다양한 소스에서 데이터 수집
수집한 데이터를 목적에 맞게 전처리
데이터 모델링 및 모델 간 교차검증
다변량 시계열 분석 최종 모델 개발
KOPIS에서 제공해준 데이터
KOPIS 홈페이지에 공개해 둔 데이터
KDI 에서 허용한 데이터
공공데이터포털
서울시 열린 데이터 광장
그 외 필요한 경우 크롤링
# 필요한 라이브러리
import requests
from bs4 import BeautifulSoup as bs
from urllib import parse
import pandas as pd
serviceKey = '본인의 key' # 공공데이터포털에서 발급받은 서비스키
params = {'ServiceKey':parse.unquote(serviceKey), # 서비스키(필수)
'startCreateDt':20200101, # 데이터 생성일 시작범위(선택)
'endCreateDt':20210831} # 데이터 생성일 종료범위(선택)
# 서비스URL
url = 'http://openapi.data.go.kr/openapi/service/rest/Covid19/getCovid19InfStateJson?'
res = requests.get(url, params=params)
soup = bs(res.text, 'lxml')
items = soup.find_all('item')
lst = []
for y in items:
l = {}
for x in y:
l[x.name]=x.text
lst.append(l)
# list를 dataframe으로 만들기
df = pd.DataFrame(lst)
df.columns = df.columns.map(response)
df
누적확진률 | 누적검사 | 누적검사완료 | 치료중 | 격리해제 | 등록일시 | 사망자 | 확진자 | 검사진행 | 음성 | 고유번호 | 기준일 | 기준시간 | 수정 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.5018316920 | 4213867 | 4043529 | 17557 | 42270 | 2020-12-31 09:35:07.493 | 900 | 60727 | 170338 | 3982802 | 371 | 20201231 | 00:00 | 2021-04-20 16:12:20.079 |
1 | 1.4985939397 | 4159509 | 3987738 | 17446 | 41435 | 2020-12-30 09:35:46.266 | 879 | 59760 | 171771 | 3927978 | 370 | 20201230 | 00:00 | 2021-04-20 16:13:02.935 |
2 | 1.4949567172 | 4098166 | 3927204 | 17148 | 40703 | 2020-12-29 09:37:33.024 | 859 | 58710 | 170962 | 3868494 | 369 | 20201229 | 00:00 | 2021-04-20 16:13:28.79 |
3 | 1.4906876613 | 4038294 | 3868416 | 17579 | 39268 | 2020-12-28 09:40:11.19 | 819 | 57666 | 169878 | 3810750 | 368 | 20201228 | 00:00 | 2021-04-20 16:14:02.198 |
4 | 1.4809513093 | 4006400 | 3839424 | 17012 | 39040 | 2020-12-27 09:32:40.326 | 808 | 56860 | 166976 | 3782564 | 367 | 20201227 | 00:00 | 2021-04-20 16:14:27.518 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
331 | NaN | NaN | NaN | NaN | 0 | 2020-02-03 21:26:59.59 | 0 | 0 | 0 | NaN | 4 | 20200204 | 00:00 | 2020-02-03 21:26:59.59 |
332 | NaN | NaN | NaN | NaN | 0 | 2020-02-03 14:41:17.17 | 0 | 15 | 0 | NaN | 3 | 20200203 | 09:00 | 2020-02-03 14:41:17.17 |
333 | NaN | NaN | NaN | NaN | 2 | 2020-02-03 12:22:49.49 | 2 | 2 | 2 | NaN | 2 | 20200202 | 09:00 | 2020-02-03 12:22:49.49 |
334 | NaN | NaN | NaN | NaN | 1 | 2020-01-31 17:47:33.33 | 0 | 0 | 1 | NaN | 1 | 20200201 | 18:00 | 2020-01-31 17:47:33.33 |
335 | NaN | NaN | NaN | 1 | 2020-01-31 17:47:33.33 | 0 | 0 | 1 | NaN | 1 | 20200101 | 18:00 | 2020-02-03 12:21:56.56 |
336 rows × 14 columns
df['확진자']=df['확진자'].astype(int)
decidecnt=[]
for index, val in enumerate(df['확진자']):
if(index < len(df['확진자'])-1):
decidecnt.append(df['확진자'][index]-df['확진자'][index+1])
decidecnt = pd.Series(decidecnt)
df['확진자'] = decidecnt
type(df['확진자'][0])
df['확진자'][0]-df['확진자'][1]
967
df['등록일시']=df['등록일시'].str.split(" ").str[0]
df['등록일시'] = df['등록일시'].apply(pd.to_datetime)
df['등록일시'].value_counts()
2020-02-03 3
2020-01-31 2
2020-08-18 2
2020-12-19 2
2020-12-09 2
..
2020-05-29 1
2020-04-28 1
2020-03-28 1
2020-02-26 1
2020-02-11 1
Name: 등록일시, Length: 330, dtype: int64
df_refined = df[['등록일시','확진자']].sort_values('등록일시',ascending=True).reset_index()
df_refined.drop('index',axis=1, inplace=True)
df_refined.iloc[0,:]
등록일시 2020-01-31 00:00:00
확진자 0
Name: 0, dtype: object
df_refined.to_csv('코로나19일별확진자수.csv')
covid19 = pd.read_excel('결과물파일.xlsx')
covid19['날짜'] = covid19['날짜'].astype(str)
covid19['날짜'] = covid19['날짜'].str.split("-").str[0]+\
covid19['날짜'].str.split("-").str[1]+\
covid19['날짜'].str.split("-").str[2]
covid19['날짜']
0 20200101
1 20200102
2 20200103
3 20200104
4 20200105
...
361 20201227
362 20201228
363 20201229
364 20201230
365 20201231
Name: 날짜, Length: 366, dtype: object
covid19 = pd.read_csv('코로나19일별확진자수.csv', encoding='cp949')
#OTT데이터 전처리
ott = ott[:-1]
ott_user_cnt = []
ott_user_install = []
ott_user_time = []
ott
for index, j in enumerate(ott):
block = j.split(",")
ott_user_cnt.append(block[1].split(":")[1])
ott_user_install.append(block[2].split(":")[1])
ott_user_time.append(block[3].split(":")[1][:-1])
dict_data = {'ott이용자수':ott_user_cnt,
'ott신규설치': ott_user_install,
'ott사용시간': ott_user_time}
df = pd.DataFrame(dict_data)
df.to_csv("201901_202008_OTT이용.csv")
#배달앱 데이터 전처리
delivery=read_txt("배달앱이용.txt")
delivery = delivery[:-1]
del_user_cnt = []
del_user_install = []
del_user_time = []
ott
for index, j in enumerate(delivery):
block = j.split(",")
del_user_cnt.append(block[1].split(":")[1])
del_user_install.append(block[2].split(":")[1])
del_user_time.append(block[3].split(":")[1][:-1])
len(del_user_cnt)
973
dict_data = {'배달앱이용자수':del_user_cnt,
'배달앱신규설치': del_user_install,
'배달앱사용시간': del_user_time}
df = pd.DataFrame(dict_data)
df.to_csv("201901_202008_배달앱이용.csv")
#화상회의앱 데이터 전처리
meeting = read_txt("화상회의앱.txt")
meeting = meeting[:-1]
user_cnt = []
user_install = []
user_time = []
ott
for index, j in enumerate(meeting):
block = j.split(",")
user_cnt.append(block[1].split(":")[1])
user_install.append(block[2].split(":")[1])
user_time.append(block[3].split(":")[1][:-1])
print(user_cnt)
print(user_install)
print(user_time)
dict_data = {'화상회의앱이용자수':user_cnt,
'화상회의앱신규설치': user_install,
'화상회의앱사용시간': user_time}
df = pd.DataFrame(dict_data)
df.to_csv("201901_202008_화상회의앱이용.csv")
#중고거래앱 데이터 전처리
used = read_txt("중고거래앱.txt")
used = used[:-1]
user_cnt = []
user_install = []
user_time = []
for index, j in enumerate(used):
block = j.split(",")
user_cnt.append(block[1].split(":")[1])
user_install.append(block[2].split(":")[1])
user_time.append(block[3].split(":")[1][:-1])
print(user_cnt)
print(user_install)
print(user_time)
dict_data = {'중고거래앱이용자수':user_cnt,
'중고거래앱신규설치': user_install,
'중고거래앱사용시간': user_time}
df = pd.DataFrame(dict_data)
df.to_csv("201901_202008_중고거래앱이용.csv")
#코로나 누락된 일별 데이터가 포함된 세 데이터 전처리
covid = read_txt("코로나.txt")
covid[0].split(",")[0].split(":")[2].split("T")[0].split("\"")[1]
'2020-01-20'
covid = read_txt("코로나.txt")
covid = covid[:-1]
date = []
user_cnt = []
for index, j in enumerate(covid):
block = j.split(",")
date.append(block[0].split(":")[2].split("T")[0].split("\"")[1])
user_cnt.append(block[3].split(":")[1])
print(date)
print(user_cnt)
dict_data = {'날짜':date,'신규확진자수':user_cnt}
df = pd.DataFrame(dict_data)
df.to_csv("201901_202007_코로나확진자수.csv")
['2020-01-20', '2020-01-21', '2020-01-22', '2020-01-23', '2020-01-24', '2020-01-25', '2020-01-26', '2020-01-27', '2020-01-28', '2020-01-29', '2020-01-30', '2020-01-31', '2020-02-01', '2020-02-02', '2020-02-03', '2020-02-04', '2020-02-05', '2020-02-06', '2020-02-07', '2020-02-08', '2020-02-09', '2020-02-10', '2020-02-11', '2020-02-12', '2020-02-13', '2020-02-14', '2020-02-15', '2020-02-16', '2020-02-17', '2020-02-18', '2020-02-19', '2020-02-20', '2020-02-21', '2020-02-22', '2020-02-23', '2020-02-24', '2020-02-25', '2020-02-26', '2020-02-27', '2020-02-28', '2020-02-29', '2020-03-01', '2020-03-02', '2020-03-03', '2020-03-04', '2020-03-05', '2020-03-06', '2020-03-07', '2020-03-08', '2020-03-09', '2020-03-10', '2020-03-11', '2020-03-12', '2020-03-13', '2020-03-14', '2020-03-15', '2020-03-16', '2020-03-17', '2020-03-18', '2020-03-19', '2020-03-20', '2020-03-21', '2020-03-22', '2020-03-23', '2020-03-24', '2020-03-25', '2020-03-26', '2020-03-27', '2020-03-28', '2020-03-29', '2020-03-30', '2020-03-31', '2020-04-01', '2020-04-02', '2020-04-03', '2020-04-04', '2020-04-05', '2020-04-06', '2020-04-07', '2020-04-08', '2020-04-09', '2020-04-10', '2020-04-11', '2020-04-12', '2020-04-13', '2020-04-14', '2020-04-15', '2020-04-16', '2020-04-17', '2020-04-18', '2020-04-19', '2020-04-20', '2020-04-21', '2020-04-22', '2020-04-23', '2020-04-24', '2020-04-25', '2020-04-26', '2020-04-27', '2020-04-28', '2020-04-29', '2020-04-30', '2020-05-01', '2020-05-02', '2020-05-03', '2020-05-04', '2020-05-05', '2020-05-06', '2020-05-07', '2020-05-08', '2020-05-09', '2020-05-10', '2020-05-11', '2020-05-12', '2020-05-13', '2020-05-14', '2020-05-15', '2020-05-16', '2020-05-17', '2020-05-18', '2020-05-19', '2020-05-20', '2020-05-21', '2020-05-22', '2020-05-23', '2020-05-24', '2020-05-25', '2020-05-26', '2020-05-27', '2020-05-28', '2020-05-29', '2020-05-30', '2020-05-31', '2020-06-01', '2020-06-02', '2020-06-03', '2020-06-04', '2020-06-05', '2020-06-06', '2020-06-07', '2020-06-08', '2020-06-09', '2020-06-10', '2020-06-11', '2020-06-12', '2020-06-13', '2020-06-14', '2020-06-15', '2020-06-16', '2020-06-17', '2020-06-18', '2020-06-19', '2020-06-20', '2020-06-21', '2020-06-22', '2020-06-23', '2020-06-24', '2020-06-25', '2020-06-26', '2020-06-27', '2020-06-28', '2020-06-29', '2020-06-30', '2020-07-01', '2020-07-02', '2020-07-03', '2020-07-04', '2020-07-05', '2020-07-06', '2020-07-07', '2020-07-08', '2020-07-09', '2020-07-10', '2020-07-11', '2020-07-12', '2020-07-13', '2020-07-14', '2020-07-15', '2020-07-16', '2020-07-17', '2020-07-18', '2020-07-19', '2020-07-20', '2020-07-21', '2020-07-22', '2020-07-23', '2020-07-24', '2020-07-25', '2020-07-26', '2020-07-27', '2020-07-28', '2020-07-29', '2020-07-30', '2020-07-31', '2020-08-01', '2020-08-02', '2020-08-03', '2020-08-04', '2020-08-05', '2020-08-06', '2020-08-07', '2020-08-08', '2020-08-09', '2020-08-10', '2020-08-11', '2020-08-12', '2020-08-13', '2020-08-14', '2020-08-15', '2020-08-16', '2020-08-17', '2020-08-18', '2020-08-19', '2020-08-20', '2020-08-21', '2020-08-22', '2020-08-23', '2020-08-24', '2020-08-25', '2020-08-26', '2020-08-27', '2020-08-28', '2020-08-29', '2020-08-30', '2020-08-31', '2020-09-01', '2020-09-02', '2020-09-03', '2020-09-04', '2020-09-05', '2020-09-06', '2020-09-07', '2020-09-08', '2020-09-09', '2020-09-10', '2020-09-11', '2020-09-12', '2020-09-13', '2020-09-14', '2020-09-15', '2020-09-16', '2020-09-17', '2020-09-18', '2020-09-19', '2020-09-20', '2020-09-21', '2020-09-22', '2020-09-23', '2020-09-24', '2020-09-25', '2020-09-26', '2020-09-27', '2020-09-28', '2020-09-29', '2020-09-30', '2020-10-01', '2020-10-02', '2020-10-03', '2020-10-04', '2020-10-05', '2020-10-06', '2020-10-07', '2020-10-08', '2020-10-09', '2020-10-10', '2020-10-11', '2020-10-12', '2020-10-13', '2020-10-14', '2020-10-15', '2020-10-16', '2020-10-17', '2020-10-18', '2020-10-19', '2020-10-20', '2020-10-21', '2020-10-22', '2020-10-23', '2020-10-24', '2020-10-25', '2020-10-26', '2020-10-27', '2020-10-28', '2020-10-29', '2020-10-30', '2020-10-31', '2020-11-01', '2020-11-02', '2020-11-03', '2020-11-04', '2020-11-05', '2020-11-06', '2020-11-07', '2020-11-08', '2020-11-09', '2020-11-10', '2020-11-11', '2020-11-12', '2020-11-13', '2020-11-14', '2020-11-15', '2020-11-16', '2020-11-17', '2020-11-18', '2020-11-19', '2020-11-20', '2020-11-21', '2020-11-22', '2020-11-23', '2020-11-24', '2020-11-25', '2020-11-26', '2020-11-27', '2020-11-28', '2020-11-29', '2020-11-30', '2020-12-01', '2020-12-02', '2020-12-03', '2020-12-04', '2020-12-05', '2020-12-06', '2020-12-07', '2020-12-08', '2020-12-09', '2020-12-10', '2020-12-11', '2020-12-12', '2020-12-13', '2020-12-14', '2020-12-15', '2020-12-16', '2020-12-17', '2020-12-18', '2020-12-19', '2020-12-20', '2020-12-21', '2020-12-22', '2020-12-23', '2020-12-24', '2020-12-25', '2020-12-26', '2020-12-27', '2020-12-28', '2020-12-29', '2020-12-30', '2020-12-31', '2021-01-01', '2021-01-02', '2021-01-03', '2021-01-04', '2021-01-05', '2021-01-06', '2021-01-07', '2021-01-08', '2021-01-09', '2021-01-10', '2021-01-11', '2021-01-12', '2021-01-13', '2021-01-14', '2021-01-15', '2021-01-16', '2021-01-17', '2021-01-18', '2021-01-19', '2021-01-20', '2021-01-21', '2021-01-22', '2021-01-23', '2021-01-24', '2021-01-25', '2021-01-26', '2021-01-27', '2021-01-28', '2021-01-29', '2021-01-30', '2021-01-31', '2021-02-01', '2021-02-02', '2021-02-03', '2021-02-04', '2021-02-05', '2021-02-06', '2021-02-07', '2021-02-08', '2021-02-09', '2021-02-10', '2021-02-11', '2021-02-12', '2021-02-13', '2021-02-14', '2021-02-15', '2021-02-16', '2021-02-17', '2021-02-18', '2021-02-19', '2021-02-20', '2021-02-21', '2021-02-22', '2021-02-23', '2021-02-24', '2021-02-25', '2021-02-26', '2021-02-27', '2021-02-28', '2021-03-01', '2021-03-02', '2021-03-03', '2021-03-04', '2021-03-05', '2021-03-06', '2021-03-07', '2021-03-08', '2021-03-09', '2021-03-10', '2021-03-11', '2021-03-12', '2021-03-13', '2021-03-14', '2021-03-15', '2021-03-22', '2021-03-21', '2021-03-20', '2021-03-19', '2021-03-18', '2021-03-17', '2021-03-16', '2021-03-23', '2021-03-29', '2021-03-24', '2021-03-25', '2021-03-26', '2021-03-27', '2021-03-28', '2021-04-04', '2021-04-03', '2021-04-02', '2021-04-01', '2021-03-31', '2021-03-30', '2021-04-11', '2021-04-10', '2021-04-09', '2021-04-08', '2021-04-07', '2021-04-06', '2021-04-05', '2021-04-18', '2021-04-17', '2021-04-16', '2021-04-15', '2021-04-14', '2021-04-13', '2021-04-12', '2021-04-25', '2021-04-24', '2021-04-23', '2021-04-22', '2021-04-21', '2021-04-20', '2021-04-19', '2021-05-02', '2021-05-01', '2021-04-30', '2021-04-29', '2021-04-28', '2021-04-27', '2021-04-26', '2021-05-09', '2021-05-08', '2021-05-07', '2021-05-06', '2021-05-05', '2021-05-04', '2021-05-03', '2021-05-12', '2021-05-11', '2021-05-10', '2021-05-23', '2021-05-22', '2021-05-21', '2021-05-20', '2021-05-19', '2021-05-18', '2021-05-17', '2021-05-16', '2021-05-15', '2021-05-14', '2021-05-13', '2021-05-30', '2021-05-29', '2021-05-28', '2021-05-27', '2021-05-26', '2021-05-25', '2021-05-24', '2021-06-05', '2021-06-04', '2021-06-03', '2021-06-02', '2021-06-01', '2021-06-13', '2021-06-12', '2021-06-11', '2021-06-10', '2021-06-09', '2021-06-08', '2021-06-07', '2021-06-20', '2021-06-19', '2021-06-18', '2021-06-17', '2021-06-16', '2021-06-15', '2021-06-14', '2021-06-27', '2021-06-26', '2021-06-25', '2021-06-24', '2021-06-23', '2021-06-22', '2021-06-21', '2021-06-06', '2021-05-31', '2021-07-05', '2021-07-04', '2021-07-03', '2021-07-02', '2021-07-01', '2021-06-30', '2021-06-29', '2021-06-28', '2021-07-11', '2021-07-10', '2021-07-09', '2021-07-08', '2021-07-07', '2021-07-06', '2021-07-18', '2021-07-17', '2021-07-16', '2021-07-15', '2021-07-14', '2021-07-13', '2021-07-12', '2021-07-25', '2021-07-24', '2021-07-23', '2021-07-22', '2021-07-21', '2021-07-20', '2021-07-19', '2021-08-01', '2021-07-31', '2021-07-30', '2021-07-29', '2021-07-28', '2021-07-27', '2021-07-26', '2021-08-09', '2021-08-08', '2021-08-07', '2021-08-06', '2021-08-05', '2021-08-04', '2021-08-03']
[' 1', ' 0', ' 0', ' 0', ' 1', ' 0', ' 1', ' 1', ' 0', ' 0', ' 3', ' 4', ' 1', ' 3', ' 0', ' 1', ' 2', ' 5', ' 1', ' 0', ' 3', ' 0', ' 1', ' 0', ' 0', ' 0', ' 0', ' 1', ' 1', ' 1', ' 15', ' 36', ' 74', ' 190', ' 210', ' 207', ' 130', ' 253', ' 449', ' 427', ' 909', ' 595', ' 686', ' 600', ' 516', ' 438', ' 518', ' 483', ' 367', ' 248', ' 131', ' 242', ' 114', ' 110', ' 107', ' 76', ' 74', ' 84', ' 93', ' 152', ' 87', ' 147', ' 98', ' 64', ' 76', ' 100', ' 104', ' 91', ' 146', ' 105', ' 78', ' 125', ' 101', ' 89', ' 86', ' 94', ' 81', ' 47', ' 47', ' 53', ' 39', ' 27', ' 30', ' 32', ' 25', ' 27', ' 27', ' 22', ' 22', ' 18', ' 8', ' 13', ' 9', ' 11', ' 8', ' 6', ' 10', ' 10', ' 10', ' 14', ' 9', ' 4', ' 9', ' 6', ' 13', ' 8', ' 3', ' 2', ' 4', ' 12', ' 18', ' 34', ' 35', ' 27', ' 26', ' 29', ' 27', ' 19', ' 13', ' 15', ' 13', ' 32', ' 12', ' 20', ' 23', ' 25', ' 16', ' 19', ' 40', ' 79', ' 58', ' 39', ' 27', ' 35', ' 38', ' 49', ' 39', ' 39', ' 51', ' 57', ' 38', ' 38', ' 50', ' 45', ' 56', ' 49', ' 34', ' 37', ' 34', ' 43', ' 59', ' 49', ' 67', ' 48', ' 17', ' 46', ' 51', ' 28', ' 39', ' 51', ' 62', ' 42', ' 43', ' 51', ' 54', ' 61', ' 63', ' 61', ' 48', ' 44', ' 62', ' 50', ' 45', ' 35', ' 44', ' 62', ' 33', ' 39', ' 61', ' 60', ' 39', ' 34', ' 26', ' 45', ' 63', ' 59', ' 41', ' 113', ' 58', ' 25', ' 28', ' 48', ' 18', ' 36', ' 31', ' 30', ' 23', ' 34', ' 33', ' 43', ' 20', ' 43', ' 36', ' 28', ' 34', ' 54', ' 56', ' 103', ' 166', ' 279', ' 197', ' 246', ' 297', ' 288', ' 324', ' 332', ' 397', ' 266', ' 280', ' 320', ' 441', ' 371', ' 323', ' 299', ' 248', ' 235', ' 267', ' 195', ' 198', ' 168', ' 167', ' 119', ' 136', ' 156', ' 155', ' 176', ' 136', ' 121', ' 109', ' 106', ' 113', ' 153', ' 126', ' 110', ' 82', ' 70', ' 61', ' 110', ' 125', ' 114', ' 61', ' 95', ' 50', ' 38', ' 113', ' 77', ' 63', ' 75', ' 64', ' 73', ' 75', ' 114', ' 69', ' 54', ' 72', ' 58', ' 98', ' 102', ' 84', ' 110', ' 47', ' 73', ' 91', ' 76', ' 58', ' 91', ' 121', ' 155', ' 77', ' 61', ' 119', ' 88', ' 103', ' 125', ' 114', ' 127', ' 124', ' 97', ' 75', ' 117', ' 125', ' 145', ' 89', ' 143', ' 126', ' 100', ' 146', ' 143', ' 191', ' 205', ' 208', ' 222', ' 230', ' 285', ' 325', ' 348', ' 386', ' 330', ' 271', ' 349', ' 382', ' 581', ' 555', ' 503', ' 450', ' 438', ' 451', ' 511', ' 540', ' 628', ' 577', ' 631', ' 615', ' 589', ' 670', ' 680', ' 689', ' 950', ' 1030', ' 718', ' 880', ' 1078', ' 1014', ' 1064', ' 1050', ' 1097', ' 925', ' 867', ' 1090', ' 985', ' 1241', ' 1132', ' 970', ' 807', ' 1045', ' 1050', ' 967', ' 1029', ' 824', ' 657', ' 1020', ' 715', ' 840', ' 870', ' 674', ' 641', ' 665', ' 451', ' 537', ' 562', ' 524', ' 513', ' 580', ' 520', ' 389', ' 386', ' 404', ' 401', ' 346', ' 431', ' 392', ' 437', ' 350', ' 559', ' 497', ' 469', ' 456', ' 355', ' 305', ' 336', ' 467', ' 451', ' 370', ' 393', ' 371', ' 287', ' 302', ' 444', ' 504', ' 403', ' 362', ' 326', ' 343', ' 457', ' 621', ' 621', ' 561', ' 448', ' 416', ' 332', ' 356', ' 440', ' 395', ' 387', ' 415', ' 355', ' 355', ' 344', ' 444', ' 424', ' 398', ' 418', ' 416', ' 346', ' 446', ' 470', ' 465', ' 488', ' 490', ' 459', ' 382', ' 415', ' 451', ' 447', ' 463', ' 445', ' 469', ' 363', ' 346', ' 384', ' 428', ' 430', ' 494', ' 505', ' 482', ' 543', ' 543', ' 557', ' 551', ' 503', ' 444', ' 614', ' 677', ' 670', ' 700', ' 667', ' 477', ' 473', ' 672', ' 658', ' 673', ' 698', ' 730', ' 541', ' 586', ' 644', ' 785', ' 797', ' 735', ' 731', ' 549', ' 532', ' 606', ' 627', ' 661', ' 679', ' 769', ' 512', ' 499', ' 564', ' 701', ' 525', ' 574', ' 676', ' 541', ' 488', ' 635', ' 511', ' 463', ' 585', ' 666', ' 561', ' 646', ' 654', ' 528', ' 619', ' 610', ' 681', ' 747', ' 715', ' 480', ' 533', ' 587', ' 629', ' 706', ' 516', ' 530', ' 744', ' 695', ' 681', ' 677', ' 459', ' 452', ' 565', ' 556', ' 611', ' 602', ' 453', ' 485', ' 429', ' 482', ' 507', ' 540', ' 545', ' 373', ' 397', ' 614', ' 668', ' 634', ' 610', ' 645', ' 394', ' 357', ' 555', ' 429', ' 711', ' 743', ' 794', ' 826', ' 761', ' 794', ' 595', ' 501', ' 1324', ' 1378', ' 1316', ' 1275', ' 1212', ' 746', ' 1454', ' 1452', ' 1536', ' 1600', ' 1615', ' 1150', ' 1100', ' 1487', ' 1629', ' 1630', ' 1842', ' 1784', ' 1277', ' 1251', ' 1442', ' 1539', ' 1710', ' 1673', ' 1895', ' 1363', ' 1318', ' 1492', ' 1729', ' 1823', ' 1703', ' 1775', ' 1725', ' 1200']
df.to_csv("201901_202007_코로나확진자수.csv")
df
날짜 | 신규확진자수 | |
---|---|---|
0 | 2020-01-20 | 1 |
1 | 2020-01-21 | 0 |
2 | 2020-01-22 | 0 |
3 | 2020-01-23 | 0 |
4 | 2020-01-24 | 1 |
... | ... | ... |
562 | 2021-08-07 | 1823 |
563 | 2021-08-06 | 1703 |
564 | 2021-08-05 | 1775 |
565 | 2021-08-04 | 1725 |
566 | 2021-08-03 | 1200 |
567 rows × 2 columns
df = df.sort_values(by='날짜')['신규확진자수']
df
0 1
1 0
2 0
3 0
4 1
...
564 1775
563 1703
562 1823
561 1729
560 1492
Name: 신규확진자수, Length: 567, dtype: object
#서울 지하철 이용 데이터 전처리(모델에는 이 데이터가 쓰이지는 않음)
subway = read_txt("서울지하철이용.txt")
user_cnt = []
time = []
for index, j in enumerate(subway):
user_cnt.append(j.split(",")[1].split("{")[1][:-2].split(":")[1][1:])
temp = j.split(",")[0].split("{")[1][:-2].split(":")[1][1:].split("\"")[1].\
split("T")[0]
time.append(temp)
print(time)
dict_data = {'지하철이용자수':user_cnt}
df = pd.DataFrame(dict_data)
df.to_csv("201901_202007_지하철이용자수.csv")
# dict_data = {'날짜': time,
# '확인':time}
# df = pd.DataFrame(dict_data)
df_date = pd.read_csv(
'내 저장 경로\\날짜.csv'
,encoding='cp949')
temp = pd.merge(left = df , right = df_date, how = "outer", on = "날짜")
temp = temp.sort_values(by='날짜')
temp.to_csv('내 저장 경로\\누락날짜확인.csv')
subway = read_txt("서울지하철이용.txt")
subway
['365: {date: "2019-01-01T00:00:00.000Z", sum: {total: 2169874}}',
'366: {date: "2019-01-02T00:00:00.000Z", sum: {total: 5067609}}',
'367: {date: "2019-01-03T00:00:00.000Z", sum: {total: 5255830}}',
'368: {date: "2019-01-04T00:00:00.000Z", sum: {total: 5485567}}',
'369: {date: "2019-01-05T00:00:00.000Z", sum: {total: 3921397}}',
'370: {date: "2019-01-06T00:00:00.000Z", sum: {total: 2777687}}',
'371: {date: "2019-01-07T00:00:00.000Z", sum: {total: 5199344}}',
...
...
'1269: {date: "2021-06-23T00:00:00.000Z", sum: {total: 4393854}}',
'1270: {date: "2021-06-24T00:00:00.000Z", sum: {total: 4433884}}',
'1271: {date: "2021-06-25T00:00:00.000Z", sum: {total: 4565923}}',
'1272: {date: "2021-06-26T00:00:00.000Z", sum: {total: 2918361}}',
'1273: {date: "2021-06-27T00:00:00.000Z", sum: {total: 2113908}}',
'1274: {date: "2021-06-28T00:00:00.000Z", sum: {total: 4247266}}',
'1275: {date: "2021-06-29T00:00:00.000Z", sum: {total: 4373982}}',
'1276: {date: "2021-06-30T00:00:00.000Z", sum: {total: 4358985}}']
#지하철 직접 집계
subway_raw = pd.read_csv("지하철_202108.csv", encoding="cp949")
subway_agg = subway_raw.groupby("사용일자").sum()
subway_agg = subway_agg.sort_values(by='사용일자')
subway_agg.to_csv("지하철202107_202108.csv")
movie = read_txt("영화(201801_202106).txt")
#영화관람 데이터 전처리(모델에는 안쓰임)
movie_list = []
for i in movie:
temp = i.split(",")[3].split(":")[1][1:]
movie_list.append(temp)
movie_list
dict_data = {"movie_sales":movie_list}
movie_df = pd.DataFrame(dict_data)
movie_df
movie_sales | |
---|---|
0 | 10309962150 |
1 | 2805968850 |
2 | 3086189000 |
3 | 3357281295 |
4 | 7145939653 |
... | ... |
907 | 3338439630 |
908 | 2990616610 |
909 | 999626730 |
910 | 996710000 |
911 | 1243958090 |
912 rows × 1 columns
df = pd.read_csv("내 저장 경로\\CARD_SUBWAY_MONTH_201901.csv",encoding='cp949')
#모델에 쓰인 지하철 데이터 전처리
import pandas as pd
subway_list = []
for i in range(1,10):
subway_list.append(pd.read_csv(f"내 저장 경로\\CARD_SUBWAY_MONTH_20190{i}.csv",encoding='cp949'))
df_10=pd.read_csv(f"내 저장 경로\\CARD_SUBWAY_MONTH_201910.csv",encoding='cp949')
df_11=pd.read_csv(f"내 저장 경로\\CARD_SUBWAY_MONTH_201911.csv",encoding='cp949')
df_12=pd.read_csv(f"내 저장 경로\\CARD_SUBWAY_MONTH_201912.csv",encoding='cp949')
subway_list.append(df_10)
subway_list.append(df_11)
subway_list.append(df_12)
subway_list
[ 사용일자 노선명 역명 승차총승객수 하차총승객수 등록일자
0 20190101 2호선 을지로4가 3862.0 3728.0 20190104
1 20190101 2호선 을지로3가 8104.0 7554.0 20190104
2 20190101 2호선 을지로입구 22478.0 21330.0 20190104
3 20190101 2호선 시청 8381.0 6049.0 20190104
4 20190101 1호선 동묘앞 8045.0 8504.0 20190104
... ... ... ... ... ... ...
18329 20190131 우이신설선 북한산보국문 6195.0 5786.0 20190203
18330 20190131 우이신설선 정릉 4600.0 4211.0 20190203
18331 20190131 우이신설선 성신여대입구(돈암) 3922.0 4289.0 20190203
18332 20190131 우이신설선 보문 1659.0 1693.0 20190203
18333 20190131 우이신설선 신설동 2075.0 2215.0 20190203
... 후략 ...
[18333 rows x 6 columns]]
#df_8은 나누기 2
subway_list.append(df)
df1=pd.read_csv(f"내 저장 경로\\CARD_SUBWAY_MONTH_202001.csv",encoding='cp949')
df2=pd.read_csv(f"내 저장 경로\\CARD_SUBWAY_MONTH_202002.csv",encoding='cp949')
df3=pd.read_csv(f"내 저장 경로\\CARD_SUBWAY_MONTH_202003.csv",encoding='cp949')
df4=pd.read_csv(f"내 저장 경로\\CARD_SUBWAY_MONTH_202004.csv",encoding='cp949')
subway_list.append(df1)
subway_list.append(df2)
subway_list.append(df3)
subway_list.append(df4)
df1=pd.read_csv(f"내 저장 경로\\CARD_SUBWAY_MONTH_202005.csv",encoding='cp949')
df2=pd.read_csv(f"내 저장 경로\\CARD_SUBWAY_MONTH_202006.csv",encoding='cp949')
df3=pd.read_csv(f"내 저장 경로\\CARD_SUBWAY_MONTH_202007.csv",encoding='cp949')
df4=pd.read_csv(f"내 저장 경로\\CARD_SUBWAY_MONTH_202008.csv",encoding='cp949')
subway_list.append(df1)
subway_list.append(df2)
subway_list.append(df3)
subway_list.append(df4)
df1=pd.read_csv(f"내 저장 경로\\CARD_SUBWAY_MONTH_202009.csv",encoding='cp949')
df2=pd.read_csv(f"내 저장 경로\\CARD_SUBWAY_MONTH_202010.csv",encoding='cp949')
df3=pd.read_csv(f"내 저장 경로\\CARD_SUBWAY_MONTH_202011.csv",encoding='cp949')
df4=pd.read_csv(f"내 저장 경로\\CARD_SUBWAY_MONTH_202012.csv",encoding='cp949')
subway_list.append(df1)
subway_list.append(df2)
subway_list.append(df3)
subway_list.append(df4)
len(subway_list)
31
for i in range(1,8):
subway_list.append(pd.read_csv(f"내 저장 경로\\CARD_SUBWAY_MONTH_20210{i}.csv",encoding='cp949'))
subway_list_2 = []
for i in subway_list:
subway_list_2.append(i.loc[:,['사용일자','승차총승객수']])
len(subway_list_2)
31
res=pd.DataFrame()
for i in subway_list_2:
print(i.groupby("사용일자").sum())
res = pd.concat([res,i.groupby("사용일자").sum()])
승차총승객수
사용일자
20190101 3419948.0
20190102 7621668.0
20190103 7873692.0
20190104 8270345.0
20190105 6062294.0
20190106 4408718.0
20190107 7824428.0
20190108 7913162.0
20190109 7815405.0
20190110 7977300.0
20190111 8351322.0
20190112 6282189.0
20190113 4501053.0
20190114 7680436.0
... 후략...
res.to_csv("지하철201901_202107.csv")
다음 포스트에 이어서...