법정동 -> 행정동 mapping 코드 (최종)

ewillwin·2022년 7월 28일
0

TSMtech Record

목록 보기
13/39
import chunk
from typing import final
from venv import create
import pandas as pd
import multiprocessing
import base64
import chunk
import os
import sys
import pymysql
import requests
from sqlalchemy import create_engine
import requests
import warnings
import fileinput
import glob
from multiprocessing import Process
import bisect
#-*- coding: cp949 -*-

Chunksize = 10000
warnings.simplefilter(action='ignore', category=FutureWarning)
kakao_key=[""]

index=0
def get_hangjeongdong(addr): #행정동코드 가져오는 함수
    global index
    global kakao_key
    url = "https://dapi.kakao.com/v2/local/search/address.json"
    params={'query':addr}
    headers = {"Authorization":"KakaoAK "+ kakao_key[index]}
    hangjeongdong=requests.get(url,params=params,headers=headers).json()
    if('documents' not in hangjeongdong):  # 일일 api요청 개수가 초과 했을때
        return 100
    hangjeongdong=hangjeongdong['documents']
    if(len(hangjeongdong)==0): # 행정동이 리턴되지 않을때
        return False
    else :
        hangjeongdong=hangjeongdong[0]['address']['h_code']
        if hangjeongdong == '':
            return False
        else:
            return hangjeongdong


map1 = pd.read_excel("C:/Users/TSM/Downloads/map1.xlsx",names=['행정동코드','법정동코드']) 
map1 = map1.astype({'행정동코드':'str','법정동코드':'str'})
map1_set = set(list(map1['법정동코드']))

x = 0; y = 0
for chunk in pd.read_csv("C:/Users/TSM/Downloads/gas_final/gas_final.txt",chunksize=Chunksize,sep =',',encoding='cp949',names=['date','addr','bjd','gas'],header=0,low_memory=False):
    chunk = chunk.astype({'date':'str','addr':'str','bjd':'str','gas':'int'})
    map2 = pd.read_csv("C:/Users/TSM/Downloads/map2_revised.csv",names=['지번주소','행정동코드','법정동코드'],dtype={'지번주소':'str','행정동코드':'str','법정동코드':'str'},encoding='cp949',low_memory=False)   
    map2_set = set(list(map2['지번주소']))
    map2_list_addr = list(map2['지번주소'])
    map2_list_hjd = list(map2['행정동코드'])
    idxarr = sorted(range(len(map2_list_addr)),key=lambda k:map2_list_addr[k])
    map2_list_addr.sort()

    for i, row in chunk.iterrows():
        지번주소 = chunk.at[i, 'addr']
        법정동코드 = chunk.at[i, 'bjd']
        
        if 법정동코드 not in map1_set: # not in map1
            if 지번주소 not in map2_set: # not in map2
                y+=1
                행정동코드 = get_hangjeongdong(지번주소)

                if 행정동코드 == 100:
                    index += 1
                    if index == len(kakao_key):
                        break
                    행정동코드 = get_hangjeongdong(지번주소)
                if 행정동코드 is False:
                    chunk.drop(i,axis=0,inplace=True)
                else:
                    chunk.at[i, 'bjd'] = 행정동코드
                
                tmp_data = {'지번주소':[지번주소],'행정동코드':[행정동코드],'법정동코드':[법정동코드]}
                tmp_df = pd.DataFrame(tmp_data)
                tmp_df = tmp_df.astype({'지번주소':'str','행정동코드':'str','법정동코드':'str'})
                tmp_df.to_csv('C:/Users/TSM/Downloads/map2_revised.csv',mode='a',index=False,encoding='cp949',header=False)

            else: # exist in map2
                idxx = bisect.bisect_left(map2_list_addr, 지번주소)
                idxxx = idxarr[idxx]
                temp = map2_list_hjd[idxxx]
                if (temp is not None) and (temp is not False):
                    chunk.at[i, 'bjd'] = temp
                else:
                    chunk.drop(i,axis=0,inplace=True)

        else: # exist in map1
             chunk.at[i, 'bjd'] = map1.loc[map1['법정동코드']==법정동코드]['행정동코드'].values.astype('str')[0]

    x += Chunksize
    print("읽음: ", x, "    요청 보냄: ", y)
    chunk.to_csv("h_gas.csv", mode='a', index=False, header=None)
    
print("=====================읽음: ", x, "    요청 보냄: ", y, "=====================")



x = 0; y = 0
for chunk in pd.read_csv("C:/Users/TSM/Downloads/electricity_final/electricity_final.txt",chunksize=Chunksize,sep =',',encoding='cp949',names=['date','addr','bjd','electricity'],header=0,low_memory=False):
    chunk = chunk.astype({'date':'str','addr':'str','bjd':'str','electricity':'int'})
    map2 = pd.read_csv("C:/Users/TSM/Downloads/map2_revised.csv",names=['지번주소','행정동코드','법정동코드'],dtype={'지번주소':'str','행정동코드':'str','법정동코드':'str'},encoding='cp949',low_memory=False)   
    map2_set = set(list(map2['지번주소']))
    map2_list_addr = list(map2['지번주소'])
    map2_list_hjd = list(map2['행정동코드'])
    idxarr = sorted(range(len(map2_list_addr)),key=lambda k:map2_list_addr[k])
    map2_list_addr.sort()

    for i, row in chunk.iterrows():
        지번주소 = chunk.at[i, 'addr']
        법정동코드 = chunk.at[i, 'bjd']
        
        if 법정동코드 not in map1_set: # not in map1
            if 지번주소 not in map2_set: # not in map2
                y+=1
                행정동코드 = get_hangjeongdong(지번주소)

                if 행정동코드 == 100:
                    index += 1
                    if index == len(kakao_key):
                        break
                    행정동코드 = get_hangjeongdong(지번주소)
                if 행정동코드 is False:
                    chunk.drop(i,axis=0,inplace=True)
                else:
                    chunk.at[i, 'bjd'] = 행정동코드
                
                tmp_data = {'지번주소':[지번주소],'행정동코드':[행정동코드],'법정동코드':[법정동코드]}
                tmp_df = pd.DataFrame(tmp_data)
                tmp_df = tmp_df.astype({'지번주소':'str','행정동코드':'str','법정동코드':'str'})
                tmp_df.to_csv('C:/Users/TSM/Downloads/map2_revised.csv',mode='a',index=False,encoding='cp949',header=False)

            else: # exist in map2
                idxx = bisect.bisect_left(map2_list_addr, 지번주소)
                idxxx = idxarr[idxx]
                temp = map2_list_hjd[idxxx]
                if (temp is not None) and (temp is not False):
                    chunk.at[i, 'bjd'] = temp
                else:
                    chunk.drop(i,axis=0,inplace=True)

        else: # exist in map1
             chunk.at[i, 'bjd'] = map1.loc[map1['법정동코드']==법정동코드]['행정동코드'].values.astype('str')[0]

    x += Chunksize
    print("읽음: ", x, "    요청 보냄: ", y)
    chunk.to_csv("h_electricity.csv", mode='a', index=False, header=None)
    
print("=====================읽음: ", x, "    요청 보냄: ", y, "=====================")

다 돌아가는데 5일 걸림 ㅠ

profile
Software Engineer @ LG Electronics

0개의 댓글