# API ์ด์ฉ
import urllib.request
client_id = '<๊ฐ์ธ id>'
client_secret = '<๊ฐ์ธ secret>'
# url ๋ง๋๋ ํจ์
def gen_search_url(api_node, search_text, start_num, disp_num):
base = 'https://openapi.naver.com/v1/search'
node = '/' + api_node + '.json'
param_query = '?query=' + urllib.parse.quote(search_text)
param_start = '&start=' + str(start_num)
param_disp = '&display=' + str(disp_num)
return base + node + param_query + param_start + param_disp
gen_search_url('book', 'ํ์ด์ฌ', 10, 3)
# page์์ ์ ๋ณด ์ป๊ธฐ
import json
import datetime
def get_result_onpage(url):
request = urllib.request.Request(url)
request.add_header('X-Naver-Client-Id', client_id)
request.add_header('X-Naver-Client-Secret', client_secret)
response = urllib.request.urlopen(request)
print('[%s] Url Request Success' % datetime.datetime.now())
return json.loads(response.read().decode('utf-8'))
url = gen_search_url('book', 'ํ์ด์ฌ', 10, 3)
one_result = get_result_onpage(url)
one_result
# ์ ๋ชฉ์์ tag ์ ๊ฑฐ
def delete_tag(input_str):
input_str = input_str.replace('<b>', '')
input_str = input_str.replace('</b>', '')
return input_str
# ํ ํ์ด์ง์ ๋ด์ฉ์ pandas์
import pandas as pd
def get_fields(json_data):
title = [delete_tag(each['title']) for each in json_data['items']]
link = [each['link'] for each in json_data['items']]
price = [each['discount'] for each in json_data['items']]
publisher = [each['publisher'] for each in json_data['items']]
isbn = [each['isbn'] for each in json_data['items']]
result_pd = pd.DataFrame({'title':title, 'price':price, 'isbn':isbn, 'link':link,
'publisher':publisher}, columns=['title','price','publisher','isbn','link'])
return result_pd
# ๊ฐ๋จ ํ
์คํธ
url = gen_search_url('book', 'ํ์ด์ฌ', 10, 3)
json_result = get_result_onpage(url)
pd_result = get_fields(json_result)
pd_result
# API๋ฅผ ์ด์ฉํ ์ ๋ณด ์์ง 1000๊ฐ
result_book = []
for n in range(1, 1000, 10):
url = gen_search_url('book', 'ํ์ด์ฌ', n, 100)
json_result = get_result_onpage(url)
pd_result = get_fields(json_result)
result_book.append(pd_result)
result_book = pd.concat(result_book)
# ์ธ๋ฑ์ค ์ ๋ฆฌ
result_book.reset_index(drop=True, inplace=True)
result_book.info()
# ๊ฐ๊ฒฉ์ ๋ฐ์ดํฐํ ์ ๋ฆฌ
result_book['price'] = result_book['price'].astype('float')
result_book.info()
# ํ ํ์ด์ง์ ๋ํด ์ผ๋จ ํ
์คํธ
from bs4 import BeautifulSoup
from urllib.request import urlopen
url = 'https://search.shopping.naver.com/book/catalog/32456895000'
page = urlopen(url)
soup = BeautifulSoup(page, 'html.parser')
soup
soup.find_all(class_ = 'bookBasicInfo_spec__qmQ_N')[0].text
# re
import re
tmp = soup.find_all(class_ = 'bookBasicInfo_spec__qmQ_N')[0].get_text()
result = tmp
# result = re.search('ํ์ด์ง\s+\d+', tmp)
# result.split()
# ์ง๊ธ๊ณผ ๋ง์ง ์์
result
# ํ์ด์ง ์ ๋ณด ์ป๊ธฐ
import re
import numpy as np
import time
def get_page_num(soup):
#time.sleep(1)
tmp = soup.find_all('span', 'bookBasicInfo_spec__qmQ_N')[0].get_text()
# url์ ๋ค์ด๊ฐ๋๋ฐ ํ์ด์ง ์ ๋ณด๊ฐ ์์ ๊ฒฝ์ฐ
try:
#result = re.search('ํ์ด์ง\s+\d+', tmp).group()
#result = tmp.split()[1]
# ์ง๊ธ๊ณผ ๋ง์ง ์์
result = tmp
return result
except:
print('==> Error in get_page_num!')
return np.nan
get_page_num(soup)
# ๋์น ๋ฐ์ดํฐ๋ ์กด์ฌํ๋ค
result_book.info()
# ๋ฐ์ดํฐํ ์ ๋ฆฌ
result_book['page_num'] = result_book['page_num'].str.replace('์ชฝ','')
result_book['page_num'] = result_book['page_num'].astype('float')
result_book.info()
# ๋์น ๋ฐ์ดํฐ ๋ค์ ์๋
for idx, row in result_book.iterrows():
if np.isnan(row['page_num']):
print('start fix...')
print(row['link'])
page_num = get_page_num(BeautifulSoup(urlopen(row['link']), 'html.parser'))
result_book.loc[idx, 'page_num'] = page_num
time.sleep(0.5)
# ๋ค์ ๋ฐ์ดํฐํ ์ ๋ฆฌ
try:
result_book['page_num'] = result_book['page_num'].str.replace('์ชฝ','')
result_book['page_num'] = result_book['page_num'].astype('float')
result_book.info()
except:
pass
# ํ์ด์ง ์ ๋ณด ์๋ ๊ฒ์ ์ ์ธ
result_book = result_book[result_book['page_num'].notnull()]
result_book.info()
# ์์
๋ก ์ ๋ฆฌ
writer = pd.ExcelWriter('./python_books.xlsx', engine='xlsxwriter')
result_book.to_excel(writer, sheet_name='Sheet1')
workbook = writer.book
worksheet = writer.sheets['Sheet1']
worksheet.set_column('A:A', 5)
worksheet.set_column('B:B', 60)
worksheet.set_column('C:C', 10)
worksheet.set_column('D:D', 15)
worksheet.set_column('E:E', 10)
worksheet.set_column('F:F', 50)
writer.save()
# ๋ฐ์ดํฐ ๋ค์ ์ฝ๊ธฐ
raw_data = pd.read_excel('./python_books.xlsx', index_col=0)
raw_data.head()
# seaborn์ผ๋ก ์ดํด๋ณด๊ธฐ
# ํ์ด์ง์ ๊ฐ๊ฒฉ์ ๊ด๊ณ๊ฐ ์๋ค
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(12, 8))
sns.regplot(x='page_num', y='price', data=raw_data,)
plt.show()
# ํน๋ณํ ๋น์ผ ์ฑ
ํ์ธ
raw_data[raw_data['price']>140000]
# ์ถํ์ฌ ๋ณ๋ก ํ์ธ
raw_data['publisher'].value_counts()
# ์ถํ์ฌ ์
len(raw_data['publisher'].unique())
# matplotlib ํ๊ธ ๋์
import matplotlib.pyplot as plt
import seaborn as sns
import platform
import warnings
from matplotlib import font_manager, rc
warnings.filterwarnings(action= 'ignore') # ๊ฒฝ๊ณ ๋ฌธ๊ตฌ ๋ฌด์
# matplotlib ํ๊ธ๊นจ์ง ๋ณด์
# matplotlib inline
get_ipython().run_line_magic('matplotlib', 'inline')
path = 'C:/Windows/Fonts/malgun.ttf'
if platform.system() == 'Darwin': # mac์ผ ๊ฒฝ์ฐ ํ๊ธ ์ค์
rc('font', family = 'Arial Unicode MS')
elif platform.system() =='Windows' : # window์ผ ๊ฒฝ์ฐ ํ๊ธ ์ค์
font_name = font_manager.FontProperties(fname=path).get_name()
rc('font', family = font_name)
else:
print('unknown system sorry')
# ์ถํ์ฌ๋ณ ํธ์ค ์กด์ฌ
plt.figure(figsize=(15, 6))
sns.countplot(x='publisher', data= raw_data, palette='RdYlGn',
order=raw_data['publisher'].value_counts().index)
plt.xticks(rotation=90)
plt.show()
raw_1 = raw_data[raw_data['publisher']=='์์ด์ฝ์ถํ']
plt.figure(figsize=(12, 8))
sns.regplot(x='page_num', y='price', data=raw_1)
plt.show()
# ์ถํ์ฌ๋ณ
raw_2 = raw_data[raw_data['publisher']=='ํ๋น๋ฏธ๋์ด']
plt.figure(figsize=(12, 8))
sns.regplot(x='page_num', y='price', data=raw_2)
plt.show()
# ๊ฐ๊ฒฉ์ ๋ณด๋ค ์ ์์ธกํ ์ ์์ ๊ฒ๊ฐ๋ค
raw_3 = raw_data[raw_data['publisher']=='๋น์ ์ดํผ๋ธ๋ฆญ']
plt.figure(figsize=(12, 8))
sns.regplot(x='page_num', y='price', data=raw_2)
plt.show()
raw_4 = raw_data[raw_data['publisher']=='์ํค๋ถ์ค']
plt.figure(figsize=(12, 8))
sns.regplot(x='page_num', y='price', data=raw_2)
plt.show()
# box ๊ทธ๋ํ
import plotly.express as px
px.box(raw_data, y='price')
# ํ๊ท ๋ชจ๋ธ ๊ตฌ์ฑ์ ์ํ ๋ฐ์ดํฐ ๋๋๊ธฐ
from sklearn.model_selection import train_test_split
X = raw_data['page_num'].values
y = raw_data['price'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
X_train = X_train.reshape(-1, 1)
X_test = X_test.reshape(-1, 1)
# ๋ชจ๋ธํ์ต
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_train, y_train)
# ์๋ฌ ๊ณ์ฐ
from sklearn.metrics import mean_squared_error
pred_tr = reg.predict(X_train)
pred_test = reg.predict(X_test)
rmse_tr = (np.sqrt(mean_squared_error(y_train, pred_tr)))
rmse_test = (np.sqrt(mean_squared_error(y_test, pred_test)))
print('RMSE of Train Data : ', rmse_tr)
print('RMSE of Test Data : ', rmse_test)
# train ๋ฐ์ดํฐ์ ์ด์์น๊ฐ ์๊ณ test์ ์์ ๊ฒฝ์ฐ test๋ฐ์ดํฐ์ ๊ฐ์ด ๋ ์์ ์ ์๋ค.
# ์ฐธ๊ฐ๊ณผ ์์ธก๊ฐ
plt.scatter(y_test, pred_test)
plt.xlabel('Actual')
plt.ylabel('Predict')
plt.plot([0, 80000],[0, 80000], 'r')
plt.show()
# ์ด๋ฒ์ ํน์ ์ถํ์ฌ ๊ฒ๋ง
X = raw_1['page_num'].values
y = raw_1['price'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
X_train = X_train.reshape(-1, 1)
X_test = X_test.reshape(-1, 1)
reg.fit(X_train, y_train)
# ์ถํ์ฌ๋ณ ์์ธก์ด ๋ง์ ๊ฒ๊ฐ๋ค
pred_tr = reg.predict(X_train)
pred_test = reg.predict(X_test)
rmse_tr = (np.sqrt(mean_squared_error(y_train, pred_tr)))
rmse_test = (np.sqrt(mean_squared_error(y_test, pred_test)))
print('RMSE of Train Data : ', rmse_tr)
print('RMSE of Teset Data : ', rmse_test)
# ๋ถ์ผ๋ณ๋ก ์ ๊ฒํ์ง ๋ชปํ์ง๋ง ์ถํ์ฌ๋ณ๋ก๋ ์๋ฏธ๊ฐ ์๋ค
plt.scatter(y_test, pred_test)
plt.xlabel('Actual')
plt.ylabel('Predict')
plt.plot([0, 120000],[0, 120000], 'r')
plt.show()
๐ป ์ถ์ฒ : ์ ๋ก๋ฒ ์ด์ค ๋ฐ์ดํฐ ์ทจ์ ์ค์ฟจ