๐Ÿ˜ข ์Šคํ„ฐ๋””๋…ธํŠธ (Machine Learning 14)

zoeยท2023๋…„ 5์›” 30์ผ
0

๋„ค์ด๋ฒ„ ์ฑ… ๊ฐ€๊ฒฉ ํšŒ๊ท€๋ถ„์„

# API ์ด์šฉ

import urllib.request

client_id = '<๊ฐœ์ธ id>'
client_secret = '<๊ฐœ์ธ secret>'
# url ๋งŒ๋“œ๋Š” ํ•จ์ˆ˜

def gen_search_url(api_node, search_text, start_num, disp_num):
    base = 'https://openapi.naver.com/v1/search'
    node = '/' + api_node + '.json'
    param_query = '?query=' + urllib.parse.quote(search_text)
    param_start = '&start=' + str(start_num)
    param_disp = '&display=' + str(disp_num)
    
    return base + node + param_query + param_start + param_disp

gen_search_url('book', 'ํŒŒ์ด์ฌ', 10, 3)
# page์—์„œ ์ •๋ณด ์–ป๊ธฐ

import json
import datetime

def get_result_onpage(url):
    request = urllib.request.Request(url)
    request.add_header('X-Naver-Client-Id', client_id)
    request.add_header('X-Naver-Client-Secret', client_secret)
    
    response = urllib.request.urlopen(request)
    
    print('[%s] Url Request Success' % datetime.datetime.now())
    
    return json.loads(response.read().decode('utf-8'))
url = gen_search_url('book', 'ํŒŒ์ด์ฌ', 10, 3)
one_result = get_result_onpage(url)
one_result
# ์ œ๋ชฉ์—์„œ tag ์ œ๊ฑฐ

def delete_tag(input_str):
    input_str = input_str.replace('<b>', '')
    input_str = input_str.replace('</b>', '')
    
    return input_str
# ํ•œ ํŽ˜์ด์ง€์˜ ๋‚ด์šฉ์„ pandas์—

import pandas as pd

def get_fields(json_data):
    title = [delete_tag(each['title']) for each in json_data['items']]
    link = [each['link'] for each in json_data['items']]
    price = [each['discount'] for each in json_data['items']]
    publisher = [each['publisher'] for each in json_data['items']]
    isbn = [each['isbn'] for each in json_data['items']]
    
    result_pd = pd.DataFrame({'title':title, 'price':price, 'isbn':isbn, 'link':link, 
                              'publisher':publisher}, columns=['title','price','publisher','isbn','link'])
    return result_pd
    
# ๊ฐ„๋‹จ ํ…Œ์ŠคํŠธ

url = gen_search_url('book', 'ํŒŒ์ด์ฌ', 10, 3)
json_result = get_result_onpage(url)
pd_result = get_fields(json_result)
pd_result
# API๋ฅผ ์ด์šฉํ•œ ์ •๋ณด ์ˆ˜์ง‘ 1000๊ฐœ

result_book = []

for n in range(1, 1000, 10):
    url = gen_search_url('book', 'ํŒŒ์ด์ฌ', n, 100)
    json_result = get_result_onpage(url)
    pd_result = get_fields(json_result)
    
    result_book.append(pd_result)
    
result_book = pd.concat(result_book)
# ์ธ๋ฑ์Šค ์ •๋ฆฌ

result_book.reset_index(drop=True, inplace=True)
result_book.info()
# ๊ฐ€๊ฒฉ์˜ ๋ฐ์ดํ„ฐํ˜• ์ •๋ฆฌ

result_book['price'] = result_book['price'].astype('float')
result_book.info()
# ํ•œ ํŽ˜์ด์ง€์— ๋Œ€ํ•ด ์ผ๋‹จ ํ…Œ์ŠคํŠธ

from bs4 import BeautifulSoup
from urllib.request import urlopen

url = 'https://search.shopping.naver.com/book/catalog/32456895000'
page = urlopen(url)

soup = BeautifulSoup(page, 'html.parser')
soup
soup.find_all(class_ = 'bookBasicInfo_spec__qmQ_N')[0].text
# re

import re

tmp = soup.find_all(class_ = 'bookBasicInfo_spec__qmQ_N')[0].get_text()
result = tmp

# result = re.search('ํŽ˜์ด์ง€\s+\d+', tmp) 
# result.split()
# ์ง€๊ธˆ๊ณผ ๋งž์ง€ ์•Š์Œ


result
# ํŽ˜์ด์ง€ ์ •๋ณด ์–ป๊ธฐ

import re
import numpy as np
import time

def get_page_num(soup):
    #time.sleep(1)
    tmp = soup.find_all('span', 'bookBasicInfo_spec__qmQ_N')[0].get_text()
    
    
    # url์— ๋“ค์–ด๊ฐ”๋Š”๋ฐ ํŽ˜์ด์ง€ ์ •๋ณด๊ฐ€ ์—†์„ ๊ฒฝ์šฐ
    try: 
        #result = re.search('ํŽ˜์ด์ง€\s+\d+', tmp).group()
        #result = tmp.split()[1]
        # ์ง€๊ธˆ๊ณผ ๋งž์ง€ ์•Š์Œ
        
        result = tmp      
        return result
    except:
        print('==> Error in get_page_num!')
        return np.nan

get_page_num(soup)
# ๋†“์นœ ๋ฐ์ดํ„ฐ๋„ ์กด์žฌํ•œ๋‹ค

result_book.info()
# ๋ฐ์ดํ„ฐํ˜• ์ •๋ฆฌ
result_book['page_num'] = result_book['page_num'].str.replace('์ชฝ','')
result_book['page_num'] = result_book['page_num'].astype('float')
result_book.info()
# ๋†“์นœ ๋ฐ์ดํ„ฐ ๋‹ค์‹œ ์‹œ๋„

for idx, row in result_book.iterrows():
    if np.isnan(row['page_num']):
        print('start fix...')
        print(row['link'])
        page_num = get_page_num(BeautifulSoup(urlopen(row['link']), 'html.parser'))
        
        result_book.loc[idx, 'page_num'] = page_num
        time.sleep(0.5)
# ๋‹ค์‹œ ๋ฐ์ดํ„ฐํ˜• ์ •๋ฆฌ

try:
    result_book['page_num'] = result_book['page_num'].str.replace('์ชฝ','')
    result_book['page_num'] = result_book['page_num'].astype('float')
    result_book.info()
except:
    pass
# ํŽ˜์ด์ง€ ์ •๋ณด ์—†๋Š” ๊ฒƒ์€ ์ œ์™ธ

result_book = result_book[result_book['page_num'].notnull()]
result_book.info()
# ์—‘์…€๋กœ ์ •๋ฆฌ

writer = pd.ExcelWriter('./python_books.xlsx', engine='xlsxwriter')
result_book.to_excel(writer, sheet_name='Sheet1')

workbook = writer.book
worksheet = writer.sheets['Sheet1']
worksheet.set_column('A:A', 5)
worksheet.set_column('B:B', 60)
worksheet.set_column('C:C', 10)
worksheet.set_column('D:D', 15)
worksheet.set_column('E:E', 10)
worksheet.set_column('F:F', 50)

writer.save()



ํšŒ๊ท€๋ถ„์„ ํ•ด๋ณด๊ธฐ

# ๋ฐ์ดํ„ฐ ๋‹ค์‹œ ์ฝ๊ธฐ

raw_data = pd.read_excel('./python_books.xlsx', index_col=0)
raw_data.head()
# seaborn์œผ๋กœ ์‚ดํŽด๋ณด๊ธฐ
# ํŽ˜์ด์ง€์™€ ๊ฐ€๊ฒฉ์€ ๊ด€๊ณ„๊ฐ€ ์žˆ๋‹ค


import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

plt.figure(figsize=(12, 8))
sns.regplot(x='page_num', y='price', data=raw_data,)
plt.show()
# ํŠน๋ณ„ํžˆ ๋น„์‹ผ ์ฑ… ํ™•์ธ

raw_data[raw_data['price']>140000]
# ์ถœํŒ์‚ฌ ๋ณ„๋กœ ํ™•์ธ

raw_data['publisher'].value_counts()
# ์ถœํŒ์‚ฌ ์ˆ˜

len(raw_data['publisher'].unique())
# matplotlib ํ•œ๊ธ€ ๋Œ€์‘

import matplotlib.pyplot as plt
import seaborn as sns
import platform
import warnings

from matplotlib import font_manager, rc

warnings.filterwarnings(action= 'ignore') # ๊ฒฝ๊ณ  ๋ฌธ๊ตฌ ๋ฌด์‹œ

# matplotlib ํ•œ๊ธ€๊นจ์ง ๋ณด์™„
# matplotlib inline
get_ipython().run_line_magic('matplotlib', 'inline')

path = 'C:/Windows/Fonts/malgun.ttf'

if platform.system() == 'Darwin': # mac์ผ ๊ฒฝ์šฐ ํ•œ๊ธ€ ์„ค์ •
    rc('font', family = 'Arial Unicode MS')
elif platform.system() =='Windows' : # window์ผ ๊ฒฝ์šฐ ํ•œ๊ธ€ ์„ค์ •
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc('font', family = font_name)
else:
    print('unknown system sorry')
# ์ถœํŒ์‚ฌ๋ณ„ ํŽธ์ค‘ ์กด์žฌ

plt.figure(figsize=(15, 6))
sns.countplot(x='publisher', data= raw_data, palette='RdYlGn',
              order=raw_data['publisher'].value_counts().index)
plt.xticks(rotation=90)
plt.show()
raw_1 = raw_data[raw_data['publisher']=='์—์ด์ฝ˜์ถœํŒ']

plt.figure(figsize=(12, 8))
sns.regplot(x='page_num', y='price', data=raw_1)
plt.show()
# ์ถœํŒ์‚ฌ๋ณ„

raw_2 = raw_data[raw_data['publisher']=='ํ•œ๋น›๋ฏธ๋””์–ด']

plt.figure(figsize=(12, 8))
sns.regplot(x='page_num', y='price', data=raw_2)
plt.show()
# ๊ฐ€๊ฒฉ์„ ๋ณด๋‹ค ์ž˜ ์˜ˆ์ธกํ•  ์ˆ˜ ์žˆ์„ ๊ฒƒ๊ฐ™๋‹ค

raw_3 = raw_data[raw_data['publisher']=='๋น„์ œ์ดํผ๋ธ”๋ฆญ']

plt.figure(figsize=(12, 8))
sns.regplot(x='page_num', y='price', data=raw_2)
plt.show()
raw_4 = raw_data[raw_data['publisher']=='์œ„ํ‚ค๋ถ์Šค']

plt.figure(figsize=(12, 8))
sns.regplot(x='page_num', y='price', data=raw_2)
plt.show()
# box ๊ทธ๋ž˜ํ”„

import plotly.express as px

px.box(raw_data, y='price')
# ํšŒ๊ท€ ๋ชจ๋ธ ๊ตฌ์„ฑ์„ ์œ„ํ•œ ๋ฐ์ดํ„ฐ ๋‚˜๋ˆ„๊ธฐ

from sklearn.model_selection import train_test_split

X = raw_data['page_num'].values
y = raw_data['price'].values


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

X_train = X_train.reshape(-1, 1)
X_test = X_test.reshape(-1, 1)
# ๋ชจ๋ธํ•™์Šต

from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(X_train, y_train)
# ์—๋Ÿฌ ๊ณ„์‚ฐ

from sklearn.metrics import mean_squared_error

pred_tr = reg.predict(X_train)
pred_test = reg.predict(X_test)

rmse_tr = (np.sqrt(mean_squared_error(y_train, pred_tr)))
rmse_test = (np.sqrt(mean_squared_error(y_test, pred_test)))

print('RMSE of Train Data : ', rmse_tr)
print('RMSE of Test Data : ', rmse_test)
# train ๋ฐ์ดํ„ฐ์— ์ด์ƒ์น˜๊ฐ€ ์žˆ๊ณ  test์— ์—†์„ ๊ฒฝ์šฐ test๋ฐ์ดํ„ฐ์˜ ๊ฐ’์ด ๋” ์ž‘์„ ์ˆ˜ ์žˆ๋‹ค.
# ์ฐธ๊ฐ’๊ณผ ์˜ˆ์ธก๊ฐ’

plt.scatter(y_test, pred_test)
plt.xlabel('Actual')
plt.ylabel('Predict')
plt.plot([0, 80000],[0, 80000], 'r')
plt.show()
# ์ด๋ฒˆ์—” ํŠน์ • ์ถœํŒ์‚ฌ ๊ฒƒ๋งŒ

X = raw_1['page_num'].values
y = raw_1['price'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)

X_train = X_train.reshape(-1, 1)
X_test = X_test.reshape(-1, 1)
reg.fit(X_train, y_train)
# ์ถœํŒ์‚ฌ๋ณ„ ์˜ˆ์ธก์ด ๋งž์„ ๊ฒƒ๊ฐ™๋‹ค

pred_tr = reg.predict(X_train)
pred_test = reg.predict(X_test)

rmse_tr = (np.sqrt(mean_squared_error(y_train, pred_tr)))
rmse_test = (np.sqrt(mean_squared_error(y_test, pred_test)))

print('RMSE of Train Data : ', rmse_tr)
print('RMSE of Teset Data : ', rmse_test)
# ๋ถ„์•ผ๋ณ„๋กœ ์ ๊ฒ€ํ•˜์ง€ ๋ชปํ–ˆ์ง€๋งŒ ์ถœํŒ์‚ฌ๋ณ„๋กœ๋Š” ์˜๋ฏธ๊ฐ€ ์žˆ๋‹ค

plt.scatter(y_test, pred_test)
plt.xlabel('Actual')
plt.ylabel('Predict')
plt.plot([0, 120000],[0, 120000], 'r')
plt.show()

๐Ÿ’ป ์ถœ์ฒ˜ : ์ œ๋กœ๋ฒ ์ด์Šค ๋ฐ์ดํ„ฐ ์ทจ์—… ์Šค์ฟจ

profile
#๋ฐ์ดํ„ฐ๋ถ„์„ #ํผํฌ๋จผ์Šค๋งˆ์ผ€ํŒ… #๋ฐ์ดํ„ฐ #๋””์ง€ํ„ธ๋งˆ์ผ€ํŒ…

0๊ฐœ์˜ ๋Œ“๊ธ€