230517_파이썬_EDA_웹크롤링

김지태·2023년 5월 22일
0

03. Web Data

1. BeautifulSoup for web data


BeautifulSoup Basic

    - conda install -c anaconda beautifulsoup4
    - pip install beautifulsoup4
  • data
    • 03.test_first.html

import

from bs4 import BeautifulSoup
page = open("../data/03. zerobase.html", "r").read()
soup = BeautifulSoup(page, "html.parser")
print(soup.prettify())

head 태그 확인

soup.head

body 태그 확인

soup.body

p 태그 확인

처음 발견한 p 태그만 출력

find()

soup.p
soup.find("p")

파이썬 예약어

class, id, def, list, str, int, tuple...

soup.find("p", class_="innter-text second-item")
soup.find("p", {"class":"outer-text first-item"}).text.strip()

다중 조건

soup.find("p", {"class":"inner-text first-item", "id":"first"})

find_all(): 여러 개의 태그를 반환

list 형태로 반환

soup.find_all("p")

특정 태그 확인

soup.findall(id="pw-link")[0].text
soup.find_all("p", class
="innter-text second-item")
len(soup.find_all("p"))
print(soup.find_all("p")[0].text)
print(soup.find_all("p")[1].string)
print(soup.find_all("p")[1].get_text())

p 태그 리스트에서 텍스트 속성만 출력

for each_tag in soup.find_all("p"):
print("=" * 50)
print(each_tag.text)

a 태그에서 href 속성값에 있는 값 추출

links = soup.find_all("a")
links[0].get("href"), links[1]["href"]
for each in links:
href = each.get("href") # each["href"]
text = each.get_text()
print(text + "=>" + href)

BeautifulSoup 예제 1-1 - 네이버 금융

import

from urllib.request import urlopen
from bs4 import BeautifulSoup
url = "https://finance.naver.com/marketindex/"

page = urlopen(url)

response = urlopen(url)
response
soup = BeautifulSoup(page, "html.parser")
print(soup.prettify())

1

soup.find_all("span", "value"), len(soup.find_all("span", "value"))

2

soup.findall("span", class="value"), len(soup.find_all("span", "value"))

3

soup.find_all("span", {"class":"value"}), len(soup.find_all("span", {"class":"value"}))
soup.find_all("span", {"class":"value"})[0].text, soup.find_all("span", {"class":"value"})[0].string, soup.find_all("span", {"class":"value"})[0].get_text()

BeautifulSoup 예제1-2 - 네이버 금융

  • !pip install requests
  • find, find_all
  • select, select_one
  • find, select_one : 단일 선택
  • select, find_all : 다중 선택
    import requests

from urllib.request.Request

from bs4 import BeautifulSoup
url = "https://finance.naver.com/marketindex/"
response = requests.get(url)

requests.get(), requests.post()

response.text

soup = BeautifulSoup(response.text, "html.parser")
print(soup.prettify())

soup.find_all("li", "on")

id =>

class => .

exchangeList = soup.select("#exchangeList > li")
len(exchangeList), exchangeList
title = exchangeList[0].select_one(".h_lst").text
exchange = exchangeList[0].select_one(".value").text
change = exchangeList[0].select_one(".change").text
updown = exchangeList[0].select_one(".head_info.point_dn > .blind").text

link

title, exchange, change, updown

findmethod = soup.find_all("ul", id="exchangeList")

findmethod[0].find_all("span", "value")

baseUrl = "https://finance.naver.com"
baseUrl + exchangeList[0].select_one("a").get("href")

4개 데이터 수집

exchange_datas = []
baseUrl = "https://finance.naver.com"

for item in exchangeList:
data = {
"title": item.select_one(".h_lst").text,
"exchnage": item.select_one(".value").text,
"change": item.select_one(".change").text,
"updown": item.select_one(".head_info.point_dn > .blind").text,
"link": baseUrl + item.select_one("a").get("href")
}
exchange_datas.append(data)
df = pd.DataFrame(exchange_datas)
df.to_excel("./naverfinance.xlsx", encoding="utf-8")

BeautifulSoup 예제2 - 위키백과 문서 정보 가져오기

import urllib
from urllib.request import urlopen, Request

html = "https://ko.wikipedia.org/wiki/{search_words}"

https://ko.wikipedia.org/wiki/여명의_눈동자

req = Request(html.format(searchwords=urllib.parse.quote("여명의눈동자"))) # 글자를 URL로 인코딩
response = urlopen(req)
soup = BeautifulSoup(response, "html.parser")
print(soup.prettify())
n = 0

for each in soup.find_all("ul"):
print("=>" + str(n) + "========================")
print(each.get_text())
n += 1
soup.find_all("ul")[15].text.strip().replace("\xa0", "").replace("\n", "")

Python List 데이터형

  • list형은 대괄호로 생성
    colors = ["red", "blue", "green"]

colors[0], colors[1], colors[2]
b = colors
b
b[1] = "black"
b
colors
c = colors.copy()
c
c[1] = "yellow"
c
colors

  • list형을 반복문에(for) 적용
    for color in colors:
    print(color)
  • in명령으로 조건문(if)에 적용
    if "white" in colors:
    print("True")
    movies = ["라라랜드", "먼 훗날 우리", "어벤저스", "다크나이트"]
    print(movies)
  • append: list 제일 뒤에 추가
    movies.append("타이타닉")
    movies
  • pop: 리스트 제일 뒤부터 자료를 하나씩 삭제
    movies.pop()
    movies
  • extend: 제일 뒤에 자료 추가
    movies.extend(["위대한쇼맨", "인셉션", "터미네이터"])
    movies
  • remove: 자료를 삭제
    movies.remove("어벤저스")
    movies
  • 슬라이싱: [n:m] n번재 부터 m-1까지
    movies[3:5]
    favorite_movies = movies[3:5]
    favorite_movies
  • insert: 원하는 위치에 자료를 삽입
    favorite_movies.insert(1, 9.60)
    favorite_movies
    favorite_movies.insert(3, 9.50)
    favorite_movies
  • list안에 list
    favorite_movies.insert(5, ["레오나르도 디카프리오", "조용하"])
    favorite_movies
  • isinstance: 자료형 True/False
    isinstance(favorite_movies, list)
    favorite_movies
    for each_item in favorite_movies:
    if isinstance(each_item, list):
    for nested_item in each_item:
    print("nested_item", nested_item)
    else:
    print("each_item", each_item)

2. 시카고 맛집 데이터 분석 - 개요

최종목표
총 51개 페이지에서 각 가게의 정보를 가져온다 
- 가게이름 
- 대표메뉴
- 대표메뉴의 가격 
- 가게주소

3. 시카고 맛집 데이터 분석 - 메인페이지

!pip install fake-useragent

from urllib.request import Request, urlopen
from fake_useragent import UserAgent
from bs4 import BeautifulSoup

url_base = "https://www.chicagomag.com/"
url_sub = "Chicago-Magazine/November-2012/Best-Sandwiches-Chicago/"
url = url_base + url_sub
ua = UserAgent()
req = Request(url, headers={"user-agent": ua.ie})
html = urlopen(req)
soup = BeautifulSoup(html, "html.parser")
print(soup.prettify())

soup.find_all("div", "sammy"), len(soup.find_all("div", "sammy"))

soup.select(".sammy"), len(soup.select(".sammy"))

tmpone= soup.find_all("div", "sammy")[0]
type(tmp_one)
tmp_one.find(class
="sammyRank").get_text()

tmp_one.select_one(".sammyRank").text

tmp_one
tmp_one.find("div", {"class":"sammyListing"}).get_text()

tmp_one.select_one(".sammyListing").text

tmp_one.find("a")["href"]

tmp_one.select_one("a").get("href")

import re

tmpstring = tmp_one.find(class="sammyListing").get_text()
re.split(("\n|\r\n"), tmp_string)
print(re.split(("\n|\r\n"), tmp_string)[0]) # menu
print(re.split(("\n|\r\n"), tmp_string)[1]) # cafe
from urllib.parse import urljoin

url_base = "http://www.chicagomag.com"

필요한 내용을 담을 빈 리스트

리스트로 하나씩 컬럼을 만들고, DataFrame으로 합칠 예정

rank = []
main_menu = []
cafe_name = []
url_add = []

list_soup = soup.find_all("div", "sammy") # soup.select(".sammy")

for item in listsoup:
rank.append(item.find(class
="sammyRank").gettext())
tmp_string = item.find(class
="sammyListing").get_text()
main_menu.append(re.split(("\n|\r\n"), tmp_string)[0])
cafe_name.append(re.split(("\n|\r\n"), tmp_string)[1])
url_add.append(urljoin(url_base, item.find("a")["href"]))
len(rank), len(main_menu), len(cafe_name), len(url_add)
rank[:5]
main_menu[:5]
cafe_name[:5]
url_add[:5]
import pandas as pd

data = {
"Rank": rank,
"Menu": main_menu,
"Cafe": cafe_name,
"URL": url_add,
}

df = pd.DataFrame(data)
df.tail(2)

컬럼 순서 변경

df = pd.DataFrame(data, columns=["Rank", "Cafe", "Menu", "URL"])
df.tail()

데이터 저장

df.to_csv(
"../data/03. best_sandwiches_list_chicago.csv", sep=",", encoding="utf-8"
)

4. 시카고 맛집 데이터 분석 - 하위페이지

requirements

import pandas as pd
from urllib.request import urlopen, Request
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
df = pd.read_csv("../data/03. best_sandwiches_list_chicago.csv", index_col=0)
df.tail()
df["URL"][0]
req = Request(df["URL"][0], headers={"user-agent":ua.ie})
html = urlopen(req).read()
soup_tmp = BeautifulSoup(html, "html.parser")
soup_tmp.find("p", "addy") # soup_find.select_one(".addy")

regular expression

price_tmp = soup_tmp.find("p", "addy").text
price_tmp
import re
re.split(".,", price_tmp)
price_tmp = re.split(".,", price_tmp)[0]
price_tmp
tmp = re.search("$\d+.(\d+)?", price_tmp).group()
price_tmp[len(tmp) + 2:]

from tqdm import tqdm

price = []
address = []

for idx, row in tqdm(df[:5].iterrows()):
req = Request(row["URL"], headers={"user-agent":ua.ie})
html = urlopen(req).read()
soup_tmp = BeautifulSoup(html, "html.parser")
gettings = soup_tmp.find("p", "addy").get_text()
price_tmp = re.split(".,", gettings)[0]
tmp = re.search("$\d+.(\d+)?", price_tmp).group()
price.append(tmp)
address.append(price_tmp[len(tmp)+2:])
print(idx)
len(price), len(address)
price[:5]
address[:5]
df.tail(2)
df["Price"] = price
df["Address"] = address
df = df.loc[:, ["Rank", "Cafe", "Menu", "Price", "Address"]]
df.set_index("Rank", inplace=True)
df.head()
df.to_csv(
"../data/03. best_sandwiches_list_chicago2.csv", sep=",", encoding="UTF-8"
)
pd.read_csv("../data/03. best_sandwiches_list_chicago2.csv", index_col=0)

5. 시카고 맛집 데이터 지도 시각화

requirements

import folium
import pandas as pd
import numpy as np
import googlemaps
from tqdm import tqdm
df = pd.read_csv("../data/03. best_sandwiches_list_chicago2.csv", index_col=0)
df.tail(10)
gmaps_key = "키 값 넣기"
gmaps = googlemaps.Client(key=gmaps_key)
lat = []
lng = []

for idx, row in tqdm(df.iterrows()):
if not row["Address"] == "Multiple location":
target_name = row["Address"] + ", " + "Chicago"

    # print(target_name)
    gmaps_output = gmaps.geocode(target_name)
    location_ouput = gmaps_output[0].get("geometry")
    lat.append(location_ouput["location"]["lat"])
    lng.append(location_ouput["location"]["lng"])
    # location_output = gmaps_output[0]
else:
    lat.append(np.nan)
    lng.append(np.nan)

len(lat), len(lng)
df.tail()

데이터 프레임에 위도 경도 값 추가

df["lat"] = lat
df["lng"] = lng
df.tail()
mapping = folium.Map(location=[41.8781136, -87.6297982], zoom_start=11)

for idx, row in df.iterrows():
if not row["Address"] == "Multiple location":
folium.Marker(
location=[row["lat"], row["lng"]],
popup=row["Cafe"],
tooltip=row["Menu"],
icon=folium.Icon(
icon="coffee",
prefix="fa"
)
).add_to(mapping)

mapping

profile
데이터 분석가

0개의 댓글