[제로베이스 데이터 취업 스쿨 15기] 5주차 (EDA: 3. Web Data)

김지환·2023년 6월 10일
0
post-thumbnail

5주차: 5/29/2023 - 6/4/2023


BeautifulSoup for web data


Basics

  • install
    - conda install -c anaconda beautifulsoup4
    - pip install beautifulsoup4
  • data
    - 03.test_first.html
# import 
from bs4 import BeautifulSoup

page = open("../data/03. zerobase.html", "r").read()
soup = BeautifulSoup(page, "html.parser")
print(soup.prettify())

# Read head tag
soup.head

# Read body tag
soup.body

# Read p tag
# Print the first p tag
soup.p

soup.find("p")

soup.find("p", class_="inner-text second-item")

soup.find("p", {"class":"outer-text first-item"}).text.strip()

# Multiple conditions
soup.find("p", {"class":"inner-text first-item", "id":"first"})

# find_all(): Find all
# Return multiple tags in a list
soup.find_all("p")

# Find a specific tag
soup.find_all(class_="outer-text")

soup.find_all(id="pw-link")[0].text

soup.find_all("p", class_="inner-text second-item")

print(soup.find_all("p")[0].text)
print(soup.find_all("p")[1].get_text())

# Print only the text property in the p tag list
for each_tag in soup.find_all("p"):
    print("=" * 50)
    print(each_tag.text)
    
# Extract href values from a tag
links = soup.find_all("a")
links[0].get("href"), links[1]["href"]

for each in links:
    href = each.get("href")  #each["href"]
    text = each.get_text()
    print(text + " => " + href)

Example 1-1. Naver Finance

# import 
from urllib.request import urlopen
from bs4 import BeautifulSoup

url = "https://finance.naver.com/marketindex/"
page = urlopen(url)  # response, res
soup = BeautifulSoup(page, "html.parser")
print(soup.prettify())

response = urlopen(url)
response.status

# 1
soup.find_all("span", "value")

# 2
soup.find_all("span", class_="value"), len(soup.find_all("span", "value"))

# 3
soup.find_all("span", {"class": "value"}), len(soup.find_all("span", {"class": "value"})) 

soup.find_all("span", {"class": "value"})[0].text, soup.find_all("span", {"class": "value"})[0].string, soup.find_all("span", {"class": "value"})[0].get_text()

Example 1-2. Naver Finance

  • !pip install requests
  • find, find_all
  • select, select_one
  • find, select_one: single select
  • select, find_all: multiple select
import requests
# from urllib.request.Request
from bs4 import BeautifulSoup
import pandas as pd

url = "https://finance.naver.com/marketindex/"
response = requests.get(url)
# response.text
soup = BeautifulSoup(response.text, "html.parser")
print(soup.prettify())
# soup.find_all("li", "on")
# id: #, class: .
# >: directly under
exchangeList = soup.select("#exchangeList > li")
len(exchangeList), exchangeList
title = exchangeList[0].select_one(".h_lst").text
exchange = exchangeList[0].select_one(".value").text
change = exchangeList[0].select_one(".change").text
updown = exchangeList[0].select_one(".head_info.point_dn > .blind").text

title, exchange, change, updown
findmethod = soup.find_all("ul", id="exchangeList")
findmethod[0].find_all("span", "value"), findmethod[0].find_all("span", "blind")

baseUrl = "https://finance.naver.com"
baseUrl + exchangeList[0].select_one("a").get("href")
# Collect 4 types of information from Naver Finance

exchange_data = []
baseUrl = "https://finance.naver.com"

for item in exchangeList:
    data = {
        "title": item.select_one(".h_lst").text,
        "exchange": item.select_one(".value").text,
        "change": item.select_one(".change").text,
        "updown": item.select_one(".head_info.point_dn > .blind").text,
        "link": baseUrl + item.select_one("a").get("href")
    }
    exchange_data.append(data)
exchange_data

df = pd.DataFrame(exchange_data)
df.to_excel("./naverfinance.xlsx", encoding="utf-8")

Example 2. Scraping from Wikipedia

import urllib
from urllib.request import urlopen, Request

html = "https://ko.wikipedia.org/wiki/{search_words}"
# https://ko.wikipedia.org/wiki/여명의_눈동자

req = Request(html.format(search_words=urllib.parse.quote("여명의_눈동자")))  # Encode the characters as URL
response = urlopen(req)
soup = BeautifulSoup(response, "html.parser")
print(soup.prettify())
n = 0

for each in soup.find_all("ul"):
    print("=>" + str(n) + "=================================")
    print(each.get_text())
    n += 1
soup.find_all("ul")[32].text.strip().replace("\xa0", "").replace("\n", "")

Chicago sandwich data analysis


Main page

# !pip install fake-useragent

from urllib.request import Request, urlopen
from fake_useragent import UserAgent
from bs4 import BeautifulSoup

url_base = "https://www.chicagomag.com/"
url_sub = "chicago-magazine/november-2012/best-sandwiches-chicago/"
url = url_base + url_sub

ua = UserAgent()
req = Request(url, headers={"User-Agent": ua.ie})
html = urlopen(req)
soup = BeautifulSoup(html, "html.parser")
print(soup.prettify())
soup.find_all("div", "sammy"), len(soup.find_all("div", "sammy"))
# soup.select(".sammy"), len(soup.select(".sammy"))

tmp_one = soup.find_all("div", "sammy")[0]
type(tmp_one)

tmp_one.find(class_="sammyRank").get_text()
# tmp_one.select_one(".sammyRank").text

tmp_one.find("div", {"class": "sammyListing"}).get_text()
# tmp_one.select_one(".sammyListing").text

tmp_one.find("a")["href"]
# tmp_one.select_one("a").get("href")

import re

tmp_string = tmp_one.find(class_="sammyListing").get_text()
re.split(("\n|\r\n"), tmp_string)

print(re.split(("\n|\r\n"), tmp_string)[0])  # memnu
print(re.split(("\n|\r\n"), tmp_string)[1])  # cafe_name
from urllib.parse import urljoin

url_base = "http://www.chicagomag.com"

# Empty lists
# Create one list for each column and combine in a DataFrame
rank = []
main_menu = []
cafe_name = []
url_add = []

list_soup = soup.find_all("div", "sammy")  # soup.select(".sammy")

for item in list_soup:
    rank.append(item.find(class_="sammyRank").get_text())
    tmp_string = item.find(class_="sammyListing").get_text()
    main_menu.append(re.split(("\n|\r\n"), tmp_string)[0])
    cafe_name.append(re.split(("\n|\r\n"), tmp_string)[1])
    url_add.append(urljoin(url_base, item.find("a")["href"]))

len(rank), len(main_menu), len(cafe_name), len(url_add)
import pandas as pd

data = {

    "Rank": rank,
    "Menu": main_menu,
    "Cafe": cafe_name,
    "URL": url_add
}

df = pd.DataFrame(data)
df.tail(2)
# Change column order
df = pd.DataFrame(data, columns=["Rank", "Cafe", "Menu", "URL"])
df.tail(5)
# Save data
df.to_csv(
    "../data/03. best_sandwiches_list_chicago.csv", sep=",", encoding="utf-8"
)

Subpages

# requirements
import pandas as pd
from urllib.request import urlopen, Request
from fake_useragent import UserAgent
from bs4 import BeautifulSoup

df = pd.read_csv("../data/03. best_sandwiches_list_chicago.csv", index_col=0)
df.tail()
req = Request(df["URL"][0], headers={"user-agent": "ua.ie"})
html = urlopen(req).read()
soup_tmp = BeautifulSoup(html, "html.parser")
soup_tmp.find("p", "addy")  # soup.select_one(".addy")

# regualr expression
price_tmp = soup_tmp.find("p", "addy").text
price_tmp

import re
re.split(".,", price_tmp)

price_tmp = re.split(".,", price_tmp)[0]
price_tmp

tmp = re.search("\$\d+\.(\d+)?", price_tmp).group()
price_tmp[len(tmp) + 2:]
price = []
address = []

for n in df.index[:3]:
    req = Request(df["URL"][n], headers={"user-agent": "ua.ie"})
    html = urlopen(req).read()
    soup_tmp = BeautifulSoup(html, "html.parser")

    gettings = soup_tmp.find("p", "addy").get_text()
    price_tmp = re.split(".,", gettings)[0]
    tmp = re.search("\$\d+\.(\d+)?", price_tmp).group()
    price.append(tmp)
    address.append(price_tmp[len(tmp) + 2:])
    print(n)
price = []
address = []

for idx, row in df[:3].iterrows():
    req = Request(row["URL"], headers={"user-agent": "ua.ie"})
    html = urlopen(req).read()
    soup_tmp = BeautifulSoup(html, "html.parser")

    gettings = soup_tmp.find("p", "addy").get_text()
    price_tmp = re.split(".,", gettings)[0]
    tmp = re.search("\$\d+\.(\d+)?", price_tmp).group()
    price.append(tmp)
    address.append(price_tmp[len(tmp) + 2:])
    print(idx)
from tqdm import tqdm

price = []
address = []

for idx, row in tqdm(df.iterrows()):
    req = Request(row["URL"], headers={"user-agent": "Chrome"})
    html = urlopen(req).read()
    soup_tmp = BeautifulSoup(html, "html.parser")
    gettings = soup_tmp.find("p", "addy").get_text()
    price_tmp = re.split(".,", gettings)[0]
    tmp = re.search("\$\d+\.(\d+)?", price_tmp).group()
    price.append(tmp)
    address.append(price_tmp[len(tmp) + 2:])
    print(idx)
df["Price"] = price
df["Address"] = address
df = df.loc[:, ["Rank", "Cafe", "Menu", "Price", "Address"]]
df.set_index("Rank", inplace=True)
df.head()
# Saving data
df.to_csv(
    "../data/03. best_sandwiches_list_chicago2.csv", sep=",", encoding="utf-8"
)

Map visualization

# requirements

import folium
import pandas as pd
import numpy as np
import googlemaps
from tqdm import tqdm

df = pd.read_csv("../data/03. best_sandwiches_list_chicago2.csv", index_col=0)
df.tail(10)
gmaps_key = "Insert Key"
gmaps = googlemaps.Client(key=gmaps_key)

lat = []
lng = []

for idx, row in tqdm(df.iterrows()):
    if not row["Address"] == "Multiple location":
        target_name = row["Address"] + ", " + "Chicago"
        gmaps_output = gmaps.geocode(target_name)
        location_output = gmaps_output[0].get("geometry")
        lat.append(location_output["location"]["lat"])
        lng.append(location_output["location"]["lng"])
    else:
        lat.append(np.nan)
        lng.append(np.nan)

df["lat"] = lat
df["lng"] = lng
df.tail()
mapping = folium.Map(location=[37.5519, 126.9918], zoom_start=11)

for idx, row in df.iterrows():
    if not row["Address"] == "Multiple location":
        folium.Marker(
            location=[row["lat"], row["lng"]],
            popup=row["Cafe"],
            tooltip=row["Menu"],
            icon=folium.Icon(
                icon="coffee",
                prefix="fa"
            )
        ).add_to(mapping)

mapping
profile
데이터 분석 공부하고 있습니다

0개의 댓글