BeautifulSoup Basic

svenskpotatis·2023년 9월 8일
0

📌 BeautifulSoup Basic

  • install
    - conda install -c anaconda beautifulsoup4
    - pip install beautifulsoup4
  • import
# import 
from bs4 import BeautifulSoup
page = open('주소', 'r').read()
soup = BeautifulSoup(page, 'html.parser')
print(soup.prettify())

# head 태그 확인
soup.head

# body 태그 확인
soup.body

# p 태그 확인
# 처음 발견한 p 태그만 출력
# find()
soup.p
soup.find('p')
  • soup.find
soup.find('p', class_='inner-text first-item')
soup.find('p', {'class':'outer-text first-item'}).text.strip()
# 다중 조건
soup.find('p', {'class':'inner-text first-item', 'id':'first'})
  • soup.find_all
# find_all(): 여러 개의 태그를 반환
# list 형태로 반환

soup.find_all('p')
soup.find_all('p', class_='inner-text second-item')
  • 데이터 추출 예시
# p 태그 리스트에서 텍스트 속성만 출력

for each_tag in soup.find_all('p'):
    print('='*50)
    print(each_tag.text)
# a 태그에서 href 속성값에 있는 값 추출
links = soup.find_all('a')
links[0].get('href'), links[1]['href']
for each in links:
    href = each.get('href')  # each['href']
    text = each.get_text()
    print(text + '=>' + href)

📌 BeautifulSoup 예제 1 - 네이버 금융

  • !pip install requests
  • find, find_all
  • select, select_one
  • find, select_one : 단일 선택
  • select, find_all : 다중 선택
  • import
# import 
from urllib.request import urlopen
from bs4 import BeautifulSoup

import requests
url = "https://finance.naver.com/marketindex/"
# page, response, res
response = urlopen(url)
# print(response.status)
soup = BeautifulSoup(response, 'html.parser')
print(soup.prettify())
url = 'https://finance.naver.com/marketindex/'
response = requests.get(url)
# requests.get(), requests.post()
# response.text
soup = BeautifulSoup(response.text, 'html.parser')
print(soup.prettify())
  1. 추출
# soup.find_all('li', 'on')
# id => #
# class => .
exchangeList = soup.select('#exchangeList > li')  # id는 하나만 존재
len(exchangeList), exchangeList
title = exchangeList[0].select_one('.h_lst').text
exchange = exchangeList[0].select_one('.value').text
change = exchangeList[0].select_one('.change').text
updown = exchangeList[0].select_one('div.head_info.point_dn>.blind').text  # >: 하위
# link

title, exchange, change, updown
  • id: #
  • class: .
  • >: 하위
  1. 데이터 수집, 저장
# 4개 데이터 수집

exchange_datas = []
baseUrl = 'http://finance.naver.com'

for item in exchangeList:
    data = {
        'title': item.select_one('.h_lst').text,
        'exchange': item.select_one('.value').text,
        'change': item.select_one('.change').text,
        'updown': item.select_one('.head_info.point_dn > .blind').text,
        'link': baseUrl + item.select_one('a').get('href')
    }
    exchange_datas.append(data)
df = pd.DataFrame(exchange_datas)
df.to_excel('./naverfinance.xlsx')

📌 BeautifulSoup 예제2 - 위키백과

import urllib
from urllib.request import urlopen, Request

html = 'https://ko.wikipedia.org/wiki/{search_words}'
# https://ko.wikipedia.org/wiki/여명의_눈동자
req = Request(html.format(search_words=urllib.parse.quote('여명의_눈동자')))  # 글자를 URL로 인코딩  # url decoder
response = urlopen(req)
response.status
soup = BeautifulSoup(response, 'html.parser')
print(soup.prettify())
  • url decoder 사이트

📌 시카고 맛집 데이터 분석

# !pip install fake-useragent
from urllib.request import Request, urlopen
from fake_useragent import UserAgent
from bs4 import BeautifulSoup

url_base = 'https://www.chicagomag.com/'
url_sub = 'chicago-magazine/november-2012/best-sandwiches-chicago/'
url = url_base + url_sub
# ua = UserAgent()
# ua.ie

req = Request(url, headers={'User-Agent': 'Chrome'})   # 403 Forbidden
html = urlopen(req)
soup = BeautifulSoup(html, 'html.parser')
print(soup.prettify())
  • 404 forbidden 해결
  • re
import re

tmp_string = tmp_one.find(class_='sammyListing').get_text()
re.split(("\n|/r/n"), tmp_string)

>>>
['BLT', 'Old Oak Tap', 'Read more ']

0개의 댓글