📌 BeautifulSoup Basic
- conda install -c anaconda beautifulsoup4
- pip install beautifulsoup4
from bs4 import BeautifulSoup
page = open('주소', 'r').read()
soup = BeautifulSoup(page, 'html.parser')
print(soup.prettify())
soup.head
soup.body
soup.p
soup.find('p')
soup.find('p', class_='inner-text first-item')
soup.find('p', {'class':'outer-text first-item'}).text.strip()
soup.find('p', {'class':'inner-text first-item', 'id':'first'})
soup.find_all('p')
soup.find_all('p', class_='inner-text second-item')
for each_tag in soup.find_all('p'):
print('='*50)
print(each_tag.text)
links = soup.find_all('a')
links[0].get('href'), links[1]['href']
for each in links:
href = each.get('href') href']
text = each.get_text()
print(text + '=>' + href)
📌 BeautifulSoup 예제 1 - 네이버 금융
!pip install requests
find
, find_all
select
, select_one
- find, select_one : 단일 선택
- select, find_all : 다중 선택
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests
url = "https://finance.naver.com/marketindex/"
response = urlopen(url)
soup = BeautifulSoup(response, 'html.parser')
print(soup.prettify())
url = 'https://finance.naver.com/marketindex/'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
print(soup.prettify())
- 추출
exchangeList = soup.select('#exchangeList > li')
len(exchangeList), exchangeList
title = exchangeList[0].select_one('.h_lst').text
exchange = exchangeList[0].select_one('.value').text
change = exchangeList[0].select_one('.change').text
updown = exchangeList[0].select_one('div.head_info.point_dn>.blind').text
title, exchange, change, updown
- 데이터 수집, 저장
exchange_datas = []
baseUrl = 'http://finance.naver.com'
for item in exchangeList:
data = {
'title': item.select_one('.h_lst').text,
'exchange': item.select_one('.value').text,
'change': item.select_one('.change').text,
'updown': item.select_one('.head_info.point_dn > .blind').text,
'link': baseUrl + item.select_one('a').get('href')
}
exchange_datas.append(data)
df = pd.DataFrame(exchange_datas)
df.to_excel('./naverfinance.xlsx')
📌 BeautifulSoup 예제2 - 위키백과
import urllib
from urllib.request import urlopen, Request
html = 'https://ko.wikipedia.org/wiki/{search_words}'
req = Request(html.format(search_words=urllib.parse.quote('여명의_눈동자')))
response = urlopen(req)
response.status
soup = BeautifulSoup(response, 'html.parser')
print(soup.prettify())
📌 시카고 맛집 데이터 분석
from urllib.request import Request, urlopen
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
url_base = 'https://www.chicagomag.com/'
url_sub = 'chicago-magazine/november-2012/best-sandwiches-chicago/'
url = url_base + url_sub
req = Request(url, headers={'User-Agent': 'Chrome'})
html = urlopen(req)
soup = BeautifulSoup(html, 'html.parser')
print(soup.prettify())
import re
tmp_string = tmp_one.find(class_='sammyListing').get_text()
re.split(("\n|/r/n"), tmp_string)
>>>
['BLT', 'Old Oak Tap', 'Read more ']