모든 예제는 네이버 블로그 데이터 수집을 수행한다.
소요 시간 : 2.25 sec
import asyncio
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
options = Options()
options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
options.add_argument('--log-level=3')
options.add_argument('--headless')
options.add_argument('--disable-logging')
options.add_argument('--no-sandbox')
options.add_argument('--disable-gpu')
url = 'https://section.blog.naver.com/Search/Post.naver'
keyword = '광안리'
page = 20
urls = [url + f'?pageNo={i + 1}&rangeType=ALL&orderBy=sim&keyword={keyword}' for i in range(page)]
data = []
async def scrape(url, *, loop):
await loop.run_in_executor(None, scraper, url)
def scraper(url):
driver = webdriver.Chrome(executable_path='chromedriver', options=options)
driver.get(url)
# elems = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.list_search_post div.desc [href]')))
# urls = list(set(elem.get_attribute('href') for elem in elems))
elems = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.list_search_post div.desc span.title')))
titles = [i.text for i in elems]
data.extend(titles)
driver.close()
start = time.time()
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.gather(*[scrape(url, loop=loop) for url in urls ]))
print(data)
print(time.time() - start)
참고 : https://stackoverflow.com/questions/50303797/python-webdriver-and-asyncio
소요 시간 : ?
import asyncio
import time
from concurrent.futures.thread import ThreadPoolExecutor
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
options = Options()
options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
options.add_argument('--log-level=3')
options.add_argument('--headless')
options.add_argument('--disable-logging')
options.add_argument('--no-sandbox')
options.add_argument('--disable-gpu')
url = 'https://section.blog.naver.com/Search/Post.naver'
keyword = '광안리'
page = 20
urls = [url + f'?pageNo={i + 1}&rangeType=ALL&orderBy=sim&keyword={keyword}' for i in range(page)]
executor = ThreadPoolExecutor(page)
def scrape(url, *, loop):
loop.run_in_executor(executor, scraper, url)
def scraper(url):
driver = webdriver.Chrome(executable_path='chromedriver', options=options)
driver.get(url)
elems = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.list_search_post div.desc [href]')))
urls = list(set(elem.get_attribute('href') for elem in elems))
elems = WebDriverWait(driver, 10).until(
EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.list_search_post div.desc span.title')))
titles = [i.text for i in elems]
# Kafka producer 연결
pass
loop = asyncio.get_event_loop()
for url in urls:
scrape(url, loop=loop)
start = time.time()
loop.run_until_complete(asyncio.gather(*asyncio.all_tasks(loop)))
print(time.time() - start)
참고 :
https://changmyeong.tistory.com/49
https://github.com/HENNGE/arsenic/issues/102
소요시간 : 1.37 sec
import asyncio
import time
from arsenic import get_session
from arsenic.browsers import Chrome
from arsenic.services import Chromedriver
a = []
async def run(url):
driver = Chromedriver(binary='./chromedriver')
browser = Chrome(**{"goog:chromeOptions":{
'args': ['--headless', '--disable-gpu']
}})
async with get_session(driver, browser) as session:
await session.get(url)
await session.wait_for_element(5, 'div.list_search_post div.desc span.title')
elems = await session.get_elements('div.list_search_post div.desc span.title')
titles = [await i.get_text() for i in elems]
a.extend(titles)
async def loops():
url = 'https://section.blog.naver.com/Search/Post.naver'
keyword = '광안리'
page = 20
urls = [url + f'?pageNo={i + 1}&rangeType=ALL&orderBy=sim&keyword={keyword}' for i in range(page)]
await asyncio.gather(*[run(url=url) for url in urls])
start = time.time()
asyncio.run(loops())
print(time.time() - start)
print(a)
arsenic 가 가장 빠르다.