Dynamic Web 병렬 크롤링

.·2023년 5월 23일
0

Optimizing Dynamic Web Pages Scraping

3 종류

  • selenium using async
  • selenium using ThreadPoolExecutor
  • arsenic

모든 예제는 네이버 블로그 데이터 수집을 수행한다.

1. selenium using async

소요 시간 : 2.25 sec

import asyncio
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options

from selenium import webdriver

options = Options()
options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
options.add_argument('--log-level=3')
options.add_argument('--headless')
options.add_argument('--disable-logging')
options.add_argument('--no-sandbox')
options.add_argument('--disable-gpu')


url = 'https://section.blog.naver.com/Search/Post.naver'
keyword = '광안리'
page = 20
urls = [url + f'?pageNo={i + 1}&rangeType=ALL&orderBy=sim&keyword={keyword}' for i in range(page)]

data = []


async def scrape(url, *, loop):
    await loop.run_in_executor(None, scraper, url)


def scraper(url):
    driver = webdriver.Chrome(executable_path='chromedriver', options=options)
    driver.get(url)

    # elems = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.list_search_post div.desc [href]')))
    # urls = list(set(elem.get_attribute('href') for elem in elems))
    elems = WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.list_search_post div.desc span.title')))
    titles = [i.text for i in elems]
    data.extend(titles)
    driver.close()



start = time.time()
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.gather(*[scrape(url, loop=loop) for url in urls ]))
print(data)
print(time.time() - start)

2. selenium using ThreadPoolExecutor

참고 : https://stackoverflow.com/questions/50303797/python-webdriver-and-asyncio
소요 시간 : ?

import asyncio
import time
from concurrent.futures.thread import ThreadPoolExecutor
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options

from selenium import webdriver

options = Options()
options.add_argument("start-maximized")
options.add_argument("disable-infobars")
options.add_argument("--disable-extensions")
options.add_argument('--log-level=3')
options.add_argument('--headless')
options.add_argument('--disable-logging')
options.add_argument('--no-sandbox')
options.add_argument('--disable-gpu')

url = 'https://section.blog.naver.com/Search/Post.naver'
keyword = '광안리'
page = 20
urls = [url + f'?pageNo={i + 1}&rangeType=ALL&orderBy=sim&keyword={keyword}' for i in range(page)]

executor = ThreadPoolExecutor(page)


def scrape(url, *, loop):
    loop.run_in_executor(executor, scraper, url)


def scraper(url):
    driver = webdriver.Chrome(executable_path='chromedriver', options=options)
    driver.get(url)

    elems = WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.list_search_post div.desc [href]')))
    urls = list(set(elem.get_attribute('href') for elem in elems))
    elems = WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'div.list_search_post div.desc span.title')))
    titles = [i.text for i in elems]

    # Kafka producer 연결
    pass


loop = asyncio.get_event_loop()

for url in urls:
    scrape(url, loop=loop)

start = time.time()
loop.run_until_complete(asyncio.gather(*asyncio.all_tasks(loop)))
print(time.time() - start)

3. arsenic

참고 :
https://changmyeong.tistory.com/49
https://github.com/HENNGE/arsenic/issues/102
소요시간 : 1.37 sec

import asyncio
import time

from arsenic import get_session
from arsenic.browsers import Chrome
from arsenic.services import Chromedriver

a = []


async def run(url):
    driver = Chromedriver(binary='./chromedriver')
    browser = Chrome(**{"goog:chromeOptions":{
        'args': ['--headless', '--disable-gpu']
    }})

    async with get_session(driver, browser) as session:
        await session.get(url)
        await session.wait_for_element(5, 'div.list_search_post div.desc span.title')
        elems = await session.get_elements('div.list_search_post div.desc span.title')
        titles = [await i.get_text() for i in elems]
        a.extend(titles)


async def loops():
    url = 'https://section.blog.naver.com/Search/Post.naver'
    keyword = '광안리'
    page = 20
    urls = [url + f'?pageNo={i + 1}&rangeType=ALL&orderBy=sim&keyword={keyword}' for i in range(page)]
    await asyncio.gather(*[run(url=url) for url in urls])


start = time.time()
asyncio.run(loops())
print(time.time() - start)
print(a)

결론

arsenic 가 가장 빠르다.

profile
.

0개의 댓글