Selenium 알라딘 사이트 크롤링

신준호·2023년 9월 19일

크롤링

이번 프로젝트에서 알라딘의 도서 리뷰, 제목, 출판사, 출판일, 작가를 크롤링해야 했다 .
과정은 도서 isbn으로 검색을 하고 해당 도서로 들어가 도서 정보를 저장했다
따라서 이 과정들을 자동화하기 위해 Selenuum을 사용했다

Selenium 이란?

웹 애플리케이션 테스트를 위한 포터블 프레임워크이다.
셀레늄은 테스트 스크립트 언어를 학습할 필요 없이 기능 테스트를 만들기 위한 플레이백 도구를 제공한다.

Selenium 사용 이유

BeautifulSoup 라이브러리만으로도 다양한 사이트의 정보를 크롤링 할 수 있다. 하지만, BeautifulSoup에게는 한계가 존재한다.
바로, 자바스크립트로 동적으로 생성된 정보는 가져올 수 없다.

Selenium 설치

pip install selenium

해당 명령어로 설치하고 chrome driver를 설치해야한다

Chrome driver 설치

주소
해당 주소로 들어가서 자신의 chrome 버전에 맞는 드라이버를 설치한다.

코드

필요 라이브러리 import

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
import re

html 태그 삭제

def remove_html(sentence):
    sentence = re.sub('(<([^>]+)>)|\n|\t|&nbsp;|&amp;|&gt;', '', sentence)
    return sentence

검색할 csv 파일 읽기 및 chrome driver 설정

df = pd.read_csv('../bookwave_crawling_test_2.csv', encoding='utf-8')
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging'])
driver = webdriver.Chrome(options=options)

크롤링 시작

for index, row in df.iterrows():
    isbn = row['isbn']
    url = f"https://www.aladin.co.kr/Product/Search?domain=BOOK&query={isbn}"
    driver.get(url)
    print(f"현재 idx는 {index} 입니다!!!!!!!")
    # review_list = []

    try:
        WebDriverWait(driver, 2).until(EC.element_to_be_clickable(
            (By.XPATH, f'//*[@id="yesSchList"]/li/div/div[1]/div[1]/span[1]/span[1]/a[1]'))).click()
        try:
            alert = WebDriverWait(driver, 2).until(EC.alert_is_present())
            alert.accept()
            continue  # 알림창이 뜨면 다시 for 문으로 돌아감
        except TimeoutException:
            pass  # 알림 창이 없으면 pass
    except:
        continue

    #제목 크롤링 로직
    title_html = WebDriverWait(driver, 2).until(EC.visibility_of_element_located(
        (By.XPATH, f'//*[@id="yDetailTopWrap"]/div[2]/div[1]/div[1]/h2'))).get_attribute(
        "innerHTML")
    title = remove_html(title_html)
    print(title)
    class_name = "gd_pubArea"
    class_name2 = "gd_auth"
    class_name3 = "gd_pub"
    class_name4 = "gd_date"
    # 작가 크롤링 로직
    # author_html = WebDriverWait(driver, 2).until(EC.visibility_of_element_located(
    #     (By.XPATH, f'//*[@id="yDetailTopWrap"]/div[2]/div[1]/span[@class="{class_name}"]/span/a'))).get_attribute(
    #     "innerHTML")
    # author = remove_html(author_html)
    # print(author)
    try:

# 첫 번째 경로에서 텍스트 가져오기 시도
        author_html = WebDriverWait(driver, 2).until(EC.visibility_of_element_located(
            (By.XPATH, f'//*[@id="yDetailTopWrap"]/div[2]/div[1]/span[@class="{class_name}"]/span[@class="{class_name2}"]/a'))).get_attribute("innerHTML")
        
    except Exception as e1:
        try:
        # 첫 번째 경로에서 실패한 경우, 두 번째 경로에서 텍스트 가져오기 시도
            author_html = WebDriverWait(driver, 2).until(EC.visibility_of_element_located(
                (By.XPATH, f'//*[@id="yDetailTopWrap"]/div[2]/div[1]/span[@class="{class_name}"]/span[@class="{class_name2}"]'))).get_attribute("innerHTML")
            print("나 발동돼")
        except Exception as e2:
        # 여기서도 실패한 경우 오류 처리
            print("Failed to get the text from both XPaths")
            author_html = None

    author = remove_html(author_html)
    print(author)
    # 출판사 크롤링 로직
    try:
        publisher_html = WebDriverWait(driver, 1).until(EC.visibility_of_element_located(
            (By.XPATH, f'//*[@id="yDetailTopWrap"]/div[2]/div[1]/span[@class="{class_name}"]/span[@class="{class_name3}"]'))).get_attribute(
            "innerHTML")
        publisher = remove_html(publisher_html)
        print(publisher)
    except Exception:
        continue
    # # 발행일자 크롤링 로직
    try:
        publisher_day_html =WebDriverWait(driver, 1).until(EC.visibility_of_element_located(
            (By.XPATH, f'//*[@id="yDetailTopWrap"]/div[2]/div[1]/span[@class="{class_name}"]/span[@class="{class_name4}"]'))).get_attribute(
            "innerHTML")
        publisher_day = remove_html(publisher_day_html)
        match = re.search(r"(\d+)년 (\d+)월 (\d+)일", publisher_day)
        if match:
            formatted_date = "-".join(match.groups())
        print(formatted_date)
    except Exception:
        continue

    book_df2 = pd.DataFrame({'isbn': [isbn], 'title': title,'author':author,'publisher':publisher,'image_url':row['image_url'],'content':row['content'],'publish_date':formatted_date,'genre_detail_dict_id':row['genre_detail_dict_id']})
    bookwave_book=pd.concat([bookwave_book, book_df2], ignore_index=True)

    for i in range(2, 7):
        try:
            element_id = "infoset_reviewContentList"
            scroll_script = f"document.getElementById('{element_id}').scrollIntoView();"
            driver.execute_script(scroll_script)
            time.sleep(1)

            WebDriverWait(driver, 1).until(EC.element_to_be_clickable(
                (By.XPATH, f'//*[@id="infoset_reviewContentList"]/div[{i}]/div[2]/a/div/span'))).click()

            review = WebDriverWait(driver, 1).until(EC.visibility_of_element_located(
                (By.XPATH, f'//*[@id="infoset_reviewContentList"]/div[{i}]/div[3]/div[2]'))).get_attribute(
                "innerHTML")

            sentence = remove_html(review)
            # review_list.append(sentence)
            # print(sentence)
            if not sentence:
                break
                # 데이터프레임으로 추가
            book_df = pd.DataFrame({'isbn': [isbn], 'review': sentence})
            final_df = pd.concat([final_df, book_df], ignore_index=True)

        except:
            continue

도서 정보 저장

# 최종 CSV 파일로 저장

# 'id' 열 추가
bookwave_book['id'] = range(1, len(bookwave_book) + 1)
final_df.to_csv('review/bookwave_reviews_1.csv', index=False, encoding='utf-8-sig')
bookwave_book.to_csv('booktable/bookwave_book_table_1.csv', index=False, encoding='utf-8-sig')