๐Ÿž๏ธ ์—ฐ์˜ˆ์ธ ์‚ฌ์ง„ ํฌ๋กค๋ง ํ•˜๊ธฐ

๋ฌธํ•ดํ”ผยท2023๋…„ 5์›” 21์ผ
0

5์›”์˜ ํ”„๋กœ์ ํŠธ

๋ชฉ๋ก ๋ณด๊ธฐ
2/3
post-thumbnail

๋ชจ๋ธ์„ ๋งŒ๋“ค๊ธฐ ์œ„ํ•ด์„œ๋Š” ์ œ์ผ ์ฒ˜์Œ ์žˆ์–ด์•ผ ํ•˜๋Š”๊ฒƒ์ด ์ด๋ฏธ์ง€ ์•„๋‹๊นŒ์š”?
๊ทธ๋ ‡๊ธฐ ๋•Œ๋ฌธ์— ๊ตฌ๊ธ€์—์„œ ์ด๋ฏธ์ง€๋ฅผ ์—ฐ์˜ˆ์ธ๋งˆ๋‹ค ํด๋”๋ฅผ ๋งŒ๋“ค์–ด์„œ ์ €์žฅํ•˜๋Š” ๊ณ„ํš์„ ์ƒˆ์› ์Šต๋‹ˆ๋‹ค.

1์ฐจ ํŒŒ์ด์ฌ ์ฝ”๋“œ

import requests
from bs4 import BeautifulSoup
import urllib
count2=1
for i in range(10):
    # ๊ฒ€์ƒ‰ํ•  ํ‚ค์›Œ๋“œ ์ž…๋ ฅ
    keyword = "๋ฐฐ์šฐ ์žฅ๊ธฐ์šฉ ์ธ๋ฌผ์‚ฌ์ง„"

    # ๊ฒ€์ƒ‰ URL ์ฃผ์†Œ ์ž…๋ ฅ
    url = f"https://www.google.com/search?q={keyword}&source=lnms&tbm=isch&start=100"

    # HTTP ์š”์ฒญ ๋ณด๋‚ด๊ธฐ
    header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 Edge/16.16299"}
    html = requests.get(url, headers=header).text

    # HTTP ์‘๋‹ต์—์„œ ์ด๋ฏธ์ง€ ๋งํฌ ์ถ”์ถœ
    soup = BeautifulSoup(html, "html.parser")
    img_tags = soup.find_all("img")

    # ์ด๋ฏธ์ง€ ๋‹ค์šด๋กœ๋“œ
    count = 0
    
    for img in img_tags:
        try:
            count2+=1
            img_url = img["src"]
            if img_url.startswith("http"):
                img_name = f"image_{count}_{count2}.jpg"
                urllib.request.urlretrieve(img_url, img_name)
                print(f"๋‹ค์šด๋กœ๋“œ ์™„๋ฃŒ: {img_name}")
                count += 1
        except Exception as e:
            print(f"์˜ค๋ฅ˜ ๋ฐœ์ƒ: {e}")
  • ์œ„์˜ ์ฝ”๋“œ๋Š” ์žฅ๊ธฐ์šฉ์ด๋ผ ๋ฐฐ์šฐ์˜ ์‚ฌ์ง„์„ ๊ตฌ๊ธ€์—์„œ ํฌ๋กค๋งํ•˜๊ธฐ ์œ„ํ•ด์„œ ์ œ์ผ ๋จผ์ € url์˜ ํ˜•์‹์„ ๊ด€์ฐฐํ•œ ๊ฒฐ๊ณผ GET๋ฐฉ์‹์ด๋ž€๊ฒƒ์„ ์•Œ์ˆ˜ ์žˆ์—ˆ์Šต๋‹ˆ๋‹ค

๊ทธ๋ ‡๋‹ค๋ฉด GET๋ฐฉ์‹์ด๋ž€ ๋ฌด์—‡์ผ๊นŒ์š”?

  • ํด๋ผ์ด์–ธํŠธ๊ฐ€ ์„œ๋ฒ„์— ์ •๋ณด๋ฅผ ์š”์ฒญํ•  ๋•Œ URL์— ํŠน์ •ํ•œ ๋งค๊ฐœ๋ณ€์ˆ˜๋ฅผ ์ถ”๊ฐ€ํ•˜์—ฌ ์š”์ฒญํ•˜๋Š” ๋ฐฉ์‹์ž…๋‹ˆ๋‹ค.

GET ๋ฐฉ์‹์€ ์ •๋ณด๋ฅผ ๊ฒ€์ƒ‰ํ•˜๋Š” ๋ฐ ์‚ฌ์šฉ๋˜๋ฉฐ, URL์— ๋งค๊ฐœ๋ณ€์ˆ˜๊ฐ€ ํฌํ•จ๋˜์–ด ์žˆ์–ด ์ด๋ฅผ ์‚ฌ์šฉํ•ด ์„œ๋ฒ„์— ์š”์ฒญ์„ ๋ณด๋‚ผ ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค. ์ด๋Ÿฐ ํŠน์„ฑ ๋•Œ๋ฌธ์— ์‚ฌ์šฉ์ž๊ฐ€ ์ฃผ์†Œ์ฐฝ์˜ URL์„ ๋ณ€๊ฒฝํ•˜์—ฌ ๋‹ค์–‘ํ•œ ์ •๋ณด๋ฅผ ์–ป์„ ์ˆ˜ ์žˆ๋Š” ๊ฒƒ์ด ๊ฐ€๋Šฅํ•ฉ๋‹ˆ๋‹ค.


๋ณ€์ˆ˜ keyword๋Š” ๊ฒ€์ƒ‰ํ•˜๊ณ  ์‹ถ์€ ๋ฐฐ์šฐ๋‚˜ ๊ฐ€์ˆ˜์˜ ์ด๋ฆ„๋“ฑ์„ url์— f์ŠคํŠธ๋ง์˜ ํ˜•์‹์œผ๋กœ ๋„ฃ์—ˆ๊ณ  ๊ทธ ์ด์œ ๋Š”
์ถ”ํ›„์— ๋ฐ˜๋ณต๋ฌธ์„ ํ†ตํ•ด์„œ ๋ฆฌ์ŠคํŠธ์— ์—ฐ์˜ˆ์ธ์˜ ์ด๋ฆ„์„ ๋„ฃ๊ณ  ์ด๋ฆ„์„ ๋ฐ˜๋ณตํ•˜๊ธฐ ์œ„ํ•ด์„œ ์ž…๋‹ˆ๋‹ค.

๊ทธ๋Ÿฌ๋ฉด์„œ URL์—์„œ ์ฝ์–ด์˜จ ์›น์˜ ์ •๋ณด๋ฅผ BeautifulSoup์„ ํ†ตํ•ด์„œ ์›นํŽ˜์ด์ง€์˜ ์ฝ”๋“œ๋ฅผ ํ•ด์„ํ•˜์—ฌ ๋ฌธ์„œ ๊ฐ์ฒด ๋ชจ๋ธ(DOM, Document Object Model)๊ณผ ์œ ์‚ฌํ•œ ๊ตฌ์กฐ๋กœ ๋ณ€ํ™˜ํ•œ๊ฒƒ์„ soup์— ๋‹ด์Šต๋‹ˆ๋‹ค.
๋ณ€ํ™˜ํ•œ๊ฒƒ์„ ํ†ตํ•ด์„œ soup.find_all("img") ๋ฅผ ํ†ตํ•ด์„œ ์ด๋ฏธ์ง€๋งŒ ์ฐพ์Šต๋‹ˆ๋‹ค.



๋ฐ˜๋ณต๋ฌธ

  • ์ด์ œ < img >ํƒœ๊ทธ์— ๋Œ€ํ•œ ๋ฐ˜๋ณต๋ฌธ์„ ์ˆ˜ํ–‰ํ•ฉ๋‹ˆ๋‹ค.
    ๊ทธ๋ฆฌ๊ณ  ์ด๋ฏธ์ง€ ์ˆ˜์ง‘์— ์‹œ๊ฐ„์ด ๋งŽ์ด ๊ฑธ๋ฆฌ๊ธฐ ๋•Œ๋ฌธ์— ์—๋Ÿฌ๊ฐ€ ๋‚˜๋ฉด ๋ฉˆ์ถ”๋Š”๊ฒŒ ์•„๋‹ˆ๋ผ
    try ์™€ except๋ฅผ ํ†ตํ•ด์„œ ์—๋Ÿฌ์˜ ๋ฉ”์„ธ์ง€๋งŒ์„ ์ถœ๋ ฅํ•˜๊ณ  ๋ฉˆ์ถ”์ง€ ์•Š๊ฒŒ ํ•˜์˜€์Šต๋‹ˆ๋‹ค.

  • img_url = img["src"]
    ๊ฐ < img > ํƒœ๊ทธ์˜ "src" ์†์„ฑ๊ฐ’์„ ๊ฐ€์ ธ์™€ img_url ๋ณ€์ˆ˜์— ์ €์žฅํ•ฉ๋‹ˆ๋‹ค. "src"๋Š” ์ด๋ฏธ์ง€ ํŒŒ์ผ์˜ URL๋ฅผ ๊ฐ€๋ฆฌํ‚ต๋‹ˆ๋‹ค.

  • if img_url.startswith("http"):
    ์ด๋ฏธ์ง€ URL์ด "http"๋กœ ์‹œ์ž‘ํ•˜๋Š”์ง€ ํ™•์ธํ•˜๊ณ , ๊ทธ๋ ‡๋‹ค๋ฉด ํ•ด๋‹น ์ด๋ฏธ์ง€๋ฅผ ๋‹ค์šด๋กœ๋“œํ•˜๊ธฐ ์œ„ํ•ด ๋‹ค์Œ ์ฝ”๋“œ๋ฅผ ์‹คํ–‰ํ•ฉ๋‹ˆ๋‹ค.

  • urllib.request.urlretrieve(img_url, img_name)
    ์ด๋ฏธ์ง€ URL์—์„œ ์ด๋ฏธ์ง€๋ฅผ ๋‹ค์šด๋กœ๋“œํ•˜๊ณ  ์ด๋ฏธ์ง€ ํŒŒ์ผ์˜ ์ด๋ฆ„์ธ img_name์œผ๋กœ ์ €์žฅํ•ฉ๋‹ˆ๋‹ค.

์œ„์˜ ์ฝ”๋“œ๋กœ ์ด๋ฏธ์ง€๋Š” ๋‹ค์šด์ด ๋ฐ›์•„์กŒ์ง€๋งŒ ์—ฐ์˜ˆ์ธ์˜ ์‚ฌ์ง„ ๋ฟ๋งŒ์•„๋‹ˆ๋ผ ์‚ฌ์ง„์„ ํฌ์ŠคํŒ…ํ•œ๊ณณ์˜ ๋กœ๊ณ ๋‚˜ ๊ธฐํƒ€ ํ•„์š”์—†๋Š” ์ด๋ฏธ์ง€ ๊นŒ์ง€ ์ €์žฅ์ด ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.
๊ทธ๋ฆฌ๊ณ  ์Šคํฌ๋กคํ•ด์„œ ์ด๋ฏธ์ง€๊ฐ€ ์ถ”๊ฐ€๋กœ ๋กœ๋“œ๋˜๋Š” ํ˜•์‹์ด๊ธฐ ๋•Œ๋ฌธ์— ์Šคํฌ๋กค๊ธฐ๋Šฅ๋„ ์ถ”๊ฐ€ํ•ด์•ผ ํ–ˆ์Šต๋‹ˆ๋‹ค.

๊ทธ๊ฒƒ์˜ ํ•ด๊ฒฐ๋ฐฉ๋ฒ•์œผ๋กœ ๋‹ค๋ฅธ ์ฝ”๋“œ๋ฅผ ์ž‘์„ฑํ•ด์•ผ ํ–ˆ์Šต๋‹ˆ๋‹ค.


2์ฐจ ํŒŒ์ด์ฌ ์ฝ”๋“œ

import os
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from PIL import Image
import io
import time
# ํฌ๋กฌ๋“œ๋ผ์ด๋ฒ„ ์œ„์น˜ ์„ค์ •
DRIVER_PATH = '/usr/local/bin/chromedriver'
# ํฌ๋กฌ ๋“œ๋ผ์ด๋ฒ„ ์˜ต์…˜ ์„ค์ •
listm=['๊ฐ€์ˆ˜ ๋ฐฑ์˜ˆ๋ฆฐ', '๊ฐ€์ˆ˜ ์ด์†Œ๋ผ','๊ฐ€์ˆ˜ ์œค์•„๋ฐฐ์šฐ', '๋ฐฐ์šฐ ๋ฏผํšจ๋ฆฐ','๋ฐฐ์šฐ ์‹ ๋ฏผ์•„','๋ฐฐ์šฐ ์ˆ˜์• ','๋ฐฐ์šฐ ์„œํ˜„์ง„๊ฐ€์ˆ˜', '๊ฐ€์ˆ˜ ์†ก๊ฐ€์ธ','๊ฐœ๊ทธ์šฐ๋จผ ์‹ ๋ด‰์„ ','๊ฐœ๊ทธ์šฐ๋จผ ์ด์˜์ž']
for i in listm:
  options = Options()
  options.add_argument('--headless')  # ์ฐฝ ์•ˆ๋„์šฐ๊ธฐ
  options.add_argument('--no-sandbox')  # ๋ฆฌ๋ˆ…์Šค ํ™˜๊ฒฝ์—์„œ ํ•„์š”ํ•œ ์˜ต์…˜
  options.add_argument('--disable-dev-shm-usage')  # ๋ฆฌ๋ˆ…์Šค ํ™˜๊ฒฝ์—์„œ ํ•„์š”ํ•œ ์˜ต์…˜
  # ๊ฒ€์ƒ‰์–ด ์„ค์ •
  search_name = i
  # ๊ฒ€์ƒ‰์–ด๋ฅผ ์ด์šฉํ•œ ๊ตฌ๊ธ€ ์ด๋ฏธ์ง€ ๊ฒ€์ƒ‰ url
  url = f'https://www.google.com/search?q={search_name}&source=lnms&tbm=isch'
  # ํฌ๋กฌ ๋“œ๋ผ์ด๋ฒ„ ์‹คํ–‰
  service = Service(DRIVER_PATH)
  driver = webdriver.Chrome(service=service, options=options)
  # url ์ ‘์†
  driver.get(url)
  # ํŽ˜์ด์ง€ ๋กœ๋“œ๋ฅผ ์œ„ํ•œ ๋Œ€๊ธฐ ์‹œ๊ฐ„
  time.sleep(2)
  # ์ด๋ฏธ์ง€ ๋กœ๋”ฉ์„ ์œ„ํ•œ ์Šคํฌ๋กค ๋‹ค์šด
  last_height = driver.execute_script("return document.body.scrollHeight")
  while True:
      driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
      time.sleep(1)
      new_height = driver.execute_script("return document.body.scrollHeight")
      if new_height == last_height:
          break
      last_height = new_height
  # ์ด๋ฏธ์ง€ ๋งํฌ ์ถ”์ถœ
  soup = BeautifulSoup(driver.page_source, 'html.parser')
  img_tags = soup.find_all('img')
  urls = []
  for img in img_tags:
      try:
          url = img['src']
          if 'http' in url:
              urls.append(url)
      except:
          pass
  # ์ด๋ฏธ์ง€ ๋‹ค์šด๋กœ๋“œ
  os.makedirs(f'./images{search_name}', exist_ok=True)
  count = 0
  for url in urls:
      try:
          response = requests.get(url, stream=True)
          # ์ด๋ฏธ์ง€ ์‚ฌ์ด์ฆˆ ํ™•์ธ
          img = Image.open(io.BytesIO(response.content))
          width, height = img.size
          if width >= 20 and height >= 20:
              file_name = f'./images{search_name}/{count}.jpg'
              with open(file_name, 'wb') as out_file:
                  out_file.write(response.content)
              print(f'{file_name} saved')
              count += 1
              if count == 200:
                  break
      except:
          pass
  # ํฌ๋กฌ ๋“œ๋ผ์ด๋ฒ„ ์ข…๋ฃŒ
  driver.quit()

์—ฌ๊ธฐ์„œ selenium์„ ์ด์šฉํ•ด์„œ ์•ž์—์„œ ๋ง์”€๋“œ๋ฆฐ ์Šคํฌ๋กค๋“ฑ์˜ ๋ธŒ๋ผ์šฐ์ €์˜ ์ž๋™ํ™”๋ฅผ ์ง„ํ–‰ํ•˜์˜€์Šต๋‹ˆ๋‹ค.

์ด์ œ ์ฝ”๋“œ๋ฅผ ๋ณด๋ฉด ์ฒ˜์Œ๊ฒƒ๊ณผ ๋‹ค๋ฅธ๊ฒƒ์€ ํฌ๊ฒŒ ์—†์Šต๋‹ˆ๋‹ค.
ํ•˜์ง€๋งŒ ๋‹ค๋ฅธ๊ฒƒ๋“ค์€ ํฌ๊ฒŒ 00์žˆ์Šต๋‹ˆ๋‹ค.

  • time.sleep()

    ์Šฌ๋ฆฝ์„ ์ด์šฉํ•˜๋Š” ์ด์œ ๋Š” ํฌ๊ฒŒ ๋‘๊ฐ€์ง€๊ฐ€ ์žˆ์Šต๋‹ˆ๋‹ค.

    • ํŽ˜์ด์ง€ ๋กœ๋”ฉ ๋Œ€๊ธฐ : ์›น ํŽ˜์ด์ง€์˜ ๋™์  ์ปจํ…์ธ ๊ฐ€ ์™„์ „ํžˆ ๋กœ๋“œ๋˜๊ณ  ๋ Œ๋”๋ง๋  ์‹œ๊ฐ„์„ ํ™•๋ณดํ•˜๊ธฐ ์œ„ํ•ด ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค.
    • ์š”์ฒญ ๊ฐ„๊ฒฉ ์กฐ์ ˆ : ์›น ์„œ๋ฒ„์— ๋ถ€๋‹ด์„ ์ฃผ์ง€ ์•Š๊ธฐ ์œ„ํ•ด ์›น ํŽ˜์ด์ง€์— ์ผ์ •ํ•œ ๊ฐ„๊ฒฉ์œผ๋กœ ๋ณด๋‚ด๋Š”๋ฐ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค.
  • driver.execute_script()

    • ์›น ํŽ˜์ด์ง€์˜ ์ด ์Šคํฌ๋กค ๋†’์ด๋ฅผ ์ธก์ •ํ•˜๊ณ  ๊ทธ ๊ฐ’์„ last_height ๋ณ€์ˆ˜์— ํ• ๋‹นํ•˜๋Š” ๊ฒƒ์ž…๋‹ˆ๋‹ค. ์ด ์ •๋ณด๋Š” ๋ฌดํ•œ ์Šคํฌ๋กค ํŽ˜์ด์ง€์—์„œ ์ปจํ…์ธ ๋ฅผ ๋กœ๋“œํ•˜๊ฑฐ๋‚˜ ์Šคํฌ๋กค์„ ์ž๋™ํ™”ํ•˜๋Š” ๋ฐ ์‚ฌ์šฉํ•  ์ˆ˜ ์žˆ์Šต๋‹ˆ๋‹ค.
    • driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
      ๋˜‘๊ฐ™์ง€๋งŒ ์Šคํฌ๋กค์„ ํ–ˆ์„๋•Œ ์ฒ˜์Œ ์ˆซ์ž๋Š” 0์œผ๋กœ ๊ฐ€๋กœ๋Š” ๊ฐ€๋งŒํžˆ ์žˆ๊ณ , ๋‘๋ฒˆ์งธ ์ธ์ž๋Š” ์•„๋ž˜๋กœ ์Šคํฌ๋กค์„ ์ตœ๋Œ€๋กœ ํ•œ๋‹ค๋Š” ๋œป์ž…๋‹ˆ๋‹ค
      1. ์ด๋ ‡๊ฒŒ ์Šคํฌ๋กค์„ ํ†ตํ•ด์„œ ๋‚˜์˜จ ๊ธธ์ด๊ฐ€ ๋‘๊ฐœ๊ฐ€ ์„œ๋กœ ๊ฐ™์œผ๋ฉด while๋ฌธ์„ ๋ฉˆ์ถ”๊ฒŒ ๋ฉ๋‹ˆ๋‹ค.
        ๊ทธ๋ฆฌ๊ณ  ์ด๋ ‡๊ฒŒ ๋‚˜์˜จ ํŽ˜์ด์ง€ ์ „์ฒด์˜ html์„ ๊ฐ€์ ธ์˜ค๊ฒŒ ๋˜๊ณ  < img >ํƒœ๊ทธ๋ฅผ ๊ฐ€์ ธ์˜ค๊ฒŒ ๋ฉ๋‹ˆ๋‹ค.

      2. ์ง€๊ธˆ๊นŒ์ง€ urls์— ๋‹ด๊ธด ๊ฒƒ๋“ค์ด ๋˜๋‹ค์‹œ ๋ฐ˜๋ณต๋ฌธ์„ ํ†ตํ•ด์„œ ์ด๋ฏธ์ง€๋ฅผ ๋‹ค์šด๋ฐ›์„ ํ˜•์‹์„ ์ŠคํŠธ๋ฆฌ๋ฐ ๋ฐฉ์‹์œผ๋กœ ํ•˜๊ฒŒ๋ฉ๋‹ˆ๋‹ค.

      3. ๊ทธ๋ฆฌ๊ณ  1์ฐจ ์ฝ”๋“œ์—์„œ ์ด๋ฏธ์ง€๊ฐ€ ์ฒจ๋ถ€๋˜์–ด์žˆ๋Š” ๋กœ๊ณ ๋“ค๋„ ๊ฐ™์ด ๋ฐ›์•„์ ธ ์˜ค๋Š”๊ฒƒ์„ ๋ฐฉ์ง€ํ•˜๊ธฐ ์œ„ํ•ด์„œ ์ด๋ฏธ์ง€์˜ ํฌ๊ธฐ๋ฅผ ์ธก์ •ํ•˜์—ฌ 20*20์ด ๋„˜์–ด์•ผ ๋‹ค์šด๋กœ๋“œ๊ฐ€ ๋˜๋„๋ก ์„ค์ •ํ•˜์˜€์Šต๋‹ˆ๋‹ค.

      ๊ทธ๋ ‡๋‹ค๋ฉด ๊ฒฐ๊ณผ์ ์œผ๋กœ ์ฒ˜์Œ์— listm์ด๋ผ๋Š” ๋ฆฌ์ŠคํŠธ์— ์žˆ๋Š” ๋ฐฐ์šฐ์™€ ์—ฐ์˜ˆ์ธ์˜ ์ด๋ฆ„๋“ค์— ๋Œ€ํ•œ ํด๋”๊ฐ€ ๋งŒ๋“ค์–ด์ง€๊ณ  ๊ทธ ํด๋”์— ํ•ด๋‹น ์—ฐ์˜ˆ์ธ์˜ ์‚ฌ์ง„์ด 200์žฅ๊นŒ์ง€ ์ €์žฅ๋˜๊ฒŒ ๋ฉ๋‹ˆ๋‹ค.

๋‹ค์Œ ํฌ์ŠคํŒ…์€ 6๋ช…์˜ ํŒ€์›๋“ค์ด ๊ฐ์ž ๋ชจ์€ ์—ฐ์˜ˆ์ธ๋“ค์„ ํŒŒ์ผ์งˆ๋ผ ๋ฅผ ํ†ตํ•ด์„œ EC2์— ๋ชจ์•„๋ณด๊ฒ ์Šต๋‹ˆ๋‹ค

profile
ํ–‰๋ณตํ•˜๋ ค๊ณ  ๊ฐœ๋ฐœ๊ณต๋ถ€ํ•˜๋Š” ๋ฌธ๊ด‘์‹์˜ ๋กœ๊ทธํŒŒ์ผ์ž…๋‹ˆ๋‹ค.

0๊ฐœ์˜ ๋Œ“๊ธ€