[제로베이스 데이터 취업 스쿨 15기] 5주차 (EDA: 3. Movie Ranking)

김지환·2023년 6월 10일
0
post-thumbnail

5주차: 5/29/2023 - 6/4/2023


Movie ratings site analysis


# requirements

import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup
  • IMDB top 250 movies
url = "https://www.imdb.com/chart/top/"
response = urlopen(url)
# response.status

soup = BeautifulSoup(response, "html.parser")
print(soup.prettify())

# Movie title tag
soup.find_all("td", "titleColumn")

# 1
soup.find_all("td", "titleColumn")[0].a.string

# 2
soup.select(".titleColumn")[0].find("a").text

# 3
soup.select(".titleColumn")[0].select_one("a").get_text()
# Movie ratings tag
soup.select("td.ratingColumn.imdbRating")

soup.select("td.ratingColumn.imdbRating")[0].text.strip()
len(soup.find_all("td", "titleColumn")), len(soup.select("td.ratingColumn.imdbRating"))
# Movie title list

end = len(soup.find_all("td", "titleColumn"))

movie_name = []

for n in range(0, end):
    movie_name.append(
        soup.find_all("td", "titleColumn")[n].a.string
    )

movie_name
movie_name = [soup.select(".titleColumn")[n].a.text for n in range(0, end)]
movie_name
# Movie ratings list
end = len(soup.select("td.ratingColumn.imdbRating"))

movie_rating = [soup.select("td.ratingColumn.imdbRating")[n].text.strip() for n in range(0, end)]
movie_rating
# Check the size of data
len(movie_name), len(movie_rating)

Automation


import time
from tqdm import tdqm

movie_date = []
movie_name = []
movie_rating = []

for today in tqdm(date):
    url = "https://www.imdb.com/chart/top/"
    response = urlopen(url.format(date=today.strftime("%Y%m%d")))
    soup = BeautifulSoup(response, "html.parser")
    
    end = len(soup.select("td.ratingColumn.imdbRating"))
    
    movie_date.extend([today for _ in range(0, end)])
    movie_name.extend([soup.select(".titleColumn")[n].find("a").text for n in range(0, end)])
    movie_rating.extend([soup.select("td.ratingColumn.imdbRating")[n].text.strip() for n in range(0, end)])
    
    time.sleep(0.5)
movie = pd.DataFrame({
    "name": movie_name,
    "rating": movie_rating
})
movie.tail()
movie["rating"] = movie["rating"].astype(float)

Saving data

# Save data
movie.to_csv(
    "../data/03. movie_data.csv", sep=",", encoding="utf-8"
)

Movie ratings data analysis


import numpy as np
import pandas as pd

movie = pd.read_csv("../data/03. movie_data.csv", index_col=0)
movie.tail()

# pivot table
movie_unique = pd.pivot_table(data=movie, index="name", aggfunc=np.sum)
movie_unique

movie_best = movie_unique.sort_values(by="rating", ascending=False)
movie_best.head()

tmp = movie.query("name == ['Daeboo']")
tmp

Visualization

import matplotlib.pyplot as plt
from matplotlib import rc

rc("font", family="Malgun Gothic")
get_ipython().run_line_magic("matplotlib", "inline")
plt.figure(figsize=(20, 8))
plt.plot(tmp["date"], tmp["rating"])
plt.title("Rating per date")
plt.xlabel("Date")
plt.ylabel("Rating")
plt.xticks(rotation="vertical")
plt.legend(labels=["Rating trend"], loc="best")
plt.grid(True)
plt.show()
movie_pivot = pd.pivot_table(data=movie, index="date", columns="name", values="rating")
movie_pivot.head()

movie_pivot.to_excel("../data/03. movie_pivot.xlsx")
import platform
import seaborn as sns
from matplotlib import font_manager, rc

path = "C:/Windows/Fonts/malgun.ttf"

if platform.system() == "Darwin":
    rc("font", family="Arial Unicode MS")
elif platform.system() == "Windows":
    font_name = font_manager.FontProperties(fname=path).get_name()
    rc("font", family=font_name)
else:
    print("Unknown system")
target_col = ["쇼생크 탈출", "다크 나이트", "Daeboo", "12명의 성난 사람들", "The Godfather Part II"]
plt.figure(figsize=(20, 8))
plt.title("Rating per date")
plt.xlabel("Date")
plt.ylabel("Rating")
plt.xticks(rotation="vertical")
plt.tick_params(bottom="off", labelbottom="off")
plt.plot(movie_pivot[target_col])
plt.legend(target_col, loc="best")
plt.grid(True)
profile
데이터 분석 공부하고 있습니다

0개의 댓글