์ผ๋ฐ์ ์ผ๋ก๋ ์ฌ์ฉ์ ๊ธฐ๋ฐ๋ณด๋ค๋ ์์ดํ
๊ธฐ๋ฐ ํ์
ํํฐ๋ง์ด ์ ํ๋๊ฐ ๋ ๋๋ค
- ๋น์ทํ ์ํ๋ฅผ ์ข์ํ๋ค๊ณ ์ทจํฅ์ด ๋น์ทํ๋ค๊ณ ํ๋จํ๊ธฐ ์ด๋ ต๊ฑฐ๋
- ๋งค์ฐ ์ ๋ช
ํ ์ํ๋ ์ทจํฅ๊ณผ ๊ด๊ณ์์ด ๊ด๋ํ๋ ๊ฒฝ์ฐ๊ฐ ๋ง๊ณ
- ์ฌ์ฉ์๋ค์ด ํ์ ์ ๋งค๊ธฐ์ง ์๋ ๊ฒฝ์ฐ๊ฐ ๋ง๊ธฐ ๋๋ฌธ
์ ์ฌ ์์ธ ํ์ ํํฐ๋ง : ์ฌ์ฉ์ - ์์ดํ ํ์ ํ๋ ฌ ๋ฐ์ดํฐ๋ฅผ ์ด์ฉํด์ '์ ์ฌ์์ธ'์ ๋์ถํ๋ ๊ฒ. ์ฃผ์์ธ๊ณผ ์์ดํ ์ ๋ํ ์ ์ฌ์์ธ์ ๋ํด ํ๋ ฌ๋ถํด๋ฅผ ํ๊ณ ๋ค์ ํ๋ ฌ๊ณฑ์ ํตํด ์์ง ํ์ ์ ๋ถ์ฌํ์ง ์์ ์์ดํ ์ ๋ํ ์์ธก ํ์ ์ ์์ฑํ๋ ๊ฒ
์ฝ์ฌ์ธ ์ ์ฌ๋
์ฝ์ฌ์ธ ์ ์ฌ๋ ๊ณ์ฐ์
# ๋ฐ์ดํฐ ์ฝ๊ธฐ
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
movies = pd.read_csv('./tmdb5000movies/tmdb_5000_movies.csv')
print(movies.shape)
movies.head()
# ๋ฐ์ดํฐ ์ ํ
movies_df = movies[['id', 'title', 'genres', 'vote_average',
'vote_count', 'popularity', 'keywords', 'overview']]
movies_df.head()
# ๋ฐ์ดํฐ ์ฃผ์์ฌํญ
# genres์ keywords๋ ์ปฌ๋ผ์์ dictํ์ผ๋ก ์ ์ฅ๋จ
movies_df[['genres']][:1].values
type(movies_df['genres'][0])
movies_df['genres'][0]
# ๋ฌธ์์ด๋ก ๋ ๋ฐ์ดํฐ ๋ณํ
from ast import literal_eval
code = "(1, 2, {'foo':'bar'})"
code
type(code)
literal_eval(code)
type(literal_eval(code))
# genres์ keywords์ ๋ด์ฉ์ list์ dict์ผ๋ก ๋ณต๊ตฌ
from ast import literal_eval
movies_df['genres'] = movies_df['genres'].apply(literal_eval)
movies_df['keywords'] = movies_df['keywords'].apply(literal_eval)
movies_df.head()
# dict์ value ๊ฐ์ ํน์ฑ์ผ๋ก ์ฌ์ฉํ๋๋ก ๋ณ๊ฒฝ
movies_df['genres'] = movies_df['genres'].apply(lambda x : [y['name'] for y in x])
movies_df['keywords'] = movies_df['keywords'].apply(lambda x : [y['name'] for y in x])
movies_df[['genres', 'keywords']][:2]
# genres์ ๊ฐ ๋จ์ด๋ค์ ํ๋์ ๋ฌธ์ฅ(๋์ด์ฐ๊ธฐ๋ก ๊ตฌ๋ถ๋)์ผ๋ก ๋ณํ
movies_df['genres_literal'] = movies_df['genres'].apply(lambda x : (' '.join(x)))
movies_df.head()
# ๋ฌธ์์ด๋ก ๋ณํ๋ genres๋ฅผ CountVectorize ์ํ
# countvectorize : https://wikidocs.net/33661
# ngram_range : ๋จ์ด ๋ฌถ์ ์ค์
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(min_df=0, ngram_range=(1, 2))
genre_mat = count_vect.fit_transform(movies_df['genres_literal'])
print(genre_mat.shape)
# ๋ฌธ์ฅ์ ์ ์ฌ๋ ์ธก์ ์ ํ๋ ๋ฐฉ๋ฒ ์ค ํ๋์ธ ์ฝ์ฌ์ธ ์ ์ฌ๋ ์ธก์ ์ ์ํ
# confusion_matrix์ ๋น์ทํ๊ฒ ํด์ํ๋ฉด ๋๋ค.
from sklearn.metrics.pairwise import cosine_similarity
genre_sim =cosine_similarity(genre_mat, genre_mat) # ๋ ๋ฉํธ๋ฆญ์ค๋ผ๋ฆฌ similarity๋ฅผ ๊ตฌํจ
print(genre_sim.shape)
print(genre_sim[:2])
# genre_sim ๊ฐ์ฒด์์ ๋์ ๊ฐ ์์ผ๋ก ์ ๋ ฌ
genre_sim_sorted_ind = genre_sim.argsort()[:, ::-1]
# [:, ::-1] : argsort()๊ฐ array ์ ๋ ฌ ๊ธฐ๋ฅ์ด๊ณ , ascending ์ ๋ ฌ ๊ธฐ๋ฅ์ด ์์ด [:, ::-1]ํ๋ฉด ์์๊ฐ ๋ค์งํ๋ค(?..)
print(genre_sim_sorted_ind[:1])
# ์ถ์ฒ ์ํ๋ฅผ DataFrame์ผ๋ก ๋ฐํํ๋ ํจ์
def find_sim_movie(df, sorted_ind, title_name, top_n = 10):
title_movie = df[df['title'] == title_name]
title_index = title_movie.index.values
silmilar_indexes = sorted_ind[title_index, :(top_n)]
print(silmilar_indexes)
silmilar_indexes = silmilar_indexes.reshape(-1)
return df.iloc[silmilar_indexes]
# ์ํ ๋๋ถ์ ์ ์ฌํ ์ํ๋?
similar_movies = find_sim_movie(movies_df, genre_sim_sorted_ind, 'The Godfather', 10)
similar_movies[['title', 'vote_average']]
# ๋ค์ ๋ฐ์ดํฐ ํ์
# ํ์ ๊ณผ ํ์ ์ ๋งค๊ธด ํ์๋ฅผ ๋ณด๋ฉด ๋ฌธ์ ๋ฐ์ดํฐ๊ฐ ๋ณด์ธ๋ค.
movies_df[['title', 'vote_average',
'vote_count']].sort_values('vote_average', ascending=False)[:10]
# ์ํ ์ ์ ์ ์ํ ๊ฐ์ค์น ์ ์
# ์ํ ์ ์ฒด ํ๊ท ํ์ ๊ณผ ์ต์ ํฌํ ํ์๋ฅผ 60%์ง์ ์ผ๋ก ์ง์
C = movies_df['vote_average'].mean()
m = movies_df['vote_count'].quantile(0.6)
print('C: ', round(C, 3), 'm: ', round(m, 3))
# ๊ฐ์ค์น๊ฐ ๋ถ์ฌ๋ ํ์ ์ ๊ณ์ฐํ๊ธฐ ์ํ ํจ์
def weighted_vote_average(recode):
v = recode['vote_count']
R = recode['vote_average']
return ((v/(v+m)) * R) + ((m/(m+v) * C))
# ๋ค์ ๊ณ์ฐ
movies_df['weighted_vote'] = movies_df.apply(weighted_vote_average, axis=1)
movies_df.head()
# ์ ์ฒด ๋ฐ์ดํฐ์์ ๊ฐ์ค์น๊ฐ ๋ถ์ฌ๋ ํ์ ์์ผ๋ก ์ ๋ ฌํ ๊ฒฐ๊ณผ
movies_df[['title', 'vote_average',
'weighted_vote', 'vote_count']].sort_values('weighted_vote', ascending=False)[:10]
# vote_average์ weighted_vote๋ฅผ ๋น๊ตํด๋ณด๋ฉด ๊ฐ์ค์น๊ฐ ์ ์ฉ๋์ด ๊ฐ์ด ๋ฐ๋์ด ์์
movies_df.tail()
movies_df[movies_df['vote_count'] < 10]
# ์ ์ฒด ๋ฐ์ดํฐ์์ ๊ฐ์ค์น๊ฐ ๋ถ์ฌ๋ ํ์ ์์ผ๋ก ์ ๋ ฌ
movies_df[['title', 'vote_average',
'weighted_vote', 'vote_count']].sort_values('weighted_vote', ascending=False)[:10]
# ์ ์ฌ ์ํ๋ฅผ ์ฐพ๋ ํจ์ ๋ณ๊ฒฝ
def find_sim_movie(df, sorted_ind, title_name, top_n = 10):
title_movie = df[df['title'] == title_name]
title_index = title_movie.index.values
silmilar_indexes = sorted_ind[title_index, :(top_n*2)]
silmilar_indexes = silmilar_indexes.reshape(-1)
silmilar_indexes = silmilar_indexes[silmilar_indexes != title_index]
return df.iloc[silmilar_indexes].sort_values('weighted_vote', ascending = False)[:top_n]
# ๋๋ถ์ ์ ์ฌํ ์ํ ์ฐพ๊ธฐ
similar_movies = find_sim_movie(movies_df, genre_sim_sorted_ind, 'The Godfather', 10)
similar_movies[['title', 'vote_average', 'weighted_vote']]
์ํ์ ํ์ ์ ๋งค๊ธด ์ฌ์ฉ์์ ์ํ ํ์ ํ๋ ฌ ๋ฑ์ ๋ฐ์ดํฐ
์ด์ค์์ 1MB์ง๋ฆฌ small ๋ฐ์ดํฐ ์ด์ฉ
# ๋ฐ์ดํฐ ์ฝ๊ธฐ
import pandas as pd
import numpy as np
movies = pd.read_csv('./ml-latest-small/movies.csv')
ratings = pd.read_csv('./ml-latest-small/ratings.csv')
print(movies.shape)
print(ratings.shape)
# movie์๋ ์ํ์ ๋ชฉ, ์ฅ๋ฅด
movies.head()
# rating์๋ ์ํ ํ์ ์ด ์ฌ์ฉ์๋ณ๋ก ์ํ๋ณ๋ก ์กด์ฌ
ratings.head()
# rating raw ๋ฐ์ดํฐ ์ ๋ฆฌ ํ์
# pivot_table
ratings = ratings[['userId', 'movieId', 'rating']]
ratings_matrix = ratings.pivot_table('rating', index='userId', columns='movieId')
ratings_matrix.head()
# ratings์ movie๋ฅผ movieID ๊ธฐ์ค์ผ๋ก ๊ฒฐํฉ
ratings_movies = pd.merge(ratings, movies, on='movieId')
ratings_movies.head()
# ๋ค์ pivot_table ์ด์ฉํ์ฌ ์ ๋ฆฌ
ratings_matrix = ratings_movies.pivot_table('rating', index='userId', columns='title')
ratings_matrix.head()
# nan์ 0์ผ๋ก ๋ณํ
ratings_matrix = ratings_matrix.fillna(0)
ratings_matrix.head()
# ์ ์ฌ๋ ์ธก์ ์ ์ํด ํ๋ ฌ์ transpose
ratings_matrix_T = ratings_matrix.transpose()
ratings_matrix_T.head()
# ์ ์ฌ๋ ์ธก์ ๊ฒฐ๊ณผ
from sklearn.metrics.pairwise import cosine_similarity
item_sim = cosine_similarity(ratings_matrix_T, ratings_matrix_T)
item_sim_df = pd.DataFrame(data=item_sim,
index=ratings_matrix.columns, columns=ratings_matrix.columns)
print(item_sim_df.shape)
item_sim_df.head()
# ๋๋ถ์ ์ ์ฌํ ์ํ๋?
item_sim_df['Godfather, The (1972)'].sort_values(ascending=False)[:6]
# ์ธ์
์
๊ณผ ์ ์ฌํ ์ํ?
item_sim_df['Inception (2010)'].sort_values(ascending=False)[1:6]
# ๋ฐ์ดํฐ ๊ฒฝ๋ก ํ์ธ
import numpy as np
import pandas as pd
import os
print(os.listdir('./good book recommendation/'))
# books.csv
# ratings 1, 2, 3, 4, 5์ ์๋ฏธ โ ๋ณ์ ๋ณ ์ฌ๋๋ค ์ข์์ ์
books = pd.read_csv('./good book recommendation/books.csv', encoding='ISO-8859-1')
books.head()
# ratings.csv
# rating user_id, user_id๊ฐ ์ค rate
ratings = pd.read_csv('./good book recommendation/ratings.csv', encoding='ISO-8859-1')
ratings.head()
# book_tags.csv
# book_id์ tag_id
book_tags = pd.read_csv('./good book recommendation/book_tags.csv', encoding='ISO-8859-1')
book_tags.head()
# tags.csv
# tag_id์ tag_name
tags = pd.read_csv('./good book recommendation/tags.csv', encoding='ISO-8859-1')
tags.head()
# book_tags์ tags๋ฅผ merge
tags_join_DF = pd.merge(book_tags, tags, left_on='tag_id', right_on='tag_id', how='inner')
tags_join_DF.head()
# to_read.csv
# user_id๊ฐ ์ฝ์ book_id
to_read = pd.read_csv('./good book recommendation/to_read.csv')
to_read.head()
# books์ aurthors
books['authors'][:5]
# aurthors๋ก Tfidf ์ํ
# Tfidf : ์ ๋ณด ๊ฒ์๊ณผ ํ
์คํธ ๋ง์ด๋์์ ์ด์ฉํ๋ ๊ฐ์ค์น๋ก, ์ฌ๋ฌ ๋ฌธ์๋ก ์ด๋ฃจ์ด์ง ๋ฌธ์๊ตฐ์ด ์์ ๋ ์ด๋ค ๋จ์ด๊ฐ ํน์ ๋ฌธ์ ๋ด์์ ์ผ๋ง๋ ์ค์ํ ๊ฒ์ธ์ง๋ฅผ ๋ํ๋ด๋ ํต๊ณ์ ์์น์ด๋ค.
# https://wikidocs.net/31698
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(books['authors'])
tfidf_matrix
# ์ ์ฌ๋ ์ธก์
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim
# The Hobbit์ index๋ 6
titles = books['title']
indices =pd.Series(books.index, index=books['title'])
indices['The Hobbit']
# ์ ์ฌ๋ ๊ฐ ํธ์ถ
cosine_sim[indices['The Hobbit']]
# ์ ์ฌ๋ ๊ฒฐ๊ณผ๋ฅผ ์ธ๋ฑ์ค๋ฅผ ๊ฐ์ง list ํ์ผ๋ก
cosine_sim[indices['The Hobbit']].shape
list(enumerate(cosine_sim[indices['The Hobbit']]))
# ๊ฐ์ฅ ์ ์ฌํ ์ฑ
์ ์ธ๋ฑ์ค ์ฐพ๊ธฐ
sim_scores = list(enumerate(cosine_sim[indices['The Hobbit']]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores[:3]
# ์๊ฐ๋ก ๋ณธ ์ ์ฌ ์ฑ
๊ฒ์
sim_scores = sim_scores[1:11]
book_indices = [i[0] for i in sim_scores]
titles.iloc[book_indices]
# book์ tag ํฌํจ
books_with_tags = pd.merge(books, tags_join_DF,
left_on='book_id', right_on='goodreads_book_id', how='inner')
books_with_tags.head()
# tag๋ก Tfidf
tf1 = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix1 = tf1.fit_transform(books_with_tags['tag_name'].head(10000))
cosine_sim1 = linear_kernel(tfidf_matrix1, tfidf_matrix1)
cosine_sim1
# ์ถ์ฒ ์ฑ
์ ๋ฐํํ๋ ํจ์
titles1 = books['title']
indices1 = pd.Series(books.index, index=books['title'])
def tags_recommendations(title):
idx = indices1[title]
sim_scores = list(enumerate(cosine_sim1[idx]))
sim_scores = sorted(sim_scores, key=lambda x:x[1], reverse=True)
sim_scores = sim_scores[1:11]
book_indices = [i[0] for i in sim_scores]
return titles.iloc[book_indices]
# ํ๊ทธ๋ก ์ฐพ์๋ณธ The Hobbit๊ณผ ์ ์ฌํ ์ฑ
tags_recommendations('The Hobbit').head(20)
# ์์๋ก book id๋ง๋ค tag๋ฅผ ๋ถ์ด๊ธฐ
temp_df = books_with_tags.groupby('book_id')['tag_name'].apply(' '.join).reset_index()
temp_df.head()
# books์ ํฉ์น๊ธฐ
books = pd.merge(books, temp_df, left_on='book_id', right_on='book_id', how='inner')
books.head()
pd.Series(books[['authors', 'tag_name']].fillna('').values.tolist())
# ์ ์์ด๋ฆ๊ณผ ํ๊ทธ ํฉ์น๊ธฐ
books['corpus'] = (pd.Series(books[['authors', 'tag_name']].fillna('').values.tolist()).str.join(' '))
books['corpus'][:3]
# Tfidf ์ํ
tf_corpus = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')
tfidf_matrix_corpus = tf_corpus.fit_transform(books['corpus'])
cosine_sim_corpus = linear_kernel(tfidf_matrix_corpus, tfidf_matrix_corpus)
titles = books['title']
indices = pd.Series(books.index, index=books['title'])
# ์ถ์ฒํจ์ ๋ง๋ค๊ธฐ
def corpus_recommendations(title):
idx = indices1[title]
sim_scores = list(enumerate(cosine_sim_corpus[idx]))
sim_scores = sorted(sim_scores, key= lambda x:x[1], reverse=True)
sim_scores = sim_scores[1:11]
book_indices = [i[0] for i in sim_scores]
return titles.iloc[book_indices]
# Hobbit๊ณผ ๋น์ทํ ์ฑ
corpus_recommendations('The Hobbit')
# Twilight๊ณผ ๋น์ทํ ์ฑ
corpus_recommendations('Twilight (Twilight, #1)')
# ๋ก๋ฏธ์ค์ ์ค๋ฆฌ์ฃ๊ณผ ์ ์ฌํ ์ฑ
corpus_recommendations('Romeo and Juliet')
๐ป ์ถ์ฒ : ์ ๋ก๋ฒ ์ด์ค ๋ฐ์ดํฐ ์ทจ์ ์ค์ฟจ