- data2 로드를 통한 데이터 프레임 완성
- 한글에 맞게 형태소 분석기를 이용해서 단어 토큰화 TF-IDF 계산
- 코사인 유사도를 기반으로 영화 3종 추천 -올드보이
import pandas as pd
import numpy as np
! pip install konlpy
from konlpy.tag import Okt
data = pd.read_csv('data2.csv')
data.head(), data.shape
okt = Okt()
data_l = [' '.join(okt.morphs(i)) for i in data['content']]
data_l[0]
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_v = TfidfVectorizer()
tfidf_m = tfidf_v.fit_transform(data_l)
tfidf_m.toarray()
from sklearn.metrics.pairwise import cosine_similarity
cos = cosine_similarity(tfidf_m, tfidf_m)
from sklearn.metrics import euclidean_distances
eud = euclidean_distances(tfidf_m, tfidf_m)
cos.shape, eud.shape
data.index
t_idx = dict(zip(data['name'], data.index ))
t_idx
def ck_s_t(t, cosine_sim = cos ):
idx = t_idx[t]
c_sc = list(enumerate(cosine_sim[idx]))
c_sc = sorted ( c_sc, key=lambda x:x[1] , reverse=True)
m_d = c_sc[1:6]
m_i = [i[0]for i in m_d]
return data['name'].iloc[m_i]
pd.DataFrame(ck_s_t('올드보이'))
def ck_s_t(t, cosine_sim = eud ):
idx = t_idx[t]
c_sc = list(enumerate(cosine_sim[idx]))
c_sc = sorted ( c_sc, key=lambda x:x[1] , reverse=True)
m_d = c_sc[-2:-7:-1]
m_i = [i[0]for i in m_d]
return data['name'].iloc[m_i]
pd.DataFrame(ck_s_t('올드보이'))