from gensim.summarization.summarizer import summarize
text = '''Rice Pudding - Poem by Alan Alexander Milne
... What is the matter with Mary Jane?
... She's crying with all her might and main,
... And she won't eat her dinner - rice pudding again -
... What is the matter with Mary Jane?
... What is the matter with Mary Jane?
... I've promised her dolls and a daisy-chain,
... And a book about animals - all in vain -
... What is the matter with Mary Jane?
... What is the matter with Mary Jane?
... She's perfectly well, and she hasn't a pain;
... But, look at her, now she's beginning again! -
... What is the matter with Mary Jane?
... What is the matter with Mary Jane?
... I've promised her sweets and a ride in the train,
... And I've begged her to stop for a bit and explain -
... What is the matter with Mary Jane?
... What is the matter with Mary Jane?
... She's perfectly well and she hasn't a pain,
... And it's lovely rice pudding for dinner again!
... What is the matter with Mary Jane?'''
print(summarize(text))
And she won't eat her dinner - rice pudding again -
I've promised her dolls and a daisy-chain,
I've promised her sweets and a ride in the train,
And it's lovely rice pudding for dinner again!
[15-2] 비지도 학습을 이용한 음식 리뷰 추출 요약
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import re
from nltk.tokenize import sent_tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import gensim.models.keyedvectors as word2vec
import gc
import string
import nltk
nltk.download('punkt')
from google.colab import files
uploaded = files.upload()
import io
df = pd.read_csv(io.BytesIO(uploaded['Reviews.csv']))
df.head(3)
Id ProductId UserId ProfileName HelpfulnessNumerator HelpfulnessDenominator Score Time Summary Text
0 1 B001E4KFG0 A3SGXH7AUHU8GW delmartian 1 1 5 1303862400 Good Quality Dog Food I have bought several of the Vitality canned d...
1 2 B00813GRG4 A1D87F6ZCVE5NK dll pa 0 0 1 1346976000 Not as Advertised Product arrived labeled as Jumbo Salted Peanut...
2 3 B000LQOCH0 ABXLMWJIXXAIN Natalia Corres "Natalia Corres" 1 1 4 1219017600 "Delight" says it all This is a confection that has been around a fe...
def split_sentences(reviews):
"""
Splits the reviews into individual sentences
"""
n_reviews = len(reviews)
for i in range(n_reviews):
review = reviews[i]
sentences = sent_tokenize(review)
for j in reversed(range(len(sentences))):
sent = sentences[j]
sentences[j] = sent.strip()
if sent == '':
sentences.pop(j)
reviews[i] = sentences
rev_list = list(df['Text'])
split_sentences(rev_list)
df['sent_tokens'] = rev_list
df['length_of_rv'] = df['sent_tokens'].map(lambda x: len(x))
choice_length = 5
df = df[df['length_of_rv']>choice_length]
df.shape
(172765, 12)
list_sentences_train = df['Text']
max_features = 5000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
maxlen = 200
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
def loadEmbeddingMatrix(typeToLoad):
if(typeToLoad=="glove"):
uploaded_glove = files.upload()
EMBEDDING_FILE = io.BytesIO(uploaded['glove.twitter.27B.25d.txt'])
EMBEDDING_FILE = 'glove.twitter.27B.25d.txt'
embed_size = 25
elif(typeToLoad=="fasttext"):
EMBEDDING_FILE='wiki.simple.vec/wiki.simple.vec'
embed_size = 300
if(typeToLoad=="glove" or typeToLoad=="fasttext" ):
embeddings_index = dict()
f = open(EMBEDDING_FILE, encoding='utf-8')
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))
gc.collect()
return embeddings_index
emb_index= loadEmbeddingMatrix('glove')
Loaded 1193514 word vectors.
def calculate_sentence_embedding(wordList):
"""
This function calculates the embedding for entire sentence by taking the mean of embedding of
each word in the sentence. To be improved.
"""
emb_li =[]
for k in wordList:
embedding_vector = emb_index.get(k)
if embedding_vector is not None:
if(len(embedding_vector) == 25):
emb_li.append(list(embedding_vector))
mean_arr = np.array(emb_li)
return np.mean(mean_arr, axis=0)
def get_sent_embedding(mylist):
"""
This function calculates the embedding of each sentence in the review. Checks if the sentence being passed is a valid one,
removing the punctuation and emojis etc.
"""
sent_emb = []
n_sentences = len(mylist)
for i in mylist:
i = i.lower()
wL = re.sub("[^\w]", " ", i).split()
if(len(wL)>0):
for k in wL:
if(k in string.punctuation):
wL.remove(k)
if(len(wL) <= 2):
continue
else:
print("Sentence Removed: ",i)
continue
res = list(calculate_sentence_embedding(wL))
sent_emb.append(res)
return np.array(sent_emb)
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
how_many_summaries = 5000
summary = [None]*how_many_summaries
for rv in range(how_many_summaries):
review = df['sent_tokens'].iloc[rv]
enc_email = get_sent_embedding(review)
if(len(enc_email) > 0):
n_clusters = int(np.ceil(len(enc_email)**0.5))
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
kmeans = kmeans.fit(enc_email)
avg = []
closest = []
for j in range(n_clusters):
idx = np.where(kmeans.labels_ == j)[0]
avg.append(np.mean(idx))
closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_,\
enc_email)
ordering = sorted(range(n_clusters), key=lambda k: avg[k])
summary[rv] = ' '.join([review[closest[idx]] for idx in ordering])
print("Done for review # = ", rv)
else:
print("This is not a valid review")
Done for review
Done for review
Done for review
df_5000 = df.iloc[:5000]
df_5000['PredictedSummary'] = summary
df_5000[['Text', 'PredictedSummary']].to_csv('top_5000_summary.csv')
df_result = pd.read_csv('top_5000_summary.csv')
df_result.head(3)
Unnamed: 0 Text PredictedSummary
0 2 This is a confection that has been around a fe... This is a confection that has been around a fe...
1 7 This taffy is so good. It is very soft and ch... This taffy is so good. The flavors are amazing.
2 30 I have never been a huge coffee fan. However, ... ).<br />The little Dolche Guesto Machine is su...