Text summarization

Jane의 study note.·2022년 11월 30일

NLP 자연어처리

목록 보기

9/24

[15-1] TextRank를 이용한 추출 요약

from gensim.summarization.summarizer import summarize

text = '''Rice Pudding - Poem by Alan Alexander Milne
... What is the matter with Mary Jane?
... She's crying with all her might and main,
... And she won't eat her dinner - rice pudding again -
... What is the matter with Mary Jane?
... What is the matter with Mary Jane?
... I've promised her dolls and a daisy-chain,
... And a book about animals - all in vain -
... What is the matter with Mary Jane?
... What is the matter with Mary Jane?
... She's perfectly well, and she hasn't a pain;
... But, look at her, now she's beginning again! -
... What is the matter with Mary Jane?
... What is the matter with Mary Jane?
... I've promised her sweets and a ride in the train,
... And I've begged her to stop for a bit and explain -
... What is the matter with Mary Jane?
... What is the matter with Mary Jane?
... She's perfectly well and she hasn't a pain,
... And it's lovely rice pudding for dinner again!
... What is the matter with Mary Jane?'''

print(summarize(text))
And she won't eat her dinner - rice pudding again -
I've promised her dolls and a daisy-chain,
I've promised her sweets and a ride in the train,
And it's lovely rice pudding for dinner again!

[15-2] 비지도 학습을 이용한 음식 리뷰 추출 요약

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import re
from nltk.tokenize import sent_tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import gensim.models.keyedvectors as word2vec
import gc
import string
import nltk
nltk.download('punkt')

from google.colab import files
uploaded = files.upload()

import io
df = pd.read_csv(io.BytesIO(uploaded['Reviews.csv']))
# Dataset is now stored in a Pandas Dataframe

df.head(3)

	Id	ProductId	UserId	ProfileName	HelpfulnessNumerator	HelpfulnessDenominator	Score	Time	Summary	Text
0	1	B001E4KFG0	A3SGXH7AUHU8GW	delmartian	1	1	5	1303862400	Good Quality Dog Food	I have bought several of the Vitality canned d...
1	2	B00813GRG4	A1D87F6ZCVE5NK	dll pa	0	0	1	1346976000	Not as Advertised	Product arrived labeled as Jumbo Salted Peanut...
2	3	B000LQOCH0	ABXLMWJIXXAIN	Natalia Corres "Natalia Corres"	1	1	4	1219017600	"Delight" says it all	This is a confection that has been around a fe...

def split_sentences(reviews):
    """
    Splits the reviews into individual sentences
    """
    n_reviews = len(reviews)
    for i in range(n_reviews):
        review = reviews[i]
        #print(email)
        sentences = sent_tokenize(review)
        #print(sentences)
        for j in reversed(range(len(sentences))):
            sent = sentences[j]
            sentences[j] = sent.strip()
            if sent == '':
                sentences.pop(j)
        reviews[i] = sentences

rev_list = list(df['Text'])
split_sentences(rev_list)
#Adding split reviews in the data frame
df['sent_tokens'] = rev_list
#Calculating lenght of sentences in each review
df['length_of_rv'] = df['sent_tokens'].map(lambda x: len(x))

choice_length = 5
df = df[df['length_of_rv']>choice_length]
df.shape
(172765, 12)

#Making vocabulary with reviews with max vocabs=5000. 
list_sentences_train = df['Text']
max_features = 5000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
maxlen = 200
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)

def loadEmbeddingMatrix(typeToLoad):
        #load different embedding file from Kaggle depending on which embedding 
        #matrix we are going to experiment with
        if(typeToLoad=="glove"):
            uploaded_glove = files.upload()
            EMBEDDING_FILE = io.BytesIO(uploaded['glove.twitter.27B.25d.txt'])
            EMBEDDING_FILE = 'glove.twitter.27B.25d.txt'
            embed_size = 25
        
        elif(typeToLoad=="fasttext"):
            EMBEDDING_FILE='wiki.simple.vec/wiki.simple.vec'
            embed_size = 300

        if(typeToLoad=="glove" or typeToLoad=="fasttext" ):
            embeddings_index = dict()
            #Transfer the embedding weights into a dictionary by iterating through every line of the file.
            f = open(EMBEDDING_FILE, encoding='utf-8')
            for line in f:
                #split up line into an indexed array
                values = line.split()
                #first index is word
                word = values[0]
                #store the rest of the values in the array as a new array
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs #50 dimensions
            f.close()
            print('Loaded %s word vectors.' % len(embeddings_index))
       # else:
        #    embeddings_index = dict()
         #   for word in word2vecDict.wv.vocab:
          #      embeddings_index[word] = word2vecDict.word_vec(word)
           # print('Loaded %s word vectors.' % len(embeddings_index))
            
        gc.collect()
        return embeddings_index #, embedding_matrix
        
## Loading 'glove' words
emb_index= loadEmbeddingMatrix('glove')      
Loaded 1193514 word vectors.

def calculate_sentence_embedding(wordList):
    """
    This function calculates the embedding for entire sentence by taking the mean of embedding of
    each word in the sentence. To be improved. 
    """
    #wordList = re.sub("[^\w]", " ",  sent).split()
    #print(wordList)
    emb_li =[]
    for k in wordList:
        embedding_vector = emb_index.get(k)
        if embedding_vector is not None:
            if(len(embedding_vector) == 25):
                emb_li.append(list(embedding_vector))
    #print("Lost words in translation: ", len(wordList)-len(emb_li))
    mean_arr = np.array(emb_li)
    #print("done calculating sentence emb for you")
    return np.mean(mean_arr, axis=0)


def get_sent_embedding(mylist):
    """
    This function calculates the embedding of each sentence in the review. Checks if the sentence being passed is a valid one, 
    removing the punctuation and emojis etc.
    """
    sent_emb = []
    n_sentences = len(mylist)
    for i in mylist:
        #print("my sentence : ", i)
        #print("\nlower is", i.lower())
        i = i.lower()
        wL = re.sub("[^\w]", " ",  i).split()
        if(len(wL)>0):
            for k in wL:
                if(k in string.punctuation):
                    wL.remove(k)
            if(len(wL) <= 2):
                continue
        else:
            print("Sentence Removed: ",i)
            continue
        #print(wL)
        res = list(calculate_sentence_embedding(wL))
        sent_emb.append(res)
    return np.array(sent_emb)
    
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
## Calculating summaries for first 5000 reviews. 
how_many_summaries = 5000
summary = [None]*how_many_summaries
for rv in range(how_many_summaries):
        review = df['sent_tokens'].iloc[rv]
        enc_email = get_sent_embedding(review)
        if(len(enc_email) > 0):
            n_clusters = int(np.ceil(len(enc_email)**0.5))
            kmeans = KMeans(n_clusters=n_clusters, random_state=0)
            kmeans = kmeans.fit(enc_email)
            avg = []
            closest = []
            for j in range(n_clusters):
                idx = np.where(kmeans.labels_ == j)[0]
                #print("IDX is: ", idx)
                avg.append(np.mean(idx))
            closest, _ = pairwise_distances_argmin_min(kmeans.cluster_centers_,\
                                                       enc_email)
            ordering = sorted(range(n_clusters), key=lambda k: avg[k])
            summary[rv] = ' '.join([review[closest[idx]] for idx in ordering])
            print("Done for review # = ", rv)
        else:
            print("This is not a valid review")    

Done for review # =  4997
Done for review # =  4998
Done for review # =  4999

df_5000 = df.iloc[:5000]
df_5000['PredictedSummary'] = summary
df_5000[['Text', 'PredictedSummary']].to_csv('top_5000_summary.csv')


df_result = pd.read_csv('top_5000_summary.csv')
df_result.head(3)

	Unnamed: 0	Text	PredictedSummary
0	2	This is a confection that has been around a fe...	This is a confection that has been around a fe...
1	7	This taffy is so good. It is very soft and ch...	This taffy is so good. The flavors are amazing.
2	30	I have never been a huge coffee fan. However, ...	).<br />The little Dolche Guesto Machine is su...

Jane의 study note.

이전 포스트

자연어생성 (NLG, Natural Language Generation)

다음 포스트

Text summarization

NLP 자연어처리

[15-1] TextRank를 이용한 추출 요약

[15-2] 비지도 학습을 이용한 음식 리뷰 추출 요약

자연어생성 (NLG, Natural Language Generation)

텍스트분류

0개의 댓글