의미분석

Jane의 study note.·2022년 11월 30일
0

NLP 자연어처리

목록 보기
4/24

[7-1] Lesk_Algorithm을 이용한 단어중의성 해소

!pip3 install nltk==3.3

import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import wordnet 
from nltk import word_tokenize
from nltk.corpus import stopwords
import sys

# 단어와  문장에 나타난 단어에 대해  Best Sense 추출
def disambiguate(word, sentence, stopwords):
        # Best sense 를 얻기위한 Lesk 알고리즘을 작성해보세요.
        
        word_senses = wordnet.synsets(word)
        best_sense = word_senses[0]  # Assume that first sense is most freq.
        max_overlap = 0
        context = set(word_tokenize(sentence))
        for sense in word_senses:
            signature = tokenized_gloss(sense)
            overlap = compute_overlap(signature, context, stopwords)
            if overlap > max_overlap:
                max_overlap = overlap
                best_sense = sense
                
        return best_sense

# sense의 definition에 대한 모든 token 추출
def tokenized_gloss(sense):
        tokens = set(word_tokenize(sense.definition()))
        for example in sense.examples():
            tokens.union(set(word_tokenize(example)))
        return tokens

# 겹치는 단어 비교
def compute_overlap(signature, context, stopwords):
        gloss = signature.difference(stopwords)
        return len(gloss.intersection(context))
        
stopwords = set(stopwords.words('english'))# NLTK에서 지정한 영어 불용어 처리 ex) i, my, they...
sentence = ("They eat a meal")
context = set(word_tokenize(sentence))
word = 'eat'

print("Word :", word)
syn = wordnet.synsets('eat')[1]
print("Sense :", syn.name())
print("Definition :", syn.definition())
print("Sentence :", sentence)

signature = tokenized_gloss(syn)
print(signature)
print(compute_overlap(signature, context, stopwords))
print("Best sense: ", disambiguate(word, sentence, stopwords))

Word : eat
Sense : eat.v.02
Definition : eat a meal; take a meal
Sentence : They eat a meal
{';', 'a', 'meal', 'eat', 'take'}
2
Best sense:  Synset('eat.v.02')

0개의 댓글