import requests
from bs4 import BeautifulSoup
url = '기사 가져올 주소'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
eng_news = soup.select('p')
eng_text = eng_news[3].get_text()
eng_text
!pip install nltk
import nltk
nltk.download('punkt')
nltk.download('omw-1.4')
import nltk
from nltk.tokenize import word_tokenize
token1 = word_tokenize(eng_text)
print(token1)
=> ['It', 'is', 'the', 'present-day', 'darling',
'of', 'the', 'tech']
import nltk
from nltk.tokenize import WordPunctTokenizer
wordpuncttoken = WordPunctTokenizer().tokenize(eng_text)
print(wordpuncttoken)
=>['It', 'is', 'the', 'present','-', 'day', 'darling',
'of', 'the', 'tech']
import nltk
from nltk.tokenize import TreebankWordTokenizer
treebankwordtoken=TreebankWordTokenizer().tokenize(eng_text)
print(treebankwordtoken)
=> ['It', 'is', 'the', 'present-day', 'darling',
'of', 'the', 'tech']
분리한 토큰마다 품사를 부착한다 https://www.nltk.org/api/nltk.tag.html
태그목록: https://pythonprogramming.net/natural-language-toolkit-nltk-part-speech-tagging/
from nltk import pos_tag
nltk.download('averaged_perceptron_tagger')
taggedToken = pos_tag(word_tokens)
print(taggedToken)
=> [('James', 'NNP'), ('is', 'VBZ'), ('working', 'VBG'),
('at', 'IN'), ('Disney', 'NNP'),
('in', 'IN'), ('London', 'NNP')]
http://www.nltk.org/api/nltk.chunk.html
nltk.download('words')
nltk.download('maxent_ne_chunker')
from nltk import ne_chunk
neToken = ne_chunk(taggedToken)
print(neToken)
=> (S
(PERSON James/NNP)
is/VBZ
working/VBG
at/IN
(ORGANIZATION Disney/NNP)
in/IN
(GPE London/NNP))
James는 PERSON(사람),
Disney는 조직(ORGANIZATION),
London은 위치(GPE)
규칙 상세: http://www.nltk.org/api/nltk.chunk.html
from nltk.stem import PorterStemmer
ps = PorterStemmer()
print("running -> " + ps.stem("running"))
print("believes -> "+ps.stem('believes'))
print('using ->' + ps.stem("using"))
print("conversation ->" + ps.stem('conversation'))
print('organization ->'+ ps.stem('organization'))
print('studies -> '+ ps.stem("studies"))
=>
running -> run
believes -> believ
using ->use
conversation ->convers
organization ->organ
studies -> studi
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
wl = WordNetLemmatizer()
print("running -> " + wl.lemmatize("running"))
print("believes -> "+wl.lemmatize('believes'))
print('using ->' + wl.lemmatize("using"))
print("conversation ->" + wl.lemmatize('conversation'))
print('organization ->'+ wl.lemmatize('organization'))
print('studies -> '+ wl.lemmatize("studies"))
=>
running -> running
believes -> belief
using ->using
conversation ->conversation
organization ->organization
studies -> study
stopPos = ["IN", "CC", "UH", "TO", "MD", "DT", "VBZ", "VBP"]
from collections import Counter
Counter(taggedToken).most_common()
=>
[(('its', 'PRP$'), 5),
(('of', 'IN'), 3),
(('the', 'DT'), 2),
(('.', '.'), 2),
(('(', '('), 2),
(('AI', 'NNP'), 2),
((')', ')'), 2),
(('It', 'PRP'), 1),
(('is', 'VBZ'), 1),
(('present-day', 'JJ'), 1),
(('darling', 'NN'), 1),
(('tech', 'JJ'), 1)]
stopWord = [",", "be", "able"]
word = []
for tag in taggedToken:
if tag[1] not in stopPos:
if tag[0] not in stopWord:
word.append(tag[0])
print(word)
=>
['It', 'present-day', 'darling', 'tech',
'world','.', 'current' 등등)
import nltk
nltk.download('averaged_perceptron_tagger') #pos tagging
nltk.download('words') #NER
nltk.download('maxnet_ne_chuncker') #NER
nltk.download('wordnet') #Lemmatization
from nltk.tokenize import TreebankWordTokenizer
token= TreebankWordTokenizer().tokenize
("Obama loves fried chicken of KFC")
print('token:', token)
from nltk import pos_tag
TaggedToken = pos_tag(token)
print('tagged token:', TaggedToken)
from nltk.stem import PorterStemmer
ps = PorterStemmer()
print("loves -> " + ps.stem("loves"))
print("fried => " + ps.stem('fried'))
from nltk.stem import WordNetLemmatizer
wl = WordNetLemmatizer()
print("loves -> " + wl.lemmatize("loves"))
print("fried => " + wl.lemmatize('fried'))
#불용어 처리
StopPos = ['IN']
StopWord = ["fried"]
word = []
for tag in TaggedToken:
if tag[1] not in StopPos:
if tag[0] not in StopWord:
word.append(wl.lemmatize(tag[0]))
print(word)
=>
token: ['Obama', 'loves', 'fried', 'chicken', 'of', 'KFC']
tagged token: [('Obama', 'NNP'), ('loves', 'VBZ'),
('fried', 'VBN'), ('chicken', 'NN'),
('of', 'IN'), ('KFC', 'NNP')]
loves -> love
fried => fri
loves -> love
fried => fried
['Obama', 'love', 'chicken', 'KFC']