[10-1] 정보추출에서 관계형 튜플 표현
locs = [('고려대학교', 'In', '서울'),
('Naver', 'In', '성남'),
('KT 위즈', 'In', '수원'),
('한화 이글스', 'In', '대전'),
('NC 소프트', 'In', '성남'),
('삼성', 'In', '서울')]
query = []
for (e1, rel, e2) in locs:
if e2=='서울':
query.append(e1)
print(query)
['고려대학교', '삼성']
[10-2] 정규식을 적용한 관계추출 실습 1
import re
import nltk
nltk.download('ieer')
IN = re.compile(r'.*\bin\b(?!\b.+ing)')
for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):
print("doc : ", doc)
for rel in nltk.sem.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern = IN):
print(nltk.sem.rtuple(rel))
doc : <IEERDocument NYT19980315.0063: 'PUBLIC RADIO HOSTS DROP IN AND MAYBE STAY TOO LONG'>
[ORG: 'WHYY'] 'in' [LOC: 'Philadelphia']
doc : <IEERDocument NYT19980315.0064: 'IN CYBERSPACE, IS THERE LAW WHERE THERE IS NO LAND?'>
[ORG: 'McGlashan & Sarrail'] 'firm in' [LOC: 'San Mateo']
doc : <IEERDocument NYT19980315.0067: 'THE SITES: TUNING INTO MUSIC ON THE WEB'>
[ORG: 'Freedom Forum'] 'in' [LOC: 'Arlington']
doc : <IEERDocument NYT19980315.0069: 'ANALYSIS: TAXING INTERNET SALES _ GOVERNORS VS. TAX FREEDOM ACT'>
[ORG: 'Brookings Institution'] ', the research group in' [LOC: 'Washington']
doc : <IEERDocument NYT19980315.0070: 'A SEARCH ENGINE THAT CHARGES FOR TOP BILLING'>
[ORG: 'Idealab'] ', a self-described business incubator based in' [LOC: 'Los Angeles']
[ORG: 'Open Text'] ', based in' [LOC: 'Waterloo']
doc : <IEERDocument NYT19980315.0071: 'COMING SOON: TV DOCUMENTARIES TO A BOOKSTORE NEAR YOU'>
[ORG: 'WGBH'] 'in' [LOC: 'Boston']
doc : <IEERDocument NYT19980315.0072: 'WEATHER'>
doc : <IEERDocument NYT19980315.0073: 'DICAPRIO , CHARISMATIC STAR, BALKS AT TEEN IDOL IMAGE'>
doc : <IEERDocument NYT19980315.0074: "You've read the book? Now see the television program. Inspired by ``Angela's...">
doc : <IEERDocument NYT19980315.0084: 'A DIRECTOR WHO DARES, AND TAKES THE HEAT'>
[ORG: 'Bastille Opera'] 'in' [LOC: 'Paris']
doc : <IEERDocument NYT19980315.0085: 'ADVERTISING: AFTER 32 YEARS , WELLS BDDP WILL CLOSE'>
[ORG: 'Omnicom'] 'in' [LOC: 'New York']
[ORG: 'DDB Needham'] 'in' [LOC: 'New York']
[ORG: 'Kaplan Thaler Group'] 'in' [LOC: 'New York']
[ORG: 'BBDO South'] 'in' [LOC: 'Atlanta']
[ORG: 'Georgia-Pacific'] 'in' [LOC: 'Atlanta']
doc : <IEERDocument NYT19980315.0086: "`THE GIFTS OF THE JEWS': DESERT NOMADS CHANGED THE WORLD WITH THE">
doc : <IEERDocument NYT19980315.0087: 'MUSICIANS ON BROADWAY TO VOTE ON A CONTRACT'>
[10-3] 정규식을 적용한 관계추출 실습 2
import nltk
from nltk.corpus import conll2002
nltk.download('conll2002')
import re
vnv = """
(
is/V|
was/V|
werd/V|
wordt/V
)
.*
van/Prep
"""
VAN = re.compile (vnv, re.VERBOSE)
for doc in conll2002.chunked_sents('ned.train'):
for rel in nltk.sem.extract_rels('PER', 'ORG', doc, corpus='conll2002', pattern=VAN):
print("", nltk.sem.clause(rel, relsym="VAN"))
"""기본 10 단어 창 내에서 두 NE와 왼쪽 및 오른쪽 컨텍스트 사이에 개입하는 실제 단어가 표시됨.
네덜란드어 사전을 사용하면 결과 VAN ( 'annie_lennox' , 'eurythmics' ) 이 왜 틀린지 알 수 있음"""
VAN("cornet_d'elzius", 'buitenlandse_handel')
VAN('johan_rottiers', 'kardinaal_van_roey_instituut')
VAN('annie_lennox', 'eurythmics')