Information Extraction

Jane의 study note.·2022년 11월 30일
0

NLP 자연어처리

목록 보기
7/24

[10-1] 정보추출에서 관계형 튜플 표현

# [표11-1]의 예를 바탕으로 한 구조화된 데이터
locs = [('고려대학교', 'In', '서울'),
        ('Naver', 'In', '성남'),
        ('KT 위즈', 'In', '수원'),
        ('한화 이글스', 'In', '대전'),
        ('NC 소프트', 'In', '성남'),
        ('삼성', 'In', '서울')]       

# 서울에 있는 기관 출력 
query = []

for (e1, rel, e2) in locs:
  if e2=='서울':
    query.append(e1)

print(query)
['고려대학교', '삼성']

[10-2] 정규식을 적용한 관계추출 실습 1

import re
import nltk
nltk.download('ieer')

# 단어가 포함된 문자열을 검색할 때 사용되는 관계 추출의 특수 정규식은 (?!\b.+ing\b) 임
IN = re.compile(r'.*\bin\b(?!\b.+ing)')

for doc in nltk.corpus.ieer.parsed_docs('NYT_19980315'):
  print("doc : ", doc)
  for rel in nltk.sem.extract_rels('ORG', 'LOC', doc, corpus='ieer', pattern = IN):

     print(nltk.sem.rtuple(rel))

doc :  <IEERDocument NYT19980315.0063: 'PUBLIC RADIO HOSTS DROP IN AND MAYBE STAY TOO LONG'>
[ORG: 'WHYY'] 'in' [LOC: 'Philadelphia']
doc :  <IEERDocument NYT19980315.0064: 'IN CYBERSPACE, IS THERE LAW WHERE THERE IS NO LAND?'>
[ORG: 'McGlashan &AMP; Sarrail'] 'firm in' [LOC: 'San Mateo']
doc :  <IEERDocument NYT19980315.0067: 'THE SITES: TUNING INTO MUSIC ON THE WEB'>
[ORG: 'Freedom Forum'] 'in' [LOC: 'Arlington']
doc :  <IEERDocument NYT19980315.0069: 'ANALYSIS: TAXING INTERNET SALES _ GOVERNORS VS. TAX FREEDOM ACT'>
[ORG: 'Brookings Institution'] ', the research group in' [LOC: 'Washington']
doc :  <IEERDocument NYT19980315.0070: 'A SEARCH ENGINE THAT CHARGES FOR TOP BILLING'>
[ORG: 'Idealab'] ', a self-described business incubator based in' [LOC: 'Los Angeles']
[ORG: 'Open Text'] ', based in' [LOC: 'Waterloo']
doc :  <IEERDocument NYT19980315.0071: 'COMING SOON: TV DOCUMENTARIES TO A BOOKSTORE NEAR YOU'>
[ORG: 'WGBH'] 'in' [LOC: 'Boston']
doc :  <IEERDocument NYT19980315.0072: 'WEATHER'>
doc :  <IEERDocument NYT19980315.0073: 'DICAPRIO , CHARISMATIC STAR, BALKS AT TEEN IDOL IMAGE'>
doc :  <IEERDocument NYT19980315.0074: "You've read the book? Now see the television program. Inspired by ``Angela's...">
doc :  <IEERDocument NYT19980315.0084: 'A DIRECTOR WHO DARES, AND TAKES THE HEAT'>
[ORG: 'Bastille Opera'] 'in' [LOC: 'Paris']
doc :  <IEERDocument NYT19980315.0085: 'ADVERTISING: AFTER 32 YEARS , WELLS BDDP WILL CLOSE'>
[ORG: 'Omnicom'] 'in' [LOC: 'New York']
[ORG: 'DDB Needham'] 'in' [LOC: 'New York']
[ORG: 'Kaplan Thaler Group'] 'in' [LOC: 'New York']
[ORG: 'BBDO South'] 'in' [LOC: 'Atlanta']
[ORG: 'Georgia-Pacific'] 'in' [LOC: 'Atlanta']
doc :  <IEERDocument NYT19980315.0086: "`THE GIFTS OF THE JEWS': DESERT NOMADS CHANGED THE WORLD WITH THE">
doc :  <IEERDocument NYT19980315.0087: 'MUSICIANS ON BROADWAY TO VOTE ON A CONTRACT'>

[10-3] 정규식을 적용한 관계추출 실습 2

import nltk
from nltk.corpus import conll2002
nltk.download('conll2002')
import re

# 여러개 정규식 추가
vnv = """
 (
 is/V|   
 was/V|   
 werd/V|  
 wordt/V  
 )
 .*       
 van/Prep 
 """

# 정규식을 기준으로 전처리
VAN = re.compile (vnv, re.VERBOSE)

# conll2002 코퍼스에 대한 관계 추출
for doc in conll2002.chunked_sents('ned.train'):
  # print("doc : ", doc)
  for rel in nltk.sem.extract_rels('PER', 'ORG', doc, corpus='conll2002', pattern=VAN):
    # 네덜란드어의 관계
    print("", nltk.sem.clause(rel, relsym="VAN"))
    """기본 10 단어 창 내에서 두 NE와 왼쪽 및 오른쪽 컨텍스트 사이에 개입하는 실제 단어가 표시됨. 
    네덜란드어 사전을 사용하면 결과 VAN ( 'annie_lennox' , 'eurythmics' ) 이 왜 틀린지 알 수 있음"""
    # print (nltk.rtuple(rel, lcon=True , rcon=True ))

 VAN("cornet_d'elzius", 'buitenlandse_handel')
 VAN('johan_rottiers', 'kardinaal_van_roey_instituut')
 VAN('annie_lennox', 'eurythmics')

0개의 댓글