Spaces:
Sleeping
Sleeping
# Natural Language Toolkit: Relation Extraction | |
# | |
# Copyright (C) 2001-2023 NLTK Project | |
# Author: Ewan Klein <[email protected]> | |
# URL: <https://www.nltk.org/> | |
# For license information, see LICENSE.TXT | |
""" | |
Code for extracting relational triples from the ieer and conll2002 corpora. | |
Relations are stored internally as dictionaries ('reldicts'). | |
The two serialization outputs are "rtuple" and "clause". | |
- An rtuple is a tuple of the form ``(subj, filler, obj)``, | |
where ``subj`` and ``obj`` are pairs of Named Entity mentions, and ``filler`` is the string of words | |
occurring between ``sub`` and ``obj`` (with no intervening NEs). Strings are printed via ``repr()`` to | |
circumvent locale variations in rendering utf-8 encoded strings. | |
- A clause is an atom of the form ``relsym(subjsym, objsym)``, | |
where the relation, subject and object have been canonicalized to single strings. | |
""" | |
# todo: get a more general solution to canonicalized symbols for clauses -- maybe use xmlcharrefs? | |
import html | |
import re | |
from collections import defaultdict | |
# Dictionary that associates corpora with NE classes | |
NE_CLASSES = { | |
"ieer": [ | |
"LOCATION", | |
"ORGANIZATION", | |
"PERSON", | |
"DURATION", | |
"DATE", | |
"CARDINAL", | |
"PERCENT", | |
"MONEY", | |
"MEASURE", | |
], | |
"conll2002": ["LOC", "PER", "ORG"], | |
"ace": [ | |
"LOCATION", | |
"ORGANIZATION", | |
"PERSON", | |
"DURATION", | |
"DATE", | |
"CARDINAL", | |
"PERCENT", | |
"MONEY", | |
"MEASURE", | |
"FACILITY", | |
"GPE", | |
], | |
} | |
# Allow abbreviated class labels | |
short2long = dict(LOC="LOCATION", ORG="ORGANIZATION", PER="PERSON") | |
long2short = dict(LOCATION="LOC", ORGANIZATION="ORG", PERSON="PER") | |
def _expand(type): | |
""" | |
Expand an NE class name. | |
:type type: str | |
:rtype: str | |
""" | |
try: | |
return short2long[type] | |
except KeyError: | |
return type | |
def class_abbrev(type): | |
""" | |
Abbreviate an NE class name. | |
:type type: str | |
:rtype: str | |
""" | |
try: | |
return long2short[type] | |
except KeyError: | |
return type | |
def _join(lst, sep=" ", untag=False): | |
""" | |
Join a list into a string, turning tags tuples into tag strings or just words. | |
:param untag: if ``True``, omit the tag from tagged input strings. | |
:type lst: list | |
:rtype: str | |
""" | |
try: | |
return sep.join(lst) | |
except TypeError: | |
if untag: | |
return sep.join(tup[0] for tup in lst) | |
from nltk.tag import tuple2str | |
return sep.join(tuple2str(tup) for tup in lst) | |
def descape_entity(m, defs=html.entities.entitydefs): | |
""" | |
Translate one entity to its ISO Latin value. | |
Inspired by example from effbot.org | |
""" | |
try: | |
return defs[m.group(1)] | |
except KeyError: | |
return m.group(0) # use as is | |
def list2sym(lst): | |
""" | |
Convert a list of strings into a canonical symbol. | |
:type lst: list | |
:return: a Unicode string without whitespace | |
:rtype: unicode | |
""" | |
sym = _join(lst, "_", untag=True) | |
sym = sym.lower() | |
ENT = re.compile(r"&(\w+?);") | |
sym = ENT.sub(descape_entity, sym) | |
sym = sym.replace(".", "") | |
return sym | |
def tree2semi_rel(tree): | |
""" | |
Group a chunk structure into a list of 'semi-relations' of the form (list(str), ``Tree``). | |
In order to facilitate the construction of (``Tree``, string, ``Tree``) triples, this | |
identifies pairs whose first member is a list (possibly empty) of terminal | |
strings, and whose second member is a ``Tree`` of the form (NE_label, terminals). | |
:param tree: a chunk tree | |
:return: a list of pairs (list(str), ``Tree``) | |
:rtype: list of tuple | |
""" | |
from nltk.tree import Tree | |
semi_rels = [] | |
semi_rel = [[], None] | |
for dtr in tree: | |
if not isinstance(dtr, Tree): | |
semi_rel[0].append(dtr) | |
else: | |
# dtr is a Tree | |
semi_rel[1] = dtr | |
semi_rels.append(semi_rel) | |
semi_rel = [[], None] | |
return semi_rels | |
def semi_rel2reldict(pairs, window=5, trace=False): | |
""" | |
Converts the pairs generated by ``tree2semi_rel`` into a 'reldict': a dictionary which | |
stores information about the subject and object NEs plus the filler between them. | |
Additionally, a left and right context of length =< window are captured (within | |
a given input sentence). | |
:param pairs: a pair of list(str) and ``Tree``, as generated by | |
:param window: a threshold for the number of items to include in the left and right context | |
:type window: int | |
:return: 'relation' dictionaries whose keys are 'lcon', 'subjclass', 'subjtext', 'subjsym', 'filler', objclass', objtext', 'objsym' and 'rcon' | |
:rtype: list(defaultdict) | |
""" | |
result = [] | |
while len(pairs) > 2: | |
reldict = defaultdict(str) | |
reldict["lcon"] = _join(pairs[0][0][-window:]) | |
reldict["subjclass"] = pairs[0][1].label() | |
reldict["subjtext"] = _join(pairs[0][1].leaves()) | |
reldict["subjsym"] = list2sym(pairs[0][1].leaves()) | |
reldict["filler"] = _join(pairs[1][0]) | |
reldict["untagged_filler"] = _join(pairs[1][0], untag=True) | |
reldict["objclass"] = pairs[1][1].label() | |
reldict["objtext"] = _join(pairs[1][1].leaves()) | |
reldict["objsym"] = list2sym(pairs[1][1].leaves()) | |
reldict["rcon"] = _join(pairs[2][0][:window]) | |
if trace: | |
print( | |
"(%s(%s, %s)" | |
% ( | |
reldict["untagged_filler"], | |
reldict["subjclass"], | |
reldict["objclass"], | |
) | |
) | |
result.append(reldict) | |
pairs = pairs[1:] | |
return result | |
def extract_rels(subjclass, objclass, doc, corpus="ace", pattern=None, window=10): | |
""" | |
Filter the output of ``semi_rel2reldict`` according to specified NE classes and a filler pattern. | |
The parameters ``subjclass`` and ``objclass`` can be used to restrict the | |
Named Entities to particular types (any of 'LOCATION', 'ORGANIZATION', | |
'PERSON', 'DURATION', 'DATE', 'CARDINAL', 'PERCENT', 'MONEY', 'MEASURE'). | |
:param subjclass: the class of the subject Named Entity. | |
:type subjclass: str | |
:param objclass: the class of the object Named Entity. | |
:type objclass: str | |
:param doc: input document | |
:type doc: ieer document or a list of chunk trees | |
:param corpus: name of the corpus to take as input; possible values are | |
'ieer' and 'conll2002' | |
:type corpus: str | |
:param pattern: a regular expression for filtering the fillers of | |
retrieved triples. | |
:type pattern: SRE_Pattern | |
:param window: filters out fillers which exceed this threshold | |
:type window: int | |
:return: see ``mk_reldicts`` | |
:rtype: list(defaultdict) | |
""" | |
if subjclass and subjclass not in NE_CLASSES[corpus]: | |
if _expand(subjclass) in NE_CLASSES[corpus]: | |
subjclass = _expand(subjclass) | |
else: | |
raise ValueError( | |
"your value for the subject type has not been recognized: %s" | |
% subjclass | |
) | |
if objclass and objclass not in NE_CLASSES[corpus]: | |
if _expand(objclass) in NE_CLASSES[corpus]: | |
objclass = _expand(objclass) | |
else: | |
raise ValueError( | |
"your value for the object type has not been recognized: %s" % objclass | |
) | |
if corpus == "ace" or corpus == "conll2002": | |
pairs = tree2semi_rel(doc) | |
elif corpus == "ieer": | |
pairs = tree2semi_rel(doc.text) + tree2semi_rel(doc.headline) | |
else: | |
raise ValueError("corpus type not recognized") | |
reldicts = semi_rel2reldict(pairs) | |
relfilter = lambda x: ( | |
x["subjclass"] == subjclass | |
and len(x["filler"].split()) <= window | |
and pattern.match(x["filler"]) | |
and x["objclass"] == objclass | |
) | |
return list(filter(relfilter, reldicts)) | |
def rtuple(reldict, lcon=False, rcon=False): | |
""" | |
Pretty print the reldict as an rtuple. | |
:param reldict: a relation dictionary | |
:type reldict: defaultdict | |
""" | |
items = [ | |
class_abbrev(reldict["subjclass"]), | |
reldict["subjtext"], | |
reldict["filler"], | |
class_abbrev(reldict["objclass"]), | |
reldict["objtext"], | |
] | |
format = "[%s: %r] %r [%s: %r]" | |
if lcon: | |
items = [reldict["lcon"]] + items | |
format = "...%r)" + format | |
if rcon: | |
items.append(reldict["rcon"]) | |
format = format + "(%r..." | |
printargs = tuple(items) | |
return format % printargs | |
def clause(reldict, relsym): | |
""" | |
Print the relation in clausal form. | |
:param reldict: a relation dictionary | |
:type reldict: defaultdict | |
:param relsym: a label for the relation | |
:type relsym: str | |
""" | |
items = (relsym, reldict["subjsym"], reldict["objsym"]) | |
return "%s(%r, %r)" % items | |
####################################################### | |
# Demos of relation extraction with regular expressions | |
####################################################### | |
############################################ | |
# Example of in(ORG, LOC) | |
############################################ | |
def in_demo(trace=0, sql=True): | |
""" | |
Select pairs of organizations and locations whose mentions occur with an | |
intervening occurrence of the preposition "in". | |
If the sql parameter is set to True, then the entity pairs are loaded into | |
an in-memory database, and subsequently pulled out using an SQL "SELECT" | |
query. | |
""" | |
from nltk.corpus import ieer | |
if sql: | |
try: | |
import sqlite3 | |
connection = sqlite3.connect(":memory:") | |
cur = connection.cursor() | |
cur.execute( | |
"""create table Locations | |
(OrgName text, LocationName text, DocID text)""" | |
) | |
except ImportError: | |
import warnings | |
warnings.warn("Cannot import sqlite; sql flag will be ignored.") | |
IN = re.compile(r".*\bin\b(?!\b.+ing)") | |
print() | |
print("IEER: in(ORG, LOC) -- just the clauses:") | |
print("=" * 45) | |
for file in ieer.fileids(): | |
for doc in ieer.parsed_docs(file): | |
if trace: | |
print(doc.docno) | |
print("=" * 15) | |
for rel in extract_rels("ORG", "LOC", doc, corpus="ieer", pattern=IN): | |
print(clause(rel, relsym="IN")) | |
if sql: | |
try: | |
rtuple = (rel["subjtext"], rel["objtext"], doc.docno) | |
cur.execute( | |
"""insert into Locations | |
values (?, ?, ?)""", | |
rtuple, | |
) | |
connection.commit() | |
except NameError: | |
pass | |
if sql: | |
try: | |
cur.execute( | |
"""select OrgName from Locations | |
where LocationName = 'Atlanta'""" | |
) | |
print() | |
print("Extract data from SQL table: ORGs in Atlanta") | |
print("-" * 15) | |
for row in cur: | |
print(row) | |
except NameError: | |
pass | |
############################################ | |
# Example of has_role(PER, LOC) | |
############################################ | |
def roles_demo(trace=0): | |
from nltk.corpus import ieer | |
roles = r""" | |
(.*( # assorted roles | |
analyst| | |
chair(wo)?man| | |
commissioner| | |
counsel| | |
director| | |
economist| | |
editor| | |
executive| | |
foreman| | |
governor| | |
head| | |
lawyer| | |
leader| | |
librarian).*)| | |
manager| | |
partner| | |
president| | |
producer| | |
professor| | |
researcher| | |
spokes(wo)?man| | |
writer| | |
,\sof\sthe?\s* # "X, of (the) Y" | |
""" | |
ROLES = re.compile(roles, re.VERBOSE) | |
print() | |
print("IEER: has_role(PER, ORG) -- raw rtuples:") | |
print("=" * 45) | |
for file in ieer.fileids(): | |
for doc in ieer.parsed_docs(file): | |
lcon = rcon = False | |
if trace: | |
print(doc.docno) | |
print("=" * 15) | |
lcon = rcon = True | |
for rel in extract_rels("PER", "ORG", doc, corpus="ieer", pattern=ROLES): | |
print(rtuple(rel, lcon=lcon, rcon=rcon)) | |
############################################## | |
### Show what's in the IEER Headlines | |
############################################## | |
def ieer_headlines(): | |
from nltk.corpus import ieer | |
from nltk.tree import Tree | |
print("IEER: First 20 Headlines") | |
print("=" * 45) | |
trees = [ | |
(doc.docno, doc.headline) | |
for file in ieer.fileids() | |
for doc in ieer.parsed_docs(file) | |
] | |
for tree in trees[:20]: | |
print() | |
print("%s:\n%s" % tree) | |
############################################# | |
## Dutch CONLL2002: take_on_role(PER, ORG | |
############################################# | |
def conllned(trace=1): | |
""" | |
Find the copula+'van' relation ('of') in the Dutch tagged training corpus | |
from CoNLL 2002. | |
""" | |
from nltk.corpus import conll2002 | |
vnv = """ | |
( | |
is/V| # 3rd sing present and | |
was/V| # past forms of the verb zijn ('be') | |
werd/V| # and also present | |
wordt/V # past of worden ('become) | |
) | |
.* # followed by anything | |
van/Prep # followed by van ('of') | |
""" | |
VAN = re.compile(vnv, re.VERBOSE) | |
print() | |
print("Dutch CoNLL2002: van(PER, ORG) -- raw rtuples with context:") | |
print("=" * 45) | |
for doc in conll2002.chunked_sents("ned.train"): | |
lcon = rcon = False | |
if trace: | |
lcon = rcon = True | |
for rel in extract_rels( | |
"PER", "ORG", doc, corpus="conll2002", pattern=VAN, window=10 | |
): | |
print(rtuple(rel, lcon=lcon, rcon=rcon)) | |
############################################# | |
## Spanish CONLL2002: (PER, ORG) | |
############################################# | |
def conllesp(): | |
from nltk.corpus import conll2002 | |
de = """ | |
.* | |
( | |
de/SP| | |
del/SP | |
) | |
""" | |
DE = re.compile(de, re.VERBOSE) | |
print() | |
print("Spanish CoNLL2002: de(ORG, LOC) -- just the first 10 clauses:") | |
print("=" * 45) | |
rels = [ | |
rel | |
for doc in conll2002.chunked_sents("esp.train") | |
for rel in extract_rels("ORG", "LOC", doc, corpus="conll2002", pattern=DE) | |
] | |
for r in rels[:10]: | |
print(clause(r, relsym="DE")) | |
print() | |
def ne_chunked(): | |
print() | |
print("1500 Sentences from Penn Treebank, as processed by NLTK NE Chunker") | |
print("=" * 45) | |
ROLE = re.compile( | |
r".*(chairman|president|trader|scientist|economist|analyst|partner).*" | |
) | |
rels = [] | |
for i, sent in enumerate(nltk.corpus.treebank.tagged_sents()[:1500]): | |
sent = nltk.ne_chunk(sent) | |
rels = extract_rels("PER", "ORG", sent, corpus="ace", pattern=ROLE, window=7) | |
for rel in rels: | |
print(f"{i:<5}{rtuple(rel)}") | |
if __name__ == "__main__": | |
import nltk | |
from nltk.sem import relextract | |
in_demo(trace=0) | |
roles_demo(trace=0) | |
conllned() | |
conllesp() | |
ieer_headlines() | |
ne_chunked() | |