Spaces:
Sleeping
Sleeping
# Natural Language Toolkit: NomBank Corpus Reader | |
# | |
# Copyright (C) 2001-2023 NLTK Project | |
# Authors: Paul Bedaride <[email protected]> | |
# Edward Loper <[email protected]> | |
# URL: <https://www.nltk.org/> | |
# For license information, see LICENSE.TXT | |
from functools import total_ordering | |
from xml.etree import ElementTree | |
from nltk.corpus.reader.api import * | |
from nltk.corpus.reader.util import * | |
from nltk.internals import raise_unorderable_types | |
from nltk.tree import Tree | |
class NombankCorpusReader(CorpusReader): | |
""" | |
Corpus reader for the nombank corpus, which augments the Penn | |
Treebank with information about the predicate argument structure | |
of every noun instance. The corpus consists of two parts: the | |
predicate-argument annotations themselves, and a set of "frameset | |
files" which define the argument labels used by the annotations, | |
on a per-noun basis. Each "frameset file" contains one or more | |
predicates, such as ``'turn'`` or ``'turn_on'``, each of which is | |
divided into coarse-grained word senses called "rolesets". For | |
each "roleset", the frameset file provides descriptions of the | |
argument roles, along with examples. | |
""" | |
def __init__( | |
self, | |
root, | |
nomfile, | |
framefiles="", | |
nounsfile=None, | |
parse_fileid_xform=None, | |
parse_corpus=None, | |
encoding="utf8", | |
): | |
""" | |
:param root: The root directory for this corpus. | |
:param nomfile: The name of the file containing the predicate- | |
argument annotations (relative to ``root``). | |
:param framefiles: A list or regexp specifying the frameset | |
fileids for this corpus. | |
:param parse_fileid_xform: A transform that should be applied | |
to the fileids in this corpus. This should be a function | |
of one argument (a fileid) that returns a string (the new | |
fileid). | |
:param parse_corpus: The corpus containing the parse trees | |
corresponding to this corpus. These parse trees are | |
necessary to resolve the tree pointers used by nombank. | |
""" | |
# If framefiles is specified as a regexp, expand it. | |
if isinstance(framefiles, str): | |
self._fileids = find_corpus_fileids(root, framefiles) | |
self._fileids = list(framefiles) | |
# Initialize the corpus reader. | |
CorpusReader.__init__(self, root, framefiles, encoding) | |
# Record our nom file & nouns file. | |
self._nomfile = nomfile | |
self._nounsfile = nounsfile | |
self._parse_fileid_xform = parse_fileid_xform | |
self._parse_corpus = parse_corpus | |
def instances(self, baseform=None): | |
""" | |
:return: a corpus view that acts as a list of | |
``NombankInstance`` objects, one for each noun in the corpus. | |
""" | |
kwargs = {} | |
if baseform is not None: | |
kwargs["instance_filter"] = lambda inst: inst.baseform == baseform | |
return StreamBackedCorpusView( | |
self.abspath(self._nomfile), | |
lambda stream: self._read_instance_block(stream, **kwargs), | |
encoding=self.encoding(self._nomfile), | |
) | |
def lines(self): | |
""" | |
:return: a corpus view that acts as a list of strings, one for | |
each line in the predicate-argument annotation file. | |
""" | |
return StreamBackedCorpusView( | |
self.abspath(self._nomfile), | |
read_line_block, | |
encoding=self.encoding(self._nomfile), | |
) | |
def roleset(self, roleset_id): | |
""" | |
:return: the xml description for the given roleset. | |
""" | |
baseform = roleset_id.split(".")[0] | |
baseform = baseform.replace("perc-sign", "%") | |
baseform = baseform.replace("oneslashonezero", "1/10").replace( | |
"1/10", "1-slash-10" | |
) | |
framefile = "frames/%s.xml" % baseform | |
if framefile not in self.fileids(): | |
raise ValueError("Frameset file for %s not found" % roleset_id) | |
# n.b.: The encoding for XML fileids is specified by the file | |
# itself; so we ignore self._encoding here. | |
with self.abspath(framefile).open() as fp: | |
etree = ElementTree.parse(fp).getroot() | |
for roleset in etree.findall("predicate/roleset"): | |
if roleset.attrib["id"] == roleset_id: | |
return roleset | |
raise ValueError(f"Roleset {roleset_id} not found in {framefile}") | |
def rolesets(self, baseform=None): | |
""" | |
:return: list of xml descriptions for rolesets. | |
""" | |
if baseform is not None: | |
framefile = "frames/%s.xml" % baseform | |
if framefile not in self.fileids(): | |
raise ValueError("Frameset file for %s not found" % baseform) | |
framefiles = [framefile] | |
else: | |
framefiles = self.fileids() | |
rsets = [] | |
for framefile in framefiles: | |
# n.b.: The encoding for XML fileids is specified by the file | |
# itself; so we ignore self._encoding here. | |
with self.abspath(framefile).open() as fp: | |
etree = ElementTree.parse(fp).getroot() | |
rsets.append(etree.findall("predicate/roleset")) | |
return LazyConcatenation(rsets) | |
def nouns(self): | |
""" | |
:return: a corpus view that acts as a list of all noun lemmas | |
in this corpus (from the nombank.1.0.words file). | |
""" | |
return StreamBackedCorpusView( | |
self.abspath(self._nounsfile), | |
read_line_block, | |
encoding=self.encoding(self._nounsfile), | |
) | |
def _read_instance_block(self, stream, instance_filter=lambda inst: True): | |
block = [] | |
# Read 100 at a time. | |
for i in range(100): | |
line = stream.readline().strip() | |
if line: | |
inst = NombankInstance.parse( | |
line, self._parse_fileid_xform, self._parse_corpus | |
) | |
if instance_filter(inst): | |
block.append(inst) | |
return block | |
###################################################################### | |
# { Nombank Instance & related datatypes | |
###################################################################### | |
class NombankInstance: | |
def __init__( | |
self, | |
fileid, | |
sentnum, | |
wordnum, | |
baseform, | |
sensenumber, | |
predicate, | |
predid, | |
arguments, | |
parse_corpus=None, | |
): | |
self.fileid = fileid | |
"""The name of the file containing the parse tree for this | |
instance's sentence.""" | |
self.sentnum = sentnum | |
"""The sentence number of this sentence within ``fileid``. | |
Indexing starts from zero.""" | |
self.wordnum = wordnum | |
"""The word number of this instance's predicate within its | |
containing sentence. Word numbers are indexed starting from | |
zero, and include traces and other empty parse elements.""" | |
self.baseform = baseform | |
"""The baseform of the predicate.""" | |
self.sensenumber = sensenumber | |
"""The sense number of the predicate.""" | |
self.predicate = predicate | |
"""A ``NombankTreePointer`` indicating the position of this | |
instance's predicate within its containing sentence.""" | |
self.predid = predid | |
"""Identifier of the predicate.""" | |
self.arguments = tuple(arguments) | |
"""A list of tuples (argloc, argid), specifying the location | |
and identifier for each of the predicate's argument in the | |
containing sentence. Argument identifiers are strings such as | |
``'ARG0'`` or ``'ARGM-TMP'``. This list does *not* contain | |
the predicate.""" | |
self.parse_corpus = parse_corpus | |
"""A corpus reader for the parse trees corresponding to the | |
instances in this nombank corpus.""" | |
def roleset(self): | |
"""The name of the roleset used by this instance's predicate. | |
Use ``nombank.roleset() <NombankCorpusReader.roleset>`` to | |
look up information about the roleset.""" | |
r = self.baseform.replace("%", "perc-sign") | |
r = r.replace("1/10", "1-slash-10").replace("1-slash-10", "oneslashonezero") | |
return f"{r}.{self.sensenumber}" | |
def __repr__(self): | |
return "<NombankInstance: {}, sent {}, word {}>".format( | |
self.fileid, | |
self.sentnum, | |
self.wordnum, | |
) | |
def __str__(self): | |
s = "{} {} {} {} {}".format( | |
self.fileid, | |
self.sentnum, | |
self.wordnum, | |
self.baseform, | |
self.sensenumber, | |
) | |
items = self.arguments + ((self.predicate, "rel"),) | |
for (argloc, argid) in sorted(items): | |
s += f" {argloc}-{argid}" | |
return s | |
def _get_tree(self): | |
if self.parse_corpus is None: | |
return None | |
if self.fileid not in self.parse_corpus.fileids(): | |
return None | |
return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum] | |
tree = property( | |
_get_tree, | |
doc=""" | |
The parse tree corresponding to this instance, or None if | |
the corresponding tree is not available.""", | |
) | |
def parse(s, parse_fileid_xform=None, parse_corpus=None): | |
pieces = s.split() | |
if len(pieces) < 6: | |
raise ValueError("Badly formatted nombank line: %r" % s) | |
# Divide the line into its basic pieces. | |
(fileid, sentnum, wordnum, baseform, sensenumber) = pieces[:5] | |
args = pieces[5:] | |
rel = [args.pop(i) for i, p in enumerate(args) if "-rel" in p] | |
if len(rel) != 1: | |
raise ValueError("Badly formatted nombank line: %r" % s) | |
# Apply the fileid selector, if any. | |
if parse_fileid_xform is not None: | |
fileid = parse_fileid_xform(fileid) | |
# Convert sentence & word numbers to ints. | |
sentnum = int(sentnum) | |
wordnum = int(wordnum) | |
# Parse the predicate location. | |
predloc, predid = rel[0].split("-", 1) | |
predicate = NombankTreePointer.parse(predloc) | |
# Parse the arguments. | |
arguments = [] | |
for arg in args: | |
argloc, argid = arg.split("-", 1) | |
arguments.append((NombankTreePointer.parse(argloc), argid)) | |
# Put it all together. | |
return NombankInstance( | |
fileid, | |
sentnum, | |
wordnum, | |
baseform, | |
sensenumber, | |
predicate, | |
predid, | |
arguments, | |
parse_corpus, | |
) | |
class NombankPointer: | |
""" | |
A pointer used by nombank to identify one or more constituents in | |
a parse tree. ``NombankPointer`` is an abstract base class with | |
three concrete subclasses: | |
- ``NombankTreePointer`` is used to point to single constituents. | |
- ``NombankSplitTreePointer`` is used to point to 'split' | |
constituents, which consist of a sequence of two or more | |
``NombankTreePointer`` pointers. | |
- ``NombankChainTreePointer`` is used to point to entire trace | |
chains in a tree. It consists of a sequence of pieces, which | |
can be ``NombankTreePointer`` or ``NombankSplitTreePointer`` pointers. | |
""" | |
def __init__(self): | |
if self.__class__ == NombankPointer: | |
raise NotImplementedError() | |
class NombankChainTreePointer(NombankPointer): | |
def __init__(self, pieces): | |
self.pieces = pieces | |
"""A list of the pieces that make up this chain. Elements may | |
be either ``NombankSplitTreePointer`` or | |
``NombankTreePointer`` pointers.""" | |
def __str__(self): | |
return "*".join("%s" % p for p in self.pieces) | |
def __repr__(self): | |
return "<NombankChainTreePointer: %s>" % self | |
def select(self, tree): | |
if tree is None: | |
raise ValueError("Parse tree not available") | |
return Tree("*CHAIN*", [p.select(tree) for p in self.pieces]) | |
class NombankSplitTreePointer(NombankPointer): | |
def __init__(self, pieces): | |
self.pieces = pieces | |
"""A list of the pieces that make up this chain. Elements are | |
all ``NombankTreePointer`` pointers.""" | |
def __str__(self): | |
return ",".join("%s" % p for p in self.pieces) | |
def __repr__(self): | |
return "<NombankSplitTreePointer: %s>" % self | |
def select(self, tree): | |
if tree is None: | |
raise ValueError("Parse tree not available") | |
return Tree("*SPLIT*", [p.select(tree) for p in self.pieces]) | |
class NombankTreePointer(NombankPointer): | |
""" | |
wordnum:height*wordnum:height*... | |
wordnum:height, | |
""" | |
def __init__(self, wordnum, height): | |
self.wordnum = wordnum | |
self.height = height | |
def parse(s): | |
# Deal with chains (xx*yy*zz) | |
pieces = s.split("*") | |
if len(pieces) > 1: | |
return NombankChainTreePointer( | |
[NombankTreePointer.parse(elt) for elt in pieces] | |
) | |
# Deal with split args (xx,yy,zz) | |
pieces = s.split(",") | |
if len(pieces) > 1: | |
return NombankSplitTreePointer( | |
[NombankTreePointer.parse(elt) for elt in pieces] | |
) | |
# Deal with normal pointers. | |
pieces = s.split(":") | |
if len(pieces) != 2: | |
raise ValueError("bad nombank pointer %r" % s) | |
return NombankTreePointer(int(pieces[0]), int(pieces[1])) | |
def __str__(self): | |
return f"{self.wordnum}:{self.height}" | |
def __repr__(self): | |
return "NombankTreePointer(%d, %d)" % (self.wordnum, self.height) | |
def __eq__(self, other): | |
while isinstance(other, (NombankChainTreePointer, NombankSplitTreePointer)): | |
other = other.pieces[0] | |
if not isinstance(other, NombankTreePointer): | |
return self is other | |
return self.wordnum == other.wordnum and self.height == other.height | |
def __ne__(self, other): | |
return not self == other | |
def __lt__(self, other): | |
while isinstance(other, (NombankChainTreePointer, NombankSplitTreePointer)): | |
other = other.pieces[0] | |
if not isinstance(other, NombankTreePointer): | |
return id(self) < id(other) | |
return (self.wordnum, -self.height) < (other.wordnum, -other.height) | |
def select(self, tree): | |
if tree is None: | |
raise ValueError("Parse tree not available") | |
return tree[self.treepos(tree)] | |
def treepos(self, tree): | |
""" | |
Convert this pointer to a standard 'tree position' pointer, | |
given that it points to the given tree. | |
""" | |
if tree is None: | |
raise ValueError("Parse tree not available") | |
stack = [tree] | |
treepos = [] | |
wordnum = 0 | |
while True: | |
# tree node: | |
if isinstance(stack[-1], Tree): | |
# Select the next child. | |
if len(treepos) < len(stack): | |
treepos.append(0) | |
else: | |
treepos[-1] += 1 | |
# Update the stack. | |
if treepos[-1] < len(stack[-1]): | |
stack.append(stack[-1][treepos[-1]]) | |
else: | |
# End of node's child list: pop up a level. | |
stack.pop() | |
treepos.pop() | |
# word node: | |
else: | |
if wordnum == self.wordnum: | |
return tuple(treepos[: len(treepos) - self.height - 1]) | |
else: | |
wordnum += 1 | |
stack.pop() | |