Spaces:
Sleeping
Sleeping
# Natural Language Toolkit: PropBank Corpus Reader | |
# | |
# Copyright (C) 2001-2023 NLTK Project | |
# Author: Edward Loper <[email protected]> | |
# URL: <https://www.nltk.org/> | |
# For license information, see LICENSE.TXT | |
import re | |
from functools import total_ordering | |
from xml.etree import ElementTree | |
from nltk.corpus.reader.api import * | |
from nltk.corpus.reader.util import * | |
from nltk.internals import raise_unorderable_types | |
from nltk.tree import Tree | |
class PropbankCorpusReader(CorpusReader): | |
""" | |
Corpus reader for the propbank corpus, which augments the Penn | |
Treebank with information about the predicate argument structure | |
of every verb instance. The corpus consists of two parts: the | |
predicate-argument annotations themselves, and a set of "frameset | |
files" which define the argument labels used by the annotations, | |
on a per-verb basis. Each "frameset file" contains one or more | |
predicates, such as ``'turn'`` or ``'turn_on'``, each of which is | |
divided into coarse-grained word senses called "rolesets". For | |
each "roleset", the frameset file provides descriptions of the | |
argument roles, along with examples. | |
""" | |
def __init__( | |
self, | |
root, | |
propfile, | |
framefiles="", | |
verbsfile=None, | |
parse_fileid_xform=None, | |
parse_corpus=None, | |
encoding="utf8", | |
): | |
""" | |
:param root: The root directory for this corpus. | |
:param propfile: The name of the file containing the predicate- | |
argument annotations (relative to ``root``). | |
:param framefiles: A list or regexp specifying the frameset | |
fileids for this corpus. | |
:param parse_fileid_xform: A transform that should be applied | |
to the fileids in this corpus. This should be a function | |
of one argument (a fileid) that returns a string (the new | |
fileid). | |
:param parse_corpus: The corpus containing the parse trees | |
corresponding to this corpus. These parse trees are | |
necessary to resolve the tree pointers used by propbank. | |
""" | |
# If framefiles is specified as a regexp, expand it. | |
if isinstance(framefiles, str): | |
framefiles = find_corpus_fileids(root, framefiles) | |
framefiles = list(framefiles) | |
# Initialize the corpus reader. | |
CorpusReader.__init__(self, root, [propfile, verbsfile] + framefiles, encoding) | |
# Record our frame fileids & prop file. | |
self._propfile = propfile | |
self._framefiles = framefiles | |
self._verbsfile = verbsfile | |
self._parse_fileid_xform = parse_fileid_xform | |
self._parse_corpus = parse_corpus | |
def instances(self, baseform=None): | |
""" | |
:return: a corpus view that acts as a list of | |
``PropBankInstance`` objects, one for each noun in the corpus. | |
""" | |
kwargs = {} | |
if baseform is not None: | |
kwargs["instance_filter"] = lambda inst: inst.baseform == baseform | |
return StreamBackedCorpusView( | |
self.abspath(self._propfile), | |
lambda stream: self._read_instance_block(stream, **kwargs), | |
encoding=self.encoding(self._propfile), | |
) | |
def lines(self): | |
""" | |
:return: a corpus view that acts as a list of strings, one for | |
each line in the predicate-argument annotation file. | |
""" | |
return StreamBackedCorpusView( | |
self.abspath(self._propfile), | |
read_line_block, | |
encoding=self.encoding(self._propfile), | |
) | |
def roleset(self, roleset_id): | |
""" | |
:return: the xml description for the given roleset. | |
""" | |
baseform = roleset_id.split(".")[0] | |
framefile = "frames/%s.xml" % baseform | |
if framefile not in self._framefiles: | |
raise ValueError("Frameset file for %s not found" % roleset_id) | |
# n.b.: The encoding for XML fileids is specified by the file | |
# itself; so we ignore self._encoding here. | |
with self.abspath(framefile).open() as fp: | |
etree = ElementTree.parse(fp).getroot() | |
for roleset in etree.findall("predicate/roleset"): | |
if roleset.attrib["id"] == roleset_id: | |
return roleset | |
raise ValueError(f"Roleset {roleset_id} not found in {framefile}") | |
def rolesets(self, baseform=None): | |
""" | |
:return: list of xml descriptions for rolesets. | |
""" | |
if baseform is not None: | |
framefile = "frames/%s.xml" % baseform | |
if framefile not in self._framefiles: | |
raise ValueError("Frameset file for %s not found" % baseform) | |
framefiles = [framefile] | |
else: | |
framefiles = self._framefiles | |
rsets = [] | |
for framefile in framefiles: | |
# n.b.: The encoding for XML fileids is specified by the file | |
# itself; so we ignore self._encoding here. | |
with self.abspath(framefile).open() as fp: | |
etree = ElementTree.parse(fp).getroot() | |
rsets.append(etree.findall("predicate/roleset")) | |
return LazyConcatenation(rsets) | |
def verbs(self): | |
""" | |
:return: a corpus view that acts as a list of all verb lemmas | |
in this corpus (from the verbs.txt file). | |
""" | |
return StreamBackedCorpusView( | |
self.abspath(self._verbsfile), | |
read_line_block, | |
encoding=self.encoding(self._verbsfile), | |
) | |
def _read_instance_block(self, stream, instance_filter=lambda inst: True): | |
block = [] | |
# Read 100 at a time. | |
for i in range(100): | |
line = stream.readline().strip() | |
if line: | |
inst = PropbankInstance.parse( | |
line, self._parse_fileid_xform, self._parse_corpus | |
) | |
if instance_filter(inst): | |
block.append(inst) | |
return block | |
###################################################################### | |
# { Propbank Instance & related datatypes | |
###################################################################### | |
class PropbankInstance: | |
def __init__( | |
self, | |
fileid, | |
sentnum, | |
wordnum, | |
tagger, | |
roleset, | |
inflection, | |
predicate, | |
arguments, | |
parse_corpus=None, | |
): | |
self.fileid = fileid | |
"""The name of the file containing the parse tree for this | |
instance's sentence.""" | |
self.sentnum = sentnum | |
"""The sentence number of this sentence within ``fileid``. | |
Indexing starts from zero.""" | |
self.wordnum = wordnum | |
"""The word number of this instance's predicate within its | |
containing sentence. Word numbers are indexed starting from | |
zero, and include traces and other empty parse elements.""" | |
self.tagger = tagger | |
"""An identifier for the tagger who tagged this instance; or | |
``'gold'`` if this is an adjuticated instance.""" | |
self.roleset = roleset | |
"""The name of the roleset used by this instance's predicate. | |
Use ``propbank.roleset() <PropbankCorpusReader.roleset>`` to | |
look up information about the roleset.""" | |
self.inflection = inflection | |
"""A ``PropbankInflection`` object describing the inflection of | |
this instance's predicate.""" | |
self.predicate = predicate | |
"""A ``PropbankTreePointer`` indicating the position of this | |
instance's predicate within its containing sentence.""" | |
self.arguments = tuple(arguments) | |
"""A list of tuples (argloc, argid), specifying the location | |
and identifier for each of the predicate's argument in the | |
containing sentence. Argument identifiers are strings such as | |
``'ARG0'`` or ``'ARGM-TMP'``. This list does *not* contain | |
the predicate.""" | |
self.parse_corpus = parse_corpus | |
"""A corpus reader for the parse trees corresponding to the | |
instances in this propbank corpus.""" | |
def baseform(self): | |
"""The baseform of the predicate.""" | |
return self.roleset.split(".")[0] | |
def sensenumber(self): | |
"""The sense number of the predicate.""" | |
return self.roleset.split(".")[1] | |
def predid(self): | |
"""Identifier of the predicate.""" | |
return "rel" | |
def __repr__(self): | |
return "<PropbankInstance: {}, sent {}, word {}>".format( | |
self.fileid, | |
self.sentnum, | |
self.wordnum, | |
) | |
def __str__(self): | |
s = "{} {} {} {} {} {}".format( | |
self.fileid, | |
self.sentnum, | |
self.wordnum, | |
self.tagger, | |
self.roleset, | |
self.inflection, | |
) | |
items = self.arguments + ((self.predicate, "rel"),) | |
for (argloc, argid) in sorted(items): | |
s += f" {argloc}-{argid}" | |
return s | |
def _get_tree(self): | |
if self.parse_corpus is None: | |
return None | |
if self.fileid not in self.parse_corpus.fileids(): | |
return None | |
return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum] | |
tree = property( | |
_get_tree, | |
doc=""" | |
The parse tree corresponding to this instance, or None if | |
the corresponding tree is not available.""", | |
) | |
def parse(s, parse_fileid_xform=None, parse_corpus=None): | |
pieces = s.split() | |
if len(pieces) < 7: | |
raise ValueError("Badly formatted propbank line: %r" % s) | |
# Divide the line into its basic pieces. | |
(fileid, sentnum, wordnum, tagger, roleset, inflection) = pieces[:6] | |
rel = [p for p in pieces[6:] if p.endswith("-rel")] | |
args = [p for p in pieces[6:] if not p.endswith("-rel")] | |
if len(rel) != 1: | |
raise ValueError("Badly formatted propbank line: %r" % s) | |
# Apply the fileid selector, if any. | |
if parse_fileid_xform is not None: | |
fileid = parse_fileid_xform(fileid) | |
# Convert sentence & word numbers to ints. | |
sentnum = int(sentnum) | |
wordnum = int(wordnum) | |
# Parse the inflection | |
inflection = PropbankInflection.parse(inflection) | |
# Parse the predicate location. | |
predicate = PropbankTreePointer.parse(rel[0][:-4]) | |
# Parse the arguments. | |
arguments = [] | |
for arg in args: | |
argloc, argid = arg.split("-", 1) | |
arguments.append((PropbankTreePointer.parse(argloc), argid)) | |
# Put it all together. | |
return PropbankInstance( | |
fileid, | |
sentnum, | |
wordnum, | |
tagger, | |
roleset, | |
inflection, | |
predicate, | |
arguments, | |
parse_corpus, | |
) | |
class PropbankPointer: | |
""" | |
A pointer used by propbank to identify one or more constituents in | |
a parse tree. ``PropbankPointer`` is an abstract base class with | |
three concrete subclasses: | |
- ``PropbankTreePointer`` is used to point to single constituents. | |
- ``PropbankSplitTreePointer`` is used to point to 'split' | |
constituents, which consist of a sequence of two or more | |
``PropbankTreePointer`` pointers. | |
- ``PropbankChainTreePointer`` is used to point to entire trace | |
chains in a tree. It consists of a sequence of pieces, which | |
can be ``PropbankTreePointer`` or ``PropbankSplitTreePointer`` pointers. | |
""" | |
def __init__(self): | |
if self.__class__ == PropbankPointer: | |
raise NotImplementedError() | |
class PropbankChainTreePointer(PropbankPointer): | |
def __init__(self, pieces): | |
self.pieces = pieces | |
"""A list of the pieces that make up this chain. Elements may | |
be either ``PropbankSplitTreePointer`` or | |
``PropbankTreePointer`` pointers.""" | |
def __str__(self): | |
return "*".join("%s" % p for p in self.pieces) | |
def __repr__(self): | |
return "<PropbankChainTreePointer: %s>" % self | |
def select(self, tree): | |
if tree is None: | |
raise ValueError("Parse tree not available") | |
return Tree("*CHAIN*", [p.select(tree) for p in self.pieces]) | |
class PropbankSplitTreePointer(PropbankPointer): | |
def __init__(self, pieces): | |
self.pieces = pieces | |
"""A list of the pieces that make up this chain. Elements are | |
all ``PropbankTreePointer`` pointers.""" | |
def __str__(self): | |
return ",".join("%s" % p for p in self.pieces) | |
def __repr__(self): | |
return "<PropbankSplitTreePointer: %s>" % self | |
def select(self, tree): | |
if tree is None: | |
raise ValueError("Parse tree not available") | |
return Tree("*SPLIT*", [p.select(tree) for p in self.pieces]) | |
class PropbankTreePointer(PropbankPointer): | |
""" | |
wordnum:height*wordnum:height*... | |
wordnum:height, | |
""" | |
def __init__(self, wordnum, height): | |
self.wordnum = wordnum | |
self.height = height | |
def parse(s): | |
# Deal with chains (xx*yy*zz) | |
pieces = s.split("*") | |
if len(pieces) > 1: | |
return PropbankChainTreePointer( | |
[PropbankTreePointer.parse(elt) for elt in pieces] | |
) | |
# Deal with split args (xx,yy,zz) | |
pieces = s.split(",") | |
if len(pieces) > 1: | |
return PropbankSplitTreePointer( | |
[PropbankTreePointer.parse(elt) for elt in pieces] | |
) | |
# Deal with normal pointers. | |
pieces = s.split(":") | |
if len(pieces) != 2: | |
raise ValueError("bad propbank pointer %r" % s) | |
return PropbankTreePointer(int(pieces[0]), int(pieces[1])) | |
def __str__(self): | |
return f"{self.wordnum}:{self.height}" | |
def __repr__(self): | |
return "PropbankTreePointer(%d, %d)" % (self.wordnum, self.height) | |
def __eq__(self, other): | |
while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)): | |
other = other.pieces[0] | |
if not isinstance(other, PropbankTreePointer): | |
return self is other | |
return self.wordnum == other.wordnum and self.height == other.height | |
def __ne__(self, other): | |
return not self == other | |
def __lt__(self, other): | |
while isinstance(other, (PropbankChainTreePointer, PropbankSplitTreePointer)): | |
other = other.pieces[0] | |
if not isinstance(other, PropbankTreePointer): | |
return id(self) < id(other) | |
return (self.wordnum, -self.height) < (other.wordnum, -other.height) | |
def select(self, tree): | |
if tree is None: | |
raise ValueError("Parse tree not available") | |
return tree[self.treepos(tree)] | |
def treepos(self, tree): | |
""" | |
Convert this pointer to a standard 'tree position' pointer, | |
given that it points to the given tree. | |
""" | |
if tree is None: | |
raise ValueError("Parse tree not available") | |
stack = [tree] | |
treepos = [] | |
wordnum = 0 | |
while True: | |
# tree node: | |
if isinstance(stack[-1], Tree): | |
# Select the next child. | |
if len(treepos) < len(stack): | |
treepos.append(0) | |
else: | |
treepos[-1] += 1 | |
# Update the stack. | |
if treepos[-1] < len(stack[-1]): | |
stack.append(stack[-1][treepos[-1]]) | |
else: | |
# End of node's child list: pop up a level. | |
stack.pop() | |
treepos.pop() | |
# word node: | |
else: | |
if wordnum == self.wordnum: | |
return tuple(treepos[: len(treepos) - self.height - 1]) | |
else: | |
wordnum += 1 | |
stack.pop() | |
class PropbankInflection: | |
# { Inflection Form | |
INFINITIVE = "i" | |
GERUND = "g" | |
PARTICIPLE = "p" | |
FINITE = "v" | |
# { Inflection Tense | |
FUTURE = "f" | |
PAST = "p" | |
PRESENT = "n" | |
# { Inflection Aspect | |
PERFECT = "p" | |
PROGRESSIVE = "o" | |
PERFECT_AND_PROGRESSIVE = "b" | |
# { Inflection Person | |
THIRD_PERSON = "3" | |
# { Inflection Voice | |
ACTIVE = "a" | |
PASSIVE = "p" | |
# { Inflection | |
NONE = "-" | |
# } | |
def __init__(self, form="-", tense="-", aspect="-", person="-", voice="-"): | |
self.form = form | |
self.tense = tense | |
self.aspect = aspect | |
self.person = person | |
self.voice = voice | |
def __str__(self): | |
return self.form + self.tense + self.aspect + self.person + self.voice | |
def __repr__(self): | |
return "<PropbankInflection: %s>" % self | |
_VALIDATE = re.compile(r"[igpv\-][fpn\-][pob\-][3\-][ap\-]$") | |
def parse(s): | |
if not isinstance(s, str): | |
raise TypeError("expected a string") | |
if len(s) != 5 or not PropbankInflection._VALIDATE.match(s): | |
raise ValueError("Bad propbank inflection string %r" % s) | |
return PropbankInflection(*s) | |