# Natural Language Toolkit: NomBank Corpus Reader # # Copyright (C) 2001-2023 NLTK Project # Authors: Paul Bedaride # Edward Loper # URL: # For license information, see LICENSE.TXT from functools import total_ordering from xml.etree import ElementTree from nltk.corpus.reader.api import * from nltk.corpus.reader.util import * from nltk.internals import raise_unorderable_types from nltk.tree import Tree class NombankCorpusReader(CorpusReader): """ Corpus reader for the nombank corpus, which augments the Penn Treebank with information about the predicate argument structure of every noun instance. The corpus consists of two parts: the predicate-argument annotations themselves, and a set of "frameset files" which define the argument labels used by the annotations, on a per-noun basis. Each "frameset file" contains one or more predicates, such as ``'turn'`` or ``'turn_on'``, each of which is divided into coarse-grained word senses called "rolesets". For each "roleset", the frameset file provides descriptions of the argument roles, along with examples. """ def __init__( self, root, nomfile, framefiles="", nounsfile=None, parse_fileid_xform=None, parse_corpus=None, encoding="utf8", ): """ :param root: The root directory for this corpus. :param nomfile: The name of the file containing the predicate- argument annotations (relative to ``root``). :param framefiles: A list or regexp specifying the frameset fileids for this corpus. :param parse_fileid_xform: A transform that should be applied to the fileids in this corpus. This should be a function of one argument (a fileid) that returns a string (the new fileid). :param parse_corpus: The corpus containing the parse trees corresponding to this corpus. These parse trees are necessary to resolve the tree pointers used by nombank. """ # If framefiles is specified as a regexp, expand it. if isinstance(framefiles, str): self._fileids = find_corpus_fileids(root, framefiles) self._fileids = list(framefiles) # Initialize the corpus reader. CorpusReader.__init__(self, root, framefiles, encoding) # Record our nom file & nouns file. self._nomfile = nomfile self._nounsfile = nounsfile self._parse_fileid_xform = parse_fileid_xform self._parse_corpus = parse_corpus def instances(self, baseform=None): """ :return: a corpus view that acts as a list of ``NombankInstance`` objects, one for each noun in the corpus. """ kwargs = {} if baseform is not None: kwargs["instance_filter"] = lambda inst: inst.baseform == baseform return StreamBackedCorpusView( self.abspath(self._nomfile), lambda stream: self._read_instance_block(stream, **kwargs), encoding=self.encoding(self._nomfile), ) def lines(self): """ :return: a corpus view that acts as a list of strings, one for each line in the predicate-argument annotation file. """ return StreamBackedCorpusView( self.abspath(self._nomfile), read_line_block, encoding=self.encoding(self._nomfile), ) def roleset(self, roleset_id): """ :return: the xml description for the given roleset. """ baseform = roleset_id.split(".")[0] baseform = baseform.replace("perc-sign", "%") baseform = baseform.replace("oneslashonezero", "1/10").replace( "1/10", "1-slash-10" ) framefile = "frames/%s.xml" % baseform if framefile not in self.fileids(): raise ValueError("Frameset file for %s not found" % roleset_id) # n.b.: The encoding for XML fileids is specified by the file # itself; so we ignore self._encoding here. with self.abspath(framefile).open() as fp: etree = ElementTree.parse(fp).getroot() for roleset in etree.findall("predicate/roleset"): if roleset.attrib["id"] == roleset_id: return roleset raise ValueError(f"Roleset {roleset_id} not found in {framefile}") def rolesets(self, baseform=None): """ :return: list of xml descriptions for rolesets. """ if baseform is not None: framefile = "frames/%s.xml" % baseform if framefile not in self.fileids(): raise ValueError("Frameset file for %s not found" % baseform) framefiles = [framefile] else: framefiles = self.fileids() rsets = [] for framefile in framefiles: # n.b.: The encoding for XML fileids is specified by the file # itself; so we ignore self._encoding here. with self.abspath(framefile).open() as fp: etree = ElementTree.parse(fp).getroot() rsets.append(etree.findall("predicate/roleset")) return LazyConcatenation(rsets) def nouns(self): """ :return: a corpus view that acts as a list of all noun lemmas in this corpus (from the nombank.1.0.words file). """ return StreamBackedCorpusView( self.abspath(self._nounsfile), read_line_block, encoding=self.encoding(self._nounsfile), ) def _read_instance_block(self, stream, instance_filter=lambda inst: True): block = [] # Read 100 at a time. for i in range(100): line = stream.readline().strip() if line: inst = NombankInstance.parse( line, self._parse_fileid_xform, self._parse_corpus ) if instance_filter(inst): block.append(inst) return block ###################################################################### # { Nombank Instance & related datatypes ###################################################################### class NombankInstance: def __init__( self, fileid, sentnum, wordnum, baseform, sensenumber, predicate, predid, arguments, parse_corpus=None, ): self.fileid = fileid """The name of the file containing the parse tree for this instance's sentence.""" self.sentnum = sentnum """The sentence number of this sentence within ``fileid``. Indexing starts from zero.""" self.wordnum = wordnum """The word number of this instance's predicate within its containing sentence. Word numbers are indexed starting from zero, and include traces and other empty parse elements.""" self.baseform = baseform """The baseform of the predicate.""" self.sensenumber = sensenumber """The sense number of the predicate.""" self.predicate = predicate """A ``NombankTreePointer`` indicating the position of this instance's predicate within its containing sentence.""" self.predid = predid """Identifier of the predicate.""" self.arguments = tuple(arguments) """A list of tuples (argloc, argid), specifying the location and identifier for each of the predicate's argument in the containing sentence. Argument identifiers are strings such as ``'ARG0'`` or ``'ARGM-TMP'``. This list does *not* contain the predicate.""" self.parse_corpus = parse_corpus """A corpus reader for the parse trees corresponding to the instances in this nombank corpus.""" @property def roleset(self): """The name of the roleset used by this instance's predicate. Use ``nombank.roleset() `` to look up information about the roleset.""" r = self.baseform.replace("%", "perc-sign") r = r.replace("1/10", "1-slash-10").replace("1-slash-10", "oneslashonezero") return f"{r}.{self.sensenumber}" def __repr__(self): return "".format( self.fileid, self.sentnum, self.wordnum, ) def __str__(self): s = "{} {} {} {} {}".format( self.fileid, self.sentnum, self.wordnum, self.baseform, self.sensenumber, ) items = self.arguments + ((self.predicate, "rel"),) for (argloc, argid) in sorted(items): s += f" {argloc}-{argid}" return s def _get_tree(self): if self.parse_corpus is None: return None if self.fileid not in self.parse_corpus.fileids(): return None return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum] tree = property( _get_tree, doc=""" The parse tree corresponding to this instance, or None if the corresponding tree is not available.""", ) @staticmethod def parse(s, parse_fileid_xform=None, parse_corpus=None): pieces = s.split() if len(pieces) < 6: raise ValueError("Badly formatted nombank line: %r" % s) # Divide the line into its basic pieces. (fileid, sentnum, wordnum, baseform, sensenumber) = pieces[:5] args = pieces[5:] rel = [args.pop(i) for i, p in enumerate(args) if "-rel" in p] if len(rel) != 1: raise ValueError("Badly formatted nombank line: %r" % s) # Apply the fileid selector, if any. if parse_fileid_xform is not None: fileid = parse_fileid_xform(fileid) # Convert sentence & word numbers to ints. sentnum = int(sentnum) wordnum = int(wordnum) # Parse the predicate location. predloc, predid = rel[0].split("-", 1) predicate = NombankTreePointer.parse(predloc) # Parse the arguments. arguments = [] for arg in args: argloc, argid = arg.split("-", 1) arguments.append((NombankTreePointer.parse(argloc), argid)) # Put it all together. return NombankInstance( fileid, sentnum, wordnum, baseform, sensenumber, predicate, predid, arguments, parse_corpus, ) class NombankPointer: """ A pointer used by nombank to identify one or more constituents in a parse tree. ``NombankPointer`` is an abstract base class with three concrete subclasses: - ``NombankTreePointer`` is used to point to single constituents. - ``NombankSplitTreePointer`` is used to point to 'split' constituents, which consist of a sequence of two or more ``NombankTreePointer`` pointers. - ``NombankChainTreePointer`` is used to point to entire trace chains in a tree. It consists of a sequence of pieces, which can be ``NombankTreePointer`` or ``NombankSplitTreePointer`` pointers. """ def __init__(self): if self.__class__ == NombankPointer: raise NotImplementedError() class NombankChainTreePointer(NombankPointer): def __init__(self, pieces): self.pieces = pieces """A list of the pieces that make up this chain. Elements may be either ``NombankSplitTreePointer`` or ``NombankTreePointer`` pointers.""" def __str__(self): return "*".join("%s" % p for p in self.pieces) def __repr__(self): return "" % self def select(self, tree): if tree is None: raise ValueError("Parse tree not available") return Tree("*CHAIN*", [p.select(tree) for p in self.pieces]) class NombankSplitTreePointer(NombankPointer): def __init__(self, pieces): self.pieces = pieces """A list of the pieces that make up this chain. Elements are all ``NombankTreePointer`` pointers.""" def __str__(self): return ",".join("%s" % p for p in self.pieces) def __repr__(self): return "" % self def select(self, tree): if tree is None: raise ValueError("Parse tree not available") return Tree("*SPLIT*", [p.select(tree) for p in self.pieces]) @total_ordering class NombankTreePointer(NombankPointer): """ wordnum:height*wordnum:height*... wordnum:height, """ def __init__(self, wordnum, height): self.wordnum = wordnum self.height = height @staticmethod def parse(s): # Deal with chains (xx*yy*zz) pieces = s.split("*") if len(pieces) > 1: return NombankChainTreePointer( [NombankTreePointer.parse(elt) for elt in pieces] ) # Deal with split args (xx,yy,zz) pieces = s.split(",") if len(pieces) > 1: return NombankSplitTreePointer( [NombankTreePointer.parse(elt) for elt in pieces] ) # Deal with normal pointers. pieces = s.split(":") if len(pieces) != 2: raise ValueError("bad nombank pointer %r" % s) return NombankTreePointer(int(pieces[0]), int(pieces[1])) def __str__(self): return f"{self.wordnum}:{self.height}" def __repr__(self): return "NombankTreePointer(%d, %d)" % (self.wordnum, self.height) def __eq__(self, other): while isinstance(other, (NombankChainTreePointer, NombankSplitTreePointer)): other = other.pieces[0] if not isinstance(other, NombankTreePointer): return self is other return self.wordnum == other.wordnum and self.height == other.height def __ne__(self, other): return not self == other def __lt__(self, other): while isinstance(other, (NombankChainTreePointer, NombankSplitTreePointer)): other = other.pieces[0] if not isinstance(other, NombankTreePointer): return id(self) < id(other) return (self.wordnum, -self.height) < (other.wordnum, -other.height) def select(self, tree): if tree is None: raise ValueError("Parse tree not available") return tree[self.treepos(tree)] def treepos(self, tree): """ Convert this pointer to a standard 'tree position' pointer, given that it points to the given tree. """ if tree is None: raise ValueError("Parse tree not available") stack = [tree] treepos = [] wordnum = 0 while True: # tree node: if isinstance(stack[-1], Tree): # Select the next child. if len(treepos) < len(stack): treepos.append(0) else: treepos[-1] += 1 # Update the stack. if treepos[-1] < len(stack[-1]): stack.append(stack[-1][treepos[-1]]) else: # End of node's child list: pop up a level. stack.pop() treepos.pop() # word node: else: if wordnum == self.wordnum: return tuple(treepos[: len(treepos) - self.height - 1]) else: wordnum += 1 stack.pop()