File size: 16,247 Bytes
d916065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
# Natural Language Toolkit: NomBank Corpus Reader
#
# Copyright (C) 2001-2023 NLTK Project
# Authors: Paul Bedaride <[email protected]>
#          Edward Loper <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT

from functools import total_ordering
from xml.etree import ElementTree

from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.internals import raise_unorderable_types
from nltk.tree import Tree


class NombankCorpusReader(CorpusReader):
    """

    Corpus reader for the nombank corpus, which augments the Penn

    Treebank with information about the predicate argument structure

    of every noun instance.  The corpus consists of two parts: the

    predicate-argument annotations themselves, and a set of "frameset

    files" which define the argument labels used by the annotations,

    on a per-noun basis.  Each "frameset file" contains one or more

    predicates, such as ``'turn'`` or ``'turn_on'``, each of which is

    divided into coarse-grained word senses called "rolesets".  For

    each "roleset", the frameset file provides descriptions of the

    argument roles, along with examples.

    """

    def __init__(

        self,

        root,

        nomfile,

        framefiles="",

        nounsfile=None,

        parse_fileid_xform=None,

        parse_corpus=None,

        encoding="utf8",

    ):
        """

        :param root: The root directory for this corpus.

        :param nomfile: The name of the file containing the predicate-

            argument annotations (relative to ``root``).

        :param framefiles: A list or regexp specifying the frameset

            fileids for this corpus.

        :param parse_fileid_xform: A transform that should be applied

            to the fileids in this corpus.  This should be a function

            of one argument (a fileid) that returns a string (the new

            fileid).

        :param parse_corpus: The corpus containing the parse trees

            corresponding to this corpus.  These parse trees are

            necessary to resolve the tree pointers used by nombank.

        """

        # If framefiles is specified as a regexp, expand it.
        if isinstance(framefiles, str):
            self._fileids = find_corpus_fileids(root, framefiles)
        self._fileids = list(framefiles)
        # Initialize the corpus reader.
        CorpusReader.__init__(self, root, framefiles, encoding)

        # Record our nom file & nouns file.
        self._nomfile = nomfile
        self._nounsfile = nounsfile
        self._parse_fileid_xform = parse_fileid_xform
        self._parse_corpus = parse_corpus

    def instances(self, baseform=None):
        """

        :return: a corpus view that acts as a list of

            ``NombankInstance`` objects, one for each noun in the corpus.

        """
        kwargs = {}
        if baseform is not None:
            kwargs["instance_filter"] = lambda inst: inst.baseform == baseform
        return StreamBackedCorpusView(
            self.abspath(self._nomfile),
            lambda stream: self._read_instance_block(stream, **kwargs),
            encoding=self.encoding(self._nomfile),
        )

    def lines(self):
        """

        :return: a corpus view that acts as a list of strings, one for

            each line in the predicate-argument annotation file.

        """
        return StreamBackedCorpusView(
            self.abspath(self._nomfile),
            read_line_block,
            encoding=self.encoding(self._nomfile),
        )

    def roleset(self, roleset_id):
        """

        :return: the xml description for the given roleset.

        """
        baseform = roleset_id.split(".")[0]
        baseform = baseform.replace("perc-sign", "%")
        baseform = baseform.replace("oneslashonezero", "1/10").replace(
            "1/10", "1-slash-10"
        )
        framefile = "frames/%s.xml" % baseform
        if framefile not in self.fileids():
            raise ValueError("Frameset file for %s not found" % roleset_id)

        # n.b.: The encoding for XML fileids is specified by the file
        # itself; so we ignore self._encoding here.
        with self.abspath(framefile).open() as fp:
            etree = ElementTree.parse(fp).getroot()
        for roleset in etree.findall("predicate/roleset"):
            if roleset.attrib["id"] == roleset_id:
                return roleset
        raise ValueError(f"Roleset {roleset_id} not found in {framefile}")

    def rolesets(self, baseform=None):
        """

        :return: list of xml descriptions for rolesets.

        """
        if baseform is not None:
            framefile = "frames/%s.xml" % baseform
            if framefile not in self.fileids():
                raise ValueError("Frameset file for %s not found" % baseform)
            framefiles = [framefile]
        else:
            framefiles = self.fileids()

        rsets = []
        for framefile in framefiles:
            # n.b.: The encoding for XML fileids is specified by the file
            # itself; so we ignore self._encoding here.
            with self.abspath(framefile).open() as fp:
                etree = ElementTree.parse(fp).getroot()
            rsets.append(etree.findall("predicate/roleset"))
        return LazyConcatenation(rsets)

    def nouns(self):
        """

        :return: a corpus view that acts as a list of all noun lemmas

            in this corpus (from the nombank.1.0.words file).

        """
        return StreamBackedCorpusView(
            self.abspath(self._nounsfile),
            read_line_block,
            encoding=self.encoding(self._nounsfile),
        )

    def _read_instance_block(self, stream, instance_filter=lambda inst: True):
        block = []

        # Read 100 at a time.
        for i in range(100):
            line = stream.readline().strip()
            if line:
                inst = NombankInstance.parse(
                    line, self._parse_fileid_xform, self._parse_corpus
                )
                if instance_filter(inst):
                    block.append(inst)

        return block


######################################################################
# { Nombank Instance & related datatypes
######################################################################


class NombankInstance:
    def __init__(

        self,

        fileid,

        sentnum,

        wordnum,

        baseform,

        sensenumber,

        predicate,

        predid,

        arguments,

        parse_corpus=None,

    ):

        self.fileid = fileid
        """The name of the file containing the parse tree for this

        instance's sentence."""

        self.sentnum = sentnum
        """The sentence number of this sentence within ``fileid``.

        Indexing starts from zero."""

        self.wordnum = wordnum
        """The word number of this instance's predicate within its

        containing sentence.  Word numbers are indexed starting from

        zero, and include traces and other empty parse elements."""

        self.baseform = baseform
        """The baseform of the predicate."""

        self.sensenumber = sensenumber
        """The sense number of the predicate."""

        self.predicate = predicate
        """A ``NombankTreePointer`` indicating the position of this

        instance's predicate within its containing sentence."""

        self.predid = predid
        """Identifier of the predicate."""

        self.arguments = tuple(arguments)
        """A list of tuples (argloc, argid), specifying the location

        and identifier for each of the predicate's argument in the

        containing sentence.  Argument identifiers are strings such as

        ``'ARG0'`` or ``'ARGM-TMP'``.  This list does *not* contain

        the predicate."""

        self.parse_corpus = parse_corpus
        """A corpus reader for the parse trees corresponding to the

        instances in this nombank corpus."""

    @property
    def roleset(self):
        """The name of the roleset used by this instance's predicate.

        Use ``nombank.roleset() <NombankCorpusReader.roleset>`` to

        look up information about the roleset."""
        r = self.baseform.replace("%", "perc-sign")
        r = r.replace("1/10", "1-slash-10").replace("1-slash-10", "oneslashonezero")
        return f"{r}.{self.sensenumber}"

    def __repr__(self):
        return "<NombankInstance: {}, sent {}, word {}>".format(
            self.fileid,
            self.sentnum,
            self.wordnum,
        )

    def __str__(self):
        s = "{} {} {} {} {}".format(
            self.fileid,
            self.sentnum,
            self.wordnum,
            self.baseform,
            self.sensenumber,
        )
        items = self.arguments + ((self.predicate, "rel"),)
        for (argloc, argid) in sorted(items):
            s += f" {argloc}-{argid}"
        return s

    def _get_tree(self):
        if self.parse_corpus is None:
            return None
        if self.fileid not in self.parse_corpus.fileids():
            return None
        return self.parse_corpus.parsed_sents(self.fileid)[self.sentnum]

    tree = property(
        _get_tree,
        doc="""

        The parse tree corresponding to this instance, or None if

        the corresponding tree is not available.""",
    )

    @staticmethod
    def parse(s, parse_fileid_xform=None, parse_corpus=None):
        pieces = s.split()
        if len(pieces) < 6:
            raise ValueError("Badly formatted nombank line: %r" % s)

        # Divide the line into its basic pieces.
        (fileid, sentnum, wordnum, baseform, sensenumber) = pieces[:5]

        args = pieces[5:]
        rel = [args.pop(i) for i, p in enumerate(args) if "-rel" in p]
        if len(rel) != 1:
            raise ValueError("Badly formatted nombank line: %r" % s)

        # Apply the fileid selector, if any.
        if parse_fileid_xform is not None:
            fileid = parse_fileid_xform(fileid)

        # Convert sentence & word numbers to ints.
        sentnum = int(sentnum)
        wordnum = int(wordnum)

        # Parse the predicate location.

        predloc, predid = rel[0].split("-", 1)
        predicate = NombankTreePointer.parse(predloc)

        # Parse the arguments.
        arguments = []
        for arg in args:
            argloc, argid = arg.split("-", 1)
            arguments.append((NombankTreePointer.parse(argloc), argid))

        # Put it all together.
        return NombankInstance(
            fileid,
            sentnum,
            wordnum,
            baseform,
            sensenumber,
            predicate,
            predid,
            arguments,
            parse_corpus,
        )


class NombankPointer:
    """

    A pointer used by nombank to identify one or more constituents in

    a parse tree.  ``NombankPointer`` is an abstract base class with

    three concrete subclasses:



    - ``NombankTreePointer`` is used to point to single constituents.

    - ``NombankSplitTreePointer`` is used to point to 'split'

      constituents, which consist of a sequence of two or more

      ``NombankTreePointer`` pointers.

    - ``NombankChainTreePointer`` is used to point to entire trace

      chains in a tree.  It consists of a sequence of pieces, which

      can be ``NombankTreePointer`` or ``NombankSplitTreePointer`` pointers.

    """

    def __init__(self):
        if self.__class__ == NombankPointer:
            raise NotImplementedError()


class NombankChainTreePointer(NombankPointer):
    def __init__(self, pieces):
        self.pieces = pieces
        """A list of the pieces that make up this chain.  Elements may

           be either ``NombankSplitTreePointer`` or

           ``NombankTreePointer`` pointers."""

    def __str__(self):
        return "*".join("%s" % p for p in self.pieces)

    def __repr__(self):
        return "<NombankChainTreePointer: %s>" % self

    def select(self, tree):
        if tree is None:
            raise ValueError("Parse tree not available")
        return Tree("*CHAIN*", [p.select(tree) for p in self.pieces])


class NombankSplitTreePointer(NombankPointer):
    def __init__(self, pieces):
        self.pieces = pieces
        """A list of the pieces that make up this chain.  Elements are

           all ``NombankTreePointer`` pointers."""

    def __str__(self):
        return ",".join("%s" % p for p in self.pieces)

    def __repr__(self):
        return "<NombankSplitTreePointer: %s>" % self

    def select(self, tree):
        if tree is None:
            raise ValueError("Parse tree not available")
        return Tree("*SPLIT*", [p.select(tree) for p in self.pieces])


@total_ordering
class NombankTreePointer(NombankPointer):
    """

    wordnum:height*wordnum:height*...

    wordnum:height,



    """

    def __init__(self, wordnum, height):
        self.wordnum = wordnum
        self.height = height

    @staticmethod
    def parse(s):
        # Deal with chains (xx*yy*zz)
        pieces = s.split("*")
        if len(pieces) > 1:
            return NombankChainTreePointer(
                [NombankTreePointer.parse(elt) for elt in pieces]
            )

        # Deal with split args (xx,yy,zz)
        pieces = s.split(",")
        if len(pieces) > 1:
            return NombankSplitTreePointer(
                [NombankTreePointer.parse(elt) for elt in pieces]
            )

        # Deal with normal pointers.
        pieces = s.split(":")
        if len(pieces) != 2:
            raise ValueError("bad nombank pointer %r" % s)
        return NombankTreePointer(int(pieces[0]), int(pieces[1]))

    def __str__(self):
        return f"{self.wordnum}:{self.height}"

    def __repr__(self):
        return "NombankTreePointer(%d, %d)" % (self.wordnum, self.height)

    def __eq__(self, other):
        while isinstance(other, (NombankChainTreePointer, NombankSplitTreePointer)):
            other = other.pieces[0]

        if not isinstance(other, NombankTreePointer):
            return self is other

        return self.wordnum == other.wordnum and self.height == other.height

    def __ne__(self, other):
        return not self == other

    def __lt__(self, other):
        while isinstance(other, (NombankChainTreePointer, NombankSplitTreePointer)):
            other = other.pieces[0]

        if not isinstance(other, NombankTreePointer):
            return id(self) < id(other)

        return (self.wordnum, -self.height) < (other.wordnum, -other.height)

    def select(self, tree):
        if tree is None:
            raise ValueError("Parse tree not available")
        return tree[self.treepos(tree)]

    def treepos(self, tree):
        """

        Convert this pointer to a standard 'tree position' pointer,

        given that it points to the given tree.

        """
        if tree is None:
            raise ValueError("Parse tree not available")
        stack = [tree]
        treepos = []

        wordnum = 0
        while True:
            # tree node:
            if isinstance(stack[-1], Tree):
                # Select the next child.
                if len(treepos) < len(stack):
                    treepos.append(0)
                else:
                    treepos[-1] += 1
                # Update the stack.
                if treepos[-1] < len(stack[-1]):
                    stack.append(stack[-1][treepos[-1]])
                else:
                    # End of node's child list: pop up a level.
                    stack.pop()
                    treepos.pop()
            # word node:
            else:
                if wordnum == self.wordnum:
                    return tuple(treepos[: len(treepos) - self.height - 1])
                else:
                    wordnum += 1
                    stack.pop()