File size: 9,619 Bytes
d916065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
# Natural Language Toolkit: Penn Treebank Reader
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Steven Bird <[email protected]>
#         Edward Loper <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""

Corpus reader for corpora that consist of parenthesis-delineated parse trees.

"""

import sys

from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.tag import map_tag
from nltk.tree import Tree

# we use [^\s()]+ instead of \S+? to avoid matching ()
SORTTAGWRD = re.compile(r"\((\d+) ([^\s()]+) ([^\s()]+)\)")
TAGWORD = re.compile(r"\(([^\s()]+) ([^\s()]+)\)")
WORD = re.compile(r"\([^\s()]+ ([^\s()]+)\)")
EMPTY_BRACKETS = re.compile(r"\s*\(\s*\(")


class BracketParseCorpusReader(SyntaxCorpusReader):
    """

    Reader for corpora that consist of parenthesis-delineated parse trees,

    like those found in the "combined" section of the Penn Treebank,

    e.g. "(S (NP (DT the) (JJ little) (NN dog)) (VP (VBD barked)))".



    """

    def __init__(

        self,

        root,

        fileids,

        comment_char=None,

        detect_blocks="unindented_paren",

        encoding="utf8",

        tagset=None,

    ):
        """

        :param root: The root directory for this corpus.

        :param fileids: A list or regexp specifying the fileids in this corpus.

        :param comment_char: The character which can appear at the start of

            a line to indicate that the rest of the line is a comment.

        :param detect_blocks: The method that is used to find blocks

            in the corpus; can be 'unindented_paren' (every unindented

            parenthesis starts a new parse) or 'sexpr' (brackets are

            matched).

        :param tagset: The name of the tagset used by this corpus, to be used

            for normalizing or converting the POS tags returned by the

            ``tagged_...()`` methods.

        """
        SyntaxCorpusReader.__init__(self, root, fileids, encoding)
        self._comment_char = comment_char
        self._detect_blocks = detect_blocks
        self._tagset = tagset

    def _read_block(self, stream):
        if self._detect_blocks == "sexpr":
            return read_sexpr_block(stream, comment_char=self._comment_char)
        elif self._detect_blocks == "blankline":
            return read_blankline_block(stream)
        elif self._detect_blocks == "unindented_paren":
            # Tokens start with unindented left parens.
            toks = read_regexp_block(stream, start_re=r"^\(")
            # Strip any comments out of the tokens.
            if self._comment_char:
                toks = [
                    re.sub("(?m)^%s.*" % re.escape(self._comment_char), "", tok)
                    for tok in toks
                ]
            return toks
        else:
            assert 0, "bad block type"

    def _normalize(self, t):
        # Replace leaves of the form (!), (,), with (! !), (, ,)
        t = re.sub(r"\((.)\)", r"(\1 \1)", t)
        # Replace leaves of the form (tag word root) with (tag word)
        t = re.sub(r"\(([^\s()]+) ([^\s()]+) [^\s()]+\)", r"(\1 \2)", t)
        return t

    def _parse(self, t):
        try:
            tree = Tree.fromstring(self._normalize(t))
            # If there's an empty node at the top, strip it off
            if tree.label() == "" and len(tree) == 1:
                return tree[0]
            else:
                return tree

        except ValueError as e:
            sys.stderr.write("Bad tree detected; trying to recover...\n")
            # Try to recover, if we can:
            if e.args == ("mismatched parens",):
                for n in range(1, 5):
                    try:
                        v = Tree(self._normalize(t + ")" * n))
                        sys.stderr.write(
                            "  Recovered by adding %d close " "paren(s)\n" % n
                        )
                        return v
                    except ValueError:
                        pass
            # Try something else:
            sys.stderr.write("  Recovered by returning a flat parse.\n")
            # sys.stderr.write(' '.join(t.split())+'\n')
            return Tree("S", self._tag(t))

    def _tag(self, t, tagset=None):
        tagged_sent = [(w, p) for (p, w) in TAGWORD.findall(self._normalize(t))]
        if tagset and tagset != self._tagset:
            tagged_sent = [
                (w, map_tag(self._tagset, tagset, p)) for (w, p) in tagged_sent
            ]
        return tagged_sent

    def _word(self, t):
        return WORD.findall(self._normalize(t))


class CategorizedBracketParseCorpusReader(
    CategorizedCorpusReader, BracketParseCorpusReader
):
    """

    A reader for parsed corpora whose documents are

    divided into categories based on their file identifiers.

    @author: Nathan Schneider <[email protected]>

    """

    def __init__(self, *args, **kwargs):
        """

        Initialize the corpus reader.  Categorization arguments

        (C{cat_pattern}, C{cat_map}, and C{cat_file}) are passed to

        the L{CategorizedCorpusReader constructor

        <CategorizedCorpusReader.__init__>}.  The remaining arguments

        are passed to the L{BracketParseCorpusReader constructor

        <BracketParseCorpusReader.__init__>}.

        """
        CategorizedCorpusReader.__init__(self, kwargs)
        BracketParseCorpusReader.__init__(self, *args, **kwargs)

    def tagged_words(self, fileids=None, categories=None, tagset=None):
        return super().tagged_words(self._resolve(fileids, categories), tagset)

    def tagged_sents(self, fileids=None, categories=None, tagset=None):
        return super().tagged_sents(self._resolve(fileids, categories), tagset)

    def tagged_paras(self, fileids=None, categories=None, tagset=None):
        return super().tagged_paras(self._resolve(fileids, categories), tagset)

    def parsed_words(self, fileids=None, categories=None):
        return super().parsed_words(self._resolve(fileids, categories))

    def parsed_sents(self, fileids=None, categories=None):
        return super().parsed_sents(self._resolve(fileids, categories))

    def parsed_paras(self, fileids=None, categories=None):
        return super().parsed_paras(self._resolve(fileids, categories))


class AlpinoCorpusReader(BracketParseCorpusReader):
    """

    Reader for the Alpino Dutch Treebank.

    This corpus has a lexical breakdown structure embedded, as read by `_parse`

    Unfortunately this puts punctuation and some other words out of the sentence

    order in the xml element tree. This is no good for `tag_` and `word_`

    `_tag` and `_word` will be overridden to use a non-default new parameter 'ordered'

    to the overridden _normalize function. The _parse function can then remain

    untouched.

    """

    def __init__(self, root, encoding="ISO-8859-1", tagset=None):
        BracketParseCorpusReader.__init__(
            self,
            root,
            r"alpino\.xml",
            detect_blocks="blankline",
            encoding=encoding,
            tagset=tagset,
        )

    def _normalize(self, t, ordered=False):
        """Normalize the xml sentence element in t.

        The sentence elements <alpino_ds>, although embedded in a few overall

        xml elements, are separated by blank lines. That's how the reader can

        deliver them one at a time.

        Each sentence has a few category subnodes that are of no use to us.

        The remaining word nodes may or may not appear in the proper order.

        Each word node has attributes, among which:

        - begin : the position of the word in the sentence

        - pos   : Part of Speech: the Tag

        - word  : the actual word

        The return value is a string with all xml elementes replaced by

        clauses: either a cat clause with nested clauses, or a word clause.

        The order of the bracket clauses closely follows the xml.

        If ordered == True, the word clauses include an order sequence number.

        If ordered == False, the word clauses only have pos and word parts.

        """
        if t[:10] != "<alpino_ds":
            return ""
        # convert XML to sexpr notation
        t = re.sub(r'  <node .*? cat="(\w+)".*>', r"(\1", t)
        if ordered:
            t = re.sub(
                r'  <node. *?begin="(\d+)".*? pos="(\w+)".*? word="([^"]+)".*?/>',
                r"(\1 \2 \3)",
                t,
            )
        else:
            t = re.sub(r'  <node .*?pos="(\w+)".*? word="([^"]+)".*?/>', r"(\1 \2)", t)
        t = re.sub(r"  </node>", r")", t)
        t = re.sub(r"<sentence>.*</sentence>", r"", t)
        t = re.sub(r"</?alpino_ds.*>", r"", t)
        return t

    def _tag(self, t, tagset=None):
        tagged_sent = [
            (int(o), w, p)
            for (o, p, w) in SORTTAGWRD.findall(self._normalize(t, ordered=True))
        ]
        tagged_sent.sort()
        if tagset and tagset != self._tagset:
            tagged_sent = [
                (w, map_tag(self._tagset, tagset, p)) for (o, w, p) in tagged_sent
            ]
        else:
            tagged_sent = [(w, p) for (o, w, p) in tagged_sent]
        return tagged_sent

    def _word(self, t):
        """Return a correctly ordered list if words"""
        tagged_sent = self._tag(t)
        return [w for (w, p) in tagged_sent]