File size: 9,366 Bytes
d916065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
# Natural Language Toolkit: Chunked Corpus Reader
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Steven Bird <[email protected]>
#         Edward Loper <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT

"""

A reader for corpora that contain chunked (and optionally tagged)

documents.

"""

import codecs
import os.path

import nltk
from nltk.chunk import tagstr2tree
from nltk.corpus.reader.api import *
from nltk.corpus.reader.bracket_parse import BracketParseCorpusReader
from nltk.corpus.reader.util import *
from nltk.tokenize import *
from nltk.tree import Tree


class ChunkedCorpusReader(CorpusReader):
    """

    Reader for chunked (and optionally tagged) corpora.  Paragraphs

    are split using a block reader.  They are then tokenized into

    sentences using a sentence tokenizer.  Finally, these sentences

    are parsed into chunk trees using a string-to-chunktree conversion

    function.  Each of these steps can be performed using a default

    function or a custom function.  By default, paragraphs are split

    on blank lines; sentences are listed one per line; and sentences

    are parsed into chunk trees using ``nltk.chunk.tagstr2tree``.

    """

    def __init__(

        self,

        root,

        fileids,

        extension="",

        str2chunktree=tagstr2tree,

        sent_tokenizer=RegexpTokenizer("\n", gaps=True),

        para_block_reader=read_blankline_block,

        encoding="utf8",

        tagset=None,

    ):
        """

        :param root: The root directory for this corpus.

        :param fileids: A list or regexp specifying the fileids in this corpus.

        """
        CorpusReader.__init__(self, root, fileids, encoding)
        self._cv_args = (str2chunktree, sent_tokenizer, para_block_reader, tagset)
        """Arguments for corpus views generated by this corpus: a tuple

        (str2chunktree, sent_tokenizer, para_block_tokenizer)"""

    def words(self, fileids=None):
        """

        :return: the given file(s) as a list of words

            and punctuation symbols.

        :rtype: list(str)

        """
        return concat(
            [
                ChunkedCorpusView(f, enc, 0, 0, 0, 0, *self._cv_args)
                for (f, enc) in self.abspaths(fileids, True)
            ]
        )

    def sents(self, fileids=None):
        """

        :return: the given file(s) as a list of

            sentences or utterances, each encoded as a list of word

            strings.

        :rtype: list(list(str))

        """
        return concat(
            [
                ChunkedCorpusView(f, enc, 0, 1, 0, 0, *self._cv_args)
                for (f, enc) in self.abspaths(fileids, True)
            ]
        )

    def paras(self, fileids=None):
        """

        :return: the given file(s) as a list of

            paragraphs, each encoded as a list of sentences, which are

            in turn encoded as lists of word strings.

        :rtype: list(list(list(str)))

        """
        return concat(
            [
                ChunkedCorpusView(f, enc, 0, 1, 1, 0, *self._cv_args)
                for (f, enc) in self.abspaths(fileids, True)
            ]
        )

    def tagged_words(self, fileids=None, tagset=None):
        """

        :return: the given file(s) as a list of tagged

            words and punctuation symbols, encoded as tuples

            ``(word,tag)``.

        :rtype: list(tuple(str,str))

        """
        return concat(
            [
                ChunkedCorpusView(
                    f, enc, 1, 0, 0, 0, *self._cv_args, target_tagset=tagset
                )
                for (f, enc) in self.abspaths(fileids, True)
            ]
        )

    def tagged_sents(self, fileids=None, tagset=None):
        """

        :return: the given file(s) as a list of

            sentences, each encoded as a list of ``(word,tag)`` tuples.



        :rtype: list(list(tuple(str,str)))

        """
        return concat(
            [
                ChunkedCorpusView(
                    f, enc, 1, 1, 0, 0, *self._cv_args, target_tagset=tagset
                )
                for (f, enc) in self.abspaths(fileids, True)
            ]
        )

    def tagged_paras(self, fileids=None, tagset=None):
        """

        :return: the given file(s) as a list of

            paragraphs, each encoded as a list of sentences, which are

            in turn encoded as lists of ``(word,tag)`` tuples.

        :rtype: list(list(list(tuple(str,str))))

        """
        return concat(
            [
                ChunkedCorpusView(
                    f, enc, 1, 1, 1, 0, *self._cv_args, target_tagset=tagset
                )
                for (f, enc) in self.abspaths(fileids, True)
            ]
        )

    def chunked_words(self, fileids=None, tagset=None):
        """

        :return: the given file(s) as a list of tagged

            words and chunks.  Words are encoded as ``(word, tag)``

            tuples (if the corpus has tags) or word strings (if the

            corpus has no tags).  Chunks are encoded as depth-one

            trees over ``(word,tag)`` tuples or word strings.

        :rtype: list(tuple(str,str) and Tree)

        """
        return concat(
            [
                ChunkedCorpusView(
                    f, enc, 1, 0, 0, 1, *self._cv_args, target_tagset=tagset
                )
                for (f, enc) in self.abspaths(fileids, True)
            ]
        )

    def chunked_sents(self, fileids=None, tagset=None):
        """

        :return: the given file(s) as a list of

            sentences, each encoded as a shallow Tree.  The leaves

            of these trees are encoded as ``(word, tag)`` tuples (if

            the corpus has tags) or word strings (if the corpus has no

            tags).

        :rtype: list(Tree)

        """
        return concat(
            [
                ChunkedCorpusView(
                    f, enc, 1, 1, 0, 1, *self._cv_args, target_tagset=tagset
                )
                for (f, enc) in self.abspaths(fileids, True)
            ]
        )

    def chunked_paras(self, fileids=None, tagset=None):
        """

        :return: the given file(s) as a list of

            paragraphs, each encoded as a list of sentences, which are

            in turn encoded as a shallow Tree.  The leaves of these

            trees are encoded as ``(word, tag)`` tuples (if the corpus

            has tags) or word strings (if the corpus has no tags).

        :rtype: list(list(Tree))

        """
        return concat(
            [
                ChunkedCorpusView(
                    f, enc, 1, 1, 1, 1, *self._cv_args, target_tagset=tagset
                )
                for (f, enc) in self.abspaths(fileids, True)
            ]
        )

    def _read_block(self, stream):
        return [tagstr2tree(t) for t in read_blankline_block(stream)]


class ChunkedCorpusView(StreamBackedCorpusView):
    def __init__(

        self,

        fileid,

        encoding,

        tagged,

        group_by_sent,

        group_by_para,

        chunked,

        str2chunktree,

        sent_tokenizer,

        para_block_reader,

        source_tagset=None,

        target_tagset=None,

    ):
        StreamBackedCorpusView.__init__(self, fileid, encoding=encoding)
        self._tagged = tagged
        self._group_by_sent = group_by_sent
        self._group_by_para = group_by_para
        self._chunked = chunked
        self._str2chunktree = str2chunktree
        self._sent_tokenizer = sent_tokenizer
        self._para_block_reader = para_block_reader
        self._source_tagset = source_tagset
        self._target_tagset = target_tagset

    def read_block(self, stream):
        block = []
        for para_str in self._para_block_reader(stream):
            para = []
            for sent_str in self._sent_tokenizer.tokenize(para_str):
                sent = self._str2chunktree(
                    sent_str,
                    source_tagset=self._source_tagset,
                    target_tagset=self._target_tagset,
                )

                # If requested, throw away the tags.
                if not self._tagged:
                    sent = self._untag(sent)

                # If requested, throw away the chunks.
                if not self._chunked:
                    sent = sent.leaves()

                # Add the sentence to `para`.
                if self._group_by_sent:
                    para.append(sent)
                else:
                    para.extend(sent)

            # Add the paragraph to `block`.
            if self._group_by_para:
                block.append(para)
            else:
                block.extend(para)

        # Return the block
        return block

    def _untag(self, tree):
        for i, child in enumerate(tree):
            if isinstance(child, Tree):
                self._untag(child)
            elif isinstance(child, tuple):
                tree[i] = child[0]
            else:
                raise ValueError("expected child to be Tree or tuple")
        return tree