File size: 8,456 Bytes
d916065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
# Natural Language Toolkit: Plaintext Corpus Reader
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Steven Bird <[email protected]>
#         Edward Loper <[email protected]>
#         Nitin Madnani <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT

"""

A reader for corpora that consist of plaintext documents.

"""

import nltk.data
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.tokenize import *


class PlaintextCorpusReader(CorpusReader):
    """

    Reader for corpora that consist of plaintext documents.  Paragraphs

    are assumed to be split using blank lines.  Sentences and words can

    be tokenized using the default tokenizers, or by custom tokenizers

    specified as parameters to the constructor.



    This corpus reader can be customized (e.g., to skip preface

    sections of specific document formats) by creating a subclass and

    overriding the ``CorpusView`` class variable.

    """

    CorpusView = StreamBackedCorpusView
    """The corpus view class used by this reader.  Subclasses of

       ``PlaintextCorpusReader`` may specify alternative corpus view

       classes (e.g., to skip the preface sections of documents.)"""

    def __init__(

        self,

        root,

        fileids,

        word_tokenizer=WordPunctTokenizer(),

        sent_tokenizer=nltk.data.LazyLoader("tokenizers/punkt/english.pickle"),

        para_block_reader=read_blankline_block,

        encoding="utf8",

    ):
        r"""

        Construct a new plaintext corpus reader for a set of documents

        located at the given root directory.  Example usage:



            >>> root = '/usr/local/share/nltk_data/corpora/webtext/'

            >>> reader = PlaintextCorpusReader(root, '.*\.txt') # doctest: +SKIP



        :param root: The root directory for this corpus.

        :param fileids: A list or regexp specifying the fileids in this corpus.

        :param word_tokenizer: Tokenizer for breaking sentences or

            paragraphs into words.

        :param sent_tokenizer: Tokenizer for breaking paragraphs

            into words.

        :param para_block_reader: The block reader used to divide the

            corpus into paragraph blocks.

        """
        CorpusReader.__init__(self, root, fileids, encoding)
        self._word_tokenizer = word_tokenizer
        self._sent_tokenizer = sent_tokenizer
        self._para_block_reader = para_block_reader

    def words(self, fileids=None):
        """

        :return: the given file(s) as a list of words

            and punctuation symbols.

        :rtype: list(str)

        """
        return concat(
            [
                self.CorpusView(path, self._read_word_block, encoding=enc)
                for (path, enc, fileid) in self.abspaths(fileids, True, True)
            ]
        )

    def sents(self, fileids=None):
        """

        :return: the given file(s) as a list of

            sentences or utterances, each encoded as a list of word

            strings.

        :rtype: list(list(str))

        """
        if self._sent_tokenizer is None:
            raise ValueError("No sentence tokenizer for this corpus")

        return concat(
            [
                self.CorpusView(path, self._read_sent_block, encoding=enc)
                for (path, enc, fileid) in self.abspaths(fileids, True, True)
            ]
        )

    def paras(self, fileids=None):
        """

        :return: the given file(s) as a list of

            paragraphs, each encoded as a list of sentences, which are

            in turn encoded as lists of word strings.

        :rtype: list(list(list(str)))

        """
        if self._sent_tokenizer is None:
            raise ValueError("No sentence tokenizer for this corpus")

        return concat(
            [
                self.CorpusView(path, self._read_para_block, encoding=enc)
                for (path, enc, fileid) in self.abspaths(fileids, True, True)
            ]
        )

    def _read_word_block(self, stream):
        words = []
        for i in range(20):  # Read 20 lines at a time.
            words.extend(self._word_tokenizer.tokenize(stream.readline()))
        return words

    def _read_sent_block(self, stream):
        sents = []
        for para in self._para_block_reader(stream):
            sents.extend(
                [
                    self._word_tokenizer.tokenize(sent)
                    for sent in self._sent_tokenizer.tokenize(para)
                ]
            )
        return sents

    def _read_para_block(self, stream):
        paras = []
        for para in self._para_block_reader(stream):
            paras.append(
                [
                    self._word_tokenizer.tokenize(sent)
                    for sent in self._sent_tokenizer.tokenize(para)
                ]
            )
        return paras


class CategorizedPlaintextCorpusReader(CategorizedCorpusReader, PlaintextCorpusReader):
    """

    A reader for plaintext corpora whose documents are divided into

    categories based on their file identifiers.

    """

    def __init__(self, *args, **kwargs):
        """

        Initialize the corpus reader.  Categorization arguments

        (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to

        the ``CategorizedCorpusReader`` constructor.  The remaining arguments

        are passed to the ``PlaintextCorpusReader`` constructor.

        """
        CategorizedCorpusReader.__init__(self, kwargs)
        PlaintextCorpusReader.__init__(self, *args, **kwargs)


# FIXME: Is there a better way? How to not hardcode this?
#       Possibly, add a language kwargs to CategorizedPlaintextCorpusReader to
#       override the `sent_tokenizer`.
class PortugueseCategorizedPlaintextCorpusReader(CategorizedPlaintextCorpusReader):
    def __init__(self, *args, **kwargs):
        CategorizedCorpusReader.__init__(self, kwargs)
        kwargs["sent_tokenizer"] = nltk.data.LazyLoader(
            "tokenizers/punkt/portuguese.pickle"
        )
        PlaintextCorpusReader.__init__(self, *args, **kwargs)


class EuroparlCorpusReader(PlaintextCorpusReader):

    """

    Reader for Europarl corpora that consist of plaintext documents.

    Documents are divided into chapters instead of paragraphs as

    for regular plaintext documents. Chapters are separated using blank

    lines. Everything is inherited from ``PlaintextCorpusReader`` except

    that:



    - Since the corpus is pre-processed and pre-tokenized, the

      word tokenizer should just split the line at whitespaces.

    - For the same reason, the sentence tokenizer should just

      split the paragraph at line breaks.

    - There is a new 'chapters()' method that returns chapters instead

      instead of paragraphs.

    - The 'paras()' method inherited from PlaintextCorpusReader is

      made non-functional to remove any confusion between chapters

      and paragraphs for Europarl.

    """

    def _read_word_block(self, stream):
        words = []
        for i in range(20):  # Read 20 lines at a time.
            words.extend(stream.readline().split())
        return words

    def _read_sent_block(self, stream):
        sents = []
        for para in self._para_block_reader(stream):
            sents.extend([sent.split() for sent in para.splitlines()])
        return sents

    def _read_para_block(self, stream):
        paras = []
        for para in self._para_block_reader(stream):
            paras.append([sent.split() for sent in para.splitlines()])
        return paras

    def chapters(self, fileids=None):
        """

        :return: the given file(s) as a list of

            chapters, each encoded as a list of sentences, which are

            in turn encoded as lists of word strings.

        :rtype: list(list(list(str)))

        """
        return concat(
            [
                self.CorpusView(fileid, self._read_para_block, encoding=enc)
                for (fileid, enc) in self.abspaths(fileids, True)
            ]
        )

    def paras(self, fileids=None):
        raise NotImplementedError(
            "The Europarl corpus reader does not support paragraphs. Please use chapters() instead."
        )