Spaces:
Sleeping
Sleeping
File size: 8,456 Bytes
d916065 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 |
# Natural Language Toolkit: Plaintext Corpus Reader
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Steven Bird <[email protected]>
# Edward Loper <[email protected]>
# Nitin Madnani <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
A reader for corpora that consist of plaintext documents.
"""
import nltk.data
from nltk.corpus.reader.api import *
from nltk.corpus.reader.util import *
from nltk.tokenize import *
class PlaintextCorpusReader(CorpusReader):
"""
Reader for corpora that consist of plaintext documents. Paragraphs
are assumed to be split using blank lines. Sentences and words can
be tokenized using the default tokenizers, or by custom tokenizers
specified as parameters to the constructor.
This corpus reader can be customized (e.g., to skip preface
sections of specific document formats) by creating a subclass and
overriding the ``CorpusView`` class variable.
"""
CorpusView = StreamBackedCorpusView
"""The corpus view class used by this reader. Subclasses of
``PlaintextCorpusReader`` may specify alternative corpus view
classes (e.g., to skip the preface sections of documents.)"""
def __init__(
self,
root,
fileids,
word_tokenizer=WordPunctTokenizer(),
sent_tokenizer=nltk.data.LazyLoader("tokenizers/punkt/english.pickle"),
para_block_reader=read_blankline_block,
encoding="utf8",
):
r"""
Construct a new plaintext corpus reader for a set of documents
located at the given root directory. Example usage:
>>> root = '/usr/local/share/nltk_data/corpora/webtext/'
>>> reader = PlaintextCorpusReader(root, '.*\.txt') # doctest: +SKIP
:param root: The root directory for this corpus.
:param fileids: A list or regexp specifying the fileids in this corpus.
:param word_tokenizer: Tokenizer for breaking sentences or
paragraphs into words.
:param sent_tokenizer: Tokenizer for breaking paragraphs
into words.
:param para_block_reader: The block reader used to divide the
corpus into paragraph blocks.
"""
CorpusReader.__init__(self, root, fileids, encoding)
self._word_tokenizer = word_tokenizer
self._sent_tokenizer = sent_tokenizer
self._para_block_reader = para_block_reader
def words(self, fileids=None):
"""
:return: the given file(s) as a list of words
and punctuation symbols.
:rtype: list(str)
"""
return concat(
[
self.CorpusView(path, self._read_word_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def sents(self, fileids=None):
"""
:return: the given file(s) as a list of
sentences or utterances, each encoded as a list of word
strings.
:rtype: list(list(str))
"""
if self._sent_tokenizer is None:
raise ValueError("No sentence tokenizer for this corpus")
return concat(
[
self.CorpusView(path, self._read_sent_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def paras(self, fileids=None):
"""
:return: the given file(s) as a list of
paragraphs, each encoded as a list of sentences, which are
in turn encoded as lists of word strings.
:rtype: list(list(list(str)))
"""
if self._sent_tokenizer is None:
raise ValueError("No sentence tokenizer for this corpus")
return concat(
[
self.CorpusView(path, self._read_para_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def _read_word_block(self, stream):
words = []
for i in range(20): # Read 20 lines at a time.
words.extend(self._word_tokenizer.tokenize(stream.readline()))
return words
def _read_sent_block(self, stream):
sents = []
for para in self._para_block_reader(stream):
sents.extend(
[
self._word_tokenizer.tokenize(sent)
for sent in self._sent_tokenizer.tokenize(para)
]
)
return sents
def _read_para_block(self, stream):
paras = []
for para in self._para_block_reader(stream):
paras.append(
[
self._word_tokenizer.tokenize(sent)
for sent in self._sent_tokenizer.tokenize(para)
]
)
return paras
class CategorizedPlaintextCorpusReader(CategorizedCorpusReader, PlaintextCorpusReader):
"""
A reader for plaintext corpora whose documents are divided into
categories based on their file identifiers.
"""
def __init__(self, *args, **kwargs):
"""
Initialize the corpus reader. Categorization arguments
(``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to
the ``CategorizedCorpusReader`` constructor. The remaining arguments
are passed to the ``PlaintextCorpusReader`` constructor.
"""
CategorizedCorpusReader.__init__(self, kwargs)
PlaintextCorpusReader.__init__(self, *args, **kwargs)
# FIXME: Is there a better way? How to not hardcode this?
# Possibly, add a language kwargs to CategorizedPlaintextCorpusReader to
# override the `sent_tokenizer`.
class PortugueseCategorizedPlaintextCorpusReader(CategorizedPlaintextCorpusReader):
def __init__(self, *args, **kwargs):
CategorizedCorpusReader.__init__(self, kwargs)
kwargs["sent_tokenizer"] = nltk.data.LazyLoader(
"tokenizers/punkt/portuguese.pickle"
)
PlaintextCorpusReader.__init__(self, *args, **kwargs)
class EuroparlCorpusReader(PlaintextCorpusReader):
"""
Reader for Europarl corpora that consist of plaintext documents.
Documents are divided into chapters instead of paragraphs as
for regular plaintext documents. Chapters are separated using blank
lines. Everything is inherited from ``PlaintextCorpusReader`` except
that:
- Since the corpus is pre-processed and pre-tokenized, the
word tokenizer should just split the line at whitespaces.
- For the same reason, the sentence tokenizer should just
split the paragraph at line breaks.
- There is a new 'chapters()' method that returns chapters instead
instead of paragraphs.
- The 'paras()' method inherited from PlaintextCorpusReader is
made non-functional to remove any confusion between chapters
and paragraphs for Europarl.
"""
def _read_word_block(self, stream):
words = []
for i in range(20): # Read 20 lines at a time.
words.extend(stream.readline().split())
return words
def _read_sent_block(self, stream):
sents = []
for para in self._para_block_reader(stream):
sents.extend([sent.split() for sent in para.splitlines()])
return sents
def _read_para_block(self, stream):
paras = []
for para in self._para_block_reader(stream):
paras.append([sent.split() for sent in para.splitlines()])
return paras
def chapters(self, fileids=None):
"""
:return: the given file(s) as a list of
chapters, each encoded as a list of sentences, which are
in turn encoded as lists of word strings.
:rtype: list(list(list(str)))
"""
return concat(
[
self.CorpusView(fileid, self._read_para_block, encoding=enc)
for (fileid, enc) in self.abspaths(fileids, True)
]
)
def paras(self, fileids=None):
raise NotImplementedError(
"The Europarl corpus reader does not support paragraphs. Please use chapters() instead."
)
|