Spaces:
Sleeping
Sleeping
File size: 4,896 Bytes
d916065 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
# Natural Language Toolkit: Pros and Cons Corpus Reader
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Pierpaolo Pantone <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
CorpusReader for the Pros and Cons dataset.
- Pros and Cons dataset information -
Contact: Bing Liu, [email protected]
https://www.cs.uic.edu/~liub
Distributed with permission.
Related papers:
- Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".
Proceedings of the 22nd International Conference on Computational Linguistics
(Coling-2008), Manchester, 18-22 August, 2008.
- Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and Comparing
Opinions on the Web". Proceedings of the 14th international World Wide Web
conference (WWW-2005), May 10-14, 2005, in Chiba, Japan.
"""
import re
from nltk.corpus.reader.api import *
from nltk.tokenize import *
class ProsConsCorpusReader(CategorizedCorpusReader, CorpusReader):
"""
Reader for the Pros and Cons sentence dataset.
>>> from nltk.corpus import pros_cons
>>> pros_cons.sents(categories='Cons') # doctest: +NORMALIZE_WHITESPACE
[['East', 'batteries', '!', 'On', '-', 'off', 'switch', 'too', 'easy',
'to', 'maneuver', '.'], ['Eats', '...', 'no', ',', 'GULPS', 'batteries'],
...]
>>> pros_cons.words('IntegratedPros.txt')
['Easy', 'to', 'use', ',', 'economical', '!', ...]
"""
CorpusView = StreamBackedCorpusView
def __init__(
self,
root,
fileids,
word_tokenizer=WordPunctTokenizer(),
encoding="utf8",
**kwargs
):
"""
:param root: The root directory for the corpus.
:param fileids: a list or regexp specifying the fileids in the corpus.
:param word_tokenizer: a tokenizer for breaking sentences or paragraphs
into words. Default: `WhitespaceTokenizer`
:param encoding: the encoding that should be used to read the corpus.
:param kwargs: additional parameters passed to CategorizedCorpusReader.
"""
CorpusReader.__init__(self, root, fileids, encoding)
CategorizedCorpusReader.__init__(self, kwargs)
self._word_tokenizer = word_tokenizer
def sents(self, fileids=None, categories=None):
"""
Return all sentences in the corpus or in the specified files/categories.
:param fileids: a list or regexp specifying the ids of the files whose
sentences have to be returned.
:param categories: a list specifying the categories whose sentences
have to be returned.
:return: the given file(s) as a list of sentences. Each sentence is
tokenized using the specified word_tokenizer.
:rtype: list(list(str))
"""
fileids = self._resolve(fileids, categories)
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, str):
fileids = [fileids]
return concat(
[
self.CorpusView(path, self._read_sent_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def words(self, fileids=None, categories=None):
"""
Return all words and punctuation symbols in the corpus or in the specified
files/categories.
:param fileids: a list or regexp specifying the ids of the files whose
words have to be returned.
:param categories: a list specifying the categories whose words have
to be returned.
:return: the given file(s) as a list of words and punctuation symbols.
:rtype: list(str)
"""
fileids = self._resolve(fileids, categories)
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, str):
fileids = [fileids]
return concat(
[
self.CorpusView(path, self._read_word_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def _read_sent_block(self, stream):
sents = []
for i in range(20): # Read 20 lines at a time.
line = stream.readline()
if not line:
continue
sent = re.match(r"^(?!\n)\s*<(Pros|Cons)>(.*)</(?:Pros|Cons)>", line)
if sent:
sents.append(self._word_tokenizer.tokenize(sent.group(2).strip()))
return sents
def _read_word_block(self, stream):
words = []
for sent in self._read_sent_block(stream):
words.extend(sent)
return words
|