Spaces:
Sleeping
Sleeping
File size: 4,230 Bytes
d916065 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 |
# Natural Language Toolkit: Opinion Lexicon Corpus Reader
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Pierpaolo Pantone <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
CorpusReader for the Opinion Lexicon.
Opinion Lexicon information
===========================
Authors: Minqing Hu and Bing Liu, 2004.
Department of Computer Science
University of Illinois at Chicago
Contact: Bing Liu, [email protected]
https://www.cs.uic.edu/~liub
Distributed with permission.
Related papers:
- Minqing Hu and Bing Liu. "Mining and summarizing customer reviews".
Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery
& Data Mining (KDD-04), Aug 22-25, 2004, Seattle, Washington, USA.
- Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and
Comparing Opinions on the Web". Proceedings of the 14th International World
Wide Web conference (WWW-2005), May 10-14, 2005, Chiba, Japan.
"""
from nltk.corpus.reader import WordListCorpusReader
from nltk.corpus.reader.api import *
class IgnoreReadmeCorpusView(StreamBackedCorpusView):
"""
This CorpusView is used to skip the initial readme block of the corpus.
"""
def __init__(self, *args, **kwargs):
StreamBackedCorpusView.__init__(self, *args, **kwargs)
# open self._stream
self._open()
# skip the readme block
read_blankline_block(self._stream)
# Set the initial position to the current stream position
self._filepos = [self._stream.tell()]
class OpinionLexiconCorpusReader(WordListCorpusReader):
"""
Reader for Liu and Hu opinion lexicon. Blank lines and readme are ignored.
>>> from nltk.corpus import opinion_lexicon
>>> opinion_lexicon.words()
['2-faced', '2-faces', 'abnormal', 'abolish', ...]
The OpinionLexiconCorpusReader provides shortcuts to retrieve positive/negative
words:
>>> opinion_lexicon.negative()
['2-faced', '2-faces', 'abnormal', 'abolish', ...]
Note that words from `words()` method are sorted by file id, not alphabetically:
>>> opinion_lexicon.words()[0:10] # doctest: +NORMALIZE_WHITESPACE
['2-faced', '2-faces', 'abnormal', 'abolish', 'abominable', 'abominably',
'abominate', 'abomination', 'abort', 'aborted']
>>> sorted(opinion_lexicon.words())[0:10] # doctest: +NORMALIZE_WHITESPACE
['2-faced', '2-faces', 'a+', 'abnormal', 'abolish', 'abominable', 'abominably',
'abominate', 'abomination', 'abort']
"""
CorpusView = IgnoreReadmeCorpusView
def words(self, fileids=None):
"""
Return all words in the opinion lexicon. Note that these words are not
sorted in alphabetical order.
:param fileids: a list or regexp specifying the ids of the files whose
words have to be returned.
:return: the given file(s) as a list of words and punctuation symbols.
:rtype: list(str)
"""
if fileids is None:
fileids = self._fileids
elif isinstance(fileids, str):
fileids = [fileids]
return concat(
[
self.CorpusView(path, self._read_word_block, encoding=enc)
for (path, enc, fileid) in self.abspaths(fileids, True, True)
]
)
def positive(self):
"""
Return all positive words in alphabetical order.
:return: a list of positive words.
:rtype: list(str)
"""
return self.words("positive-words.txt")
def negative(self):
"""
Return all negative words in alphabetical order.
:return: a list of negative words.
:rtype: list(str)
"""
return self.words("negative-words.txt")
def _read_word_block(self, stream):
words = []
for i in range(20): # Read 20 lines at a time.
line = stream.readline()
if not line:
continue
words.append(line.strip())
return words
|