File size: 4,896 Bytes
d916065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# Natural Language Toolkit: Pros and Cons Corpus Reader
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Pierpaolo Pantone <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT

"""

CorpusReader for the Pros and Cons dataset.



- Pros and Cons dataset information -



Contact: Bing Liu, [email protected]

        https://www.cs.uic.edu/~liub



Distributed with permission.



Related papers:



- Murthy Ganapathibhotla and Bing Liu. "Mining Opinions in Comparative Sentences".

    Proceedings of the 22nd International Conference on Computational Linguistics

    (Coling-2008), Manchester, 18-22 August, 2008.



- Bing Liu, Minqing Hu and Junsheng Cheng. "Opinion Observer: Analyzing and Comparing

    Opinions on the Web". Proceedings of the 14th international World Wide Web

    conference (WWW-2005), May 10-14, 2005, in Chiba, Japan.

"""
import re

from nltk.corpus.reader.api import *
from nltk.tokenize import *


class ProsConsCorpusReader(CategorizedCorpusReader, CorpusReader):
    """

    Reader for the Pros and Cons sentence dataset.



        >>> from nltk.corpus import pros_cons

        >>> pros_cons.sents(categories='Cons') # doctest: +NORMALIZE_WHITESPACE

        [['East', 'batteries', '!', 'On', '-', 'off', 'switch', 'too', 'easy',

        'to', 'maneuver', '.'], ['Eats', '...', 'no', ',', 'GULPS', 'batteries'],

        ...]

        >>> pros_cons.words('IntegratedPros.txt')

        ['Easy', 'to', 'use', ',', 'economical', '!', ...]

    """

    CorpusView = StreamBackedCorpusView

    def __init__(

        self,

        root,

        fileids,

        word_tokenizer=WordPunctTokenizer(),

        encoding="utf8",

        **kwargs

    ):
        """

        :param root: The root directory for the corpus.

        :param fileids: a list or regexp specifying the fileids in the corpus.

        :param word_tokenizer: a tokenizer for breaking sentences or paragraphs

            into words. Default: `WhitespaceTokenizer`

        :param encoding: the encoding that should be used to read the corpus.

        :param kwargs: additional parameters passed to CategorizedCorpusReader.

        """

        CorpusReader.__init__(self, root, fileids, encoding)
        CategorizedCorpusReader.__init__(self, kwargs)
        self._word_tokenizer = word_tokenizer

    def sents(self, fileids=None, categories=None):
        """

        Return all sentences in the corpus or in the specified files/categories.



        :param fileids: a list or regexp specifying the ids of the files whose

            sentences have to be returned.

        :param categories: a list specifying the categories whose sentences

            have to be returned.

        :return: the given file(s) as a list of sentences. Each sentence is

            tokenized using the specified word_tokenizer.

        :rtype: list(list(str))

        """
        fileids = self._resolve(fileids, categories)
        if fileids is None:
            fileids = self._fileids
        elif isinstance(fileids, str):
            fileids = [fileids]
        return concat(
            [
                self.CorpusView(path, self._read_sent_block, encoding=enc)
                for (path, enc, fileid) in self.abspaths(fileids, True, True)
            ]
        )

    def words(self, fileids=None, categories=None):
        """

        Return all words and punctuation symbols in the corpus or in the specified

        files/categories.



        :param fileids: a list or regexp specifying the ids of the files whose

            words have to be returned.

        :param categories: a list specifying the categories whose words have

            to be returned.

        :return: the given file(s) as a list of words and punctuation symbols.

        :rtype: list(str)

        """
        fileids = self._resolve(fileids, categories)
        if fileids is None:
            fileids = self._fileids
        elif isinstance(fileids, str):
            fileids = [fileids]
        return concat(
            [
                self.CorpusView(path, self._read_word_block, encoding=enc)
                for (path, enc, fileid) in self.abspaths(fileids, True, True)
            ]
        )

    def _read_sent_block(self, stream):
        sents = []
        for i in range(20):  # Read 20 lines at a time.
            line = stream.readline()
            if not line:
                continue
            sent = re.match(r"^(?!\n)\s*<(Pros|Cons)>(.*)</(?:Pros|Cons)>", line)
            if sent:
                sents.append(self._word_tokenizer.tokenize(sent.group(2).strip()))
        return sents

    def _read_word_block(self, stream):
        words = []
        for sent in self._read_sent_block(stream):
            words.extend(sent)
        return words