Spaces:
Sleeping
Sleeping
File size: 4,626 Bytes
d916065 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 |
# Natural Language Toolkit: SentiWordNet
#
# Copyright (C) 2001-2023 NLTK Project
# Author: Christopher Potts <[email protected]>
# URL: <https://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
An NLTK interface for SentiWordNet
SentiWordNet is a lexical resource for opinion mining.
SentiWordNet assigns to each synset of WordNet three
sentiment scores: positivity, negativity, and objectivity.
For details about SentiWordNet see:
http://sentiwordnet.isti.cnr.it/
>>> from nltk.corpus import sentiwordnet as swn
>>> print(swn.senti_synset('breakdown.n.03'))
<breakdown.n.03: PosScore=0.0 NegScore=0.25>
>>> list(swn.senti_synsets('slow'))
[SentiSynset('decelerate.v.01'), SentiSynset('slow.v.02'),\
SentiSynset('slow.v.03'), SentiSynset('slow.a.01'),\
SentiSynset('slow.a.02'), SentiSynset('dense.s.04'),\
SentiSynset('slow.a.04'), SentiSynset('boring.s.01'),\
SentiSynset('dull.s.08'), SentiSynset('slowly.r.01'),\
SentiSynset('behind.r.03')]
>>> happy = swn.senti_synsets('happy', 'a')
>>> happy0 = list(happy)[0]
>>> happy0.pos_score()
0.875
>>> happy0.neg_score()
0.0
>>> happy0.obj_score()
0.125
"""
import re
from nltk.corpus.reader import CorpusReader
class SentiWordNetCorpusReader(CorpusReader):
def __init__(self, root, fileids, encoding="utf-8"):
"""
Construct a new SentiWordNet Corpus Reader, using data from
the specified file.
"""
super().__init__(root, fileids, encoding=encoding)
if len(self._fileids) != 1:
raise ValueError("Exactly one file must be specified")
self._db = {}
self._parse_src_file()
def _parse_src_file(self):
lines = self.open(self._fileids[0]).read().splitlines()
lines = filter((lambda x: not re.search(r"^\s*#", x)), lines)
for i, line in enumerate(lines):
fields = [field.strip() for field in re.split(r"\t+", line)]
try:
pos, offset, pos_score, neg_score, synset_terms, gloss = fields
except BaseException as e:
raise ValueError(f"Line {i} formatted incorrectly: {line}\n") from e
if pos and offset:
offset = int(offset)
self._db[(pos, offset)] = (float(pos_score), float(neg_score))
def senti_synset(self, *vals):
from nltk.corpus import wordnet as wn
if tuple(vals) in self._db:
pos_score, neg_score = self._db[tuple(vals)]
pos, offset = vals
if pos == "s":
pos = "a"
synset = wn.synset_from_pos_and_offset(pos, offset)
return SentiSynset(pos_score, neg_score, synset)
else:
synset = wn.synset(vals[0])
pos = synset.pos()
if pos == "s":
pos = "a"
offset = synset.offset()
if (pos, offset) in self._db:
pos_score, neg_score = self._db[(pos, offset)]
return SentiSynset(pos_score, neg_score, synset)
else:
return None
def senti_synsets(self, string, pos=None):
from nltk.corpus import wordnet as wn
sentis = []
synset_list = wn.synsets(string, pos)
for synset in synset_list:
sentis.append(self.senti_synset(synset.name()))
sentis = filter(lambda x: x, sentis)
return sentis
def all_senti_synsets(self):
from nltk.corpus import wordnet as wn
for key, fields in self._db.items():
pos, offset = key
pos_score, neg_score = fields
synset = wn.synset_from_pos_and_offset(pos, offset)
yield SentiSynset(pos_score, neg_score, synset)
class SentiSynset:
def __init__(self, pos_score, neg_score, synset):
self._pos_score = pos_score
self._neg_score = neg_score
self._obj_score = 1.0 - (self._pos_score + self._neg_score)
self.synset = synset
def pos_score(self):
return self._pos_score
def neg_score(self):
return self._neg_score
def obj_score(self):
return self._obj_score
def __str__(self):
"""Prints just the Pos/Neg scores for now."""
s = "<"
s += self.synset.name() + ": "
s += "PosScore=%s " % self._pos_score
s += "NegScore=%s" % self._neg_score
s += ">"
return s
def __repr__(self):
return "Senti" + repr(self.synset)
|