Spaces:
Sleeping
Sleeping
# Natural Language Toolkit: vader | |
# | |
# Copyright (C) 2001-2023 NLTK Project | |
# Author: C.J. Hutto <[email protected]> | |
# Ewan Klein <[email protected]> (modifications) | |
# Pierpaolo Pantone <[email protected]> (modifications) | |
# George Berry <[email protected]> (modifications) | |
# Malavika Suresh <[email protected]> (modifications) | |
# URL: <https://www.nltk.org/> | |
# For license information, see LICENSE.TXT | |
# | |
# Modifications to the original VADER code have been made in order to | |
# integrate it into NLTK. These have involved changes to | |
# ensure Python 3 compatibility, and refactoring to achieve greater modularity. | |
""" | |
If you use the VADER sentiment analysis tools, please cite: | |
Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for | |
Sentiment Analysis of Social Media Text. Eighth International Conference on | |
Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014. | |
""" | |
import math | |
import re | |
import string | |
from itertools import product | |
import nltk.data | |
from nltk.util import pairwise | |
class VaderConstants: | |
""" | |
A class to keep the Vader lists and constants. | |
""" | |
##Constants## | |
# (empirically derived mean sentiment intensity rating increase for booster words) | |
B_INCR = 0.293 | |
B_DECR = -0.293 | |
# (empirically derived mean sentiment intensity rating increase for using | |
# ALLCAPs to emphasize a word) | |
C_INCR = 0.733 | |
N_SCALAR = -0.74 | |
NEGATE = { | |
"aint", | |
"arent", | |
"cannot", | |
"cant", | |
"couldnt", | |
"darent", | |
"didnt", | |
"doesnt", | |
"ain't", | |
"aren't", | |
"can't", | |
"couldn't", | |
"daren't", | |
"didn't", | |
"doesn't", | |
"dont", | |
"hadnt", | |
"hasnt", | |
"havent", | |
"isnt", | |
"mightnt", | |
"mustnt", | |
"neither", | |
"don't", | |
"hadn't", | |
"hasn't", | |
"haven't", | |
"isn't", | |
"mightn't", | |
"mustn't", | |
"neednt", | |
"needn't", | |
"never", | |
"none", | |
"nope", | |
"nor", | |
"not", | |
"nothing", | |
"nowhere", | |
"oughtnt", | |
"shant", | |
"shouldnt", | |
"uhuh", | |
"wasnt", | |
"werent", | |
"oughtn't", | |
"shan't", | |
"shouldn't", | |
"uh-uh", | |
"wasn't", | |
"weren't", | |
"without", | |
"wont", | |
"wouldnt", | |
"won't", | |
"wouldn't", | |
"rarely", | |
"seldom", | |
"despite", | |
} | |
# booster/dampener 'intensifiers' or 'degree adverbs' | |
# https://en.wiktionary.org/wiki/Category:English_degree_adverbs | |
BOOSTER_DICT = { | |
"absolutely": B_INCR, | |
"amazingly": B_INCR, | |
"awfully": B_INCR, | |
"completely": B_INCR, | |
"considerably": B_INCR, | |
"decidedly": B_INCR, | |
"deeply": B_INCR, | |
"effing": B_INCR, | |
"enormously": B_INCR, | |
"entirely": B_INCR, | |
"especially": B_INCR, | |
"exceptionally": B_INCR, | |
"extremely": B_INCR, | |
"fabulously": B_INCR, | |
"flipping": B_INCR, | |
"flippin": B_INCR, | |
"fricking": B_INCR, | |
"frickin": B_INCR, | |
"frigging": B_INCR, | |
"friggin": B_INCR, | |
"fully": B_INCR, | |
"fucking": B_INCR, | |
"greatly": B_INCR, | |
"hella": B_INCR, | |
"highly": B_INCR, | |
"hugely": B_INCR, | |
"incredibly": B_INCR, | |
"intensely": B_INCR, | |
"majorly": B_INCR, | |
"more": B_INCR, | |
"most": B_INCR, | |
"particularly": B_INCR, | |
"purely": B_INCR, | |
"quite": B_INCR, | |
"really": B_INCR, | |
"remarkably": B_INCR, | |
"so": B_INCR, | |
"substantially": B_INCR, | |
"thoroughly": B_INCR, | |
"totally": B_INCR, | |
"tremendously": B_INCR, | |
"uber": B_INCR, | |
"unbelievably": B_INCR, | |
"unusually": B_INCR, | |
"utterly": B_INCR, | |
"very": B_INCR, | |
"almost": B_DECR, | |
"barely": B_DECR, | |
"hardly": B_DECR, | |
"just enough": B_DECR, | |
"kind of": B_DECR, | |
"kinda": B_DECR, | |
"kindof": B_DECR, | |
"kind-of": B_DECR, | |
"less": B_DECR, | |
"little": B_DECR, | |
"marginally": B_DECR, | |
"occasionally": B_DECR, | |
"partly": B_DECR, | |
"scarcely": B_DECR, | |
"slightly": B_DECR, | |
"somewhat": B_DECR, | |
"sort of": B_DECR, | |
"sorta": B_DECR, | |
"sortof": B_DECR, | |
"sort-of": B_DECR, | |
} | |
# check for special case idioms using a sentiment-laden keyword known to SAGE | |
SPECIAL_CASE_IDIOMS = { | |
"the shit": 3, | |
"the bomb": 3, | |
"bad ass": 1.5, | |
"yeah right": -2, | |
"cut the mustard": 2, | |
"kiss of death": -1.5, | |
"hand to mouth": -2, | |
} | |
# for removing punctuation | |
REGEX_REMOVE_PUNCTUATION = re.compile(f"[{re.escape(string.punctuation)}]") | |
PUNC_LIST = [ | |
".", | |
"!", | |
"?", | |
",", | |
";", | |
":", | |
"-", | |
"'", | |
'"', | |
"!!", | |
"!!!", | |
"??", | |
"???", | |
"?!?", | |
"!?!", | |
"?!?!", | |
"!?!?", | |
] | |
def __init__(self): | |
pass | |
def negated(self, input_words, include_nt=True): | |
""" | |
Determine if input contains negation words | |
""" | |
neg_words = self.NEGATE | |
if any(word.lower() in neg_words for word in input_words): | |
return True | |
if include_nt: | |
if any("n't" in word.lower() for word in input_words): | |
return True | |
for first, second in pairwise(input_words): | |
if second.lower() == "least" and first.lower() != "at": | |
return True | |
return False | |
def normalize(self, score, alpha=15): | |
""" | |
Normalize the score to be between -1 and 1 using an alpha that | |
approximates the max expected value | |
""" | |
norm_score = score / math.sqrt((score * score) + alpha) | |
return norm_score | |
def scalar_inc_dec(self, word, valence, is_cap_diff): | |
""" | |
Check if the preceding words increase, decrease, or negate/nullify the | |
valence | |
""" | |
scalar = 0.0 | |
word_lower = word.lower() | |
if word_lower in self.BOOSTER_DICT: | |
scalar = self.BOOSTER_DICT[word_lower] | |
if valence < 0: | |
scalar *= -1 | |
# check if booster/dampener word is in ALLCAPS (while others aren't) | |
if word.isupper() and is_cap_diff: | |
if valence > 0: | |
scalar += self.C_INCR | |
else: | |
scalar -= self.C_INCR | |
return scalar | |
class SentiText: | |
""" | |
Identify sentiment-relevant string-level properties of input text. | |
""" | |
def __init__(self, text, punc_list, regex_remove_punctuation): | |
if not isinstance(text, str): | |
text = str(text.encode("utf-8")) | |
self.text = text | |
self.PUNC_LIST = punc_list | |
self.REGEX_REMOVE_PUNCTUATION = regex_remove_punctuation | |
self.words_and_emoticons = self._words_and_emoticons() | |
# doesn't separate words from | |
# adjacent punctuation (keeps emoticons & contractions) | |
self.is_cap_diff = self.allcap_differential(self.words_and_emoticons) | |
def _words_plus_punc(self): | |
""" | |
Returns mapping of form: | |
{ | |
'cat,': 'cat', | |
',cat': 'cat', | |
} | |
""" | |
no_punc_text = self.REGEX_REMOVE_PUNCTUATION.sub("", self.text) | |
# removes punctuation (but loses emoticons & contractions) | |
words_only = no_punc_text.split() | |
# remove singletons | |
words_only = {w for w in words_only if len(w) > 1} | |
# the product gives ('cat', ',') and (',', 'cat') | |
punc_before = {"".join(p): p[1] for p in product(self.PUNC_LIST, words_only)} | |
punc_after = {"".join(p): p[0] for p in product(words_only, self.PUNC_LIST)} | |
words_punc_dict = punc_before | |
words_punc_dict.update(punc_after) | |
return words_punc_dict | |
def _words_and_emoticons(self): | |
""" | |
Removes leading and trailing puncutation | |
Leaves contractions and most emoticons | |
Does not preserve punc-plus-letter emoticons (e.g. :D) | |
""" | |
wes = self.text.split() | |
words_punc_dict = self._words_plus_punc() | |
wes = [we for we in wes if len(we) > 1] | |
for i, we in enumerate(wes): | |
if we in words_punc_dict: | |
wes[i] = words_punc_dict[we] | |
return wes | |
def allcap_differential(self, words): | |
""" | |
Check whether just some words in the input are ALL CAPS | |
:param list words: The words to inspect | |
:returns: `True` if some but not all items in `words` are ALL CAPS | |
""" | |
is_different = False | |
allcap_words = 0 | |
for word in words: | |
if word.isupper(): | |
allcap_words += 1 | |
cap_differential = len(words) - allcap_words | |
if 0 < cap_differential < len(words): | |
is_different = True | |
return is_different | |
class SentimentIntensityAnalyzer: | |
""" | |
Give a sentiment intensity score to sentences. | |
""" | |
def __init__( | |
self, | |
lexicon_file="sentiment/vader_lexicon.zip/vader_lexicon/vader_lexicon.txt", | |
): | |
self.lexicon_file = nltk.data.load(lexicon_file) | |
self.lexicon = self.make_lex_dict() | |
self.constants = VaderConstants() | |
def make_lex_dict(self): | |
""" | |
Convert lexicon file to a dictionary | |
""" | |
lex_dict = {} | |
for line in self.lexicon_file.split("\n"): | |
(word, measure) = line.strip().split("\t")[0:2] | |
lex_dict[word] = float(measure) | |
return lex_dict | |
def polarity_scores(self, text): | |
""" | |
Return a float for sentiment strength based on the input text. | |
Positive values are positive valence, negative value are negative | |
valence. | |
:note: Hashtags are not taken into consideration (e.g. #BAD is neutral). If you | |
are interested in processing the text in the hashtags too, then we recommend | |
preprocessing your data to remove the #, after which the hashtag text may be | |
matched as if it was a normal word in the sentence. | |
""" | |
# text, words_and_emoticons, is_cap_diff = self.preprocess(text) | |
sentitext = SentiText( | |
text, self.constants.PUNC_LIST, self.constants.REGEX_REMOVE_PUNCTUATION | |
) | |
sentiments = [] | |
words_and_emoticons = sentitext.words_and_emoticons | |
for item in words_and_emoticons: | |
valence = 0 | |
i = words_and_emoticons.index(item) | |
if ( | |
i < len(words_and_emoticons) - 1 | |
and item.lower() == "kind" | |
and words_and_emoticons[i + 1].lower() == "of" | |
) or item.lower() in self.constants.BOOSTER_DICT: | |
sentiments.append(valence) | |
continue | |
sentiments = self.sentiment_valence(valence, sentitext, item, i, sentiments) | |
sentiments = self._but_check(words_and_emoticons, sentiments) | |
return self.score_valence(sentiments, text) | |
def sentiment_valence(self, valence, sentitext, item, i, sentiments): | |
is_cap_diff = sentitext.is_cap_diff | |
words_and_emoticons = sentitext.words_and_emoticons | |
item_lowercase = item.lower() | |
if item_lowercase in self.lexicon: | |
# get the sentiment valence | |
valence = self.lexicon[item_lowercase] | |
# check if sentiment laden word is in ALL CAPS (while others aren't) | |
if item.isupper() and is_cap_diff: | |
if valence > 0: | |
valence += self.constants.C_INCR | |
else: | |
valence -= self.constants.C_INCR | |
for start_i in range(0, 3): | |
if ( | |
i > start_i | |
and words_and_emoticons[i - (start_i + 1)].lower() | |
not in self.lexicon | |
): | |
# dampen the scalar modifier of preceding words and emoticons | |
# (excluding the ones that immediately preceed the item) based | |
# on their distance from the current item. | |
s = self.constants.scalar_inc_dec( | |
words_and_emoticons[i - (start_i + 1)], valence, is_cap_diff | |
) | |
if start_i == 1 and s != 0: | |
s = s * 0.95 | |
if start_i == 2 and s != 0: | |
s = s * 0.9 | |
valence = valence + s | |
valence = self._never_check( | |
valence, words_and_emoticons, start_i, i | |
) | |
if start_i == 2: | |
valence = self._idioms_check(valence, words_and_emoticons, i) | |
# future work: consider other sentiment-laden idioms | |
# other_idioms = | |
# {"back handed": -2, "blow smoke": -2, "blowing smoke": -2, | |
# "upper hand": 1, "break a leg": 2, | |
# "cooking with gas": 2, "in the black": 2, "in the red": -2, | |
# "on the ball": 2,"under the weather": -2} | |
valence = self._least_check(valence, words_and_emoticons, i) | |
sentiments.append(valence) | |
return sentiments | |
def _least_check(self, valence, words_and_emoticons, i): | |
# check for negation case using "least" | |
if ( | |
i > 1 | |
and words_and_emoticons[i - 1].lower() not in self.lexicon | |
and words_and_emoticons[i - 1].lower() == "least" | |
): | |
if ( | |
words_and_emoticons[i - 2].lower() != "at" | |
and words_and_emoticons[i - 2].lower() != "very" | |
): | |
valence = valence * self.constants.N_SCALAR | |
elif ( | |
i > 0 | |
and words_and_emoticons[i - 1].lower() not in self.lexicon | |
and words_and_emoticons[i - 1].lower() == "least" | |
): | |
valence = valence * self.constants.N_SCALAR | |
return valence | |
def _but_check(self, words_and_emoticons, sentiments): | |
words_and_emoticons = [w_e.lower() for w_e in words_and_emoticons] | |
but = {"but"} & set(words_and_emoticons) | |
if but: | |
bi = words_and_emoticons.index(next(iter(but))) | |
for sidx, sentiment in enumerate(sentiments): | |
if sidx < bi: | |
sentiments[sidx] = sentiment * 0.5 | |
elif sidx > bi: | |
sentiments[sidx] = sentiment * 1.5 | |
return sentiments | |
def _idioms_check(self, valence, words_and_emoticons, i): | |
onezero = f"{words_and_emoticons[i - 1]} {words_and_emoticons[i]}" | |
twoonezero = "{} {} {}".format( | |
words_and_emoticons[i - 2], | |
words_and_emoticons[i - 1], | |
words_and_emoticons[i], | |
) | |
twoone = f"{words_and_emoticons[i - 2]} {words_and_emoticons[i - 1]}" | |
threetwoone = "{} {} {}".format( | |
words_and_emoticons[i - 3], | |
words_and_emoticons[i - 2], | |
words_and_emoticons[i - 1], | |
) | |
threetwo = "{} {}".format( | |
words_and_emoticons[i - 3], words_and_emoticons[i - 2] | |
) | |
sequences = [onezero, twoonezero, twoone, threetwoone, threetwo] | |
for seq in sequences: | |
if seq in self.constants.SPECIAL_CASE_IDIOMS: | |
valence = self.constants.SPECIAL_CASE_IDIOMS[seq] | |
break | |
if len(words_and_emoticons) - 1 > i: | |
zeroone = f"{words_and_emoticons[i]} {words_and_emoticons[i + 1]}" | |
if zeroone in self.constants.SPECIAL_CASE_IDIOMS: | |
valence = self.constants.SPECIAL_CASE_IDIOMS[zeroone] | |
if len(words_and_emoticons) - 1 > i + 1: | |
zeroonetwo = "{} {} {}".format( | |
words_and_emoticons[i], | |
words_and_emoticons[i + 1], | |
words_and_emoticons[i + 2], | |
) | |
if zeroonetwo in self.constants.SPECIAL_CASE_IDIOMS: | |
valence = self.constants.SPECIAL_CASE_IDIOMS[zeroonetwo] | |
# check for booster/dampener bi-grams such as 'sort of' or 'kind of' | |
if ( | |
threetwo in self.constants.BOOSTER_DICT | |
or twoone in self.constants.BOOSTER_DICT | |
): | |
valence = valence + self.constants.B_DECR | |
return valence | |
def _never_check(self, valence, words_and_emoticons, start_i, i): | |
if start_i == 0: | |
if self.constants.negated([words_and_emoticons[i - 1]]): | |
valence = valence * self.constants.N_SCALAR | |
if start_i == 1: | |
if words_and_emoticons[i - 2] == "never" and ( | |
words_and_emoticons[i - 1] == "so" | |
or words_and_emoticons[i - 1] == "this" | |
): | |
valence = valence * 1.5 | |
elif self.constants.negated([words_and_emoticons[i - (start_i + 1)]]): | |
valence = valence * self.constants.N_SCALAR | |
if start_i == 2: | |
if ( | |
words_and_emoticons[i - 3] == "never" | |
and ( | |
words_and_emoticons[i - 2] == "so" | |
or words_and_emoticons[i - 2] == "this" | |
) | |
or ( | |
words_and_emoticons[i - 1] == "so" | |
or words_and_emoticons[i - 1] == "this" | |
) | |
): | |
valence = valence * 1.25 | |
elif self.constants.negated([words_and_emoticons[i - (start_i + 1)]]): | |
valence = valence * self.constants.N_SCALAR | |
return valence | |
def _punctuation_emphasis(self, sum_s, text): | |
# add emphasis from exclamation points and question marks | |
ep_amplifier = self._amplify_ep(text) | |
qm_amplifier = self._amplify_qm(text) | |
punct_emph_amplifier = ep_amplifier + qm_amplifier | |
return punct_emph_amplifier | |
def _amplify_ep(self, text): | |
# check for added emphasis resulting from exclamation points (up to 4 of them) | |
ep_count = text.count("!") | |
if ep_count > 4: | |
ep_count = 4 | |
# (empirically derived mean sentiment intensity rating increase for | |
# exclamation points) | |
ep_amplifier = ep_count * 0.292 | |
return ep_amplifier | |
def _amplify_qm(self, text): | |
# check for added emphasis resulting from question marks (2 or 3+) | |
qm_count = text.count("?") | |
qm_amplifier = 0 | |
if qm_count > 1: | |
if qm_count <= 3: | |
# (empirically derived mean sentiment intensity rating increase for | |
# question marks) | |
qm_amplifier = qm_count * 0.18 | |
else: | |
qm_amplifier = 0.96 | |
return qm_amplifier | |
def _sift_sentiment_scores(self, sentiments): | |
# want separate positive versus negative sentiment scores | |
pos_sum = 0.0 | |
neg_sum = 0.0 | |
neu_count = 0 | |
for sentiment_score in sentiments: | |
if sentiment_score > 0: | |
pos_sum += ( | |
float(sentiment_score) + 1 | |
) # compensates for neutral words that are counted as 1 | |
if sentiment_score < 0: | |
neg_sum += ( | |
float(sentiment_score) - 1 | |
) # when used with math.fabs(), compensates for neutrals | |
if sentiment_score == 0: | |
neu_count += 1 | |
return pos_sum, neg_sum, neu_count | |
def score_valence(self, sentiments, text): | |
if sentiments: | |
sum_s = float(sum(sentiments)) | |
# compute and add emphasis from punctuation in text | |
punct_emph_amplifier = self._punctuation_emphasis(sum_s, text) | |
if sum_s > 0: | |
sum_s += punct_emph_amplifier | |
elif sum_s < 0: | |
sum_s -= punct_emph_amplifier | |
compound = self.constants.normalize(sum_s) | |
# discriminate between positive, negative and neutral sentiment scores | |
pos_sum, neg_sum, neu_count = self._sift_sentiment_scores(sentiments) | |
if pos_sum > math.fabs(neg_sum): | |
pos_sum += punct_emph_amplifier | |
elif pos_sum < math.fabs(neg_sum): | |
neg_sum -= punct_emph_amplifier | |
total = pos_sum + math.fabs(neg_sum) + neu_count | |
pos = math.fabs(pos_sum / total) | |
neg = math.fabs(neg_sum / total) | |
neu = math.fabs(neu_count / total) | |
else: | |
compound = 0.0 | |
pos = 0.0 | |
neg = 0.0 | |
neu = 0.0 | |
sentiment_dict = { | |
"neg": round(neg, 3), | |
"neu": round(neu, 3), | |
"pos": round(pos, 3), | |
"compound": round(compound, 4), | |
} | |
return sentiment_dict | |