Pushpa commited on
Commit
ebddec3
·
1 Parent(s): 0f5c896

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +825 -0
app.py ADDED
@@ -0,0 +1,825 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Survey_Analysis_v_3_2_86.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1VOlSQ6kva-BiGfJc7b3BwlKBegP13tdS
8
+ """
9
+
10
+ #1 - https://www.kaggle.com/code/ramjasmaurya/financial-sentiment-analysis
11
+ #2 - https://www.kaggle.com/code/adarshbiradar/sentiment-analysis-using-bert
12
+
13
+ !pip install streamlit
14
+ import streamlit
15
+
16
+ pip install pygal
17
+
18
+ !pip install squarify
19
+
20
+ # Commented out IPython magic to ensure Python compatibility.
21
+ import numpy as np
22
+ import pandas as pd
23
+ import seaborn as sns
24
+ import matplotlib.pyplot as plt
25
+ import plotly.express as px
26
+ import plotly.graph_objects as go
27
+
28
+
29
+ import pygal as py
30
+ import squarify as sq
31
+ import matplotlib
32
+ plt.rcParams["figure.figsize"] = (20,15)
33
+ matplotlib.rc('xtick', labelsize=7)
34
+ matplotlib.rc('ytick', labelsize=7)
35
+
36
+ font = {'family' : 'normal',
37
+ 'weight' : 'bold',
38
+ 'size' : 5}
39
+
40
+ matplotlib.rc('font', **font)
41
+ from sklearn.feature_extraction.text import CountVectorizer
42
+ import warnings
43
+ warnings.filterwarnings("ignore", category=FutureWarning)
44
+ # %matplotlib inline
45
+
46
+ df=pd.read_csv("/content/gen-data.csv",engine="python",encoding="ISO-8859-1")
47
+ df
48
+
49
+ col1=df.keys()[0]
50
+ col2=df.keys()[1]
51
+ col2
52
+
53
+ df2=pd.DataFrame([[col1, col2]], columns=list([col1,col2]), index=[4845])
54
+
55
+ df=df.append(df2, ignore_index=True).set_axis(['sentiment', 'news'], axis=1, inplace=False)
56
+
57
+ df
58
+
59
+ df = df.replace("neutral","neutral")
60
+
61
+ sns.countplot(y="sentiment",data=df)
62
+
63
+ df.isnull().sum()
64
+
65
+ from textblob import TextBlob
66
+
67
+ def preprocess(ReviewText):
68
+ ReviewText = ReviewText.str.replace("(<br/>)", "")
69
+ ReviewText = ReviewText.str.replace('(<a).*(>).*(</a>)', '')
70
+ ReviewText = ReviewText.str.replace('(&amp)', '')
71
+ ReviewText = ReviewText.str.replace('(&gt)', '')
72
+ ReviewText = ReviewText.str.replace('(&lt)', '')
73
+ ReviewText = ReviewText.str.replace('(\xa0)', ' ')
74
+ return ReviewText
75
+ df['Review Text'] = preprocess(df['news'])
76
+
77
+ df['polarity'] = df['news'].map(lambda text: TextBlob(text).sentiment.polarity)
78
+ df['news_len'] = df['news'].astype(str).apply(len)
79
+ df['word_count'] = df['news'].apply(lambda x: len(str(x).split()))
80
+
81
+ df
82
+
83
+ print('top 4 random reviews with the highest positive sentiment polarity: \n')
84
+
85
+ df1=df.drop_duplicates(subset=['Review Text'])
86
+
87
+ cl = df1.loc[df1.polarity == 1, ['Review Text']].sample(4).values
88
+ for c in cl:
89
+ print(c[0])
90
+
91
+ print('5 random reviews with the most neutral sentiment(zero) polarity: \n')
92
+ cl1 = df.loc[df.polarity == 0, ['Review Text']].sample(5).values
93
+ for c in cl1:
94
+ print(c[0])
95
+
96
+ print('5 reviews with the most negative polarity having polarity lesser than -0.80: \n')
97
+ cl3 = df.loc[df.polarity <= -0.80, ['Review Text']].sample(5).values
98
+ for c in cl3:
99
+ print(c[0])
100
+
101
+ sns.boxplot(df["polarity"],palette="rainbow",data=df)
102
+
103
+ df['polarity'].plot(
104
+ kind='hist',
105
+ bins=50,
106
+ color="peru",
107
+ title='Sentiment Polarity Distribution');plt.show()
108
+
109
+ p_s=df[df["polarity"]>0].count()["sentiment"]
110
+ neu_s=df[df["polarity"]==0].count()["sentiment"]
111
+ neg_s=df[df["polarity"]<0].count()["sentiment"]
112
+
113
+ # Setting labels for items in Chart
114
+ sentiment = ['positive_sentiment',"neutral_sentiment","negative_sentiment"]
115
+
116
+ # Setting size in Chart based on
117
+ # given values
118
+ values = [p_s,neu_s,neg_s]
119
+
120
+ # colors
121
+ colors = ['#FF0000', 'olive', '#FFFF00']
122
+ # explosion
123
+ explode = (0.05, 0.05, 0.05)
124
+
125
+ # Pie Chart
126
+ plt.pie(values, colors=colors, labels=sentiment,
127
+ autopct='%1.1f%%', pctdistance=0.85,
128
+ explode=explode)
129
+
130
+ # draw circle
131
+ centre_circle = plt.Circle((0, 0), 0.70, fc='white')
132
+ fig = plt.gcf()
133
+
134
+ # Adding Circle in Pie chart
135
+ fig.gca().add_artist(centre_circle)
136
+
137
+ # Adding Title of chart
138
+ plt.title('count of polarity as per sentiment')
139
+
140
+ # Displaing Chart
141
+ plt.show()
142
+
143
+ df.plot.box(y=["word_count"],color="hotpink")
144
+
145
+ df['word_count'].plot(
146
+ kind='hist',
147
+ bins=100,
148
+ color="orange",
149
+ title='Review Text Word Count Distribution');plt.show()
150
+
151
+ sns.boxenplot(x="news_len",data=df)
152
+ plt.show()
153
+
154
+ df['news_len'].plot(
155
+ kind='hist',
156
+ bins=50,
157
+ color="lightblue",
158
+ title='Review Text Word Count Distribution');plt.show()
159
+
160
+ fig = px.scatter(df, x="news_len", y="word_count", color="sentiment",
161
+ marginal_x="box", marginal_y="violin",
162
+ title="Click on the legend items!")
163
+ fig.show()
164
+
165
+ def get_top_n_words(corpus, n=None):
166
+ vec = CountVectorizer().fit(corpus)
167
+ bag_of_words = vec.transform(corpus)
168
+ sum_words = bag_of_words.sum(axis=0)
169
+ words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
170
+ words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
171
+ return words_freq[:n]
172
+ common_words = get_top_n_words(df['Review Text'], 20)
173
+ for word, freq in common_words:
174
+ print(word, freq)
175
+ df1 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
176
+ df1.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
177
+ kind='bar',title='Top 20 words in review before removing stop words')
178
+ df1
179
+
180
+ def get_top_n_words(corpus, n=None):
181
+ vec = CountVectorizer(stop_words = 'english').fit(corpus)
182
+ bag_of_words = vec.transform(corpus)
183
+ sum_words = bag_of_words.sum(axis=0)
184
+ words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
185
+ words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
186
+ return words_freq[:n]
187
+ common_words = get_top_n_words(df['Review Text'], 20)
188
+ for word, freq in common_words:
189
+ print(word, freq)
190
+ df2 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
191
+ df2.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(kind='bar', title='Top 20 words in review after removing stop words')
192
+
193
+ def get_top_n_bigram(corpus, n=None):
194
+ vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
195
+ bag_of_words = vec.transform(corpus)
196
+ sum_words = bag_of_words.sum(axis=0)
197
+ words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
198
+ words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
199
+ return words_freq[:n]
200
+ common_words = get_top_n_bigram(df['Review Text'], 20)
201
+ for word, freq in common_words:
202
+ print(word, freq)
203
+ df3 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
204
+ df3.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
205
+ kind='bar',title='Top 20 bigrams in review before removing stop words')
206
+
207
+ def get_top_n_bigram(corpus, n=None):
208
+ vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
209
+ bag_of_words = vec.transform(corpus)
210
+ sum_words = bag_of_words.sum(axis=0)
211
+ words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
212
+ words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
213
+ return words_freq[:n]
214
+ common_words = get_top_n_bigram(df['Review Text'], 20)
215
+ for word, freq in common_words:
216
+ print(word, freq)
217
+ df4 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
218
+ df4.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
219
+ kind='bar', title='Top 20 bigrams in review after removing stop words')
220
+
221
+ def get_top_n_trigram(corpus, n=None):
222
+ vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus)
223
+ bag_of_words = vec.transform(corpus)
224
+ sum_words = bag_of_words.sum(axis=0)
225
+ words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
226
+ words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
227
+ return words_freq[:n]
228
+ common_words = get_top_n_trigram(df['Review Text'], 20)
229
+ for word, freq in common_words:
230
+ print(word, freq)
231
+ df5 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
232
+ df5.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
233
+ kind='bar', title='Top 20 trigrams in review before removing stop words')
234
+
235
+ def get_top_n_trigram(corpus, n=None):
236
+ vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus)
237
+ bag_of_words = vec.transform(corpus)
238
+ sum_words = bag_of_words.sum(axis=0)
239
+ words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
240
+ words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
241
+ return words_freq[:n]
242
+ common_words = get_top_n_trigram(df['Review Text'], 20)
243
+ for word, freq in common_words:
244
+ print(word, freq)
245
+ df6 = pd.DataFrame(common_words, columns = ['ReviewText' ,'count'])
246
+ df6.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
247
+ kind='bar', title='Top 20 trigrams in review after removing stop words')
248
+
249
+ import nltk
250
+ nltk.download('punkt')
251
+ nltk.download('wordnet')
252
+ nltk.download('omw-1.4')
253
+ nltk.download('averaged_perceptron_tagger')
254
+
255
+ #import nltk
256
+ blob = TextBlob(str(df['Review Text']))
257
+ pos_df = pd.DataFrame(blob.tags, columns = ['word' , 'pos'])
258
+ pos_df = pos_df.pos.value_counts()[:20]
259
+ pos_df.plot(
260
+ kind='bar',
261
+ title='Top 20 Part-of-speech tagging for review corpus')
262
+
263
+ y0 = df.loc[df['sentiment'] == 'positive']['polarity']
264
+ y1 = df.loc[df['sentiment'] == 'negative']['polarity']
265
+ y2 = df.loc[df['sentiment'] == 'neutral']['polarity']
266
+
267
+ trace0 = go.Box(
268
+ y=y0,
269
+ name = 'positive',
270
+ marker = dict(
271
+ color = 'rgb(214, 12, 140)',
272
+ )
273
+ )
274
+ trace1 = go.Box(
275
+ y=y1,
276
+ name = 'negative',
277
+ marker = dict(
278
+ color = 'rgb(0, 128, 128)',
279
+ )
280
+ )
281
+ trace2 = go.Box(
282
+ y=y2,
283
+ name = 'neutral',
284
+ marker = dict(
285
+ color = 'rgb(10, 140, 208)',
286
+ )
287
+ )
288
+ data = [trace0, trace1, trace2]
289
+ layout = go.Layout(
290
+ title = "Polarity Boxplot according to sentiment"
291
+ )
292
+
293
+ go.Figure(data=data,layout=layout)
294
+
295
+ y0 = df.loc[df['sentiment'] == 'positive']['news_len']
296
+ y1 = df.loc[df['sentiment'] == 'negative']['news_len']
297
+ y2 = df.loc[df['sentiment'] == 'neutral']['news_len']
298
+
299
+
300
+ trace0 = go.Box(
301
+ y=y0,
302
+ name = 'positive',
303
+ marker = dict(
304
+ color = 'rgb(214, 12, 140)',
305
+ )
306
+ )
307
+ trace1 = go.Box(
308
+ y=y1,
309
+ name = 'negative',
310
+ marker = dict(
311
+ color = 'rgb(0, 128, 128)',
312
+ )
313
+ )
314
+ trace2 = go.Box(
315
+ y=y2,
316
+ name = 'neutral',
317
+ marker = dict(
318
+ color = 'rgb(10, 140, 208)',
319
+ )
320
+ )
321
+ data = [trace0, trace1, trace2]
322
+ layout = go.Layout(
323
+ title = "news length Boxplot by sentiment"
324
+ )
325
+ go.Figure(data=data,layout=layout)
326
+
327
+ xp = df.loc[df['sentiment'] == "positive", 'polarity']
328
+ xneu = df.loc[df['sentiment'] == "neutral", 'polarity']
329
+ xneg= df.loc[df['sentiment'] == "negative", 'polarity']
330
+
331
+ trace1 = go.Histogram(
332
+ x=xp, name='positive',
333
+ opacity=0.75
334
+ )
335
+ trace2 = go.Histogram(
336
+ x=xneu, name = 'neutral',
337
+ opacity=0.75
338
+ )
339
+ trace3 = go.Histogram(
340
+ x=xneg, name = 'negative',
341
+ opacity=0.75
342
+ )
343
+ data = [trace1, trace2,trace3]
344
+ layout = go.Layout(barmode='overlay', title='Distribution of Sentiment polarity')
345
+ go.Figure(data=data, layout=layout)
346
+
347
+ trace1 = go.Scatter(
348
+ x=df['polarity'], y=df['news_len'], mode='markers', name='points',
349
+ marker=dict(color='rgb(102,0,0)', size=2, opacity=0.4)
350
+ )
351
+ trace2 = go.Histogram2dContour(
352
+ x=df['polarity'], y=df['news_len'], name='density', ncontours=50,
353
+ colorscale='Hot', reversescale=True, showscale=False
354
+ )
355
+ trace3 = go.Histogram(
356
+ x=df['polarity'], name='Sentiment polarity density',
357
+ marker=dict(color='rgb(102,0,0)'),
358
+ yaxis='y2'
359
+ )
360
+ trace4 = go.Histogram(
361
+ y=df['news_len'], name='news length density', marker=dict(color='rgb(102,0,0)'),
362
+ xaxis='x2'
363
+ )
364
+ data = [trace1, trace2, trace3, trace4]
365
+
366
+ layout = go.Layout(
367
+ showlegend=False,
368
+ autosize=False,
369
+ width=600,
370
+ height=550,
371
+ xaxis=dict(
372
+ domain=[0, 0.85],
373
+ showgrid=False,
374
+ zeroline=False
375
+ ),
376
+ yaxis=dict(
377
+ domain=[0, 0.85],
378
+ showgrid=False,
379
+ zeroline=False
380
+ ),
381
+ margin=dict(
382
+ t=50
383
+ ),
384
+ hovermode='x unified',
385
+ bargap=0,
386
+ xaxis2=dict(
387
+ domain=[0.85, 1],
388
+ showgrid=False,
389
+ zeroline=False
390
+ ),
391
+ yaxis2=dict(
392
+ domain=[0.85, 1],
393
+ showgrid=False,
394
+ zeroline=False
395
+ )
396
+ )
397
+
398
+ go.Figure(data=data, layout=layout)
399
+
400
+ trace1 = go.Scatter(
401
+ x=df['polarity'], y=df['word_count'], mode='markers', name='points',
402
+ marker=dict(color='rgb(102,0,0)', size=2, opacity=0.4)
403
+ )
404
+ trace2 = go.Histogram2dContour(
405
+ x=df['polarity'], y=df['word_count'], name='density', ncontours=20,
406
+ colorscale='Hot', reversescale=True, showscale=False
407
+ )
408
+ trace3 = go.Histogram(
409
+ x=df['polarity'], name='Sentiment polarity density',
410
+ marker=dict(color='rgb(102,0,0)'),
411
+ yaxis='y2'
412
+ )
413
+ trace4 = go.Histogram(
414
+ y=df['word_count'], name='word count density', marker=dict(color='rgb(112,0,0)'),
415
+ xaxis='x2'
416
+ )
417
+ data = [trace1, trace2, trace3, trace4]
418
+
419
+ layout = go.Layout(
420
+ showlegend=False,
421
+ autosize=False,
422
+ width=600,
423
+ height=550,
424
+ xaxis=dict(
425
+ domain=[0, 0.85],
426
+ showgrid=False,
427
+ zeroline=False
428
+ ),
429
+ yaxis=dict(
430
+ domain=[0, 0.85],
431
+ showgrid=False,
432
+ zeroline=False
433
+ ),
434
+ margin=dict(
435
+ t=50
436
+ ),
437
+ hovermode='closest',
438
+ bargap=0,
439
+ xaxis2=dict(
440
+ domain=[0.85, 1],
441
+ showgrid=False,
442
+ zeroline=False
443
+ ),
444
+ yaxis2=dict(
445
+ domain=[0.85, 1],
446
+ showgrid=False,
447
+ zeroline=False
448
+ )
449
+ )
450
+
451
+ go.Figure(data=data, layout=layout)
452
+
453
+ pip install scattertext
454
+
455
+ pip install spacy
456
+
457
+ import scattertext as st
458
+ import spacy
459
+ nlp = spacy.blank("en")
460
+ nlp.add_pipe('sentencizer')
461
+ #nlp.add_pipe(nlp.create_pipe('sentencizer'))
462
+ corpus = st.CorpusFromPandas(df, category_col='sentiment', text_col='Review Text', nlp=nlp).build()
463
+ print(list(corpus.get_scaled_f_scores_vs_background().index[:20]))
464
+
465
+ term_freq_df = corpus.get_term_freq_df()
466
+ term_freq_df['positive_sentiment'] = corpus.get_scaled_f_scores('positive')
467
+ list(term_freq_df.sort_values(by='positive_sentiment', ascending=False).index[:20])
468
+
469
+ term_freq_df['neutral_sentiment'] = corpus.get_scaled_f_scores('neutral')
470
+ list(term_freq_df.sort_values(by='neutral_sentiment', ascending=False).index[:20])
471
+
472
+ term_freq_df['negative_sentiment'] = corpus.get_scaled_f_scores('negative')
473
+ list(term_freq_df.sort_values(by='negative_sentiment', ascending=False).index[:20])
474
+
475
+ from sklearn.feature_extraction.text import TfidfVectorizer
476
+ from sklearn.decomposition import TruncatedSVD
477
+ from collections import Counter
478
+
479
+ tfidf_vectorizer = TfidfVectorizer(stop_words='english', use_idf=True, smooth_idf=True)
480
+ reindexed_data = df['Review Text'].values
481
+ document_term_matrix = tfidf_vectorizer.fit_transform(reindexed_data)
482
+ n_topics = 10
483
+ lsa_model = TruncatedSVD(n_components=n_topics)
484
+ lsa_topic_matrix = lsa_model.fit_transform(document_term_matrix)
485
+
486
+ def get_keys(topic_matrix):
487
+ '''
488
+ returns an integer list of predicted topic
489
+ categories for a given topic matrix
490
+ '''
491
+ keys = topic_matrix.argmax(axis=1).tolist()
492
+ return keys
493
+
494
+ def keys_to_counts(keys):
495
+ '''
496
+ returns a tuple of topic categories and their
497
+ accompanying magnitudes for a given list of keys
498
+ '''
499
+ count_pairs = Counter(keys).items()
500
+ categories = [pair[0] for pair in count_pairs]
501
+ counts = [pair[1] for pair in count_pairs]
502
+ return (categories, counts)
503
+
504
+ lsa_keys = get_keys(lsa_topic_matrix)
505
+ lsa_categories, lsa_counts = keys_to_counts(lsa_keys)
506
+
507
+ def get_top_n_words(n, keys, document_term_matrix, tfidf_vectorizer):
508
+ '''
509
+ returns a list of n_topic strings, where each string contains the n most common
510
+ words in a predicted category, in order
511
+ '''
512
+ top_word_indices = []
513
+ for topic in range(n_topics):
514
+ temp_vector_sum = 0
515
+ for i in range(len(keys)):
516
+ if keys[i] == topic:
517
+ temp_vector_sum += document_term_matrix[i]
518
+ temp_vector_sum = temp_vector_sum.toarray()
519
+ top_n_word_indices = np.flip(np.argsort(temp_vector_sum)[0][-n:],0)
520
+ top_word_indices.append(top_n_word_indices)
521
+ top_words = []
522
+ for topic in top_word_indices:
523
+ topic_words = []
524
+ for index in topic:
525
+ temp_word_vector = np.zeros((1,document_term_matrix.shape[1]))
526
+ temp_word_vector[:,index] = 1
527
+ the_word = tfidf_vectorizer.inverse_transform(temp_word_vector)[0][0]
528
+ topic_words.append(the_word.encode('ascii').decode('utf-8'))
529
+ top_words.append(" ".join(topic_words))
530
+ return top_words
531
+
532
+ top_lsa=get_top_n_words(3, lsa_keys, document_term_matrix, tfidf_vectorizer)
533
+
534
+ for i in range(len(top_lsa)):
535
+ print("Topic {}: ".format(i+1), top_lsa[i])
536
+
537
+ top_3_words = get_top_n_words(3, lsa_keys, document_term_matrix, tfidf_vectorizer)
538
+ labels = ['Topic {}: \n'.format(i+1) + top_3_words[i] for i in lsa_categories]
539
+ fig, ax = plt.subplots(figsize=(16,8))
540
+ ax.bar(lsa_categories, lsa_counts,color="skyblue");
541
+ ax.set_xticks(lsa_categories,);
542
+ ax.set_xticklabels(labels, rotation=45, rotation_mode='default',color="olive");
543
+ ax.set_ylabel('Number of review text on topics');
544
+ ax.set_title('Count of LSA topics');
545
+ plt.show();
546
+
547
+ """#---2----"""
548
+
549
+ df['sentiment'].value_counts()
550
+
551
+ from sklearn.model_selection import train_test_split
552
+ train,eva = train_test_split(df,test_size = 0.2)
553
+
554
+ !pip install simpletransformers
555
+
556
+ from simpletransformers.classification import ClassificationModel
557
+
558
+ # Create a Transformer Model BERT
559
+ model = ClassificationModel('bert', 'bert-base-cased', num_labels=3, args={'reprocess_input_data': True, 'overwrite_output_dir': True},use_cuda=False)
560
+
561
+ # 0,1,2 : positive,negative
562
+ def making_label(st):
563
+ if(st=='positive'):
564
+ return 0
565
+ elif(st=='neutral'):
566
+ return 2
567
+ else:
568
+ return 1
569
+
570
+ train['label'] = train['sentiment'].apply(making_label)
571
+ eva['label'] = eva['sentiment'].apply(making_label)
572
+ print(train.shape)
573
+
574
+ train_df = pd.DataFrame({
575
+ 'text': train['news'][:1500].replace(r'\n', ' ', regex=True),
576
+ 'label': train['label'][:1500]
577
+ })
578
+
579
+ eval_df = pd.DataFrame({
580
+ 'text': eva['news'][-400:].replace(r'\n', ' ', regex=True),
581
+ 'label': eva['label'][-400:]
582
+ })
583
+
584
+ model.train_model(train_df)
585
+
586
+ result, model_outputs, wrong_predictions = model.eval_model(eval_df)
587
+
588
+ result
589
+
590
+ model_outputs
591
+
592
+ len(wrong_predictions)
593
+
594
+ lst = []
595
+ for arr in model_outputs:
596
+ lst.append(np.argmax(arr))
597
+
598
+ true = eval_df['label'].tolist()
599
+ predicted = lst
600
+
601
+ import sklearn
602
+ mat = sklearn.metrics.confusion_matrix(true , predicted)
603
+ mat
604
+
605
+ df_cm = pd.DataFrame(mat, range(3), range(3))
606
+
607
+ sns.heatmap(df_cm, annot=True)
608
+ plt.show()
609
+
610
+ print(sklearn.metrics.classification_report(true,predicted,target_names=['positive','neutral','negative']))
611
+
612
+ sklearn.metrics.accuracy_score(true,predicted)
613
+
614
+ #Give your statement
615
+ def get_result(statement):
616
+ result = model.predict([statement])
617
+ pos = np.where(result[1][0] == np.amax(result[1][0]))
618
+ pos = int(pos[0])
619
+ sentiment_dict = {0:'positive',1:'negative',2:'neutral'}
620
+ print(sentiment_dict[pos])
621
+ return
622
+
623
+ ## neutral statement
624
+ get_result("According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .")
625
+
626
+ ## positive statement
627
+ get_result("According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 20 % -40 % with an operating profit margin of 10 % -20 % of net sales .")
628
+
629
+ ## negative statement
630
+ get_result('Sales in Finland decreased by 2.0 % , and international sales decreased by 9.3 % in terms of euros , and by 15.1 % in terms of local currencies .')
631
+
632
+ get_result("This company is growing like anything with 23% profit every year")
633
+
634
+ get_result("This company is not able to make any profit but make very less profit in last quarter")
635
+
636
+ get_result("The doctor treated well and the patient was very healthy")
637
+
638
+ get_result("the act of politicians is to serve and help needy and not to create ruck suck")
639
+
640
+ get_result("American burger is too good. Can't resisit to go and have one")
641
+
642
+ get_result("GDP per capita increased to double in India from 2013")
643
+
644
+ get_result("Indian economy is doing very good and will become super power one day.")
645
+
646
+ get_result("Indian economy is doing very good and will create millions of jobs in coming years")
647
+
648
+ get_result("Indian economy is not doing very good and need urgent reforms but we are pretty sure it will be very good in coming years")
649
+
650
+ get_result("Indian economy is doing very good.Indian economy is not doing very good ")
651
+
652
+ get_result("Indian economy is not doing very good. Indian economy will bounce back to become leading economy")
653
+
654
+ get_result("Indian economy is not doing very good. Urgent reforms is required to create new jobs and improve export")
655
+
656
+ get_result("The stock market of Indian economy is dangling too much")
657
+
658
+ """#VADER"""
659
+
660
+ !pip install vaderSentiment
661
+
662
+ from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
663
+
664
+ obj = SentimentIntensityAnalyzer()
665
+
666
+ sentence = "Ram is really good "
667
+ sentiment_dict = obj.polarity_scores(sentence)
668
+ print(sentiment_dict)
669
+
670
+ #check this
671
+ sentence = "Ram is better "
672
+ sentiment_dict = obj.polarity_scores(sentence)
673
+ print(sentiment_dict)
674
+
675
+ sentence = "Rahul is really bad"
676
+ sentiment_dict = obj.polarity_scores(sentence)
677
+ print(sentiment_dict)
678
+
679
+ #punctuation
680
+ print(obj.polarity_scores('Ram is good boy'))
681
+ print(obj.polarity_scores('Ram is good boy!'))
682
+ print(obj.polarity_scores('Ram is good boy!!'))
683
+
684
+ #capitalization
685
+ print(obj.polarity_scores('Ram is good'))
686
+ print(obj.polarity_scores('Ram is GOOD'))
687
+
688
+ #degree
689
+ print(obj.polarity_scores('Ram is good'))
690
+ print(obj.polarity_scores('Ram is better'))
691
+ print(obj.polarity_scores('Ram is best'))
692
+
693
+ print(obj.polarity_scores('Ram is bad'))
694
+ print(obj.polarity_scores('Ram is worse'))
695
+ print(obj.polarity_scores('Ram is worst'))
696
+
697
+ #conjuction
698
+ print(obj.polarity_scores('Ram is good'))
699
+ print(obj.polarity_scores('Ram is good, but he is also naughty sometimes'))
700
+
701
+ #slang
702
+ print(obj.polarity_scores("That Hotel"))
703
+ print(obj.polarity_scores("That Hotel SUX"))
704
+ print(obj.polarity_scores("That Hotel SUCKS"))
705
+
706
+ #emoticons
707
+ print(obj.polarity_scores("Your :) is the most beautiful thing I have ever seen"))
708
+ print(obj.polarity_scores("Your smile is the most beautiful thing I have ever seen"))
709
+
710
+ print(obj.polarity_scores("Your :( is the worst thing I have ever seen"))
711
+ print(obj.polarity_scores("Your smile is the worst thing I have ever seen"))
712
+
713
+ #https://360digitmg.com/blog/bert-variants-and-their-differences
714
+ #https://simpletransformers.ai/docs/classification-specifics/#supported-model-types Official reference
715
+
716
+ """#3.a Using FINBERT Model"""
717
+
718
+ #PPT
719
+ #https://medium.com/@benjamin_joesy/finbert-financial-sentiment-analysis-with-bert-acf695b64ac6
720
+
721
+ from transformers import BertTokenizer, BertForSequenceClassification, pipeline
722
+
723
+ # tested in transformers==4.18.0
724
+ import transformers
725
+ transformers.__version__
726
+
727
+ finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
728
+ tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
729
+
730
+ nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)
731
+ results = nlp(['growth is strong and we have plenty of liquidity.',
732
+ 'there is a shortage of capital, and we need extra financing.',
733
+ 'formulation patents might protect Vasotec to a limited extent.'])
734
+
735
+ results
736
+
737
+ """#FINBERT ESG"""
738
+
739
+ finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-esg',num_labels=4)
740
+ tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-esg')
741
+
742
+ nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)
743
+ results = nlp(['Managing and working to mitigate the impact our operations have on the environment is a core element of our business.',
744
+ 'Rhonda has been volunteering for several years for a variety of charitable community programs.',
745
+ 'Cabot\'s annual statements are audited annually by an independent registered public accounting firm.',
746
+ 'As of December 31, 2012, the 2011 Term Loan had a principal balance of $492.5 million.'])
747
+
748
+ results
749
+
750
+ """#FINBERT Classification"""
751
+
752
+ finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-fls',num_labels=3)
753
+ tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-fls')
754
+
755
+ nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)
756
+ results = nlp(['we expect the age of our fleet to enhance availability and reliability due to reduced downtime for repairs.',
757
+ 'on an equivalent unit of production basis, general and administrative expenses declined 24 percent from 1994 to $.67 per boe.',
758
+ 'we will continue to assess the need for a valuation allowance against deferred tax assets considering all available evidence obtained in'])
759
+
760
+ results
761
+
762
+ X = df['Review Text'].to_list()
763
+ y = df['sentiment'].to_list()
764
+
765
+ from transformers import BertTokenizer, BertForSequenceClassification
766
+
767
+ finbert_whole = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
768
+ tokenizer_whole = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
769
+
770
+ labels = {0:'neutral', 1:'positive',2:'negative'}
771
+
772
+ sent_val = list()
773
+ for x in X:
774
+ inputs = tokenizer_whole(x, return_tensors="pt", padding=True)
775
+ outputs = finbert_whole(**inputs)[0]
776
+
777
+ val = labels[np.argmax(outputs.detach().numpy())]
778
+ print(x, '---->', val)
779
+ print('#######################################################')
780
+ sent_val.append(val)
781
+
782
+ from sklearn.metrics import accuracy_score
783
+ print(accuracy_score(y, sent_val))
784
+
785
+ """#Using DISTILBERT"""
786
+
787
+ from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
788
+
789
+ tokenizer_distilbert = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
790
+ model_distilbert = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
791
+
792
+ labels = {0:'neutral', 1:'positive',2:'negative'}
793
+
794
+ sent_val_bert = list()
795
+ for x in X:
796
+ inputs = tokenizer_distilbert(x, return_tensors="pt", padding=True)
797
+ outputs = model_distilbert(**inputs)[0]
798
+
799
+ val = labels[np.argmax(outputs.detach().numpy())]
800
+ print(x, '---->', val)
801
+ print('#######################################################')
802
+ sent_val_bert.append(val)
803
+
804
+ from sklearn.metrics import accuracy_score
805
+ print(accuracy_score(y, sent_val))
806
+
807
+ """#Bert"""
808
+
809
+ tokenizer_bert = DistilBertTokenizer.from_pretrained("bert-base-uncased")
810
+ model_bert = DistilBertForSequenceClassification.from_pretrained("bert-base-uncased")
811
+
812
+ labels = {0:'neutral', 1:'positive',2:'negative'}
813
+
814
+ sent_val_bert1 = list()
815
+ for x in X:
816
+ inputs = tokenizer_bert(x, return_tensors="pt", padding=True)
817
+ outputs = model_bert(**inputs)[0]
818
+
819
+ val = labels[np.argmax(outputs.detach().numpy())]
820
+ print(x, '---->', val)
821
+ print('#######################################################')
822
+ sent_val_bert1.append(val)
823
+
824
+ from sklearn.metrics import accuracy_score
825
+ print(accuracy_score(y, sent_val))