bert / app.py
Pushpa's picture
Update app.py
c2cf2c0
raw
history blame
26.5 kB
# -*- coding: utf-8 -*-
"""Survey_Analysis_v_3_2_86.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1VOlSQ6kva-BiGfJc7b3BwlKBegP13tdS
"""
#1 - https://www.kaggle.com/code/ramjasmaurya/financial-sentiment-analysis
#2 - https://www.kaggle.com/code/adarshbiradar/sentiment-analysis-using-bert
import streamlit
#pip install pygal
#pip install squarify
# Commented out IPython magic to ensure Python compatibility.
import numpy as np
import pandas as pd
#import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import pygal as py
import squarify as sq
import matplotlib
plt.rcParams["figure.figsize"] = (20,15)
matplotlib.rc('xtick', labelsize=7)
matplotlib.rc('ytick', labelsize=7)
font = {'family' : 'normal',
'weight' : 'bold',
'size' : 5}
matplotlib.rc('font', **font)
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
# %matplotlib inline
df=pd.read_csv("/content/gen-data.csv",engine="python",encoding="ISO-8859-1")
df
col1=df.keys()[0]
col2=df.keys()[1]
col2
df2=pd.DataFrame([[col1, col2]], columns=list([col1,col2]), index=[4845])
df=df.append(df2, ignore_index=True).set_axis(['sentiment', 'news'], axis=1, inplace=False)
df
df = df.replace("neutral","neutral")
sns.countplot(y="sentiment",data=df)
df.isnull().sum()
from textblob import TextBlob
def preprocess(ReviewText):
ReviewText = ReviewText.str.replace("(<br/>)", "")
ReviewText = ReviewText.str.replace('(<a).*(>).*(</a>)', '')
ReviewText = ReviewText.str.replace('(&amp)', '')
ReviewText = ReviewText.str.replace('(&gt)', '')
ReviewText = ReviewText.str.replace('(&lt)', '')
ReviewText = ReviewText.str.replace('(\xa0)', ' ')
return ReviewText
df['Review Text'] = preprocess(df['news'])
df['polarity'] = df['news'].map(lambda text: TextBlob(text).sentiment.polarity)
df['news_len'] = df['news'].astype(str).apply(len)
df['word_count'] = df['news'].apply(lambda x: len(str(x).split()))
df
print('top 4 random reviews with the highest positive sentiment polarity: \n')
df1=df.drop_duplicates(subset=['Review Text'])
cl = df1.loc[df1.polarity == 1, ['Review Text']].sample(4).values
for c in cl:
print(c[0])
print('5 random reviews with the most neutral sentiment(zero) polarity: \n')
cl1 = df.loc[df.polarity == 0, ['Review Text']].sample(5).values
for c in cl1:
print(c[0])
print('5 reviews with the most negative polarity having polarity lesser than -0.80: \n')
cl3 = df.loc[df.polarity <= -0.80, ['Review Text']].sample(5).values
for c in cl3:
print(c[0])
sns.boxplot(df["polarity"],palette="rainbow",data=df)
df['polarity'].plot(
kind='hist',
bins=50,
color="peru",
title='Sentiment Polarity Distribution');plt.show()
p_s=df[df["polarity"]>0].count()["sentiment"]
neu_s=df[df["polarity"]==0].count()["sentiment"]
neg_s=df[df["polarity"]<0].count()["sentiment"]
# Setting labels for items in Chart
sentiment = ['positive_sentiment',"neutral_sentiment","negative_sentiment"]
# Setting size in Chart based on
# given values
values = [p_s,neu_s,neg_s]
# colors
colors = ['#FF0000', 'olive', '#FFFF00']
# explosion
explode = (0.05, 0.05, 0.05)
# Pie Chart
plt.pie(values, colors=colors, labels=sentiment,
autopct='%1.1f%%', pctdistance=0.85,
explode=explode)
# draw circle
centre_circle = plt.Circle((0, 0), 0.70, fc='white')
fig = plt.gcf()
# Adding Circle in Pie chart
fig.gca().add_artist(centre_circle)
# Adding Title of chart
plt.title('count of polarity as per sentiment')
# Displaing Chart
plt.show()
df.plot.box(y=["word_count"],color="hotpink")
df['word_count'].plot(
kind='hist',
bins=100,
color="orange",
title='Review Text Word Count Distribution');plt.show()
sns.boxenplot(x="news_len",data=df)
plt.show()
df['news_len'].plot(
kind='hist',
bins=50,
color="lightblue",
title='Review Text Word Count Distribution');plt.show()
fig = px.scatter(df, x="news_len", y="word_count", color="sentiment",
marginal_x="box", marginal_y="violin",
title="Click on the legend items!")
fig.show()
def get_top_n_words(corpus, n=None):
vec = CountVectorizer().fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
common_words = get_top_n_words(df['Review Text'], 20)
for word, freq in common_words:
print(word, freq)
df1 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
df1.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
kind='bar',title='Top 20 words in review before removing stop words')
df1
def get_top_n_words(corpus, n=None):
vec = CountVectorizer(stop_words = 'english').fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
common_words = get_top_n_words(df['Review Text'], 20)
for word, freq in common_words:
print(word, freq)
df2 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
df2.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(kind='bar', title='Top 20 words in review after removing stop words')
def get_top_n_bigram(corpus, n=None):
vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
common_words = get_top_n_bigram(df['Review Text'], 20)
for word, freq in common_words:
print(word, freq)
df3 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
df3.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
kind='bar',title='Top 20 bigrams in review before removing stop words')
def get_top_n_bigram(corpus, n=None):
vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
common_words = get_top_n_bigram(df['Review Text'], 20)
for word, freq in common_words:
print(word, freq)
df4 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
df4.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
kind='bar', title='Top 20 bigrams in review after removing stop words')
def get_top_n_trigram(corpus, n=None):
vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
common_words = get_top_n_trigram(df['Review Text'], 20)
for word, freq in common_words:
print(word, freq)
df5 = pd.DataFrame(common_words, columns = ['ReviewText' , 'count'])
df5.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
kind='bar', title='Top 20 trigrams in review before removing stop words')
def get_top_n_trigram(corpus, n=None):
vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
common_words = get_top_n_trigram(df['Review Text'], 20)
for word, freq in common_words:
print(word, freq)
df6 = pd.DataFrame(common_words, columns = ['ReviewText' ,'count'])
df6.groupby('ReviewText').sum()['count'].sort_values(ascending=False).plot(
kind='bar', title='Top 20 trigrams in review after removing stop words')
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
#import nltk
blob = TextBlob(str(df['Review Text']))
pos_df = pd.DataFrame(blob.tags, columns = ['word' , 'pos'])
pos_df = pos_df.pos.value_counts()[:20]
pos_df.plot(
kind='bar',
title='Top 20 Part-of-speech tagging for review corpus')
y0 = df.loc[df['sentiment'] == 'positive']['polarity']
y1 = df.loc[df['sentiment'] == 'negative']['polarity']
y2 = df.loc[df['sentiment'] == 'neutral']['polarity']
trace0 = go.Box(
y=y0,
name = 'positive',
marker = dict(
color = 'rgb(214, 12, 140)',
)
)
trace1 = go.Box(
y=y1,
name = 'negative',
marker = dict(
color = 'rgb(0, 128, 128)',
)
)
trace2 = go.Box(
y=y2,
name = 'neutral',
marker = dict(
color = 'rgb(10, 140, 208)',
)
)
data = [trace0, trace1, trace2]
layout = go.Layout(
title = "Polarity Boxplot according to sentiment"
)
go.Figure(data=data,layout=layout)
y0 = df.loc[df['sentiment'] == 'positive']['news_len']
y1 = df.loc[df['sentiment'] == 'negative']['news_len']
y2 = df.loc[df['sentiment'] == 'neutral']['news_len']
trace0 = go.Box(
y=y0,
name = 'positive',
marker = dict(
color = 'rgb(214, 12, 140)',
)
)
trace1 = go.Box(
y=y1,
name = 'negative',
marker = dict(
color = 'rgb(0, 128, 128)',
)
)
trace2 = go.Box(
y=y2,
name = 'neutral',
marker = dict(
color = 'rgb(10, 140, 208)',
)
)
data = [trace0, trace1, trace2]
layout = go.Layout(
title = "news length Boxplot by sentiment"
)
go.Figure(data=data,layout=layout)
xp = df.loc[df['sentiment'] == "positive", 'polarity']
xneu = df.loc[df['sentiment'] == "neutral", 'polarity']
xneg= df.loc[df['sentiment'] == "negative", 'polarity']
trace1 = go.Histogram(
x=xp, name='positive',
opacity=0.75
)
trace2 = go.Histogram(
x=xneu, name = 'neutral',
opacity=0.75
)
trace3 = go.Histogram(
x=xneg, name = 'negative',
opacity=0.75
)
data = [trace1, trace2,trace3]
layout = go.Layout(barmode='overlay', title='Distribution of Sentiment polarity')
go.Figure(data=data, layout=layout)
trace1 = go.Scatter(
x=df['polarity'], y=df['news_len'], mode='markers', name='points',
marker=dict(color='rgb(102,0,0)', size=2, opacity=0.4)
)
trace2 = go.Histogram2dContour(
x=df['polarity'], y=df['news_len'], name='density', ncontours=50,
colorscale='Hot', reversescale=True, showscale=False
)
trace3 = go.Histogram(
x=df['polarity'], name='Sentiment polarity density',
marker=dict(color='rgb(102,0,0)'),
yaxis='y2'
)
trace4 = go.Histogram(
y=df['news_len'], name='news length density', marker=dict(color='rgb(102,0,0)'),
xaxis='x2'
)
data = [trace1, trace2, trace3, trace4]
layout = go.Layout(
showlegend=False,
autosize=False,
width=600,
height=550,
xaxis=dict(
domain=[0, 0.85],
showgrid=False,
zeroline=False
),
yaxis=dict(
domain=[0, 0.85],
showgrid=False,
zeroline=False
),
margin=dict(
t=50
),
hovermode='x unified',
bargap=0,
xaxis2=dict(
domain=[0.85, 1],
showgrid=False,
zeroline=False
),
yaxis2=dict(
domain=[0.85, 1],
showgrid=False,
zeroline=False
)
)
go.Figure(data=data, layout=layout)
trace1 = go.Scatter(
x=df['polarity'], y=df['word_count'], mode='markers', name='points',
marker=dict(color='rgb(102,0,0)', size=2, opacity=0.4)
)
trace2 = go.Histogram2dContour(
x=df['polarity'], y=df['word_count'], name='density', ncontours=20,
colorscale='Hot', reversescale=True, showscale=False
)
trace3 = go.Histogram(
x=df['polarity'], name='Sentiment polarity density',
marker=dict(color='rgb(102,0,0)'),
yaxis='y2'
)
trace4 = go.Histogram(
y=df['word_count'], name='word count density', marker=dict(color='rgb(112,0,0)'),
xaxis='x2'
)
data = [trace1, trace2, trace3, trace4]
layout = go.Layout(
showlegend=False,
autosize=False,
width=600,
height=550,
xaxis=dict(
domain=[0, 0.85],
showgrid=False,
zeroline=False
),
yaxis=dict(
domain=[0, 0.85],
showgrid=False,
zeroline=False
),
margin=dict(
t=50
),
hovermode='closest',
bargap=0,
xaxis2=dict(
domain=[0.85, 1],
showgrid=False,
zeroline=False
),
yaxis2=dict(
domain=[0.85, 1],
showgrid=False,
zeroline=False
)
)
go.Figure(data=data, layout=layout)
#pip install scattertext
#pip install spacy
import scattertext as st
import spacy
nlp = spacy.blank("en")
nlp.add_pipe('sentencizer')
#nlp.add_pipe(nlp.create_pipe('sentencizer'))
corpus = st.CorpusFromPandas(df, category_col='sentiment', text_col='Review Text', nlp=nlp).build()
print(list(corpus.get_scaled_f_scores_vs_background().index[:20]))
term_freq_df = corpus.get_term_freq_df()
term_freq_df['positive_sentiment'] = corpus.get_scaled_f_scores('positive')
list(term_freq_df.sort_values(by='positive_sentiment', ascending=False).index[:20])
term_freq_df['neutral_sentiment'] = corpus.get_scaled_f_scores('neutral')
list(term_freq_df.sort_values(by='neutral_sentiment', ascending=False).index[:20])
term_freq_df['negative_sentiment'] = corpus.get_scaled_f_scores('negative')
list(term_freq_df.sort_values(by='negative_sentiment', ascending=False).index[:20])
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from collections import Counter
tfidf_vectorizer = TfidfVectorizer(stop_words='english', use_idf=True, smooth_idf=True)
reindexed_data = df['Review Text'].values
document_term_matrix = tfidf_vectorizer.fit_transform(reindexed_data)
n_topics = 10
lsa_model = TruncatedSVD(n_components=n_topics)
lsa_topic_matrix = lsa_model.fit_transform(document_term_matrix)
def get_keys(topic_matrix):
'''
returns an integer list of predicted topic
categories for a given topic matrix
'''
keys = topic_matrix.argmax(axis=1).tolist()
return keys
def keys_to_counts(keys):
'''
returns a tuple of topic categories and their
accompanying magnitudes for a given list of keys
'''
count_pairs = Counter(keys).items()
categories = [pair[0] for pair in count_pairs]
counts = [pair[1] for pair in count_pairs]
return (categories, counts)
lsa_keys = get_keys(lsa_topic_matrix)
lsa_categories, lsa_counts = keys_to_counts(lsa_keys)
def get_top_n_words(n, keys, document_term_matrix, tfidf_vectorizer):
'''
returns a list of n_topic strings, where each string contains the n most common
words in a predicted category, in order
'''
top_word_indices = []
for topic in range(n_topics):
temp_vector_sum = 0
for i in range(len(keys)):
if keys[i] == topic:
temp_vector_sum += document_term_matrix[i]
temp_vector_sum = temp_vector_sum.toarray()
top_n_word_indices = np.flip(np.argsort(temp_vector_sum)[0][-n:],0)
top_word_indices.append(top_n_word_indices)
top_words = []
for topic in top_word_indices:
topic_words = []
for index in topic:
temp_word_vector = np.zeros((1,document_term_matrix.shape[1]))
temp_word_vector[:,index] = 1
the_word = tfidf_vectorizer.inverse_transform(temp_word_vector)[0][0]
topic_words.append(the_word.encode('ascii').decode('utf-8'))
top_words.append(" ".join(topic_words))
return top_words
top_lsa=get_top_n_words(3, lsa_keys, document_term_matrix, tfidf_vectorizer)
for i in range(len(top_lsa)):
print("Topic {}: ".format(i+1), top_lsa[i])
top_3_words = get_top_n_words(3, lsa_keys, document_term_matrix, tfidf_vectorizer)
labels = ['Topic {}: \n'.format(i+1) + top_3_words[i] for i in lsa_categories]
fig, ax = plt.subplots(figsize=(16,8))
ax.bar(lsa_categories, lsa_counts,color="skyblue");
ax.set_xticks(lsa_categories,);
ax.set_xticklabels(labels, rotation=45, rotation_mode='default',color="olive");
ax.set_ylabel('Number of review text on topics');
ax.set_title('Count of LSA topics');
plt.show();
"""#---2----"""
df['sentiment'].value_counts()
from sklearn.model_selection import train_test_split
train,eva = train_test_split(df,test_size = 0.2)
#pip install simpletransformers
from simpletransformers.classification import ClassificationModel
# Create a Transformer Model BERT
model = ClassificationModel('bert', 'bert-base-cased', num_labels=3, args={'reprocess_input_data': True, 'overwrite_output_dir': True},use_cuda=False)
# 0,1,2 : positive,negative
def making_label(st):
if(st=='positive'):
return 0
elif(st=='neutral'):
return 2
else:
return 1
train['label'] = train['sentiment'].apply(making_label)
eva['label'] = eva['sentiment'].apply(making_label)
print(train.shape)
train_df = pd.DataFrame({
'text': train['news'][:1500].replace(r'\n', ' ', regex=True),
'label': train['label'][:1500]
})
eval_df = pd.DataFrame({
'text': eva['news'][-400:].replace(r'\n', ' ', regex=True),
'label': eva['label'][-400:]
})
model.train_model(train_df)
result, model_outputs, wrong_predictions = model.eval_model(eval_df)
result
model_outputs
len(wrong_predictions)
lst = []
for arr in model_outputs:
lst.append(np.argmax(arr))
true = eval_df['label'].tolist()
predicted = lst
import sklearn
mat = sklearn.metrics.confusion_matrix(true , predicted)
mat
df_cm = pd.DataFrame(mat, range(3), range(3))
sns.heatmap(df_cm, annot=True)
plt.show()
print(sklearn.metrics.classification_report(true,predicted,target_names=['positive','neutral','negative']))
sklearn.metrics.accuracy_score(true,predicted)
#Give your statement
def get_result(statement):
result = model.predict([statement])
pos = np.where(result[1][0] == np.amax(result[1][0]))
pos = int(pos[0])
sentiment_dict = {0:'positive',1:'negative',2:'neutral'}
print(sentiment_dict[pos])
return
## neutral statement
get_result("According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing .")
## positive statement
get_result("According to the company 's updated strategy for the years 2009-2012 , Basware targets a long-term net sales growth in the range of 20 % -40 % with an operating profit margin of 10 % -20 % of net sales .")
## negative statement
get_result('Sales in Finland decreased by 2.0 % , and international sales decreased by 9.3 % in terms of euros , and by 15.1 % in terms of local currencies .')
get_result("This company is growing like anything with 23% profit every year")
get_result("This company is not able to make any profit but make very less profit in last quarter")
get_result("The doctor treated well and the patient was very healthy")
get_result("the act of politicians is to serve and help needy and not to create ruck suck")
get_result("American burger is too good. Can't resisit to go and have one")
get_result("GDP per capita increased to double in India from 2013")
get_result("Indian economy is doing very good and will become super power one day.")
get_result("Indian economy is doing very good and will create millions of jobs in coming years")
get_result("Indian economy is not doing very good and need urgent reforms but we are pretty sure it will be very good in coming years")
get_result("Indian economy is doing very good.Indian economy is not doing very good ")
get_result("Indian economy is not doing very good. Indian economy will bounce back to become leading economy")
get_result("Indian economy is not doing very good. Urgent reforms is required to create new jobs and improve export")
get_result("The stock market of Indian economy is dangling too much")
"""#VADER"""
#pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
obj = SentimentIntensityAnalyzer()
sentence = "Ram is really good "
sentiment_dict = obj.polarity_scores(sentence)
print(sentiment_dict)
#check this
sentence = "Ram is better "
sentiment_dict = obj.polarity_scores(sentence)
print(sentiment_dict)
sentence = "Rahul is really bad"
sentiment_dict = obj.polarity_scores(sentence)
print(sentiment_dict)
#punctuation
print(obj.polarity_scores('Ram is good boy'))
print(obj.polarity_scores('Ram is good boy!'))
print(obj.polarity_scores('Ram is good boy!!'))
#capitalization
print(obj.polarity_scores('Ram is good'))
print(obj.polarity_scores('Ram is GOOD'))
#degree
print(obj.polarity_scores('Ram is good'))
print(obj.polarity_scores('Ram is better'))
print(obj.polarity_scores('Ram is best'))
print(obj.polarity_scores('Ram is bad'))
print(obj.polarity_scores('Ram is worse'))
print(obj.polarity_scores('Ram is worst'))
#conjuction
print(obj.polarity_scores('Ram is good'))
print(obj.polarity_scores('Ram is good, but he is also naughty sometimes'))
#slang
print(obj.polarity_scores("That Hotel"))
print(obj.polarity_scores("That Hotel SUX"))
print(obj.polarity_scores("That Hotel SUCKS"))
#emoticons
print(obj.polarity_scores("Your :) is the most beautiful thing I have ever seen"))
print(obj.polarity_scores("Your smile is the most beautiful thing I have ever seen"))
print(obj.polarity_scores("Your :( is the worst thing I have ever seen"))
print(obj.polarity_scores("Your smile is the worst thing I have ever seen"))
#https://360digitmg.com/blog/bert-variants-and-their-differences
#https://simpletransformers.ai/docs/classification-specifics/#supported-model-types Official reference
"""#3.a Using FINBERT Model"""
#PPT
#https://medium.com/@benjamin_joesy/finbert-financial-sentiment-analysis-with-bert-acf695b64ac6
from transformers import BertTokenizer, BertForSequenceClassification, pipeline
# tested in transformers==4.18.0
import transformers
transformers.__version__
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)
results = nlp(['growth is strong and we have plenty of liquidity.',
'there is a shortage of capital, and we need extra financing.',
'formulation patents might protect Vasotec to a limited extent.'])
results
"""#FINBERT ESG"""
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-esg',num_labels=4)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-esg')
nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)
results = nlp(['Managing and working to mitigate the impact our operations have on the environment is a core element of our business.',
'Rhonda has been volunteering for several years for a variety of charitable community programs.',
'Cabot\'s annual statements are audited annually by an independent registered public accounting firm.',
'As of December 31, 2012, the 2011 Term Loan had a principal balance of $492.5 million.'])
results
"""#FINBERT Classification"""
finbert = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-fls',num_labels=3)
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-fls')
nlp = pipeline("text-classification", model=finbert, tokenizer=tokenizer)
results = nlp(['we expect the age of our fleet to enhance availability and reliability due to reduced downtime for repairs.',
'on an equivalent unit of production basis, general and administrative expenses declined 24 percent from 1994 to $.67 per boe.',
'we will continue to assess the need for a valuation allowance against deferred tax assets considering all available evidence obtained in'])
results
X = df['Review Text'].to_list()
y = df['sentiment'].to_list()
from transformers import BertTokenizer, BertForSequenceClassification
finbert_whole = BertForSequenceClassification.from_pretrained('yiyanghkust/finbert-tone',num_labels=3)
tokenizer_whole = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')
labels = {0:'neutral', 1:'positive',2:'negative'}
sent_val = list()
for x in X:
inputs = tokenizer_whole(x, return_tensors="pt", padding=True)
outputs = finbert_whole(**inputs)[0]
val = labels[np.argmax(outputs.detach().numpy())]
print(x, '---->', val)
print('#######################################################')
sent_val.append(val)
from sklearn.metrics import accuracy_score
print(accuracy_score(y, sent_val))
"""#Using DISTILBERT"""
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
tokenizer_distilbert = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model_distilbert = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
labels = {0:'neutral', 1:'positive',2:'negative'}
sent_val_bert = list()
for x in X:
inputs = tokenizer_distilbert(x, return_tensors="pt", padding=True)
outputs = model_distilbert(**inputs)[0]
val = labels[np.argmax(outputs.detach().numpy())]
print(x, '---->', val)
print('#######################################################')
sent_val_bert.append(val)
from sklearn.metrics import accuracy_score
print(accuracy_score(y, sent_val))
"""#Bert"""
tokenizer_bert = DistilBertTokenizer.from_pretrained("bert-base-uncased")
model_bert = DistilBertForSequenceClassification.from_pretrained("bert-base-uncased")
labels = {0:'neutral', 1:'positive',2:'negative'}
sent_val_bert1 = list()
for x in X:
inputs = tokenizer_bert(x, return_tensors="pt", padding=True)
outputs = model_bert(**inputs)[0]
val = labels[np.argmax(outputs.detach().numpy())]
print(x, '---->', val)
print('#######################################################')
sent_val_bert1.append(val)
from sklearn.metrics import accuracy_score
print(accuracy_score(y, sent_val))