Spaces:
Runtime error
Runtime error
File size: 3,377 Bytes
df00128 3328b56 df00128 3328b56 df00128 3328b56 df00128 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 |
import numpy as np # For linear algebra
import pandas as pd # Data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # For Visualisation
import seaborn as sns # For Visualisation
from bs4 import BeautifulSoup # For Text Parsing
# # IMPORTING DATASET
data = pd.read_csv('Reviews.csv')
# data
# # DATA PREPROCESSING & VISUALISATION
#data.isnull().sum()
data=data.dropna()
#data.isnull().sum()
#data.shape
score_unique = data['Score'].unique()
#print(score_unique)
# 0-> NEGATIVE REVIEW
# 1-> NEUTRAL REVIEW
# 2-> POSTIVE REVIEW
a=[]
for i in data['Score']:
if i <3:
a.append(0)
if i==3:
a.append(1)
if i>3:
a.append(2)
r_0, r_1, r_2 = 0, 0, 0
for i in a:
if i == 0:
r_0 += 1
elif i == 1:
r_1 += 1
else:
r_2 += 1
# print('Negative Reviews:',r_0)
# print('Neutral Reviews:',r_1)
# print('Positive Reviews:',r_2)
# sns.countplot(a)
# plt.xlabel('Reviews', color = 'red')
# plt.ylabel('Count', color = 'red')
# plt.xticks([0,1,2],['Negative','Neutral','Positive'])
# plt.title('COUNT PLOT', color = 'r')
# plt.show()
data['sentiment']=a
#data
final_dataset = data[['Text','sentiment']]
#final_dataset
data_p=final_dataset[data['sentiment']==2]
data_n=final_dataset[data['sentiment']==0]
#len(data_p), len(data_n)
datap = data_p.iloc[np.random.randint(1,443766,5000), :]
datan = data_n.iloc[np.random.randint(1, 82007,5000), :]
#len(datan), len(datap)
data = pd.concat([datap,datan])
len(data)
c=[]
for i in data['sentiment']:
if i==0:
c.append(0)
if i==2:
c.append(1)
data['sentiment']=c
def strip_html(text):
soup = BeautifulSoup(text, "html.parser")
return soup.get_text()
data['review'] = data['Text'].apply(strip_html)
data=data.drop('Text',axis=1)
#data.head()
import nltk #Natural Language Processing Toolkit
def punc_clean(text):
import string as st
a=[w for w in text if w not in st.punctuation]
return ''.join(a)
data['review'] = data['review'].apply(punc_clean)
#data.head(2)
def remove_stopword(text):
stopword=nltk.corpus.stopwords.words('english')
stopword.remove('not')
a=[w for w in nltk.word_tokenize(text) if w not in stopword]
return ' '.join(a)
#data['review'] = data['review'].apply(remove_stopword)
from sklearn.feature_extraction.text import TfidfVectorizer
vectr = TfidfVectorizer(ngram_range=(1,2),min_df=1)
vectr.fit(data['review'])
vect_X = vectr.transform(data['review'])
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
clf=model.fit(vect_X,data['sentiment'])
#clf.score(vect_X,data['sentiment'])*100
# # PREDICTION
# clf.predict(vectr.transform(['''Nice look and build quality with moderately fast everything such as refresh rate, display quality, sound, processing, gaming experience and many more .. I didn't find any lagging or heating issue..And battery health I won't say great but I'll take that, Only cons I can say about it is camera.. sharpening picture a little much at day light and low light photo you have to compromise.''']))
# clf.predict(vectr.transform(['''Phone has bugs , and screen quality is poor , Avoid realme. Gaming was just over hyped''']))
# clf.predict(vectr.transform(['''No lags found super speed and very good performance nice phone in this budget''']))
|