danielcd99 commited on
Commit
14536de
1 Parent(s): ae778b8

feat:added main files

Browse files
Files changed (2) hide show
  1. app.py +30 -0
  2. preprocess_data.py +81 -0
app.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from preprocess_data import preprocess_text,get_stopwords
4
+ from datasets import load_dataset
5
+
6
+ dataset = load_dataset('danielcd99/imdb')
7
+
8
+ dataframes = {}
9
+ for split in dataset.keys():
10
+ # Convert the dataset split to a pandas DataFrame
11
+ df = dataset[split].to_pandas()
12
+ dataframes[split] = df
13
+
14
+ TITLE_TEXT = f"IMDB reviews"
15
+ DESCRIPTION_TEXT = f"Esta é uma aplicação para o trabalho de NLP. Utilizamos a base de dados de reviews do IMDb com 50.000 comentários entre positivos e negativos (a base está balanceada). Por meio desta interface é possível visualizar como os exemplos da nossa base de teste foram classificados com um BERT treinado para esta task."
16
+
17
+ st.title(TITLE_TEXT)
18
+ st.write(DESCRIPTION_TEXT)
19
+
20
+ if st.button('Encontre exemplos!'):
21
+ df = df.sample(5)
22
+ get_stopwords()
23
+ df['preprocessed_review'] = df['review'].copy()
24
+ df['preprocessed_review'] = df['preprocessed_review'].apply(preprocess_text)
25
+ cols = ['review','preprocessed_review','sentiment']
26
+
27
+
28
+
29
+ st.table(df[cols])
30
+
preprocess_data.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import nltk
3
+ from nltk.corpus import stopwords
4
+ from nltk.stem import PorterStemmer
5
+
6
+
7
+ def lowercase_text(text):
8
+ return text.lower()
9
+
10
+ def remove_html(text):
11
+ return re.sub(r'<[^<]+?>', '', text)
12
+
13
+ def remove_url(text):
14
+ return re.sub(r'http[s]?://\S+|www\.\S+', '', text)
15
+
16
+ def remove_punctuations(text):
17
+ tokens_list = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
18
+ for char in text:
19
+ if char in tokens_list:
20
+ text = text.replace(char, ' ')
21
+
22
+ return text
23
+
24
+ def remove_emojis(text):
25
+ emojis = re.compile("["
26
+ u"\U0001F600-\U0001F64F"
27
+ u"\U0001F300-\U0001F5FF"
28
+ u"\U0001F680-\U0001F6FF"
29
+ u"\U0001F1E0-\U0001F1FF"
30
+ u"\U00002500-\U00002BEF"
31
+ u"\U00002702-\U000027B0"
32
+ u"\U00002702-\U000027B0"
33
+ u"\U000024C2-\U0001F251"
34
+ u"\U0001f926-\U0001f937"
35
+ u"\U00010000-\U0010ffff"
36
+ u"\u2640-\u2642"
37
+ u"\u2600-\u2B55"
38
+ u"\u200d"
39
+ u"\u23cf"
40
+ u"\u23e9"
41
+ u"\u231a"
42
+ u"\ufe0f"
43
+ u"\u3030"
44
+ "]+", re.UNICODE)
45
+
46
+ text = re.sub(emojis, '', text)
47
+ return text
48
+
49
+ def remove_stop_words(text):
50
+ stop_words = stopwords.words('english')
51
+ new_text = ''
52
+ for word in text.split():
53
+ if word not in stop_words:
54
+ new_text += ''.join(f'{word} ')
55
+
56
+ return new_text.strip()
57
+
58
+ def stem_words(text):
59
+ stemmer = PorterStemmer()
60
+ new_text = ''
61
+ for word in text.split():
62
+ new_text += ''.join(f'{stemmer.stem(word)} ')
63
+
64
+ return new_text
65
+
66
+ def get_stopwords():
67
+ nltk.download('stopwords')
68
+
69
+ def preprocess_text(text):
70
+ text = lowercase_text(text)
71
+ text = remove_html(text)
72
+ text = remove_url(text)
73
+ text = remove_punctuations(text)
74
+ text = remove_emojis(text)
75
+ text = remove_stop_words(text)
76
+ text = stem_words(text)
77
+
78
+ return text
79
+
80
+ if __name__ == "__main__":
81
+ pass