HamidRezaei commited on
Commit
bcb1984
·
verified ·
1 Parent(s): aec4a54

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +135 -0
app.py ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Hugging Face's logo
2
+ Hugging Face
3
+ Search models, datasets, users...
4
+ Models
5
+ Datasets
6
+ Spaces
7
+ Posts
8
+ Docs
9
+ Solutions
10
+ Pricing
11
+
12
+
13
+
14
+ Spaces:
15
+
16
+ Asa-AI-Lab
17
+ /
18
+ Offensive-Detection-Space
19
+
20
+ private
21
+
22
+ Logs
23
+ App
24
+ Files
25
+ Community
26
+ Settings
27
+ Offensive-Detection-Space
28
+ /
29
+ app.py
30
+
31
+ hafez97's picture
32
+ hafez97
33
+ Update app.py
34
+ b244916
35
+ verified
36
+ 13 days ago
37
+ raw
38
+
39
+ Copy download link
40
+ history
41
+ blame
42
+ edit
43
+ delete
44
+
45
+ 2.96 kB
46
+ import streamlit as st
47
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
48
+ import os
49
+ import torch
50
+
51
+ from cleantext import clean
52
+ import hazm
53
+ import re
54
+
55
+ def cleanhtml(raw_html):
56
+ cleanr = re.compile('<.*?>')
57
+ cleantext = re.sub(cleanr, '', raw_html)
58
+ return cleantext
59
+
60
+
61
+ def cleaning(text):
62
+ text = text.strip()
63
+
64
+ # regular cleaning
65
+ text = clean(text,
66
+ clean_all=True,
67
+ punct=True,
68
+ stopwords=True,
69
+ stemming=True,
70
+ extra_spaces=True
71
+ )
72
+
73
+ # cleaning htmls
74
+ text = cleanhtml(text)
75
+
76
+ # normalizing
77
+ normalizer = hazm.Normalizer()
78
+ text = normalizer.normalize(text)
79
+
80
+ # removing wierd patterns
81
+ wierd_pattern = re.compile("["
82
+ u"\U0001F600-\U0001F64F" # emoticons
83
+ u"\U0001F300-\U0001F5FF" # symbols & pictographs
84
+ u"\U0001F680-\U0001F6FF" # transport & map symbols
85
+ u"\U0001F1E0-\U0001F1FF" # flags (iOS)
86
+ u"\U00002702-\U000027B0"
87
+ u"\U000024C2-\U0001F251"
88
+ u"\U0001f926-\U0001f937"
89
+ u'\U00010000-\U0010ffff'
90
+ u"\u200d"
91
+ u"\u2640-\u2642"
92
+ u"\u2600-\u2B55"
93
+ u"\u23cf"
94
+ u"\u23e9"
95
+ u"\u231a"
96
+ u"\u3030"
97
+ u"\ufe0f"
98
+ u"\u2069"
99
+ u"\u2066"
100
+ # u"\u200c"
101
+ u"\u2068"
102
+ u"\u2067"
103
+ "]+", flags=re.UNICODE)
104
+
105
+ text = wierd_pattern.sub(r'', text)
106
+
107
+ # removing extra spaces, hashtags
108
+ text = re.sub("#", "", text)
109
+ text = re.sub("\s+", " ", text)
110
+
111
+ return text
112
+
113
+ access_token = os.getenv('ACCESS_TOKEN')
114
+ tokenizer = AutoTokenizer.from_pretrained("HamidRezaei/Persian-Offensive-Language-Detection-Lora", token=access_token)
115
+ model = AutoModelForSequenceClassification.from_pretrained("HamidRezaei/Persian-Offensive-Language-Detection-Lora", token=access_token)
116
+
117
+ st.title("Offensive or Not?")
118
+ prompt = st.text_area(label="Send a message")
119
+ button = st.button("send")
120
+
121
+ if prompt:
122
+ normalized_prompt = cleaning(prompt)
123
+
124
+ encoding = tokenizer(normalized_prompt, return_tensors="pt")
125
+ encoding = {k: v.to(model.device) for k,v in encoding.items()}
126
+
127
+ outputs = model(**encoding)
128
+ logits = outputs.logits
129
+
130
+ # apply sigmoid + threshold
131
+ sigmoid = torch.nn.Sigmoid()
132
+ probs = sigmoid(logits.squeeze().cpu())
133
+ score = probs.item()
134
+ st.markdown(f"Offensive: score {score}" if score > 0.5 else f"Not Offensive: score {score}")
135
+