File size: 2,890 Bytes
4bf6cb6
 
801d065
 
 
4bf6cb6
 
 
7a5f863
 
4bf6cb6
 
 
 
 
 
 
 
 
801d065
4bf6cb6
f7ce05d
4bf6cb6
 
801d065
4bf6cb6
 
f7ce05d
4bf6cb6
 
 
 
 
801d065
 
 
 
 
 
 
ab99c79
801d065
f7ce05d
 
 
 
4bf6cb6
 
 
2b0aaf5
801d065
 
2b0aaf5
801d065
4bf6cb6
f7ce05d
 
4bf6cb6
f7ce05d
801d065
f7ce05d
2b0aaf5
801d065
2b0aaf5
801d065
 
 
f7ce05d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from transformers import AutoModelForSequenceClassification
from transformers import BertTokenizerFast
import torch
import re
import string
import pickle
import streamlit as st

# Функция очистки текста
def clean(text):
    text = text.lower() # нижний регистр
    text = re.sub(r'http\S+', " ", text) # удаляем ссылки
    text = re.sub(r'@\w+',' ',text) # удаляем упоминания пользователей
    text = re.sub(r'#\w+', ' ', text) # удаляем хэштеги
    text = re.sub(r'\d+', ' ', text) # удаляем числа
    return text

# Загрузка весов модели ML

model_filename = 'model_comments_weights.pkl'
with open(model_filename, 'rb') as file:
    model = pickle.load(file)
    
# Загрузка весов векторизатора
vectorizer = CountVectorizer()
vectorizer_filename = 'vectorizer_comments_weights.pkl'
with open(vectorizer_filename, 'rb') as file:
    vectorizer = pickle.load(file)

# Само приложение  


#Готовая модель ruBert 

tokenizer_bert = BertTokenizerFast.from_pretrained('blanchefort/rubert-base-cased-sentiment-rusentiment')
model_bert = AutoModelForSequenceClassification.from_pretrained('blanchefort/rubert-base-cased-sentiment-rusentiment', return_dict=True)




st.title("SafeTalk")
st.write("Your Personal Comment Filter is an innovative application that harnesses the power of AI to distinguish toxic comments from the rest.") 
st.write("Empowering users to navigate online discussions with confidence, SafeTalk ensures a more constructive and respectful online community by identifying and flagging harmful content.")
user_review = st.text_input("Enter your comment:", "")
user_review_clean = clean(user_review)
user_features = vectorizer.transform([user_review_clean])
prediction = model.predict(user_features)
inputs = tokenizer_bert(user_review_clean, max_length=512, padding=True, truncation=True, return_tensors='pt')
outputs = model_bert(**inputs)
prediction_bert = torch.nn.functional.softmax(outputs.logits, dim=1)
prediction_bert = torch.argmax(prediction_bert, dim=1).numpy()
st.write("Comment by ML model:", user_review)

if prediction == 0:
    st.markdown("<p style='color: green;'>Non-toxic comment</p>", unsafe_allow_html=True)
else:
    st.markdown("<p style='color: red;'>Toxic comment</p>", unsafe_allow_html=True)
st.write("Comment by RuBERT:", user_review)

if prediction_bert == 0:
    st.markdown("<p style='color: green;'>Controversial comment</p>", unsafe_allow_html=True)
elif prediction_bert == 1:
    st.markdown("<p style='color: red;'>Non-toxic comment</p>", unsafe_allow_html=True)
else:
    st.markdown("<p style='color: red;'>Toxic comment</p>", unsafe_allow_html=True)