from sklearn.feature_extraction.text import CountVectorizer from sklearn.linear_model import LogisticRegression from transformers import AutoModelForSequenceClassification from transformers import BertTokenizerFast import torch import re import string import pickle import streamlit as st # Функция очистки текста def clean(text): text = text.lower() # нижний регистр text = re.sub(r'http\S+', " ", text) # удаляем ссылки text = re.sub(r'@\w+',' ',text) # удаляем упоминания пользователей text = re.sub(r'#\w+', ' ', text) # удаляем хэштеги text = re.sub(r'\d+', ' ', text) # удаляем числа return text # Загрузка весов модели ML model_filename = 'model_comments_weights.pkl' with open(model_filename, 'rb') as file: model = pickle.load(file) # Загрузка весов векторизатора vectorizer = CountVectorizer() vectorizer_filename = 'vectorizer_comments_weights.pkl' with open(vectorizer_filename, 'rb') as file: vectorizer = pickle.load(file) # Само приложение #Готовая модель ruBert tokenizer_bert = BertTokenizerFast.from_pretrained('blanchefort/rubert-base-cased-sentiment-rusentiment') model_bert = AutoModelForSequenceClassification.from_pretrained('blanchefort/rubert-base-cased-sentiment-rusentiment', return_dict=True) def predict(text): ) return predicted st.title("SafeTalk") st.write("Your Personal Comment Filter is an innovative application that harnesses the power of AI to distinguish toxic comments from the rest.") st.write("Empowering users to navigate online discussions with confidence, SafeTalk ensures a more constructive and respectful online community by identifying and flagging harmful content.") user_review = st.text_input("Enter your comment:", "") user_review_clean = clean(user_review) user_features = vectorizer.transform([user_review_clean]) prediction = model.predict(user_features) inputs = tokenizer(user_review_clean, max_length=512, padding=True, truncation=True, return_tensors='pt') outputs = model_bert(**inputs) prediction_bert = torch.nn.functional.softmax(outputs.logits, dim=1) prediction_bert = torch.argmax(predicted, dim=1).numpy( st.write("Comment by ML model:", user_review) if prediction == 0: st.markdown("

Non-toxic comment

", unsafe_allow_html=True) else: st.markdown("

Toxic comment

", unsafe_allow_html=True) st.write("Comment by RuBERT:", user_review) if prediction == 0: st.markdown("

Controversial comment

", unsafe_allow_html=True) elif prediction == 1: st.markdown("

Non-toxic comment

", unsafe_allow_html=True) else: st.markdown("

Toxic comment

", unsafe_allow_html=True)