import streamlit as st from transformers import AutoTokenizer, AutoModelForSequenceClassification import os import torch from cleantext import clean import hazm import re def cleanhtml(raw_html): cleanr = re.compile('<.*?>') cleantext = re.sub(cleanr, '', raw_html) return cleantext def cleaning(text): text = text.strip() # regular cleaning text = clean(text, clean_all=True, punct=True, stopwords=True, stemming=True, extra_spaces=True ) # cleaning htmls text = cleanhtml(text) # normalizing normalizer = hazm.Normalizer() text = normalizer.normalize(text) # removing wierd patterns wierd_pattern = re.compile("[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" u"\U0001f926-\U0001f937" u'\U00010000-\U0010ffff' u"\u200d" u"\u2640-\u2642" u"\u2600-\u2B55" u"\u23cf" u"\u23e9" u"\u231a" u"\u3030" u"\ufe0f" u"\u2069" u"\u2066" # u"\u200c" u"\u2068" u"\u2067" "]+", flags=re.UNICODE) text = wierd_pattern.sub(r'', text) # removing extra spaces, hashtags text = re.sub("#", "", text) text = re.sub("\s+", " ", text) return text tokenizer = AutoTokenizer.from_pretrained("HamidRezaei/Persian-Offensive-Language-Detection") model = AutoModelForSequenceClassification.from_pretrained("HamidRezaei/Persian-Offensive-Language-Detection") st.title("Offensive or Not?") prompt = st.text_area(label="Send a message") button = st.button("send") if prompt: normalized_prompt = cleaning(prompt) encoding = tokenizer(normalized_prompt, return_tensors="pt") encoding = {k: v.to(model.device) for k,v in encoding.items()} outputs = model(**encoding) logits = outputs.logits # apply sigmoid + threshold sigmoid = torch.nn.Sigmoid() probs = sigmoid(logits.squeeze().cpu()) score = probs.item() st.markdown(f"Offensive: score {score}" if score > 0.5 else f"Not Offensive: score {score}")