Hugging Face's logo
Hugging Face
Search models, datasets, users...
Models
Datasets
Spaces
Posts
Docs
Solutions
Pricing


Spaces:

Asa-AI-Lab
/
Offensive-Detection-Space

private

Logs
App
Files
Community
Settings
Offensive-Detection-Space
/
app.py

hafez97's picture
hafez97
Update app.py
b244916
verified
13 days ago
raw

Copy download link
history
blame
edit
delete

2.96 kB
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import os
import torch

from cleantext import clean
import hazm
import re

def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext


def cleaning(text):
    text = text.strip()

    # regular cleaning
    text = clean(text,
                 clean_all=True,
                 punct=True,
                 stopwords=True,
                 stemming=True,
                 extra_spaces=True
                 )

    # cleaning htmls
    text = cleanhtml(text)

    # normalizing
    normalizer = hazm.Normalizer()
    text = normalizer.normalize(text)

    # removing wierd patterns
    wierd_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u'\U00010000-\U0010ffff'
                               u"\u200d"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\u3030"
                               u"\ufe0f"
                               u"\u2069"
                               u"\u2066"
                               # u"\u200c"
                               u"\u2068"
                               u"\u2067"
                               "]+", flags=re.UNICODE)

    text = wierd_pattern.sub(r'', text)

    # removing extra spaces, hashtags
    text = re.sub("#", "", text)
    text = re.sub("\s+", " ", text)

    return text

access_token = os.getenv('ACCESS_TOKEN')
tokenizer = AutoTokenizer.from_pretrained("HamidRezaei/Persian-Offensive-Language-Detection-Lora", token=access_token)
model = AutoModelForSequenceClassification.from_pretrained("HamidRezaei/Persian-Offensive-Language-Detection-Lora", token=access_token)

st.title("Offensive or Not?")
prompt = st.text_area(label="Send a message")
button = st.button("send")

if prompt:
    normalized_prompt = cleaning(prompt)
    
    encoding = tokenizer(normalized_prompt, return_tensors="pt")
    encoding = {k: v.to(model.device) for k,v in encoding.items()}
    
    outputs = model(**encoding)
    logits = outputs.logits
    
    # apply sigmoid + threshold
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(logits.squeeze().cpu())
    score = probs.item()
    st.markdown(f"Offensive: score {score}" if score > 0.5 else f"Not Offensive: score {score}")