Spaces:

mskov
/

Speech-Trigger-Detection

Runtime error

File size: 5,799 Bytes

babca6f
 
cbe4d4c
 
c8e54ed
1ae8e53
ff14337
 
df85058
ff14337
 
 
53eb88c
 
 
 
 
 
 
28ff844
ff14337
 
df85058
a94b06f
df85058
 
 
 
 
 
61fa7d4
 
 
34bf2a6
61fa7d4
 
 
 
4b9eea9
df85058
c8e54ed
53eb88c
c8e54ed
bbd3701
2cadcf2
f5e59d1
 
 
9d36990
f5e59d1
f10b2fa
6bfef5d
395d676
 
 
 
 
73d041b
395d676
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ff14337
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
789fd51
ff14337
 
 
 
 
 
 
 
 
 
 
 
 
395d676
ff14337
33b1b5b
53eb88c
61fa7d4
33b1b5b
ca7ae8f
 
335e90e
 
33b1b5b
53eb88c
30dbd25
c8e54ed

import os
os.system("pip install git+https://github.com/openai/whisper.git")
import evaluate
from evaluate.utils import launch_gradio_widget
import gradio as gr
import torch
import classify
from whisper.tokenizer import get_tokenizer
from speechbrain.pretrained.interfaces import foreign_class
from transformers import AutoModelForSequenceClassification, pipeline, WhisperTokenizer, RobertaForSequenceClassification, RobertaTokenizer, AutoTokenizer


# pull in emotion detection
# --- Add element for specification
# pull in text classification
# --- Add custom labels
# --- Associate labels with radio elements
# add logic to initiate mock notificaiton when detected
# pull in misophonia-specific model

model_cache = {}

# Building prediction function for gradio
emo_dict = {
    'sad': 'Sad', 
    'hap': 'Happy',
    'ang': 'Anger',
    'neu': 'Neutral'
}

# static classes for now, but it would be best ot have the user select from multiple, and to enter their own
class_options = {
    "racism": ["racism", "hate speech", "bigotry", "racially targeted", "racially diminutive", "racial slur", "ethnic slur", "ethnic hate", "pro-white nationalism"],
    "LGBTQ+ hate": ["gay slur", "trans slur", "homophobic slur", "transphobia", "anti-LBGTQ+", "hate speech"],
    "sexually explicit": ["sexually explicit", "sexually coercive", "sexual exploitation", "vulgar", "raunchy", "sexually demeaning", "sexual violence", "victim blaming"],
    "misophonia": ["chewing", "breathing", "mouthsounds", "popping", "sneezing", "yawning", "smacking", "sniffling", "panting"]
}

pipe = pipeline("automatic-speech-recognition", model="openai/whisper-large")

# Create a Gradio interface with audio file and text inputs
def classify_toxicity(audio_file, text_input, classify_anxiety):
    # Transcribe the audio file using Whisper ASR
    if audio_file != None:
        transcribed_text = pipe(audio_file)["text"]
        
        #### Emotion classification ####
        emotion_classifier = foreign_class(source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP", pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec2Classifier")
        out_prob, score, index, text_lab = emotion_classifier.classify_file(audio_file)
    
    else:
        transcribed_text = text_input
    if classify_anxiety != "misophonia":
        #### Toxicity Classifier ####
            
        toxicity_module = evaluate.load("toxicity",  "facebook/roberta-hate-speech-dynabench-r4-target")
        #toxicity_module = evaluate.load("toxicity", 'DaNLP/da-electra-hatespeech-detection', module_type="measurement")
    
        toxicity_results = toxicity_module.compute(predictions=[transcribed_text])
     
        toxicity_score = toxicity_results["toxicity"][0]
        print(toxicity_score)
    
        #### Text classification #####
    
        device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    
        text_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
    
        sequence_to_classify = transcribed_text
        print(classify_anxiety, class_options)
        candidate_labels = class_options.get(classify_anxiety, [])
        # classification_output = classifier(sequence_to_classify, candidate_labels, multi_label=False)
        classification_output = text_classifier(sequence_to_classify, candidate_labels, multi_label=True)
        print(classification_output)
    
        #### Emotion classification ####
        
        emotion_classifier = foreign_class(source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP", pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec2Classifier")
        out_prob, score, index, text_lab = emotion_classifier.classify_file(audio_file)
     
        return toxicity_score, classification_output, emo_dict[text_lab[0]], transcribed_text
        # return f"Toxicity Score ({available_models[selected_model]}): {toxicity_score:.4f}"
    else: 
        # model = model_cache[model_name]
        class_names = classify_anxiety.split(",")
        # tokenizer = get_tokenizer(multilingual=".en" not in model_name)
        tokenizer= WhisperTokenizer.from_pretrained("openai/whisper-large")
        model = "large"
        
        if model_name not in model_cache:
            model = whisper.load_model(model_name)
            model_cache[model_name] = model
        else:
            # model = model_cache[model_name]
            class_names = classify_anxiety.split(",")
    
        internal_lm_average_logprobs = classify.calculate_internal_lm_average_logprobs(
            model=model,
            class_names=class_names,
            # class_names=classify_anxiety,
            tokenizer=tokenizer,
        )
        audio_features = classify.calculate_audio_features(audio_path, model)
        average_logprobs = classify.calculate_average_logprobs(
            model=model,
            audio_features=audio_features,
            class_names=class_names,
            tokenizer=tokenizer,
        )
        average_logprobs -= internal_lm_average_logprobs
        scores = average_logprobs.softmax(-1).tolist()
        return {class_name: score for class_name, score in zip(class_names, scores)}

        return classify_anxiety
     
with gr.Blocks() as iface:
    with gr.Column():
        classify = gr.Radio(["racism", "LGBTQ+ hate", "sexually explicit", "misophonia"])
    with gr.Column():
        aud_input = gr.Audio(source="upload", type="filepath", label="Upload Audio File")
        text = gr.Textbox(label="Enter Text", placeholder="Enter text here...")
        submit_btn = gr.Button(label="Run")
    with gr.Column():
        out_text = gr.Textbox()
    submit_btn.click(fn=classify_toxicity, inputs=[aud_input, text, classify], outputs=out_text)

iface.launch()