Spaces:

mskov
/

Speech-Trigger-Detection

Runtime error

File size: 4,229 Bytes

babca6f
 
cbe4d4c
 
c8e54ed
1ae8e53
df85058
585a1e8
53eb88c
 
 
 
 
 
 
28ff844
df85058
 
 
 
 
 
 
 
2cadcf2
df85058
c8e54ed
53eb88c
c8e54ed
bbd3701
babca6f
 
 
9e0c17e
babca6f
 
 
 
 
f10b2fa
babca6f
9e0c17e
2cadcf2
 
 
f5e59d1
 
 
 
 
f10b2fa
6bfef5d
c8e54ed
73d041b
 
e95ab8a
b65fb2a
1ff03d5
c8e54ed
 
 
1ff03d5
53eb88c
73d041b
53eb88c
 
 
73d041b
53eb88c
 
 
73d041b
 
53eb88c
 
73d041b
 
df85058
8dbe0c3
df85058
 
2724e1c
c8e54ed
33b1b5b
53eb88c
 
33b1b5b
ca7ae8f
 
335e90e
 
33b1b5b
53eb88c
30dbd25
c8e54ed

import os
os.system("pip install git+https://github.com/openai/whisper.git")
import evaluate
from evaluate.utils import launch_gradio_widget
import gradio as gr
import torch
from speechbrain.pretrained.interfaces import foreign_class
from transformers import AutoModelForSequenceClassification, pipeline, RobertaForSequenceClassification, RobertaTokenizer, AutoTokenizer
# pull in emotion detection
# --- Add element for specification
# pull in text classification
# --- Add custom labels
# --- Associate labels with radio elements
# add logic to initiate mock notificaiton when detected
# pull in misophonia-specific model

# Building prediction function for gradio
emotion_dict = {
    'sad': 'Sad', 
    'hap': 'Happy',
    'ang': 'Anger',
    'neu': 'Neutral'
}

pipe = pipeline("automatic-speech-recognition")

# Create a Gradio interface with audio file and text inputs
def classify_toxicity(audio_file, text_input, classify_anxiety):
    # Transcribe the audio file using Whisper ASR
    if audio_file != None:
        '''whisper_model = WhisperModel.from_pretrained("openai/whisper-base")
        feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-base")
        transcription_results = whisper_model.compute(uploaded=audio_file)
        
        audio = whisper.load_audio(audio_file)
        mel = whisper.log_mel_spectrogram(audio).to(model.device)
        _, probs = model.detect_language(mel)    
        options = whisper.DecodingOptions(fp16 = False)
        result = whisper.decode(model, mel, options)
        # Extract the transcribed text
        # transcribed_text = transcription_results["transcription"]
        '''
        # model = whisper.load_model("base")
        # transcribed_text = model.transcribe(audio_file)
        transcribed_text = pipe(audio_file)["text"]
        
        #### Emotion classification ####
        emotion_classifier = foreign_class(source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP", pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec2Classifier")
        out_prob, score, index, text_lab = emotion_classifier.classify_file(audio_file.name)
    
    else:
        transcribed_text = text_input
 
    #### Toxicity Classifier ####
        
    toxicity_module = evaluate.load("toxicity",  "facebook/roberta-hate-speech-dynabench-r4-target")
    #toxicity_module = evaluate.load("toxicity", 'DaNLP/da-electra-hatespeech-detection', module_type="measurement")

    toxicity_results = toxicity_module.compute(predictions=[transcribed_text])
 
    toxicity_score = toxicity_results["toxicity"][0]
    print(toxicity_score)

    #### Text classification #####

    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

    text_classifier = pipeline("zero-shot-classification", model="MoritzLaurer/mDeBERTa-v3-base-mnli-xnli")
    
    sequence_to_classify = transcribed_text
    candidate_labels = classify_anxiety
    # classification_output = classifier(sequence_to_classify, candidate_labels, multi_label=False)
    classification_output = text_classifier(sequence_to_classify, candidate_labels, multi_label=False)
    print(classification_output)

    #### Emotion classification ####
    
    emotion_classifier = foreign_class(source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP", pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec2Classifier")
    out_prob, score, index, text_lab = emotion_classifier.classify_file(audio_file.name)
 
    return toxicity_score, classification_output, emo_dict[text_lab[0]], transcribed_text
    # return f"Toxicity Score ({available_models[selected_model]}): {toxicity_score:.4f}"
 
with gr.Blocks() as iface:
    with gr.Column():
        classify = gr.Radio(["racial identity hate", "LGBTQ+ hate", "sexually explicit", "misophonia"])
    with gr.Column():
        aud_input = gr.Audio(source="upload", type="filepath", label="Upload Audio File")
        text = gr.Textbox(label="Enter Text", placeholder="Enter text here...")
        submit_btn = gr.Button(label="Run")
    with gr.Column():
        out_text = gr.Textbox()
    submit_btn.click(fn=classify_toxicity, inputs=[aud_input, text, classify], outputs=out_text)

iface.launch()