|
import gradio as gr |
|
import torch |
|
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2Processor |
|
import librosa |
|
import numpy as np |
|
|
|
|
|
model_name = "facebook/wav2vec2-large-xlsr-53" |
|
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name, num_labels=7) |
|
processor = Wav2Vec2Processor.from_pretrained(model_name) |
|
|
|
|
|
def recognize_emotion(audio): |
|
|
|
audio_input, _ = librosa.load(audio, sr=16000) |
|
|
|
|
|
inputs = processor(audio_input, sampling_rate=16000, return_tensors="pt", padding=True) |
|
|
|
|
|
with torch.no_grad(): |
|
logits = model(**inputs).logits |
|
|
|
|
|
emotion_map = { |
|
0: "Neutral", |
|
1: "Happy", |
|
2: "Angry", |
|
3: "Sad", |
|
4: "Surprised", |
|
5: "Fearful", |
|
6: "Disgusted" |
|
} |
|
|
|
|
|
predicted_class = torch.argmax(logits, dim=-1).item() |
|
emotion = emotion_map[predicted_class] |
|
|
|
return emotion |
|
|
|
|
|
iface = gr.Interface( |
|
fn=recognize_emotion, |
|
inputs=gr.inputs.Audio(source="microphone", type="filepath"), |
|
outputs="text", |
|
title="Speech Emotion Recognition", |
|
description="Identify the emotion in the speech: Happy, Sad, Angry, Surprised, Neutral, Fearful, or Disgusted." |
|
) |
|
|
|
|
|
iface.launch() |
|
|