Spaces:
Running
Running
import gradio as gr | |
import numpy as np | |
import torch | |
from transformers import pipeline | |
import librosa | |
class EmotionRecognizer: | |
def __init__(self): | |
self.classifier = pipeline( | |
"audio-classification", | |
model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition", | |
device=0 if torch.cuda.is_available() else -1 | |
) | |
self.sample_rate = 16000 | |
def process_audio(self, audio_input): | |
try: | |
# Extract audio data and sample rate from gradio input | |
sample_rate, audio_data = audio_input | |
# Convert stereo to mono if necessary | |
if len(audio_data.shape) > 1: | |
audio_data = np.mean(audio_data, axis=1) | |
# Convert to float32 and normalize | |
audio_data = audio_data.astype(np.float32) | |
if np.max(np.abs(audio_data)) > 1.0: | |
audio_data = audio_data / np.max(np.abs(audio_data)) | |
# Resample if necessary | |
if sample_rate != self.sample_rate: | |
audio_data = librosa.resample( | |
y=audio_data, | |
orig_sr=sample_rate, | |
target_sr=self.sample_rate | |
) | |
# Ensure the audio isn't too short | |
if len(audio_data) < self.sample_rate: | |
# Pad audio if it's too short | |
audio_data = np.pad(audio_data, (0, self.sample_rate - len(audio_data))) | |
elif len(audio_data) > 10 * self.sample_rate: | |
# Take first 10 seconds if audio is too long | |
audio_data = audio_data[:10 * self.sample_rate] | |
# Make prediction | |
result = self.classifier({"array": audio_data, "sampling_rate": self.sample_rate}) | |
# Format results | |
emotions_text = "\n".join([ | |
f"{pred['label']}: {pred['score']*100:.2f}%" | |
for pred in result | |
]) | |
# Prepare plot data | |
plot_data = { | |
"labels": [pred['label'] for pred in result], | |
"values": [pred['score'] * 100 for pred in result] | |
} | |
return emotions_text, plot_data | |
except Exception as e: | |
print(f"Error details: {str(e)}") | |
return f"Error processing audio: {str(e)}", None | |
def create_interface(): | |
recognizer = EmotionRecognizer() | |
def process_audio_file(audio): | |
if audio is None: | |
return "Please provide an audio input.", None | |
return recognizer.process_audio(audio) | |
with gr.Blocks() as interface: | |
gr.Markdown("# Audio Emotion Recognition") | |
gr.Markdown("Record or upload audio to analyze the emotional content. The model works best with clear speech in English.") | |
with gr.Row(): | |
with gr.Column(): | |
audio_input = gr.Audio( | |
label="Upload or Record Audio", | |
type="numpy", | |
sources=["microphone", "upload"], | |
) | |
analyze_btn = gr.Button("Analyze Emotion") | |
gr.Markdown("Note: Audio will be automatically converted to mono and resampled if needed.") | |
with gr.Column(): | |
output_text = gr.Textbox( | |
label="Results", | |
lines=5 | |
) | |
output_plot = gr.BarPlot( | |
title="Emotion Confidence Scores", | |
x_title="Emotions", | |
y_title="Confidence (%)" | |
) | |
analyze_btn.click( | |
fn=process_audio_file, | |
inputs=[audio_input], | |
outputs=[output_text, output_plot] | |
) | |
return interface | |
if __name__ == "__main__": | |
demo = create_interface() | |
demo.launch(share=True) |