Spaces:
Running
Running
# app.py | |
import gradio as gr | |
import librosa | |
import numpy as np | |
import os | |
import tempfile | |
from collections import Counter | |
from speechbrain.inference.interfaces import foreign_class | |
import io | |
import matplotlib.pyplot as plt | |
import librosa.display | |
from PIL import Image # For image conversion | |
# Try to import noisereduce (if not available, noise reduction will be skipped) | |
try: | |
import noisereduce as nr | |
NOISEREDUCE_AVAILABLE = True | |
except ImportError: | |
NOISEREDUCE_AVAILABLE = False | |
# Mapping from emotion labels to emojis | |
emotion_to_emoji = { | |
"angry": "π ", | |
"happy": "π", | |
"sad": "π’", | |
"neutral": "π", | |
"excited": "π", | |
"fear": "π¨", | |
"disgust": "π€’", | |
"surprise": "π²" | |
} | |
def add_emoji_to_label(label): | |
"""Append an emoji corresponding to the emotion label.""" | |
emoji = emotion_to_emoji.get(label.lower(), "") | |
return f"{label.capitalize()} {emoji}" | |
# Load the pre-trained SpeechBrain classifier | |
classifier = foreign_class( | |
source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP", | |
pymodule_file="custom_interface.py", | |
classname="CustomEncoderWav2vec2Classifier", | |
run_opts={"device": "cpu"} # Change to {"device": "cuda"} if GPU is available | |
) | |
def preprocess_audio(audio_file, apply_noise_reduction=False): | |
""" | |
Load and preprocess the audio file: | |
- Convert to 16kHz mono. | |
- Optionally apply noise reduction. | |
- Normalize the audio. | |
Saves the processed audio to a temporary file and returns its path. | |
""" | |
y, sr = librosa.load(audio_file, sr=16000, mono=True) | |
if apply_noise_reduction and NOISEREDUCE_AVAILABLE: | |
y = nr.reduce_noise(y=y, sr=sr) | |
if np.max(np.abs(y)) > 0: | |
y = y / np.max(np.abs(y)) | |
temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) | |
import soundfile as sf | |
sf.write(temp_file.name, y, sr) | |
return temp_file.name | |
def ensemble_prediction(audio_file, apply_noise_reduction=False, segment_duration=3.0, overlap=1.0): | |
""" | |
For longer audio files, split into overlapping segments, predict each segment, | |
and return the majority-voted emotion label. | |
""" | |
y, sr = librosa.load(audio_file, sr=16000, mono=True) | |
total_duration = librosa.get_duration(y=y, sr=sr) | |
if total_duration <= segment_duration: | |
temp_file = preprocess_audio(audio_file, apply_noise_reduction) | |
_, _, _, label = classifier.classify_file(temp_file) | |
os.remove(temp_file) | |
return label[0] | |
step = segment_duration - overlap | |
segments = [] | |
for start in np.arange(0, total_duration - segment_duration + 0.001, step): | |
start_sample = int(start * sr) | |
end_sample = int((start + segment_duration) * sr) | |
segment_audio = y[start_sample:end_sample] | |
temp_seg = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) | |
import soundfile as sf | |
sf.write(temp_seg.name, segment_audio, sr) | |
segments.append(temp_seg.name) | |
predictions = [] | |
for seg in segments: | |
temp_file = preprocess_audio(seg, apply_noise_reduction) | |
_, _, _, label = classifier.classify_file(temp_file) | |
predictions.append(label[0]) | |
os.remove(temp_file) | |
os.remove(seg) | |
vote = Counter(predictions) | |
most_common = vote.most_common(1)[0][0] | |
return most_common | |
def predict_emotion(audio_file, use_ensemble=False, apply_noise_reduction=False, segment_duration=3.0, overlap=1.0): | |
""" | |
Predict emotion from an audio file and return the emotion with an emoji. | |
""" | |
try: | |
if use_ensemble: | |
label = ensemble_prediction(audio_file, apply_noise_reduction, segment_duration, overlap) | |
else: | |
temp_file = preprocess_audio(audio_file, apply_noise_reduction) | |
result = classifier.classify_file(temp_file) | |
os.remove(temp_file) | |
if isinstance(result, tuple) and len(result) > 3: | |
label = result[3][0] # Extract predicted emotion label from the tuple | |
else: | |
label = str(result) | |
return add_emoji_to_label(label.lower()) | |
except Exception as e: | |
return f"Error processing file: {str(e)}" | |
def plot_waveform(audio_file): | |
""" | |
Generate and return a waveform plot image (as a PIL Image) for the given audio file. | |
""" | |
y, sr = librosa.load(audio_file, sr=16000, mono=True) | |
plt.figure(figsize=(10, 3)) | |
librosa.display.waveshow(y, sr=sr) | |
plt.title("Waveform") | |
buf = io.BytesIO() | |
plt.savefig(buf, format="png") | |
plt.close() | |
buf.seek(0) | |
return Image.open(buf) | |
def predict_and_plot(audio_file, use_ensemble, apply_noise_reduction, segment_duration, overlap): | |
""" | |
Run emotion prediction and generate a waveform plot. | |
Returns a tuple: (emotion label with emoji, waveform image as a PIL Image). | |
""" | |
emotion = predict_emotion(audio_file, use_ensemble, apply_noise_reduction, segment_duration, overlap) | |
waveform = plot_waveform(audio_file) | |
return emotion, waveform | |
with gr.Blocks(css=".gradio-container {background-color: #f7f7f7; font-family: Arial;}") as demo: | |
gr.Markdown("<h1 style='text-align: center;'>Enhanced Emotion Recognition</h1>") | |
gr.Markdown( | |
"Upload an audio file, and the model will predict the emotion using a wav2vec2 model fine-tuned on IEMOCAP data. " | |
"The prediction is accompanied by an emoji in the output, and you can also view the audio's waveform. " | |
"Use the options below to adjust ensemble prediction and noise reduction settings." | |
) | |
with gr.Tabs(): | |
with gr.TabItem("Emotion Recognition"): | |
with gr.Row(): | |
audio_input = gr.Audio(type="filepath", label="Upload Audio") | |
use_ensemble = gr.Checkbox(label="Use Ensemble Prediction (for long audio)", value=False) | |
apply_noise_reduction = gr.Checkbox(label="Apply Noise Reduction", value=False) | |
with gr.Row(): | |
segment_duration = gr.Slider(minimum=1.0, maximum=10.0, step=0.5, value=3.0, label="Segment Duration (s)") | |
overlap = gr.Slider(minimum=0.0, maximum=5.0, step=0.5, value=1.0, label="Segment Overlap (s)") | |
predict_button = gr.Button("Predict Emotion") | |
result_text = gr.Textbox(label="Predicted Emotion") | |
waveform_image = gr.Image(label="Audio Waveform", type="pil") | |
predict_button.click( | |
predict_and_plot, | |
inputs=[audio_input, use_ensemble, apply_noise_reduction, segment_duration, overlap], | |
outputs=[result_text, waveform_image] | |
) | |
with gr.TabItem("About"): | |
gr.Markdown(""" | |
**Enhanced Emotion Recognition App** | |
- **Model:** SpeechBrain's wav2vec2 model fine-tuned on IEMOCAP for emotion recognition. | |
- **Features:** | |
- Ensemble Prediction for long audio files. | |
- Optional Noise Reduction. | |
- Visualization of the audio waveform. | |
- Emoji representation of the predicted emotion in the output. | |
**Credits:** | |
- [SpeechBrain](https://speechbrain.github.io) | |
- [Gradio](https://gradio.app) | |
""") | |
if __name__ == "__main__": | |
demo.launch() |