Spaces:

Boltz79
/

Sentiment-Analysis

Sleeping

App Files Files Community

Sentiment-Analysis / app.py

Boltz79

Update app.py

fa24496 verified 5 months ago

raw

history blame contribute delete

7.21 kB

	# app.py
	import gradio as gr
	import librosa
	import numpy as np
	import os
	import tempfile
	from collections import Counter
	from speechbrain.inference.interfaces import foreign_class
	import io
	import matplotlib.pyplot as plt
	import librosa.display
	from PIL import Image # For image conversion

	# Try to import noisereduce (if not available, noise reduction will be skipped)
	try:
	import noisereduce as nr
	NOISEREDUCE_AVAILABLE = True
	except ImportError:
	NOISEREDUCE_AVAILABLE = False

	# Mapping from emotion labels to emojis
	emotion_to_emoji = {
	"angry": "😠",
	"happy": "😊",
	"sad": "😢",
	"neutral": "😐",
	"excited": "😄",
	"fear": "😨",
	"disgust": "🤢",
	"surprise": "😲"
	}

	def add_emoji_to_label(label):
	"""Append an emoji corresponding to the emotion label."""
	emoji = emotion_to_emoji.get(label.lower(), "")
	return f"{label.capitalize()} {emoji}"

	# Load the pre-trained SpeechBrain classifier
	classifier = foreign_class(
	source="speechbrain/emotion-recognition-wav2vec2-IEMOCAP",
	pymodule_file="custom_interface.py",
	classname="CustomEncoderWav2vec2Classifier",
	run_opts={"device": "cpu"} # Change to {"device": "cuda"} if GPU is available
	)

	def preprocess_audio(audio_file, apply_noise_reduction=False):
	"""
	Load and preprocess the audio file:
	- Convert to 16kHz mono.
	- Optionally apply noise reduction.
	- Normalize the audio.
	Saves the processed audio to a temporary file and returns its path.
	"""
	y, sr = librosa.load(audio_file, sr=16000, mono=True)
	if apply_noise_reduction and NOISEREDUCE_AVAILABLE:
	y = nr.reduce_noise(y=y, sr=sr)
	if np.max(np.abs(y)) > 0:
	y = y / np.max(np.abs(y))
	temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
	import soundfile as sf
	sf.write(temp_file.name, y, sr)
	return temp_file.name

	def ensemble_prediction(audio_file, apply_noise_reduction=False, segment_duration=3.0, overlap=1.0):
	"""
	For longer audio files, split into overlapping segments, predict each segment,
	and return the majority-voted emotion label.
	"""
	y, sr = librosa.load(audio_file, sr=16000, mono=True)
	total_duration = librosa.get_duration(y=y, sr=sr)

	if total_duration <= segment_duration:
	temp_file = preprocess_audio(audio_file, apply_noise_reduction)
	_, _, _, label = classifier.classify_file(temp_file)
	os.remove(temp_file)
	return label[0]

	step = segment_duration - overlap
	segments = []
	for start in np.arange(0, total_duration - segment_duration + 0.001, step):
	start_sample = int(start * sr)
	end_sample = int((start + segment_duration) * sr)
	segment_audio = y[start_sample:end_sample]
	temp_seg = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
	import soundfile as sf
	sf.write(temp_seg.name, segment_audio, sr)
	segments.append(temp_seg.name)

	predictions = []
	for seg in segments:
	temp_file = preprocess_audio(seg, apply_noise_reduction)
	_, _, _, label = classifier.classify_file(temp_file)
	predictions.append(label[0])
	os.remove(temp_file)
	os.remove(seg)

	vote = Counter(predictions)
	most_common = vote.most_common(1)[0][0]
	return most_common

	def predict_emotion(audio_file, use_ensemble=False, apply_noise_reduction=False, segment_duration=3.0, overlap=1.0):
	"""
	Predict emotion from an audio file and return the emotion with an emoji.
	"""
	try:
	if use_ensemble:
	label = ensemble_prediction(audio_file, apply_noise_reduction, segment_duration, overlap)
	else:
	temp_file = preprocess_audio(audio_file, apply_noise_reduction)
	result = classifier.classify_file(temp_file)
	os.remove(temp_file)
	if isinstance(result, tuple) and len(result) > 3:
	label = result[3][0] # Extract predicted emotion label from the tuple
	else:
	label = str(result)
	return add_emoji_to_label(label.lower())
	except Exception as e:
	return f"Error processing file: {str(e)}"

	def plot_waveform(audio_file):
	"""
	Generate and return a waveform plot image (as a PIL Image) for the given audio file.
	"""
	y, sr = librosa.load(audio_file, sr=16000, mono=True)
	plt.figure(figsize=(10, 3))
	librosa.display.waveshow(y, sr=sr)
	plt.title("Waveform")
	buf = io.BytesIO()
	plt.savefig(buf, format="png")
	plt.close()
	buf.seek(0)
	return Image.open(buf)

	def predict_and_plot(audio_file, use_ensemble, apply_noise_reduction, segment_duration, overlap):
	"""
	Run emotion prediction and generate a waveform plot.
	Returns a tuple: (emotion label with emoji, waveform image as a PIL Image).
	"""
	emotion = predict_emotion(audio_file, use_ensemble, apply_noise_reduction, segment_duration, overlap)
	waveform = plot_waveform(audio_file)
	return emotion, waveform

	with gr.Blocks(css=".gradio-container {background-color: #f7f7f7; font-family: Arial;}") as demo:
	gr.Markdown("<h1 style='text-align: center;'>Enhanced Emotion Recognition</h1>")
	gr.Markdown(
	"Upload an audio file, and the model will predict the emotion using a wav2vec2 model fine-tuned on IEMOCAP data. "
	"The prediction is accompanied by an emoji in the output, and you can also view the audio's waveform. "
	"Use the options below to adjust ensemble prediction and noise reduction settings."
	)

	with gr.Tabs():
	with gr.TabItem("Emotion Recognition"):
	with gr.Row():
	audio_input = gr.Audio(type="filepath", label="Upload Audio")
	use_ensemble = gr.Checkbox(label="Use Ensemble Prediction (for long audio)", value=False)
	apply_noise_reduction = gr.Checkbox(label="Apply Noise Reduction", value=False)
	with gr.Row():
	segment_duration = gr.Slider(minimum=1.0, maximum=10.0, step=0.5, value=3.0, label="Segment Duration (s)")
	overlap = gr.Slider(minimum=0.0, maximum=5.0, step=0.5, value=1.0, label="Segment Overlap (s)")
	predict_button = gr.Button("Predict Emotion")
	result_text = gr.Textbox(label="Predicted Emotion")
	waveform_image = gr.Image(label="Audio Waveform", type="pil")

	predict_button.click(
	predict_and_plot,
	inputs=[audio_input, use_ensemble, apply_noise_reduction, segment_duration, overlap],
	outputs=[result_text, waveform_image]
	)

	with gr.TabItem("About"):
	gr.Markdown("""
	Enhanced Emotion Recognition App

	- Model: SpeechBrain's wav2vec2 model fine-tuned on IEMOCAP for emotion recognition.
	- Features:
	- Ensemble Prediction for long audio files.
	- Optional Noise Reduction.
	- Visualization of the audio waveform.
	- Emoji representation of the predicted emotion in the output.

	Credits:
	- [SpeechBrain](https://speechbrain.github.io)
	- [Gradio](https://gradio.app)
	""")

	if __name__ == "__main__":
	demo.launch()