Spaces:

Boltz79
/

Sentiment-Analysis

Running

App Files Files Community

Sentiment-Analysis / app.py

Boltz79

Update app.py

def04d4 verified 22 days ago

raw

history blame

3.96 kB

	import gradio as gr
	import numpy as np
	import torch
	from transformers import pipeline
	import librosa

	class EmotionRecognizer:
	def __init__(self):
	self.classifier = pipeline(
	"audio-classification",
	model="ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition",
	device=0 if torch.cuda.is_available() else -1
	)
	self.sample_rate = 16000

	def process_audio(self, audio_input):
	try:
	# Extract audio data and sample rate from gradio input
	sample_rate, audio_data = audio_input

	# Convert stereo to mono if necessary
	if len(audio_data.shape) > 1:
	audio_data = np.mean(audio_data, axis=1)

	# Convert to float32 and normalize
	audio_data = audio_data.astype(np.float32)
	if np.max(np.abs(audio_data)) > 1.0:
	audio_data = audio_data / np.max(np.abs(audio_data))

	# Resample if necessary
	if sample_rate != self.sample_rate:
	audio_data = librosa.resample(
	y=audio_data,
	orig_sr=sample_rate,
	target_sr=self.sample_rate
	)

	# Ensure the audio isn't too short
	if len(audio_data) < self.sample_rate:
	# Pad audio if it's too short
	audio_data = np.pad(audio_data, (0, self.sample_rate - len(audio_data)))
	elif len(audio_data) > 10 * self.sample_rate:
	# Take first 10 seconds if audio is too long
	audio_data = audio_data[:10 * self.sample_rate]

	# Make prediction
	result = self.classifier({"array": audio_data, "sampling_rate": self.sample_rate})

	# Format results
	emotions_text = "\n".join([
	f"{pred['label']}: {pred['score']*100:.2f}%"
	for pred in result
	])

	# Prepare plot data
	plot_data = {
	"labels": [pred['label'] for pred in result],
	"values": [pred['score'] * 100 for pred in result]
	}

	return emotions_text, plot_data

	except Exception as e:
	print(f"Error details: {str(e)}")
	return f"Error processing audio: {str(e)}", None

	def create_interface():
	recognizer = EmotionRecognizer()

	def process_audio_file(audio):
	if audio is None:
	return "Please provide an audio input.", None
	return recognizer.process_audio(audio)

	with gr.Blocks() as interface:
	gr.Markdown("# Audio Emotion Recognition")
	gr.Markdown("Record or upload audio to analyze the emotional content. The model works best with clear speech in English.")

	with gr.Row():
	with gr.Column():
	audio_input = gr.Audio(
	label="Upload or Record Audio",
	type="numpy",
	sources=["microphone", "upload"],
	)
	analyze_btn = gr.Button("Analyze Emotion")
	gr.Markdown("Note: Audio will be automatically converted to mono and resampled if needed.")

	with gr.Column():
	output_text = gr.Textbox(
	label="Results",
	lines=5
	)
	output_plot = gr.BarPlot(
	title="Emotion Confidence Scores",
	x_title="Emotions",
	y_title="Confidence (%)"
	)

	analyze_btn.click(
	fn=process_audio_file,
	inputs=[audio_input],
	outputs=[output_text, output_plot]
	)

	return interface

	if __name__ == "__main__":
	demo = create_interface()
	demo.launch(share=True)