Spaces:
Runtime error
Runtime error
import gradio as gr | |
import torch | |
from transformers import WhisperProcessor, WhisperForConditionalGeneration, AutoModelForSequenceClassification, AutoTokenizer | |
import librosa | |
import numpy as np | |
import plotly.graph_objects as go | |
import warnings | |
import os | |
from scipy.stats import kurtosis, skew | |
warnings.filterwarnings('ignore') | |
def extract_prosodic_features(waveform, sr): | |
"""Extract prosodic features from audio""" | |
try: | |
features = {} | |
# 1. Pitch (F0) Features | |
pitches, magnitudes = librosa.piptrack(y=waveform, sr=sr) | |
f0_contour = [] | |
for t in range(pitches.shape[1]): | |
pitches_at_t = pitches[:, t] | |
mags = magnitudes[:, t] | |
pitch_index = mags.argmax() | |
f0_contour.append(pitches[pitch_index, t]) | |
f0_contour = np.array(f0_contour) | |
f0_contour = f0_contour[f0_contour > 0] # Remove zero pitches | |
if len(f0_contour) > 0: | |
features['pitch_mean'] = np.mean(f0_contour) | |
features['pitch_std'] = np.std(f0_contour) | |
features['pitch_range'] = np.ptp(f0_contour) | |
else: | |
features['pitch_mean'] = 0 | |
features['pitch_std'] = 0 | |
features['pitch_range'] = 0 | |
# 2. Energy/Intensity Features | |
rms = librosa.feature.rms(y=waveform)[0] | |
features['energy_mean'] = np.mean(rms) | |
features['energy_std'] = np.std(rms) | |
features['energy_range'] = np.ptp(rms) | |
# 3. Rhythm Features | |
onset_env = librosa.onset.onset_strength(y=waveform, sr=sr) | |
tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sr) | |
features['tempo'] = tempo[0] | |
# 4. Voice Quality Features | |
spectral_centroids = librosa.feature.spectral_centroid(y=waveform, sr=sr)[0] | |
features['spectral_centroid_mean'] = np.mean(spectral_centroids) | |
spectral_rolloff = librosa.feature.spectral_rolloff(y=waveform, sr=sr)[0] | |
features['spectral_rolloff_mean'] = np.mean(spectral_rolloff) | |
# 5. MFCC Features | |
mfccs = librosa.feature.mfcc(y=waveform, sr=sr, n_mfcc=13) | |
for i in range(13): | |
features[f'mfcc_{i}_mean'] = np.mean(mfccs[i]) | |
features[f'mfcc_{i}_std'] = np.std(mfccs[i]) | |
return features | |
except Exception as e: | |
print(f"Error in extract_prosodic_features: {str(e)}") | |
return None | |
def create_feature_plots(features): | |
"""Create visualizations for audio features""" | |
try: | |
# Create main figure with subplots | |
fig = go.Figure() | |
# 1. Pitch Features | |
pitch_data = { | |
'Mean': features['pitch_mean'], | |
'Std Dev': features['pitch_std'], | |
'Range': features['pitch_range'] | |
} | |
fig.add_trace(go.Bar( | |
name='Pitch Features', | |
x=list(pitch_data.keys()), | |
y=list(pitch_data.values()), | |
marker_color='blue' | |
)) | |
# 2. Energy Features | |
energy_data = { | |
'Mean': features['energy_mean'], | |
'Std Dev': features['energy_std'], | |
'Range': features['energy_range'] | |
} | |
fig.add_trace(go.Bar( | |
name='Energy Features', | |
x=[f"Energy {k}" for k in energy_data.keys()], | |
y=list(energy_data.values()), | |
marker_color='red' | |
)) | |
# 3. MFCC Plot | |
mfcc_means = [features[f'mfcc_{i}_mean'] for i in range(13)] | |
fig.add_trace(go.Scatter( | |
name='MFCC Coefficients', | |
y=mfcc_means, | |
mode='lines+markers', | |
marker_color='green' | |
)) | |
# Update layout | |
fig.update_layout( | |
title='Voice Feature Analysis', | |
showlegend=True, | |
height=600, | |
barmode='group' | |
) | |
return fig.to_html(include_plotlyjs=True) | |
except Exception as e: | |
print(f"Error in create_feature_plots: {str(e)}") | |
return None | |
def load_models(): | |
"""Initialize and load all required models""" | |
global processor, whisper_model, emotion_tokenizer, emotion_model | |
try: | |
print("Loading Whisper model...") | |
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny") | |
whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny") | |
print("Loading emotion model...") | |
emotion_tokenizer = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base") | |
emotion_model = AutoModelForSequenceClassification.from_pretrained("j-hartmann/emotion-english-distilroberta-base") | |
whisper_model.to("cpu") | |
emotion_model.to("cpu") | |
print("Models loaded successfully!") | |
return True | |
except Exception as e: | |
print(f"Error loading models: {str(e)}") | |
return False | |
def create_emotion_plot(emotions): | |
"""Create emotion analysis visualization""" | |
try: | |
fig = go.Figure(data=[ | |
go.Bar( | |
x=list(emotions.keys()), | |
y=list(emotions.values()), | |
marker_color='rgb(55, 83, 109)' | |
) | |
]) | |
fig.update_layout( | |
title='Emotion Analysis', | |
xaxis_title='Emotion', | |
yaxis_title='Score', | |
yaxis_range=[0, 1], | |
template='plotly_white', | |
height=400 | |
) | |
return fig.to_html(include_plotlyjs=True) | |
except Exception as e: | |
print(f"Error creating emotion plot: {str(e)}") | |
return None | |
def analyze_audio(audio_input): | |
"""Main function to analyze audio input""" | |
try: | |
if audio_input is None: | |
return "Please provide an audio input", None, None | |
print(f"Processing audio input: {type(audio_input)}") | |
# Handle audio input | |
if isinstance(audio_input, tuple): | |
audio_path = audio_input[0] # Get file path from tuple | |
else: | |
audio_path = audio_input | |
print(f"Loading audio from path: {audio_path}") | |
# Load audio | |
waveform, sr = librosa.load(audio_path, sr=16000) | |
print(f"Audio loaded: {waveform.shape}, SR: {sr}") | |
# Extract voice features | |
print("Extracting voice features...") | |
features = extract_prosodic_features(waveform, sr) | |
if features is None: | |
return "Error extracting voice features", None, None | |
# Create feature plots | |
print("Creating feature visualizations...") | |
feature_viz = create_feature_plots(features) | |
if feature_viz is None: | |
return "Error creating feature visualizations", None, None | |
# Transcribe audio | |
print("Transcribing audio...") | |
inputs = processor(waveform, sampling_rate=sr, return_tensors="pt").input_features | |
with torch.no_grad(): | |
predicted_ids = whisper_model.generate(inputs) | |
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] | |
# Analyze emotions | |
print("Analyzing emotions...") | |
emotion_inputs = emotion_tokenizer( | |
transcription, | |
return_tensors="pt", | |
padding=True, | |
truncation=True, | |
max_length=512 | |
) | |
with torch.no_grad(): | |
emotion_outputs = emotion_model(**emotion_inputs) | |
emotions = torch.nn.functional.softmax(emotion_outputs.logits, dim=-1) | |
emotion_labels = ['anger', 'fear', 'joy', 'neutral', 'sadness', 'surprise'] | |
emotion_scores = { | |
label: float(score) | |
for label, score in zip(emotion_labels, emotions[0].cpu().numpy()) | |
} | |
# Create emotion visualization | |
emotion_viz = create_emotion_plot(emotion_scores) | |
if emotion_viz is None: | |
return "Error creating emotion visualization", None, None | |
# Create analysis summary | |
summary = f"""Voice Analysis Summary: | |
Speech Content: | |
{transcription} | |
Voice Characteristics: | |
- Average Pitch: {features['pitch_mean']:.2f} Hz | |
- Pitch Variation: {features['pitch_std']:.2f} Hz | |
- Speech Rate (Tempo): {features['tempo']:.2f} BPM | |
- Voice Energy: {features['energy_mean']:.4f} | |
Dominant Emotion: {max(emotion_scores.items(), key=lambda x: x[1])[0]} | |
""" | |
return summary, emotion_viz, feature_viz | |
except Exception as e: | |
error_msg = f"Error in audio analysis: {str(e)}" | |
print(error_msg) | |
return error_msg, None, None | |
# Load models at startup | |
print("Initializing application...") | |
if not load_models(): | |
raise RuntimeError("Failed to load required models") | |
# Create Gradio interface | |
demo = gr.Interface( | |
fn=analyze_audio, | |
inputs=gr.Audio( | |
sources=["microphone", "upload"], | |
type="filepath", | |
label="Audio Input" | |
), | |
outputs=[ | |
gr.Textbox(label="Analysis Summary", lines=10), | |
gr.HTML(label="Emotion Analysis"), | |
gr.HTML(label="Voice Feature Analysis") | |
], | |
title="Voice Analysis System", | |
description=""" | |
This application analyzes voice recordings to extract various characteristics: | |
1. Voice Features: | |
- Pitch analysis | |
- Energy patterns | |
- Speech rate | |
- Voice quality | |
2. Emotional Content: | |
- Emotion detection | |
- Emotional intensity | |
3. Speech Content: | |
- Text transcription | |
Upload an audio file or record directly through your microphone. | |
""", | |
examples=None, | |
cache_examples=False | |
) | |
if __name__ == "__main__": | |
demo.launch(debug=True) |