invincible-jha's picture
Update app.py
e666e44 verified
raw
history blame
9.85 kB
import gradio as gr
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration, AutoModelForSequenceClassification, AutoTokenizer
import librosa
import numpy as np
import plotly.graph_objects as go
import warnings
import os
from scipy.stats import kurtosis, skew
warnings.filterwarnings('ignore')
def extract_prosodic_features(waveform, sr):
"""Extract prosodic features from audio"""
try:
features = {}
# 1. Pitch (F0) Features
pitches, magnitudes = librosa.piptrack(y=waveform, sr=sr)
f0_contour = []
for t in range(pitches.shape[1]):
pitches_at_t = pitches[:, t]
mags = magnitudes[:, t]
pitch_index = mags.argmax()
f0_contour.append(pitches[pitch_index, t])
f0_contour = np.array(f0_contour)
f0_contour = f0_contour[f0_contour > 0] # Remove zero pitches
if len(f0_contour) > 0:
features['pitch_mean'] = np.mean(f0_contour)
features['pitch_std'] = np.std(f0_contour)
features['pitch_range'] = np.ptp(f0_contour)
else:
features['pitch_mean'] = 0
features['pitch_std'] = 0
features['pitch_range'] = 0
# 2. Energy/Intensity Features
rms = librosa.feature.rms(y=waveform)[0]
features['energy_mean'] = np.mean(rms)
features['energy_std'] = np.std(rms)
features['energy_range'] = np.ptp(rms)
# 3. Rhythm Features
onset_env = librosa.onset.onset_strength(y=waveform, sr=sr)
tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sr)
features['tempo'] = tempo[0]
# 4. Voice Quality Features
spectral_centroids = librosa.feature.spectral_centroid(y=waveform, sr=sr)[0]
features['spectral_centroid_mean'] = np.mean(spectral_centroids)
spectral_rolloff = librosa.feature.spectral_rolloff(y=waveform, sr=sr)[0]
features['spectral_rolloff_mean'] = np.mean(spectral_rolloff)
# 5. MFCC Features
mfccs = librosa.feature.mfcc(y=waveform, sr=sr, n_mfcc=13)
for i in range(13):
features[f'mfcc_{i}_mean'] = np.mean(mfccs[i])
features[f'mfcc_{i}_std'] = np.std(mfccs[i])
return features
except Exception as e:
print(f"Error in extract_prosodic_features: {str(e)}")
return None
def create_feature_plots(features):
"""Create visualizations for audio features"""
try:
# Create main figure with subplots
fig = go.Figure()
# 1. Pitch Features
pitch_data = {
'Mean': features['pitch_mean'],
'Std Dev': features['pitch_std'],
'Range': features['pitch_range']
}
fig.add_trace(go.Bar(
name='Pitch Features',
x=list(pitch_data.keys()),
y=list(pitch_data.values()),
marker_color='blue'
))
# 2. Energy Features
energy_data = {
'Mean': features['energy_mean'],
'Std Dev': features['energy_std'],
'Range': features['energy_range']
}
fig.add_trace(go.Bar(
name='Energy Features',
x=[f"Energy {k}" for k in energy_data.keys()],
y=list(energy_data.values()),
marker_color='red'
))
# 3. MFCC Plot
mfcc_means = [features[f'mfcc_{i}_mean'] for i in range(13)]
fig.add_trace(go.Scatter(
name='MFCC Coefficients',
y=mfcc_means,
mode='lines+markers',
marker_color='green'
))
# Update layout
fig.update_layout(
title='Voice Feature Analysis',
showlegend=True,
height=600,
barmode='group'
)
return fig.to_html(include_plotlyjs=True)
except Exception as e:
print(f"Error in create_feature_plots: {str(e)}")
return None
def load_models():
"""Initialize and load all required models"""
global processor, whisper_model, emotion_tokenizer, emotion_model
try:
print("Loading Whisper model...")
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
print("Loading emotion model...")
emotion_tokenizer = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
emotion_model = AutoModelForSequenceClassification.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
whisper_model.to("cpu")
emotion_model.to("cpu")
print("Models loaded successfully!")
return True
except Exception as e:
print(f"Error loading models: {str(e)}")
return False
def create_emotion_plot(emotions):
"""Create emotion analysis visualization"""
try:
fig = go.Figure(data=[
go.Bar(
x=list(emotions.keys()),
y=list(emotions.values()),
marker_color='rgb(55, 83, 109)'
)
])
fig.update_layout(
title='Emotion Analysis',
xaxis_title='Emotion',
yaxis_title='Score',
yaxis_range=[0, 1],
template='plotly_white',
height=400
)
return fig.to_html(include_plotlyjs=True)
except Exception as e:
print(f"Error creating emotion plot: {str(e)}")
return None
def analyze_audio(audio_input):
"""Main function to analyze audio input"""
try:
if audio_input is None:
return "Please provide an audio input", None, None
print(f"Processing audio input: {type(audio_input)}")
# Handle audio input
if isinstance(audio_input, tuple):
audio_path = audio_input[0] # Get file path from tuple
else:
audio_path = audio_input
print(f"Loading audio from path: {audio_path}")
# Load audio
waveform, sr = librosa.load(audio_path, sr=16000)
print(f"Audio loaded: {waveform.shape}, SR: {sr}")
# Extract voice features
print("Extracting voice features...")
features = extract_prosodic_features(waveform, sr)
if features is None:
return "Error extracting voice features", None, None
# Create feature plots
print("Creating feature visualizations...")
feature_viz = create_feature_plots(features)
if feature_viz is None:
return "Error creating feature visualizations", None, None
# Transcribe audio
print("Transcribing audio...")
inputs = processor(waveform, sampling_rate=sr, return_tensors="pt").input_features
with torch.no_grad():
predicted_ids = whisper_model.generate(inputs)
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
# Analyze emotions
print("Analyzing emotions...")
emotion_inputs = emotion_tokenizer(
transcription,
return_tensors="pt",
padding=True,
truncation=True,
max_length=512
)
with torch.no_grad():
emotion_outputs = emotion_model(**emotion_inputs)
emotions = torch.nn.functional.softmax(emotion_outputs.logits, dim=-1)
emotion_labels = ['anger', 'fear', 'joy', 'neutral', 'sadness', 'surprise']
emotion_scores = {
label: float(score)
for label, score in zip(emotion_labels, emotions[0].cpu().numpy())
}
# Create emotion visualization
emotion_viz = create_emotion_plot(emotion_scores)
if emotion_viz is None:
return "Error creating emotion visualization", None, None
# Create analysis summary
summary = f"""Voice Analysis Summary:
Speech Content:
{transcription}
Voice Characteristics:
- Average Pitch: {features['pitch_mean']:.2f} Hz
- Pitch Variation: {features['pitch_std']:.2f} Hz
- Speech Rate (Tempo): {features['tempo']:.2f} BPM
- Voice Energy: {features['energy_mean']:.4f}
Dominant Emotion: {max(emotion_scores.items(), key=lambda x: x[1])[0]}
"""
return summary, emotion_viz, feature_viz
except Exception as e:
error_msg = f"Error in audio analysis: {str(e)}"
print(error_msg)
return error_msg, None, None
# Load models at startup
print("Initializing application...")
if not load_models():
raise RuntimeError("Failed to load required models")
# Create Gradio interface
demo = gr.Interface(
fn=analyze_audio,
inputs=gr.Audio(
sources=["microphone", "upload"],
type="filepath",
label="Audio Input"
),
outputs=[
gr.Textbox(label="Analysis Summary", lines=10),
gr.HTML(label="Emotion Analysis"),
gr.HTML(label="Voice Feature Analysis")
],
title="Voice Analysis System",
description="""
This application analyzes voice recordings to extract various characteristics:
1. Voice Features:
- Pitch analysis
- Energy patterns
- Speech rate
- Voice quality
2. Emotional Content:
- Emotion detection
- Emotional intensity
3. Speech Content:
- Text transcription
Upload an audio file or record directly through your microphone.
""",
examples=None,
cache_examples=False
)
if __name__ == "__main__":
demo.launch(share=True)