|
import gradio as gr |
|
import torch |
|
from transformers import WhisperProcessor, WhisperForConditionalGeneration, AutoModelForSequenceClassification, AutoTokenizer |
|
import librosa |
|
import numpy as np |
|
import plotly.graph_objects as go |
|
|
|
class ModelManager: |
|
def __init__(self): |
|
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
self.models = {} |
|
self.tokenizers = {} |
|
self.processors = {} |
|
self.load_models() |
|
|
|
def load_models(self): |
|
print("Loading Whisper model...") |
|
self.processors['whisper'] = WhisperProcessor.from_pretrained("openai/whisper-base") |
|
self.models['whisper'] = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base").to(self.device) |
|
|
|
print("Loading emotion model...") |
|
self.tokenizers['emotion'] = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base") |
|
self.models['emotion'] = AutoModelForSequenceClassification.from_pretrained("j-hartmann/emotion-english-distilroberta-base").to(self.device) |
|
|
|
class AudioProcessor: |
|
def __init__(self): |
|
self.sample_rate = 16000 |
|
self.n_mfcc = 13 |
|
|
|
def process_audio(self, audio_path): |
|
waveform, sr = librosa.load(audio_path, sr=self.sample_rate) |
|
return waveform, self._extract_features(waveform) |
|
|
|
def _extract_features(self, waveform): |
|
return { |
|
'mfcc': librosa.feature.mfcc(y=waveform, sr=self.sample_rate, n_mfcc=self.n_mfcc), |
|
'energy': librosa.feature.rms(y=waveform)[0] |
|
} |
|
|
|
class Analyzer: |
|
def __init__(self): |
|
print("Initializing Analyzer...") |
|
self.model_manager = ModelManager() |
|
self.audio_processor = AudioProcessor() |
|
print("Analyzer initialization complete") |
|
|
|
def analyze(self, audio_path): |
|
print(f"Processing audio file: {audio_path}") |
|
|
|
waveform, features = self.audio_processor.process_audio(audio_path) |
|
|
|
|
|
print("Transcribing audio...") |
|
inputs = self.model_manager.processors['whisper'](waveform, return_tensors="pt").input_features.to(self.model_manager.device) |
|
predicted_ids = self.model_manager.models['whisper'].generate(inputs) |
|
transcription = self.model_manager.processors['whisper'].batch_decode(predicted_ids, skip_special_tokens=True)[0] |
|
|
|
|
|
print("Analyzing emotions...") |
|
inputs = self.model_manager.tokenizers['emotion'](transcription, return_tensors="pt", padding=True, truncation=True) |
|
outputs = self.model_manager.models['emotion'](**inputs) |
|
emotions = torch.nn.functional.softmax(outputs.logits, dim=-1) |
|
|
|
emotion_labels = ['anger', 'fear', 'joy', 'neutral', 'sadness', 'surprise'] |
|
emotion_scores = { |
|
label: float(score) |
|
for label, score in zip(emotion_labels, emotions[0]) |
|
} |
|
|
|
return { |
|
'transcription': transcription, |
|
'emotions': emotion_scores |
|
} |
|
|
|
def create_emotion_plot(emotions): |
|
fig = go.Figure(data=[ |
|
go.Bar(x=list(emotions.keys()), y=list(emotions.values())) |
|
]) |
|
fig.update_layout( |
|
title='Emotion Analysis', |
|
yaxis_range=[0, 1] |
|
) |
|
return fig.to_html() |
|
|
|
print("Initializing application...") |
|
analyzer = Analyzer() |
|
|
|
def process_audio(audio_file): |
|
try: |
|
print(f"Processing audio file: {audio_file}") |
|
results = analyzer.analyze(audio_file) |
|
|
|
return ( |
|
results['transcription'], |
|
create_emotion_plot(results['emotions']) |
|
) |
|
except Exception as e: |
|
print(f"Error processing audio: {str(e)}") |
|
return str(e), "Error in analysis" |
|
|
|
print("Creating Gradio interface...") |
|
interface = gr.Interface( |
|
fn=process_audio, |
|
inputs=gr.Audio(source="microphone", type="filepath"), |
|
outputs=[ |
|
gr.Textbox(label="Transcription"), |
|
gr.HTML(label="Emotion Analysis") |
|
], |
|
title="Vocal Biomarker Analysis", |
|
description="Analyze voice for emotional indicators" |
|
) |
|
|
|
print("Launching application...") |
|
if __name__ == "__main__": |
|
interface.launch() |