File size: 4,229 Bytes
822dda9 0c9630a 822dda9 0c9630a 0e04908 0c9630a 0e04908 e10faf1 0e04908 0c9630a 0e04908 822dda9 0e04908 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 |
import gradio as gr
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration, AutoModelForSequenceClassification, AutoTokenizer
import librosa
import numpy as np
import plotly.graph_objects as go
class ModelManager:
def __init__(self):
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.models = {}
self.tokenizers = {}
self.processors = {}
self.load_models()
def load_models(self):
print("Loading Whisper model...")
self.processors['whisper'] = WhisperProcessor.from_pretrained("openai/whisper-base")
self.models['whisper'] = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base").to(self.device)
print("Loading emotion model...")
self.tokenizers['emotion'] = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
self.models['emotion'] = AutoModelForSequenceClassification.from_pretrained("j-hartmann/emotion-english-distilroberta-base").to(self.device)
class AudioProcessor:
def __init__(self):
self.sample_rate = 16000
self.n_mfcc = 13
def process_audio(self, audio_path):
waveform, sr = librosa.load(audio_path, sr=self.sample_rate)
return waveform, self._extract_features(waveform)
def _extract_features(self, waveform):
return {
'mfcc': librosa.feature.mfcc(y=waveform, sr=self.sample_rate, n_mfcc=self.n_mfcc),
'energy': librosa.feature.rms(y=waveform)[0]
}
class Analyzer:
def __init__(self):
print("Initializing Analyzer...")
self.model_manager = ModelManager()
self.audio_processor = AudioProcessor()
print("Analyzer initialization complete")
def analyze(self, audio_path):
print(f"Processing audio file: {audio_path}")
# Process audio
waveform, features = self.audio_processor.process_audio(audio_path)
# Transcribe
print("Transcribing audio...")
inputs = self.model_manager.processors['whisper'](waveform, return_tensors="pt").input_features.to(self.model_manager.device)
predicted_ids = self.model_manager.models['whisper'].generate(inputs)
transcription = self.model_manager.processors['whisper'].batch_decode(predicted_ids, skip_special_tokens=True)[0]
# Analyze emotions
print("Analyzing emotions...")
inputs = self.model_manager.tokenizers['emotion'](transcription, return_tensors="pt", padding=True, truncation=True)
outputs = self.model_manager.models['emotion'](**inputs)
emotions = torch.nn.functional.softmax(outputs.logits, dim=-1)
emotion_labels = ['anger', 'fear', 'joy', 'neutral', 'sadness', 'surprise']
emotion_scores = {
label: float(score)
for label, score in zip(emotion_labels, emotions[0])
}
return {
'transcription': transcription,
'emotions': emotion_scores
}
def create_emotion_plot(emotions):
fig = go.Figure(data=[
go.Bar(x=list(emotions.keys()), y=list(emotions.values()))
])
fig.update_layout(
title='Emotion Analysis',
yaxis_range=[0, 1]
)
return fig.to_html()
print("Initializing application...")
analyzer = Analyzer()
def process_audio(audio_file):
try:
print(f"Processing audio file: {audio_file}")
results = analyzer.analyze(audio_file)
return (
results['transcription'],
create_emotion_plot(results['emotions'])
)
except Exception as e:
print(f"Error processing audio: {str(e)}")
return str(e), "Error in analysis"
print("Creating Gradio interface...")
interface = gr.Interface(
fn=process_audio,
inputs=gr.Audio(source="microphone", type="filepath"),
outputs=[
gr.Textbox(label="Transcription"),
gr.HTML(label="Emotion Analysis")
],
title="Vocal Biomarker Analysis",
description="Analyze voice for emotional indicators"
)
print("Launching application...")
if __name__ == "__main__":
interface.launch() |