File size: 4,229 Bytes
822dda9
0c9630a
 
 
 
 
822dda9
0c9630a
 
 
 
 
 
 
 
 
0e04908
0c9630a
 
 
0e04908
e10faf1
 
0e04908
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c9630a
0e04908
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
822dda9
0e04908
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import gradio as gr
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration, AutoModelForSequenceClassification, AutoTokenizer
import librosa
import numpy as np
import plotly.graph_objects as go

class ModelManager:
    def __init__(self):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.models = {}
        self.tokenizers = {}
        self.processors = {}
        self.load_models()
        
    def load_models(self):
        print("Loading Whisper model...")
        self.processors['whisper'] = WhisperProcessor.from_pretrained("openai/whisper-base")
        self.models['whisper'] = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base").to(self.device)
        
        print("Loading emotion model...")
        self.tokenizers['emotion'] = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
        self.models['emotion'] = AutoModelForSequenceClassification.from_pretrained("j-hartmann/emotion-english-distilroberta-base").to(self.device)

class AudioProcessor:
    def __init__(self):
        self.sample_rate = 16000
        self.n_mfcc = 13
        
    def process_audio(self, audio_path):
        waveform, sr = librosa.load(audio_path, sr=self.sample_rate)
        return waveform, self._extract_features(waveform)
        
    def _extract_features(self, waveform):
        return {
            'mfcc': librosa.feature.mfcc(y=waveform, sr=self.sample_rate, n_mfcc=self.n_mfcc),
            'energy': librosa.feature.rms(y=waveform)[0]
        }

class Analyzer:
    def __init__(self):
        print("Initializing Analyzer...")
        self.model_manager = ModelManager()
        self.audio_processor = AudioProcessor()
        print("Analyzer initialization complete")
        
    def analyze(self, audio_path):
        print(f"Processing audio file: {audio_path}")
        # Process audio
        waveform, features = self.audio_processor.process_audio(audio_path)
        
        # Transcribe
        print("Transcribing audio...")
        inputs = self.model_manager.processors['whisper'](waveform, return_tensors="pt").input_features.to(self.model_manager.device)
        predicted_ids = self.model_manager.models['whisper'].generate(inputs)
        transcription = self.model_manager.processors['whisper'].batch_decode(predicted_ids, skip_special_tokens=True)[0]
        
        # Analyze emotions
        print("Analyzing emotions...")
        inputs = self.model_manager.tokenizers['emotion'](transcription, return_tensors="pt", padding=True, truncation=True)
        outputs = self.model_manager.models['emotion'](**inputs)
        emotions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        
        emotion_labels = ['anger', 'fear', 'joy', 'neutral', 'sadness', 'surprise']
        emotion_scores = {
            label: float(score) 
            for label, score in zip(emotion_labels, emotions[0])
        }
        
        return {
            'transcription': transcription,
            'emotions': emotion_scores
        }

def create_emotion_plot(emotions):
    fig = go.Figure(data=[
        go.Bar(x=list(emotions.keys()), y=list(emotions.values()))
    ])
    fig.update_layout(
        title='Emotion Analysis',
        yaxis_range=[0, 1]
    )
    return fig.to_html()

print("Initializing application...")
analyzer = Analyzer()

def process_audio(audio_file):
    try:
        print(f"Processing audio file: {audio_file}")
        results = analyzer.analyze(audio_file)
        
        return (
            results['transcription'],
            create_emotion_plot(results['emotions'])
        )
    except Exception as e:
        print(f"Error processing audio: {str(e)}")
        return str(e), "Error in analysis"

print("Creating Gradio interface...")
interface = gr.Interface(
    fn=process_audio,
    inputs=gr.Audio(source="microphone", type="filepath"),
    outputs=[
        gr.Textbox(label="Transcription"),
        gr.HTML(label="Emotion Analysis")
    ],
    title="Vocal Biomarker Analysis",
    description="Analyze voice for emotional indicators"
)

print("Launching application...")
if __name__ == "__main__":
    interface.launch()