invincible-jha commited on
Commit
3f4b577
·
verified ·
1 Parent(s): b3d1df8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -118
app.py CHANGED
@@ -1,7 +1,4 @@
1
  # app.py - Voice Analysis System with Clinical Interpretation
2
- # This application provides comprehensive voice analysis with mental health insights
3
- # using voice biomarkers, emotion detection, and clinical interpretation.
4
-
5
  import gradio as gr
6
  import torch
7
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
@@ -15,13 +12,13 @@ from scipy.stats import kurtosis, skew
15
  from anthropic import Anthropic
16
  from dotenv import load_dotenv
17
 
18
- # Load environment variables for API keys
19
  load_dotenv()
20
 
21
- # Suppress warnings for cleaner output
22
  warnings.filterwarnings('ignore')
23
 
24
- # Initialize global model variables
25
  processor = None
26
  whisper_model = None
27
  emotion_tokenizer = None
@@ -29,28 +26,21 @@ emotion_model = None
29
  clinical_analyzer = None
30
 
31
  def load_models():
32
- """Load and initialize speech recognition and emotion analysis models.
33
-
34
- This function handles the initialization of both Whisper (for speech recognition)
35
- and the emotion detection model, setting them up for CPU-based inference.
36
-
37
- Returns:
38
- bool: True if all models loaded successfully, False otherwise
39
- """
40
  global processor, whisper_model, emotion_tokenizer, emotion_model
41
 
42
  try:
43
- # Initialize speech recognition (Whisper) model
44
  print("Loading Whisper model...")
45
  processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
46
  whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
47
 
48
- # Initialize emotion detection model
49
  print("Loading emotion model...")
50
  emotion_tokenizer = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
51
  emotion_model = AutoModelForSequenceClassification.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
52
 
53
- # Set models to CPU for consistent performance
54
  device = "cpu"
55
  whisper_model.to(device)
56
  emotion_model.to(device)
@@ -62,48 +52,37 @@ def load_models():
62
  return False
63
 
64
  def extract_prosodic_features(waveform, sr):
65
- """Extract voice features including pitch, energy, and rhythm patterns.
66
-
67
- Args:
68
- waveform (numpy.ndarray): Audio signal data
69
- sr (int): Sampling rate of the audio
70
-
71
- Returns:
72
- dict: Dictionary containing extracted features or None if extraction fails
73
- """
74
  try:
75
- # Input validation
76
  if waveform is None or len(waveform) == 0:
77
  return None
78
 
79
  features = {}
80
 
81
- # Pitch analysis with enhanced accuracy
82
  try:
83
  pitches, magnitudes = librosa.piptrack(
84
  y=waveform,
85
  sr=sr,
86
- fmin=50, # Minimum human voice frequency
87
- fmax=2000, # Maximum human voice frequency
88
- n_mels=128, # Frequency resolution
89
  hop_length=512,
90
  win_length=2048
91
  )
92
 
93
- # Extract valid pitch contour
94
  f0_contour = [
95
  pitches[magnitudes[:, t].argmax(), t]
96
  for t in range(pitches.shape[1])
97
  if 50 <= pitches[magnitudes[:, t].argmax(), t] <= 2000
98
  ]
99
 
100
- # Calculate pitch statistics
101
  if f0_contour:
102
  features['pitch_mean'] = float(np.mean(f0_contour))
103
  features['pitch_std'] = float(np.std(f0_contour))
104
  features['pitch_range'] = float(np.ptp(f0_contour))
105
  else:
106
- features['pitch_mean'] = 160.0 # Default adult pitch
107
  features['pitch_std'] = 0.0
108
  features['pitch_range'] = 0.0
109
 
@@ -111,7 +90,7 @@ def extract_prosodic_features(waveform, sr):
111
  print(f"Pitch extraction error: {e}")
112
  features.update({'pitch_mean': 160.0, 'pitch_std': 0.0, 'pitch_range': 0.0})
113
 
114
- # Energy analysis with noise handling
115
  try:
116
  rms = librosa.feature.rms(
117
  y=waveform,
@@ -129,7 +108,7 @@ def extract_prosodic_features(waveform, sr):
129
  print(f"Energy extraction error: {e}")
130
  features.update({'energy_mean': 0.02, 'energy_std': 0.0, 'energy_range': 0.0})
131
 
132
- # Rhythm analysis with tempo validation
133
  try:
134
  onset_env = librosa.onset.onset_strength(
135
  y=waveform,
@@ -145,7 +124,6 @@ def extract_prosodic_features(waveform, sr):
145
  aggregate=None
146
  )[0]
147
 
148
- # Validate tempo within normal speech range (40-240 BPM)
149
  features['tempo'] = float(tempo) if 40 <= tempo <= 240 else 120.0
150
 
151
  except Exception as e:
@@ -158,31 +136,21 @@ def extract_prosodic_features(waveform, sr):
158
  return None
159
 
160
  class ClinicalVoiceAnalyzer:
161
- """Analyze voice characteristics for psychological indicators."""
162
 
163
  def __init__(self):
164
- """Initialize the clinical analyzer with API and reference ranges."""
165
  self.anthropic = Anthropic(api_key=os.getenv('ANTHROPIC_API_KEY'))
166
  self.model = "claude-3-opus-20240229"
167
- # Define normal ranges for voice metrics based on clinical research
168
  self.reference_ranges = {
169
- 'pitch': {'min': 150, 'max': 400}, # Hz
170
- 'tempo': {'min': 90, 'max': 130}, # BPM
171
  'energy': {'min': 0.01, 'max': 0.05}
172
  }
173
  print("Clinical analyzer ready")
174
 
175
  def analyze_voice_metrics(self, features, emotions, transcription):
176
- """Generate clinical insights from voice and emotion data.
177
-
178
- Args:
179
- features (dict): Extracted voice features
180
- emotions (dict): Detected emotion scores
181
- transcription (str): Speech content
182
-
183
- Returns:
184
- str: Formatted clinical analysis or backup analysis if API fails
185
- """
186
  try:
187
  prompt = self._create_clinical_prompt(features, emotions, transcription)
188
  response = self.anthropic.messages.create(
@@ -196,7 +164,7 @@ class ClinicalVoiceAnalyzer:
196
  return self._generate_backup_analysis(features, emotions)
197
 
198
  def _create_clinical_prompt(self, features, emotions, transcription):
199
- """Create detailed prompt for clinical analysis."""
200
  return f"""As a clinical voice analysis expert, provide a psychological assessment of:
201
 
202
  Voice Metrics:
@@ -219,11 +187,11 @@ Provide:
219
  5. Clinical recommendations"""
220
 
221
  def _format_analysis(self, analysis):
222
- """Format the clinical analysis output."""
223
  return f"\nClinical Assessment:\n{analysis}"
224
 
225
  def _generate_backup_analysis(self, features, emotions):
226
- """Generate basic analysis when API is unavailable."""
227
  dominant_emotion = max(emotions.items(), key=lambda x: x[1])
228
  pitch_status = (
229
  "elevated" if features['pitch_mean'] > self.reference_ranges['pitch']['max']
@@ -239,14 +207,7 @@ Basic Voice Analysis (API Unavailable):
239
  - Primary Emotion: {dominant_emotion[0]} ({dominant_emotion[1]:.1%} confidence)"""
240
 
241
  def create_feature_plots(features):
242
- """Create interactive visualizations of voice features.
243
-
244
- Args:
245
- features (dict): Dictionary of extracted voice features
246
-
247
- Returns:
248
- str: HTML representation of the interactive plots
249
- """
250
  try:
251
  fig = go.Figure()
252
 
@@ -285,7 +246,6 @@ def create_feature_plots(features):
285
  marker=dict(size=15, color='green')
286
  ))
287
 
288
- # Layout configuration
289
  fig.update_layout(
290
  title='Voice Feature Analysis',
291
  showlegend=True,
@@ -302,14 +262,7 @@ def create_feature_plots(features):
302
  return None
303
 
304
  def create_emotion_plot(emotions):
305
- """Create visualization of emotional analysis.
306
-
307
- Args:
308
- emotions (dict): Dictionary of emotion scores
309
-
310
- Returns:
311
- str: HTML representation of the emotion plot
312
- """
313
  try:
314
  fig = go.Figure(data=[
315
  go.Bar(
@@ -335,46 +288,29 @@ def create_emotion_plot(emotions):
335
  return None
336
 
337
  def analyze_audio(audio_input):
338
- """Process audio input and generate comprehensive analysis.
339
-
340
- This is the main function that coordinates the entire analysis pipeline,
341
- including feature extraction, emotion detection, and clinical interpretation.
342
-
343
- Args:
344
- audio_input: Audio file path or tuple containing audio data
345
-
346
- Returns:
347
- tuple: (analysis_summary, emotion_visualization, feature_visualization)
348
- """
349
  try:
350
- # Validate input
351
  if audio_input is None:
352
  return "Please provide an audio input", None, None
353
 
354
- # Load audio
355
  audio_path = audio_input[0] if isinstance(audio_input, tuple) else audio_input
356
  waveform, sr = librosa.load(audio_path, sr=16000, duration=30)
357
 
358
- # Validate duration
359
  duration = len(waveform) / sr
360
  if duration < 0.5:
361
  return "Audio too short (minimum 0.5 seconds needed)", None, None
362
 
363
- # Extract features
364
  features = extract_prosodic_features(waveform, sr)
365
  if features is None:
366
  return "Feature extraction failed", None, None
367
 
368
- # Generate visualizations
369
  feature_viz = create_feature_plots(features)
370
 
371
- # Perform speech recognition
372
  inputs = processor(waveform, sampling_rate=sr, return_tensors="pt").input_features
373
  with torch.no_grad():
374
  predicted_ids = whisper_model.generate(inputs)
375
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
376
 
377
- # Analyze emotions
378
  emotion_inputs = emotion_tokenizer(
379
  transcription,
380
  return_tensors="pt",
@@ -387,7 +323,6 @@ def analyze_audio(audio_input):
387
  emotion_outputs = emotion_model(**emotion_inputs)
388
  emotions = torch.nn.functional.softmax(emotion_outputs.logits, dim=-1)
389
 
390
- # Process emotion scores
391
  emotion_labels = ['anger', 'fear', 'joy', 'neutral', 'sadness', 'surprise']
392
  emotion_scores = {
393
  label: float(score)
@@ -396,7 +331,6 @@ def analyze_audio(audio_input):
396
 
397
  emotion_viz = create_emotion_plot(emotion_scores)
398
 
399
- # Generate clinical analysis
400
  global clinical_analyzer
401
  if clinical_analyzer is None:
402
  clinical_analyzer = ClinicalVoiceAnalyzer()
@@ -405,20 +339,7 @@ def analyze_audio(audio_input):
405
  features, emotion_scores, transcription
406
  )
407
 
408
- # Create comprehensive summary
409
- summary = f"""Voice Analysis Summary:
410
-
411
- Speech Content:
412
- {transcription}
413
-
414
- Voice Characteristics:
415
- - Average Pitch: {features['pitch_mean']:.2f} Hz
416
- - Pitch Variation: {features['pitch_std']:.2f} Hz
417
- - Speech Rate (Tempo): {features['tempo']:.2f} BPM
418
- - Voice Energy: {features['energy_mean']:.4f}
419
-
420
- Dominant Emotion: {max(emotion_scores.items(), key=lambda x: x[1])[0]}
421
- Emotion# Continue from previous summary string
422
  summary = f"""Voice Analysis Summary:
423
 
424
  Speech Content:
@@ -435,8 +356,8 @@ Emotion Confidence: {max(emotion_scores.values()):.2%}
435
 
436
  Recording Duration: {duration:.2f} seconds
437
 
438
- {clinical_analysis}"""
439
-
440
  return summary, emotion_viz, feature_viz
441
 
442
  except Exception as e:
@@ -444,19 +365,16 @@ Recording Duration: {duration:.2f} seconds
444
  print(error_msg)
445
  return error_msg, None, None
446
 
447
- # Application initialization and Gradio interface setup
448
  try:
449
  print("===== Application Startup =====")
450
 
451
- # Load required models
452
  if not load_models():
453
  raise RuntimeError("Model loading failed")
454
 
455
- # Initialize clinical analyzer
456
  clinical_analyzer = ClinicalVoiceAnalyzer()
457
  print("Clinical analyzer initialized")
458
 
459
- # Define the interface description
460
  description = """This application provides comprehensive voice analysis with clinical insights:
461
 
462
  1. Voice Features:
@@ -483,7 +401,6 @@ For optimal results:
483
 
484
  Upload an audio file or record directly through your microphone."""
485
 
486
- # Create Gradio interface
487
  demo = gr.Interface(
488
  fn=analyze_audio,
489
  inputs=gr.Audio(
@@ -509,13 +426,12 @@ Upload an audio file or record directly through your microphone."""
509
  theme="default"
510
  )
511
 
512
- # Launch the interface with additional configuration
513
  if __name__ == "__main__":
514
  demo.launch(
515
- server_name="0.0.0.0", # Allow external access
516
- server_port=7860, # Default Gradio port
517
- share=False, # Disable public URL generation
518
- debug=False # Disable debug mode in production
519
  )
520
 
521
  except Exception as e:
 
1
  # app.py - Voice Analysis System with Clinical Interpretation
 
 
 
2
  import gradio as gr
3
  import torch
4
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
 
12
  from anthropic import Anthropic
13
  from dotenv import load_dotenv
14
 
15
+ # Load environment variables
16
  load_dotenv()
17
 
18
+ # Suppress warnings
19
  warnings.filterwarnings('ignore')
20
 
21
+ # Initialize global variables
22
  processor = None
23
  whisper_model = None
24
  emotion_tokenizer = None
 
26
  clinical_analyzer = None
27
 
28
  def load_models():
29
+ """Initialize and load required ML models."""
 
 
 
 
 
 
 
30
  global processor, whisper_model, emotion_tokenizer, emotion_model
31
 
32
  try:
33
+ # Load Whisper model
34
  print("Loading Whisper model...")
35
  processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
36
  whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
37
 
38
+ # Load emotion model
39
  print("Loading emotion model...")
40
  emotion_tokenizer = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
41
  emotion_model = AutoModelForSequenceClassification.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
42
 
43
+ # Set device
44
  device = "cpu"
45
  whisper_model.to(device)
46
  emotion_model.to(device)
 
52
  return False
53
 
54
  def extract_prosodic_features(waveform, sr):
55
+ """Extract voice features from audio data."""
 
 
 
 
 
 
 
 
56
  try:
 
57
  if waveform is None or len(waveform) == 0:
58
  return None
59
 
60
  features = {}
61
 
62
+ # Pitch analysis
63
  try:
64
  pitches, magnitudes = librosa.piptrack(
65
  y=waveform,
66
  sr=sr,
67
+ fmin=50,
68
+ fmax=2000,
69
+ n_mels=128,
70
  hop_length=512,
71
  win_length=2048
72
  )
73
 
 
74
  f0_contour = [
75
  pitches[magnitudes[:, t].argmax(), t]
76
  for t in range(pitches.shape[1])
77
  if 50 <= pitches[magnitudes[:, t].argmax(), t] <= 2000
78
  ]
79
 
 
80
  if f0_contour:
81
  features['pitch_mean'] = float(np.mean(f0_contour))
82
  features['pitch_std'] = float(np.std(f0_contour))
83
  features['pitch_range'] = float(np.ptp(f0_contour))
84
  else:
85
+ features['pitch_mean'] = 160.0
86
  features['pitch_std'] = 0.0
87
  features['pitch_range'] = 0.0
88
 
 
90
  print(f"Pitch extraction error: {e}")
91
  features.update({'pitch_mean': 160.0, 'pitch_std': 0.0, 'pitch_range': 0.0})
92
 
93
+ # Energy analysis
94
  try:
95
  rms = librosa.feature.rms(
96
  y=waveform,
 
108
  print(f"Energy extraction error: {e}")
109
  features.update({'energy_mean': 0.02, 'energy_std': 0.0, 'energy_range': 0.0})
110
 
111
+ # Rhythm analysis
112
  try:
113
  onset_env = librosa.onset.onset_strength(
114
  y=waveform,
 
124
  aggregate=None
125
  )[0]
126
 
 
127
  features['tempo'] = float(tempo) if 40 <= tempo <= 240 else 120.0
128
 
129
  except Exception as e:
 
136
  return None
137
 
138
  class ClinicalVoiceAnalyzer:
139
+ """Clinical voice analysis and interpretation."""
140
 
141
  def __init__(self):
142
+ """Initialize analyzer with API and reference ranges."""
143
  self.anthropic = Anthropic(api_key=os.getenv('ANTHROPIC_API_KEY'))
144
  self.model = "claude-3-opus-20240229"
 
145
  self.reference_ranges = {
146
+ 'pitch': {'min': 150, 'max': 400},
147
+ 'tempo': {'min': 90, 'max': 130},
148
  'energy': {'min': 0.01, 'max': 0.05}
149
  }
150
  print("Clinical analyzer ready")
151
 
152
  def analyze_voice_metrics(self, features, emotions, transcription):
153
+ """Analyze voice metrics and generate clinical insights."""
 
 
 
 
 
 
 
 
 
154
  try:
155
  prompt = self._create_clinical_prompt(features, emotions, transcription)
156
  response = self.anthropic.messages.create(
 
164
  return self._generate_backup_analysis(features, emotions)
165
 
166
  def _create_clinical_prompt(self, features, emotions, transcription):
167
+ """Create clinical analysis prompt."""
168
  return f"""As a clinical voice analysis expert, provide a psychological assessment of:
169
 
170
  Voice Metrics:
 
187
  5. Clinical recommendations"""
188
 
189
  def _format_analysis(self, analysis):
190
+ """Format clinical analysis output."""
191
  return f"\nClinical Assessment:\n{analysis}"
192
 
193
  def _generate_backup_analysis(self, features, emotions):
194
+ """Generate backup analysis when API fails."""
195
  dominant_emotion = max(emotions.items(), key=lambda x: x[1])
196
  pitch_status = (
197
  "elevated" if features['pitch_mean'] > self.reference_ranges['pitch']['max']
 
207
  - Primary Emotion: {dominant_emotion[0]} ({dominant_emotion[1]:.1%} confidence)"""
208
 
209
  def create_feature_plots(features):
210
+ """Create visualizations for voice features."""
 
 
 
 
 
 
 
211
  try:
212
  fig = go.Figure()
213
 
 
246
  marker=dict(size=15, color='green')
247
  ))
248
 
 
249
  fig.update_layout(
250
  title='Voice Feature Analysis',
251
  showlegend=True,
 
262
  return None
263
 
264
  def create_emotion_plot(emotions):
265
+ """Create visualization for emotion analysis."""
 
 
 
 
 
 
 
266
  try:
267
  fig = go.Figure(data=[
268
  go.Bar(
 
288
  return None
289
 
290
  def analyze_audio(audio_input):
291
+ """Main function for audio analysis."""
 
 
 
 
 
 
 
 
 
 
292
  try:
 
293
  if audio_input is None:
294
  return "Please provide an audio input", None, None
295
 
 
296
  audio_path = audio_input[0] if isinstance(audio_input, tuple) else audio_input
297
  waveform, sr = librosa.load(audio_path, sr=16000, duration=30)
298
 
 
299
  duration = len(waveform) / sr
300
  if duration < 0.5:
301
  return "Audio too short (minimum 0.5 seconds needed)", None, None
302
 
 
303
  features = extract_prosodic_features(waveform, sr)
304
  if features is None:
305
  return "Feature extraction failed", None, None
306
 
 
307
  feature_viz = create_feature_plots(features)
308
 
 
309
  inputs = processor(waveform, sampling_rate=sr, return_tensors="pt").input_features
310
  with torch.no_grad():
311
  predicted_ids = whisper_model.generate(inputs)
312
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
313
 
 
314
  emotion_inputs = emotion_tokenizer(
315
  transcription,
316
  return_tensors="pt",
 
323
  emotion_outputs = emotion_model(**emotion_inputs)
324
  emotions = torch.nn.functional.softmax(emotion_outputs.logits, dim=-1)
325
 
 
326
  emotion_labels = ['anger', 'fear', 'joy', 'neutral', 'sadness', 'surprise']
327
  emotion_scores = {
328
  label: float(score)
 
331
 
332
  emotion_viz = create_emotion_plot(emotion_scores)
333
 
 
334
  global clinical_analyzer
335
  if clinical_analyzer is None:
336
  clinical_analyzer = ClinicalVoiceAnalyzer()
 
339
  features, emotion_scores, transcription
340
  )
341
 
342
+ # Create summary with fixed string formatting
 
 
 
 
 
 
 
 
 
 
 
 
 
343
  summary = f"""Voice Analysis Summary:
344
 
345
  Speech Content:
 
356
 
357
  Recording Duration: {duration:.2f} seconds
358
 
359
+ {clinical_analysis}
360
+ """
361
  return summary, emotion_viz, feature_viz
362
 
363
  except Exception as e:
 
365
  print(error_msg)
366
  return error_msg, None, None
367
 
368
+ # Application initialization
369
  try:
370
  print("===== Application Startup =====")
371
 
 
372
  if not load_models():
373
  raise RuntimeError("Model loading failed")
374
 
 
375
  clinical_analyzer = ClinicalVoiceAnalyzer()
376
  print("Clinical analyzer initialized")
377
 
 
378
  description = """This application provides comprehensive voice analysis with clinical insights:
379
 
380
  1. Voice Features:
 
401
 
402
  Upload an audio file or record directly through your microphone."""
403
 
 
404
  demo = gr.Interface(
405
  fn=analyze_audio,
406
  inputs=gr.Audio(
 
426
  theme="default"
427
  )
428
 
 
429
  if __name__ == "__main__":
430
  demo.launch(
431
+ server_name="0.0.0.0",
432
+ server_port=7860,
433
+ share=False,
434
+ debug=False
435
  )
436
 
437
  except Exception as e: