invincible-jha commited on
Commit
4d5d3b7
1 Parent(s): 363bda3

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +208 -68
app.py CHANGED
@@ -4,8 +4,11 @@ from transformers import WhisperProcessor, WhisperForConditionalGeneration, Auto
4
  import librosa
5
  import numpy as np
6
  import plotly.graph_objects as go
 
7
  import warnings
8
  import os
 
 
9
  warnings.filterwarnings('ignore')
10
 
11
  # Global variables for models
@@ -37,87 +40,185 @@ def load_models():
37
  print(f"Error loading models: {str(e)}")
38
  return False
39
 
40
- def process_audio(audio_input):
41
- """Process audio file and extract waveform"""
 
 
42
  try:
43
- print(f"Audio input received: {type(audio_input)}")
 
 
 
 
 
 
 
44
 
45
- # Handle tuple input from Gradio
46
- if isinstance(audio_input, tuple):
47
- print(f"Audio input is tuple: {audio_input[0]}, {audio_input[1]}")
48
- audio_path = audio_input[0] # Get the file path
49
- else:
50
- audio_path = audio_input
51
-
52
- print(f"Processing audio from path: {audio_path}")
53
 
54
- # Verify file exists
55
- if not os.path.exists(audio_path):
56
- raise FileNotFoundError(f"Audio file not found at {audio_path}")
57
-
58
- # Load and resample audio
59
- print("Loading audio file with librosa...")
60
- waveform, sr = librosa.load(audio_path, sr=16000)
61
- print(f"Audio loaded successfully. Shape: {waveform.shape}, SR: {sr}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
- return waveform
 
 
 
 
 
 
 
64
  except Exception as e:
65
- print(f"Error processing audio: {str(e)}")
66
- raise
67
 
68
- def create_emotion_plot(emotions):
69
- """Create plotly visualization for emotion scores"""
70
  try:
71
- fig = go.Figure(data=[
72
- go.Bar(
73
- x=list(emotions.keys()),
74
- y=list(emotions.values()),
75
- marker_color='rgb(55, 83, 109)'
 
 
 
76
  )
77
- ])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  fig.update_layout(
80
- title='Emotion Analysis',
81
- xaxis_title='Emotion',
82
- yaxis_title='Score',
83
- yaxis_range=[0, 1],
84
- template='plotly_white',
85
- height=400
86
  )
87
 
88
  return fig.to_html(include_plotlyjs=True)
89
  except Exception as e:
90
- print(f"Error creating plot: {str(e)}")
91
- return "Error creating visualization"
92
 
93
  def analyze_audio(audio_input):
94
  """Main function to analyze audio input"""
95
  try:
96
  if audio_input is None:
97
  print("No audio input provided")
98
- return "No audio file provided", "Please provide an audio file"
99
 
100
  print(f"Received audio input: {audio_input}")
101
 
102
- # Process audio
103
- waveform = process_audio(audio_input)
 
 
 
 
 
 
104
 
105
- if waveform is None or len(waveform) == 0:
106
- return "Error: Invalid audio file", "Please provide a valid audio file"
 
 
 
 
 
107
 
108
  # Transcribe audio
109
  print("Transcribing audio...")
110
- inputs = processor(waveform, sampling_rate=16000, return_tensors="pt").input_features
 
 
111
 
112
  with torch.no_grad():
113
  predicted_ids = whisper_model.generate(inputs)
114
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
115
 
116
- print(f"Transcription completed: {transcription}")
117
-
118
- if not transcription or transcription.isspace():
119
- return "No speech detected in audio", "Unable to analyze emotions without speech"
120
-
121
  # Analyze emotions
122
  print("Analyzing emotions...")
123
  inputs = emotion_tokenizer(
@@ -138,21 +239,38 @@ def analyze_audio(audio_input):
138
  for label, score in zip(emotion_labels, emotions[0].cpu().numpy())
139
  }
140
 
141
- print(f"Emotion analysis completed: {emotion_scores}")
142
-
143
- # Create visualization
144
  emotion_viz = create_emotion_plot(emotion_scores)
145
 
146
- return transcription, emotion_viz
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
147
 
148
- except FileNotFoundError as e:
149
- error_msg = f"Audio file not found: {str(e)}"
150
- print(error_msg)
151
- return error_msg, "Please provide a valid audio file"
152
  except Exception as e:
153
  error_msg = f"Error analyzing audio: {str(e)}"
154
  print(error_msg)
155
- return error_msg, "Error in analysis"
156
 
157
  # Load models at startup
158
  print("Initializing application...")
@@ -168,23 +286,45 @@ demo = gr.Interface(
168
  label="Audio Input"
169
  ),
170
  outputs=[
171
- gr.Textbox(label="Transcription"),
172
- gr.HTML(label="Emotion Analysis")
 
173
  ],
174
- title="Vocal Emotion Analysis",
175
  description="""
176
- This app analyzes voice recordings to:
177
- 1. Transcribe speech to text
178
- 2. Detect emotions in the speech
179
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  Upload an audio file or record directly through your microphone.
181
  """,
182
  article="""
183
- Models used:
184
- - Speech recognition: Whisper (tiny)
185
- - Emotion detection: DistilRoBERTa
 
 
 
 
 
186
 
187
- Note: Processing may take a few moments depending on the length of the audio.
188
  """,
189
  examples=None,
190
  cache_examples=False
 
4
  import librosa
5
  import numpy as np
6
  import plotly.graph_objects as go
7
+ from plotly.subplots import make_subplots
8
  import warnings
9
  import os
10
+ import pandas as pd
11
+ from scipy.stats import kurtosis, skew
12
  warnings.filterwarnings('ignore')
13
 
14
  # Global variables for models
 
40
  print(f"Error loading models: {str(e)}")
41
  return False
42
 
43
+ def extract_voice_features(waveform, sr):
44
+ """Extract comprehensive voice features for health analysis"""
45
+ features = {}
46
+
47
  try:
48
+ # 1. Fundamental Frequency (F0) Statistics
49
+ f0, voiced_flag, _ = librosa.pyin(waveform,
50
+ fmin=librosa.note_to_hz('C2'),
51
+ fmax=librosa.note_to_hz('C7'))
52
+ f0_valid = f0[voiced_flag]
53
+ features['f0_mean'] = np.mean(f0_valid)
54
+ features['f0_std'] = np.std(f0_valid)
55
+ features['f0_range'] = np.ptp(f0_valid)
56
 
57
+ # 2. Jitter (F0 Variation)
58
+ if len(f0_valid) > 1:
59
+ f0_diff = np.diff(f0_valid)
60
+ features['jitter'] = np.mean(np.abs(f0_diff))
61
+ features['jitter_percent'] = (features['jitter'] / features['f0_mean']) * 100
 
 
 
62
 
63
+ # 3. Shimmer (Amplitude Variation)
64
+ amplitude_envelope = np.abs(librosa.stft(waveform))
65
+ features['shimmer'] = np.mean(np.std(amplitude_envelope, axis=1))
66
+
67
+ # 4. Spectral Features
68
+ spectral_centroids = librosa.feature.spectral_centroid(y=waveform, sr=sr)[0]
69
+ features['spectral_centroid_mean'] = np.mean(spectral_centroids)
70
+ features['spectral_centroid_std'] = np.std(spectral_centroids)
71
+
72
+ spectral_rolloff = librosa.feature.spectral_rolloff(y=waveform, sr=sr)[0]
73
+ features['spectral_rolloff_mean'] = np.mean(spectral_rolloff)
74
+
75
+ # 5. Voice Quality Measures
76
+ mfccs = librosa.feature.mfcc(y=waveform, sr=sr, n_mfcc=13)
77
+ features['mfcc_means'] = np.mean(mfccs, axis=1)
78
+ features['mfcc_stds'] = np.std(mfccs, axis=1)
79
+
80
+ # 6. Rhythm and Timing
81
+ tempo, _ = librosa.beat.beat_track(y=waveform, sr=sr)
82
+ features['speech_rate'] = tempo
83
+
84
+ # 7. Energy Features
85
+ rms = librosa.feature.rms(y=waveform)[0]
86
+ features['energy_mean'] = np.mean(rms)
87
+ features['energy_std'] = np.std(rms)
88
+ features['energy_kurtosis'] = kurtosis(rms)
89
+ features['energy_skewness'] = skew(rms)
90
 
91
+ # 8. Pause Analysis
92
+ silence_threshold = 0.01
93
+ is_silence = rms < silence_threshold
94
+ silence_regions = librosa.effects.split(waveform, top_db=20)
95
+ features['pause_count'] = len(silence_regions)
96
+ features['average_pause_duration'] = np.mean([r[1] - r[0] for r in silence_regions]) / sr
97
+
98
+ return features, True
99
  except Exception as e:
100
+ print(f"Error extracting voice features: {str(e)}")
101
+ return {}, False
102
 
103
+ def create_voice_analysis_plots(features):
104
+ """Create comprehensive visualization of voice analysis"""
105
  try:
106
+ # Create subplot figure
107
+ fig = make_subplots(
108
+ rows=2, cols=2,
109
+ subplot_titles=(
110
+ 'Fundamental Frequency Analysis',
111
+ 'Voice Quality Measures',
112
+ 'Energy and Rhythm Analysis',
113
+ 'MFCC Analysis'
114
  )
115
+ )
116
+
117
+ # 1. F0 Analysis Plot
118
+ f0_metrics = {
119
+ 'Mean F0': features['f0_mean'],
120
+ 'F0 Std Dev': features['f0_std'],
121
+ 'F0 Range': features['f0_range'],
122
+ 'Jitter %': features['jitter_percent']
123
+ }
124
+ fig.add_trace(
125
+ go.Bar(
126
+ x=list(f0_metrics.keys()),
127
+ y=list(f0_metrics.values()),
128
+ name='F0 Metrics'
129
+ ),
130
+ row=1, col=1
131
+ )
132
+
133
+ # 2. Voice Quality Plot
134
+ quality_metrics = {
135
+ 'Shimmer': features['shimmer'],
136
+ 'Spectral Centroid': features['spectral_centroid_mean'] / 1000, # Scale for visibility
137
+ 'Spectral Rolloff': features['spectral_rolloff_mean'] / 1000 # Scale for visibility
138
+ }
139
+ fig.add_trace(
140
+ go.Bar(
141
+ x=list(quality_metrics.keys()),
142
+ y=list(quality_metrics.values()),
143
+ name='Voice Quality'
144
+ ),
145
+ row=1, col=2
146
+ )
147
 
148
+ # 3. Energy and Rhythm Plot
149
+ energy_metrics = {
150
+ 'Energy Mean': features['energy_mean'],
151
+ 'Energy Std': features['energy_std'],
152
+ 'Speech Rate': features['speech_rate'] / 10, # Scale for visibility
153
+ 'Pause Count': features['pause_count']
154
+ }
155
+ fig.add_trace(
156
+ go.Bar(
157
+ x=list(energy_metrics.keys()),
158
+ y=list(energy_metrics.values()),
159
+ name='Energy & Rhythm'
160
+ ),
161
+ row=2, col=1
162
+ )
163
+
164
+ # 4. MFCC Analysis Plot
165
+ fig.add_trace(
166
+ go.Scatter(
167
+ y=features['mfcc_means'],
168
+ mode='lines+markers',
169
+ name='MFCC Coefficients'
170
+ ),
171
+ row=2, col=2
172
+ )
173
+
174
+ # Update layout
175
  fig.update_layout(
176
+ height=800,
177
+ showlegend=False,
178
+ title_text="Comprehensive Voice Analysis",
 
 
 
179
  )
180
 
181
  return fig.to_html(include_plotlyjs=True)
182
  except Exception as e:
183
+ print(f"Error creating voice analysis plots: {str(e)}")
184
+ return "Error creating visualizations"
185
 
186
  def analyze_audio(audio_input):
187
  """Main function to analyze audio input"""
188
  try:
189
  if audio_input is None:
190
  print("No audio input provided")
191
+ return "No audio file provided", "Please provide an audio file", ""
192
 
193
  print(f"Received audio input: {audio_input}")
194
 
195
+ # Load and process audio
196
+ if isinstance(audio_input, tuple):
197
+ audio_path = audio_input[0]
198
+ else:
199
+ audio_path = audio_input
200
+
201
+ # Load audio with original sampling rate
202
+ waveform, sr = librosa.load(audio_path, sr=None)
203
 
204
+ # Extract voice features
205
+ voice_features, success = extract_voice_features(waveform, sr)
206
+ if not success:
207
+ return "Error extracting voice features", "Analysis failed", ""
208
+
209
+ # Create voice analysis visualization
210
+ voice_analysis_html = create_voice_analysis_plots(voice_features)
211
 
212
  # Transcribe audio
213
  print("Transcribing audio...")
214
+ # Resample for Whisper model
215
+ waveform_16k = librosa.resample(waveform, orig_sr=sr, target_sr=16000)
216
+ inputs = processor(waveform_16k, sampling_rate=16000, return_tensors="pt").input_features
217
 
218
  with torch.no_grad():
219
  predicted_ids = whisper_model.generate(inputs)
220
  transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
221
 
 
 
 
 
 
222
  # Analyze emotions
223
  print("Analyzing emotions...")
224
  inputs = emotion_tokenizer(
 
239
  for label, score in zip(emotion_labels, emotions[0].cpu().numpy())
240
  }
241
 
242
+ # Create emotion visualization
 
 
243
  emotion_viz = create_emotion_plot(emotion_scores)
244
 
245
+ # Generate analysis summary
246
+ summary = f"""Voice Analysis Summary:
247
+
248
+ Speech Characteristics:
249
+ - Fundamental Frequency (Pitch): {voice_features['f0_mean']:.2f} Hz (average)
250
+ - Jitter: {voice_features['jitter_percent']:.2f}% (voice stability)
251
+ - Speech Rate: {voice_features['speech_rate']:.2f} BPM
252
+ - Number of Pauses: {voice_features['pause_count']}
253
+ - Average Pause Duration: {voice_features['average_pause_duration']:.2f} seconds
254
+
255
+ Voice Quality Indicators:
256
+ - Shimmer: {voice_features['shimmer']:.4f} (amplitude variation)
257
+ - Energy Distribution: {voice_features['energy_skewness']:.2f} (skewness)
258
+ - Spectral Centroid: {voice_features['spectral_centroid_mean']:.2f} Hz
259
+
260
+ Emotional Content:
261
+ - Primary Emotion: {max(emotion_scores.items(), key=lambda x: x[1])[0]}
262
+ - Emotional Variability: {np.std(list(emotion_scores.values())):.2f}
263
+
264
+ Speech Content:
265
+ {transcription}
266
+ """
267
+
268
+ return summary, emotion_viz, voice_analysis_html
269
 
 
 
 
 
270
  except Exception as e:
271
  error_msg = f"Error analyzing audio: {str(e)}"
272
  print(error_msg)
273
+ return error_msg, "Error in analysis", ""
274
 
275
  # Load models at startup
276
  print("Initializing application...")
 
286
  label="Audio Input"
287
  ),
288
  outputs=[
289
+ gr.Textbox(label="Analysis Summary", lines=10),
290
+ gr.HTML(label="Emotional Analysis"),
291
+ gr.HTML(label="Voice Biomarker Analysis")
292
  ],
293
+ title="Comprehensive Vocal Biomarker Analysis",
294
  description="""
295
+ This application performs comprehensive analysis of voice recordings to extract potential health-related biomarkers:
 
 
296
 
297
+ 1. Speech Characteristics:
298
+ - Fundamental frequency analysis
299
+ - Voice stability measures (jitter, shimmer)
300
+ - Speech rate and rhythm
301
+
302
+ 2. Voice Quality Analysis:
303
+ - Spectral features
304
+ - Energy distribution
305
+ - MFCC analysis
306
+
307
+ 3. Emotional Content:
308
+ - Emotion detection
309
+ - Emotional stability analysis
310
+
311
+ 4. Speech Content:
312
+ - Text transcription
313
+ - Pause analysis
314
+
315
  Upload an audio file or record directly through your microphone.
316
  """,
317
  article="""
318
+ ### About Vocal Biomarkers
319
+ Vocal biomarkers are measurable indicators in the human voice that can potentially indicate various health conditions.
320
+ This analysis focuses on several key aspects:
321
+
322
+ - **Voice Quality**: Changes in voice quality can indicate respiratory or neurological conditions
323
+ - **Prosody**: Speech rhythm and timing can be indicators of cognitive function
324
+ - **Emotional Content**: Emotional patterns can be relevant to mental health assessment
325
+ - **Acoustic Features**: Specific acoustic patterns may correlate with various health conditions
326
 
327
+ Note: This is a demonstration tool and should not be used for medical diagnosis.
328
  """,
329
  examples=None,
330
  cache_examples=False