invincible-jha
commited on
Commit
•
4d5d3b7
1
Parent(s):
363bda3
Upload app.py
Browse files
app.py
CHANGED
@@ -4,8 +4,11 @@ from transformers import WhisperProcessor, WhisperForConditionalGeneration, Auto
|
|
4 |
import librosa
|
5 |
import numpy as np
|
6 |
import plotly.graph_objects as go
|
|
|
7 |
import warnings
|
8 |
import os
|
|
|
|
|
9 |
warnings.filterwarnings('ignore')
|
10 |
|
11 |
# Global variables for models
|
@@ -37,87 +40,185 @@ def load_models():
|
|
37 |
print(f"Error loading models: {str(e)}")
|
38 |
return False
|
39 |
|
40 |
-
def
|
41 |
-
"""
|
|
|
|
|
42 |
try:
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
|
45 |
-
#
|
46 |
-
if
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
audio_path = audio_input
|
51 |
-
|
52 |
-
print(f"Processing audio from path: {audio_path}")
|
53 |
|
54 |
-
#
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
#
|
59 |
-
|
60 |
-
|
61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
except Exception as e:
|
65 |
-
print(f"Error
|
66 |
-
|
67 |
|
68 |
-
def
|
69 |
-
"""Create
|
70 |
try:
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
|
|
|
|
|
|
76 |
)
|
77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
fig.update_layout(
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
yaxis_range=[0, 1],
|
84 |
-
template='plotly_white',
|
85 |
-
height=400
|
86 |
)
|
87 |
|
88 |
return fig.to_html(include_plotlyjs=True)
|
89 |
except Exception as e:
|
90 |
-
print(f"Error creating
|
91 |
-
return "Error creating
|
92 |
|
93 |
def analyze_audio(audio_input):
|
94 |
"""Main function to analyze audio input"""
|
95 |
try:
|
96 |
if audio_input is None:
|
97 |
print("No audio input provided")
|
98 |
-
return "No audio file provided", "Please provide an audio file"
|
99 |
|
100 |
print(f"Received audio input: {audio_input}")
|
101 |
|
102 |
-
#
|
103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
|
105 |
-
|
106 |
-
|
|
|
|
|
|
|
|
|
|
|
107 |
|
108 |
# Transcribe audio
|
109 |
print("Transcribing audio...")
|
110 |
-
|
|
|
|
|
111 |
|
112 |
with torch.no_grad():
|
113 |
predicted_ids = whisper_model.generate(inputs)
|
114 |
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
|
115 |
|
116 |
-
print(f"Transcription completed: {transcription}")
|
117 |
-
|
118 |
-
if not transcription or transcription.isspace():
|
119 |
-
return "No speech detected in audio", "Unable to analyze emotions without speech"
|
120 |
-
|
121 |
# Analyze emotions
|
122 |
print("Analyzing emotions...")
|
123 |
inputs = emotion_tokenizer(
|
@@ -138,21 +239,38 @@ def analyze_audio(audio_input):
|
|
138 |
for label, score in zip(emotion_labels, emotions[0].cpu().numpy())
|
139 |
}
|
140 |
|
141 |
-
|
142 |
-
|
143 |
-
# Create visualization
|
144 |
emotion_viz = create_emotion_plot(emotion_scores)
|
145 |
|
146 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
147 |
|
148 |
-
except FileNotFoundError as e:
|
149 |
-
error_msg = f"Audio file not found: {str(e)}"
|
150 |
-
print(error_msg)
|
151 |
-
return error_msg, "Please provide a valid audio file"
|
152 |
except Exception as e:
|
153 |
error_msg = f"Error analyzing audio: {str(e)}"
|
154 |
print(error_msg)
|
155 |
-
return error_msg, "Error in analysis"
|
156 |
|
157 |
# Load models at startup
|
158 |
print("Initializing application...")
|
@@ -168,23 +286,45 @@ demo = gr.Interface(
|
|
168 |
label="Audio Input"
|
169 |
),
|
170 |
outputs=[
|
171 |
-
gr.Textbox(label="
|
172 |
-
gr.HTML(label="
|
|
|
173 |
],
|
174 |
-
title="Vocal
|
175 |
description="""
|
176 |
-
This
|
177 |
-
1. Transcribe speech to text
|
178 |
-
2. Detect emotions in the speech
|
179 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
Upload an audio file or record directly through your microphone.
|
181 |
""",
|
182 |
article="""
|
183 |
-
|
184 |
-
|
185 |
-
|
|
|
|
|
|
|
|
|
|
|
186 |
|
187 |
-
Note:
|
188 |
""",
|
189 |
examples=None,
|
190 |
cache_examples=False
|
|
|
4 |
import librosa
|
5 |
import numpy as np
|
6 |
import plotly.graph_objects as go
|
7 |
+
from plotly.subplots import make_subplots
|
8 |
import warnings
|
9 |
import os
|
10 |
+
import pandas as pd
|
11 |
+
from scipy.stats import kurtosis, skew
|
12 |
warnings.filterwarnings('ignore')
|
13 |
|
14 |
# Global variables for models
|
|
|
40 |
print(f"Error loading models: {str(e)}")
|
41 |
return False
|
42 |
|
43 |
+
def extract_voice_features(waveform, sr):
|
44 |
+
"""Extract comprehensive voice features for health analysis"""
|
45 |
+
features = {}
|
46 |
+
|
47 |
try:
|
48 |
+
# 1. Fundamental Frequency (F0) Statistics
|
49 |
+
f0, voiced_flag, _ = librosa.pyin(waveform,
|
50 |
+
fmin=librosa.note_to_hz('C2'),
|
51 |
+
fmax=librosa.note_to_hz('C7'))
|
52 |
+
f0_valid = f0[voiced_flag]
|
53 |
+
features['f0_mean'] = np.mean(f0_valid)
|
54 |
+
features['f0_std'] = np.std(f0_valid)
|
55 |
+
features['f0_range'] = np.ptp(f0_valid)
|
56 |
|
57 |
+
# 2. Jitter (F0 Variation)
|
58 |
+
if len(f0_valid) > 1:
|
59 |
+
f0_diff = np.diff(f0_valid)
|
60 |
+
features['jitter'] = np.mean(np.abs(f0_diff))
|
61 |
+
features['jitter_percent'] = (features['jitter'] / features['f0_mean']) * 100
|
|
|
|
|
|
|
62 |
|
63 |
+
# 3. Shimmer (Amplitude Variation)
|
64 |
+
amplitude_envelope = np.abs(librosa.stft(waveform))
|
65 |
+
features['shimmer'] = np.mean(np.std(amplitude_envelope, axis=1))
|
66 |
+
|
67 |
+
# 4. Spectral Features
|
68 |
+
spectral_centroids = librosa.feature.spectral_centroid(y=waveform, sr=sr)[0]
|
69 |
+
features['spectral_centroid_mean'] = np.mean(spectral_centroids)
|
70 |
+
features['spectral_centroid_std'] = np.std(spectral_centroids)
|
71 |
+
|
72 |
+
spectral_rolloff = librosa.feature.spectral_rolloff(y=waveform, sr=sr)[0]
|
73 |
+
features['spectral_rolloff_mean'] = np.mean(spectral_rolloff)
|
74 |
+
|
75 |
+
# 5. Voice Quality Measures
|
76 |
+
mfccs = librosa.feature.mfcc(y=waveform, sr=sr, n_mfcc=13)
|
77 |
+
features['mfcc_means'] = np.mean(mfccs, axis=1)
|
78 |
+
features['mfcc_stds'] = np.std(mfccs, axis=1)
|
79 |
+
|
80 |
+
# 6. Rhythm and Timing
|
81 |
+
tempo, _ = librosa.beat.beat_track(y=waveform, sr=sr)
|
82 |
+
features['speech_rate'] = tempo
|
83 |
+
|
84 |
+
# 7. Energy Features
|
85 |
+
rms = librosa.feature.rms(y=waveform)[0]
|
86 |
+
features['energy_mean'] = np.mean(rms)
|
87 |
+
features['energy_std'] = np.std(rms)
|
88 |
+
features['energy_kurtosis'] = kurtosis(rms)
|
89 |
+
features['energy_skewness'] = skew(rms)
|
90 |
|
91 |
+
# 8. Pause Analysis
|
92 |
+
silence_threshold = 0.01
|
93 |
+
is_silence = rms < silence_threshold
|
94 |
+
silence_regions = librosa.effects.split(waveform, top_db=20)
|
95 |
+
features['pause_count'] = len(silence_regions)
|
96 |
+
features['average_pause_duration'] = np.mean([r[1] - r[0] for r in silence_regions]) / sr
|
97 |
+
|
98 |
+
return features, True
|
99 |
except Exception as e:
|
100 |
+
print(f"Error extracting voice features: {str(e)}")
|
101 |
+
return {}, False
|
102 |
|
103 |
+
def create_voice_analysis_plots(features):
|
104 |
+
"""Create comprehensive visualization of voice analysis"""
|
105 |
try:
|
106 |
+
# Create subplot figure
|
107 |
+
fig = make_subplots(
|
108 |
+
rows=2, cols=2,
|
109 |
+
subplot_titles=(
|
110 |
+
'Fundamental Frequency Analysis',
|
111 |
+
'Voice Quality Measures',
|
112 |
+
'Energy and Rhythm Analysis',
|
113 |
+
'MFCC Analysis'
|
114 |
)
|
115 |
+
)
|
116 |
+
|
117 |
+
# 1. F0 Analysis Plot
|
118 |
+
f0_metrics = {
|
119 |
+
'Mean F0': features['f0_mean'],
|
120 |
+
'F0 Std Dev': features['f0_std'],
|
121 |
+
'F0 Range': features['f0_range'],
|
122 |
+
'Jitter %': features['jitter_percent']
|
123 |
+
}
|
124 |
+
fig.add_trace(
|
125 |
+
go.Bar(
|
126 |
+
x=list(f0_metrics.keys()),
|
127 |
+
y=list(f0_metrics.values()),
|
128 |
+
name='F0 Metrics'
|
129 |
+
),
|
130 |
+
row=1, col=1
|
131 |
+
)
|
132 |
+
|
133 |
+
# 2. Voice Quality Plot
|
134 |
+
quality_metrics = {
|
135 |
+
'Shimmer': features['shimmer'],
|
136 |
+
'Spectral Centroid': features['spectral_centroid_mean'] / 1000, # Scale for visibility
|
137 |
+
'Spectral Rolloff': features['spectral_rolloff_mean'] / 1000 # Scale for visibility
|
138 |
+
}
|
139 |
+
fig.add_trace(
|
140 |
+
go.Bar(
|
141 |
+
x=list(quality_metrics.keys()),
|
142 |
+
y=list(quality_metrics.values()),
|
143 |
+
name='Voice Quality'
|
144 |
+
),
|
145 |
+
row=1, col=2
|
146 |
+
)
|
147 |
|
148 |
+
# 3. Energy and Rhythm Plot
|
149 |
+
energy_metrics = {
|
150 |
+
'Energy Mean': features['energy_mean'],
|
151 |
+
'Energy Std': features['energy_std'],
|
152 |
+
'Speech Rate': features['speech_rate'] / 10, # Scale for visibility
|
153 |
+
'Pause Count': features['pause_count']
|
154 |
+
}
|
155 |
+
fig.add_trace(
|
156 |
+
go.Bar(
|
157 |
+
x=list(energy_metrics.keys()),
|
158 |
+
y=list(energy_metrics.values()),
|
159 |
+
name='Energy & Rhythm'
|
160 |
+
),
|
161 |
+
row=2, col=1
|
162 |
+
)
|
163 |
+
|
164 |
+
# 4. MFCC Analysis Plot
|
165 |
+
fig.add_trace(
|
166 |
+
go.Scatter(
|
167 |
+
y=features['mfcc_means'],
|
168 |
+
mode='lines+markers',
|
169 |
+
name='MFCC Coefficients'
|
170 |
+
),
|
171 |
+
row=2, col=2
|
172 |
+
)
|
173 |
+
|
174 |
+
# Update layout
|
175 |
fig.update_layout(
|
176 |
+
height=800,
|
177 |
+
showlegend=False,
|
178 |
+
title_text="Comprehensive Voice Analysis",
|
|
|
|
|
|
|
179 |
)
|
180 |
|
181 |
return fig.to_html(include_plotlyjs=True)
|
182 |
except Exception as e:
|
183 |
+
print(f"Error creating voice analysis plots: {str(e)}")
|
184 |
+
return "Error creating visualizations"
|
185 |
|
186 |
def analyze_audio(audio_input):
|
187 |
"""Main function to analyze audio input"""
|
188 |
try:
|
189 |
if audio_input is None:
|
190 |
print("No audio input provided")
|
191 |
+
return "No audio file provided", "Please provide an audio file", ""
|
192 |
|
193 |
print(f"Received audio input: {audio_input}")
|
194 |
|
195 |
+
# Load and process audio
|
196 |
+
if isinstance(audio_input, tuple):
|
197 |
+
audio_path = audio_input[0]
|
198 |
+
else:
|
199 |
+
audio_path = audio_input
|
200 |
+
|
201 |
+
# Load audio with original sampling rate
|
202 |
+
waveform, sr = librosa.load(audio_path, sr=None)
|
203 |
|
204 |
+
# Extract voice features
|
205 |
+
voice_features, success = extract_voice_features(waveform, sr)
|
206 |
+
if not success:
|
207 |
+
return "Error extracting voice features", "Analysis failed", ""
|
208 |
+
|
209 |
+
# Create voice analysis visualization
|
210 |
+
voice_analysis_html = create_voice_analysis_plots(voice_features)
|
211 |
|
212 |
# Transcribe audio
|
213 |
print("Transcribing audio...")
|
214 |
+
# Resample for Whisper model
|
215 |
+
waveform_16k = librosa.resample(waveform, orig_sr=sr, target_sr=16000)
|
216 |
+
inputs = processor(waveform_16k, sampling_rate=16000, return_tensors="pt").input_features
|
217 |
|
218 |
with torch.no_grad():
|
219 |
predicted_ids = whisper_model.generate(inputs)
|
220 |
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
|
221 |
|
|
|
|
|
|
|
|
|
|
|
222 |
# Analyze emotions
|
223 |
print("Analyzing emotions...")
|
224 |
inputs = emotion_tokenizer(
|
|
|
239 |
for label, score in zip(emotion_labels, emotions[0].cpu().numpy())
|
240 |
}
|
241 |
|
242 |
+
# Create emotion visualization
|
|
|
|
|
243 |
emotion_viz = create_emotion_plot(emotion_scores)
|
244 |
|
245 |
+
# Generate analysis summary
|
246 |
+
summary = f"""Voice Analysis Summary:
|
247 |
+
|
248 |
+
Speech Characteristics:
|
249 |
+
- Fundamental Frequency (Pitch): {voice_features['f0_mean']:.2f} Hz (average)
|
250 |
+
- Jitter: {voice_features['jitter_percent']:.2f}% (voice stability)
|
251 |
+
- Speech Rate: {voice_features['speech_rate']:.2f} BPM
|
252 |
+
- Number of Pauses: {voice_features['pause_count']}
|
253 |
+
- Average Pause Duration: {voice_features['average_pause_duration']:.2f} seconds
|
254 |
+
|
255 |
+
Voice Quality Indicators:
|
256 |
+
- Shimmer: {voice_features['shimmer']:.4f} (amplitude variation)
|
257 |
+
- Energy Distribution: {voice_features['energy_skewness']:.2f} (skewness)
|
258 |
+
- Spectral Centroid: {voice_features['spectral_centroid_mean']:.2f} Hz
|
259 |
+
|
260 |
+
Emotional Content:
|
261 |
+
- Primary Emotion: {max(emotion_scores.items(), key=lambda x: x[1])[0]}
|
262 |
+
- Emotional Variability: {np.std(list(emotion_scores.values())):.2f}
|
263 |
+
|
264 |
+
Speech Content:
|
265 |
+
{transcription}
|
266 |
+
"""
|
267 |
+
|
268 |
+
return summary, emotion_viz, voice_analysis_html
|
269 |
|
|
|
|
|
|
|
|
|
270 |
except Exception as e:
|
271 |
error_msg = f"Error analyzing audio: {str(e)}"
|
272 |
print(error_msg)
|
273 |
+
return error_msg, "Error in analysis", ""
|
274 |
|
275 |
# Load models at startup
|
276 |
print("Initializing application...")
|
|
|
286 |
label="Audio Input"
|
287 |
),
|
288 |
outputs=[
|
289 |
+
gr.Textbox(label="Analysis Summary", lines=10),
|
290 |
+
gr.HTML(label="Emotional Analysis"),
|
291 |
+
gr.HTML(label="Voice Biomarker Analysis")
|
292 |
],
|
293 |
+
title="Comprehensive Vocal Biomarker Analysis",
|
294 |
description="""
|
295 |
+
This application performs comprehensive analysis of voice recordings to extract potential health-related biomarkers:
|
|
|
|
|
296 |
|
297 |
+
1. Speech Characteristics:
|
298 |
+
- Fundamental frequency analysis
|
299 |
+
- Voice stability measures (jitter, shimmer)
|
300 |
+
- Speech rate and rhythm
|
301 |
+
|
302 |
+
2. Voice Quality Analysis:
|
303 |
+
- Spectral features
|
304 |
+
- Energy distribution
|
305 |
+
- MFCC analysis
|
306 |
+
|
307 |
+
3. Emotional Content:
|
308 |
+
- Emotion detection
|
309 |
+
- Emotional stability analysis
|
310 |
+
|
311 |
+
4. Speech Content:
|
312 |
+
- Text transcription
|
313 |
+
- Pause analysis
|
314 |
+
|
315 |
Upload an audio file or record directly through your microphone.
|
316 |
""",
|
317 |
article="""
|
318 |
+
### About Vocal Biomarkers
|
319 |
+
Vocal biomarkers are measurable indicators in the human voice that can potentially indicate various health conditions.
|
320 |
+
This analysis focuses on several key aspects:
|
321 |
+
|
322 |
+
- **Voice Quality**: Changes in voice quality can indicate respiratory or neurological conditions
|
323 |
+
- **Prosody**: Speech rhythm and timing can be indicators of cognitive function
|
324 |
+
- **Emotional Content**: Emotional patterns can be relevant to mental health assessment
|
325 |
+
- **Acoustic Features**: Specific acoustic patterns may correlate with various health conditions
|
326 |
|
327 |
+
Note: This is a demonstration tool and should not be used for medical diagnosis.
|
328 |
""",
|
329 |
examples=None,
|
330 |
cache_examples=False
|