invincible-jha
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -1,7 +1,4 @@
|
|
1 |
# app.py - Voice Analysis System with Clinical Interpretation
|
2 |
-
# This application provides comprehensive voice analysis with mental health insights
|
3 |
-
# using voice biomarkers, emotion detection, and clinical interpretation.
|
4 |
-
|
5 |
import gradio as gr
|
6 |
import torch
|
7 |
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
@@ -15,13 +12,13 @@ from scipy.stats import kurtosis, skew
|
|
15 |
from anthropic import Anthropic
|
16 |
from dotenv import load_dotenv
|
17 |
|
18 |
-
# Load environment variables
|
19 |
load_dotenv()
|
20 |
|
21 |
-
# Suppress warnings
|
22 |
warnings.filterwarnings('ignore')
|
23 |
|
24 |
-
# Initialize global
|
25 |
processor = None
|
26 |
whisper_model = None
|
27 |
emotion_tokenizer = None
|
@@ -29,28 +26,21 @@ emotion_model = None
|
|
29 |
clinical_analyzer = None
|
30 |
|
31 |
def load_models():
|
32 |
-
"""
|
33 |
-
|
34 |
-
This function handles the initialization of both Whisper (for speech recognition)
|
35 |
-
and the emotion detection model, setting them up for CPU-based inference.
|
36 |
-
|
37 |
-
Returns:
|
38 |
-
bool: True if all models loaded successfully, False otherwise
|
39 |
-
"""
|
40 |
global processor, whisper_model, emotion_tokenizer, emotion_model
|
41 |
|
42 |
try:
|
43 |
-
#
|
44 |
print("Loading Whisper model...")
|
45 |
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
|
46 |
whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
|
47 |
|
48 |
-
#
|
49 |
print("Loading emotion model...")
|
50 |
emotion_tokenizer = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
|
51 |
emotion_model = AutoModelForSequenceClassification.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
|
52 |
|
53 |
-
# Set
|
54 |
device = "cpu"
|
55 |
whisper_model.to(device)
|
56 |
emotion_model.to(device)
|
@@ -62,48 +52,37 @@ def load_models():
|
|
62 |
return False
|
63 |
|
64 |
def extract_prosodic_features(waveform, sr):
|
65 |
-
"""Extract voice features
|
66 |
-
|
67 |
-
Args:
|
68 |
-
waveform (numpy.ndarray): Audio signal data
|
69 |
-
sr (int): Sampling rate of the audio
|
70 |
-
|
71 |
-
Returns:
|
72 |
-
dict: Dictionary containing extracted features or None if extraction fails
|
73 |
-
"""
|
74 |
try:
|
75 |
-
# Input validation
|
76 |
if waveform is None or len(waveform) == 0:
|
77 |
return None
|
78 |
|
79 |
features = {}
|
80 |
|
81 |
-
# Pitch analysis
|
82 |
try:
|
83 |
pitches, magnitudes = librosa.piptrack(
|
84 |
y=waveform,
|
85 |
sr=sr,
|
86 |
-
fmin=50,
|
87 |
-
fmax=2000,
|
88 |
-
n_mels=128,
|
89 |
hop_length=512,
|
90 |
win_length=2048
|
91 |
)
|
92 |
|
93 |
-
# Extract valid pitch contour
|
94 |
f0_contour = [
|
95 |
pitches[magnitudes[:, t].argmax(), t]
|
96 |
for t in range(pitches.shape[1])
|
97 |
if 50 <= pitches[magnitudes[:, t].argmax(), t] <= 2000
|
98 |
]
|
99 |
|
100 |
-
# Calculate pitch statistics
|
101 |
if f0_contour:
|
102 |
features['pitch_mean'] = float(np.mean(f0_contour))
|
103 |
features['pitch_std'] = float(np.std(f0_contour))
|
104 |
features['pitch_range'] = float(np.ptp(f0_contour))
|
105 |
else:
|
106 |
-
features['pitch_mean'] = 160.0
|
107 |
features['pitch_std'] = 0.0
|
108 |
features['pitch_range'] = 0.0
|
109 |
|
@@ -111,7 +90,7 @@ def extract_prosodic_features(waveform, sr):
|
|
111 |
print(f"Pitch extraction error: {e}")
|
112 |
features.update({'pitch_mean': 160.0, 'pitch_std': 0.0, 'pitch_range': 0.0})
|
113 |
|
114 |
-
# Energy analysis
|
115 |
try:
|
116 |
rms = librosa.feature.rms(
|
117 |
y=waveform,
|
@@ -129,7 +108,7 @@ def extract_prosodic_features(waveform, sr):
|
|
129 |
print(f"Energy extraction error: {e}")
|
130 |
features.update({'energy_mean': 0.02, 'energy_std': 0.0, 'energy_range': 0.0})
|
131 |
|
132 |
-
# Rhythm analysis
|
133 |
try:
|
134 |
onset_env = librosa.onset.onset_strength(
|
135 |
y=waveform,
|
@@ -145,7 +124,6 @@ def extract_prosodic_features(waveform, sr):
|
|
145 |
aggregate=None
|
146 |
)[0]
|
147 |
|
148 |
-
# Validate tempo within normal speech range (40-240 BPM)
|
149 |
features['tempo'] = float(tempo) if 40 <= tempo <= 240 else 120.0
|
150 |
|
151 |
except Exception as e:
|
@@ -158,31 +136,21 @@ def extract_prosodic_features(waveform, sr):
|
|
158 |
return None
|
159 |
|
160 |
class ClinicalVoiceAnalyzer:
|
161 |
-
"""
|
162 |
|
163 |
def __init__(self):
|
164 |
-
"""Initialize
|
165 |
self.anthropic = Anthropic(api_key=os.getenv('ANTHROPIC_API_KEY'))
|
166 |
self.model = "claude-3-opus-20240229"
|
167 |
-
# Define normal ranges for voice metrics based on clinical research
|
168 |
self.reference_ranges = {
|
169 |
-
'pitch': {'min': 150, 'max': 400},
|
170 |
-
'tempo': {'min': 90, 'max': 130},
|
171 |
'energy': {'min': 0.01, 'max': 0.05}
|
172 |
}
|
173 |
print("Clinical analyzer ready")
|
174 |
|
175 |
def analyze_voice_metrics(self, features, emotions, transcription):
|
176 |
-
"""
|
177 |
-
|
178 |
-
Args:
|
179 |
-
features (dict): Extracted voice features
|
180 |
-
emotions (dict): Detected emotion scores
|
181 |
-
transcription (str): Speech content
|
182 |
-
|
183 |
-
Returns:
|
184 |
-
str: Formatted clinical analysis or backup analysis if API fails
|
185 |
-
"""
|
186 |
try:
|
187 |
prompt = self._create_clinical_prompt(features, emotions, transcription)
|
188 |
response = self.anthropic.messages.create(
|
@@ -196,7 +164,7 @@ class ClinicalVoiceAnalyzer:
|
|
196 |
return self._generate_backup_analysis(features, emotions)
|
197 |
|
198 |
def _create_clinical_prompt(self, features, emotions, transcription):
|
199 |
-
"""Create
|
200 |
return f"""As a clinical voice analysis expert, provide a psychological assessment of:
|
201 |
|
202 |
Voice Metrics:
|
@@ -219,11 +187,11 @@ Provide:
|
|
219 |
5. Clinical recommendations"""
|
220 |
|
221 |
def _format_analysis(self, analysis):
|
222 |
-
"""Format
|
223 |
return f"\nClinical Assessment:\n{analysis}"
|
224 |
|
225 |
def _generate_backup_analysis(self, features, emotions):
|
226 |
-
"""Generate
|
227 |
dominant_emotion = max(emotions.items(), key=lambda x: x[1])
|
228 |
pitch_status = (
|
229 |
"elevated" if features['pitch_mean'] > self.reference_ranges['pitch']['max']
|
@@ -239,14 +207,7 @@ Basic Voice Analysis (API Unavailable):
|
|
239 |
- Primary Emotion: {dominant_emotion[0]} ({dominant_emotion[1]:.1%} confidence)"""
|
240 |
|
241 |
def create_feature_plots(features):
|
242 |
-
"""Create
|
243 |
-
|
244 |
-
Args:
|
245 |
-
features (dict): Dictionary of extracted voice features
|
246 |
-
|
247 |
-
Returns:
|
248 |
-
str: HTML representation of the interactive plots
|
249 |
-
"""
|
250 |
try:
|
251 |
fig = go.Figure()
|
252 |
|
@@ -285,7 +246,6 @@ def create_feature_plots(features):
|
|
285 |
marker=dict(size=15, color='green')
|
286 |
))
|
287 |
|
288 |
-
# Layout configuration
|
289 |
fig.update_layout(
|
290 |
title='Voice Feature Analysis',
|
291 |
showlegend=True,
|
@@ -302,14 +262,7 @@ def create_feature_plots(features):
|
|
302 |
return None
|
303 |
|
304 |
def create_emotion_plot(emotions):
|
305 |
-
"""Create visualization
|
306 |
-
|
307 |
-
Args:
|
308 |
-
emotions (dict): Dictionary of emotion scores
|
309 |
-
|
310 |
-
Returns:
|
311 |
-
str: HTML representation of the emotion plot
|
312 |
-
"""
|
313 |
try:
|
314 |
fig = go.Figure(data=[
|
315 |
go.Bar(
|
@@ -335,46 +288,29 @@ def create_emotion_plot(emotions):
|
|
335 |
return None
|
336 |
|
337 |
def analyze_audio(audio_input):
|
338 |
-
"""
|
339 |
-
|
340 |
-
This is the main function that coordinates the entire analysis pipeline,
|
341 |
-
including feature extraction, emotion detection, and clinical interpretation.
|
342 |
-
|
343 |
-
Args:
|
344 |
-
audio_input: Audio file path or tuple containing audio data
|
345 |
-
|
346 |
-
Returns:
|
347 |
-
tuple: (analysis_summary, emotion_visualization, feature_visualization)
|
348 |
-
"""
|
349 |
try:
|
350 |
-
# Validate input
|
351 |
if audio_input is None:
|
352 |
return "Please provide an audio input", None, None
|
353 |
|
354 |
-
# Load audio
|
355 |
audio_path = audio_input[0] if isinstance(audio_input, tuple) else audio_input
|
356 |
waveform, sr = librosa.load(audio_path, sr=16000, duration=30)
|
357 |
|
358 |
-
# Validate duration
|
359 |
duration = len(waveform) / sr
|
360 |
if duration < 0.5:
|
361 |
return "Audio too short (minimum 0.5 seconds needed)", None, None
|
362 |
|
363 |
-
# Extract features
|
364 |
features = extract_prosodic_features(waveform, sr)
|
365 |
if features is None:
|
366 |
return "Feature extraction failed", None, None
|
367 |
|
368 |
-
# Generate visualizations
|
369 |
feature_viz = create_feature_plots(features)
|
370 |
|
371 |
-
# Perform speech recognition
|
372 |
inputs = processor(waveform, sampling_rate=sr, return_tensors="pt").input_features
|
373 |
with torch.no_grad():
|
374 |
predicted_ids = whisper_model.generate(inputs)
|
375 |
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
|
376 |
|
377 |
-
# Analyze emotions
|
378 |
emotion_inputs = emotion_tokenizer(
|
379 |
transcription,
|
380 |
return_tensors="pt",
|
@@ -387,7 +323,6 @@ def analyze_audio(audio_input):
|
|
387 |
emotion_outputs = emotion_model(**emotion_inputs)
|
388 |
emotions = torch.nn.functional.softmax(emotion_outputs.logits, dim=-1)
|
389 |
|
390 |
-
# Process emotion scores
|
391 |
emotion_labels = ['anger', 'fear', 'joy', 'neutral', 'sadness', 'surprise']
|
392 |
emotion_scores = {
|
393 |
label: float(score)
|
@@ -396,7 +331,6 @@ def analyze_audio(audio_input):
|
|
396 |
|
397 |
emotion_viz = create_emotion_plot(emotion_scores)
|
398 |
|
399 |
-
# Generate clinical analysis
|
400 |
global clinical_analyzer
|
401 |
if clinical_analyzer is None:
|
402 |
clinical_analyzer = ClinicalVoiceAnalyzer()
|
@@ -405,20 +339,7 @@ def analyze_audio(audio_input):
|
|
405 |
features, emotion_scores, transcription
|
406 |
)
|
407 |
|
408 |
-
# Create
|
409 |
-
summary = f"""Voice Analysis Summary:
|
410 |
-
|
411 |
-
Speech Content:
|
412 |
-
{transcription}
|
413 |
-
|
414 |
-
Voice Characteristics:
|
415 |
-
- Average Pitch: {features['pitch_mean']:.2f} Hz
|
416 |
-
- Pitch Variation: {features['pitch_std']:.2f} Hz
|
417 |
-
- Speech Rate (Tempo): {features['tempo']:.2f} BPM
|
418 |
-
- Voice Energy: {features['energy_mean']:.4f}
|
419 |
-
|
420 |
-
Dominant Emotion: {max(emotion_scores.items(), key=lambda x: x[1])[0]}
|
421 |
-
Emotion# Continue from previous summary string
|
422 |
summary = f"""Voice Analysis Summary:
|
423 |
|
424 |
Speech Content:
|
@@ -435,8 +356,8 @@ Emotion Confidence: {max(emotion_scores.values()):.2%}
|
|
435 |
|
436 |
Recording Duration: {duration:.2f} seconds
|
437 |
|
438 |
-
{clinical_analysis}
|
439 |
-
|
440 |
return summary, emotion_viz, feature_viz
|
441 |
|
442 |
except Exception as e:
|
@@ -444,19 +365,16 @@ Recording Duration: {duration:.2f} seconds
|
|
444 |
print(error_msg)
|
445 |
return error_msg, None, None
|
446 |
|
447 |
-
# Application initialization
|
448 |
try:
|
449 |
print("===== Application Startup =====")
|
450 |
|
451 |
-
# Load required models
|
452 |
if not load_models():
|
453 |
raise RuntimeError("Model loading failed")
|
454 |
|
455 |
-
# Initialize clinical analyzer
|
456 |
clinical_analyzer = ClinicalVoiceAnalyzer()
|
457 |
print("Clinical analyzer initialized")
|
458 |
|
459 |
-
# Define the interface description
|
460 |
description = """This application provides comprehensive voice analysis with clinical insights:
|
461 |
|
462 |
1. Voice Features:
|
@@ -483,7 +401,6 @@ For optimal results:
|
|
483 |
|
484 |
Upload an audio file or record directly through your microphone."""
|
485 |
|
486 |
-
# Create Gradio interface
|
487 |
demo = gr.Interface(
|
488 |
fn=analyze_audio,
|
489 |
inputs=gr.Audio(
|
@@ -509,13 +426,12 @@ Upload an audio file or record directly through your microphone."""
|
|
509 |
theme="default"
|
510 |
)
|
511 |
|
512 |
-
# Launch the interface with additional configuration
|
513 |
if __name__ == "__main__":
|
514 |
demo.launch(
|
515 |
-
server_name="0.0.0.0",
|
516 |
-
server_port=7860,
|
517 |
-
share=False,
|
518 |
-
debug=False
|
519 |
)
|
520 |
|
521 |
except Exception as e:
|
|
|
1 |
# app.py - Voice Analysis System with Clinical Interpretation
|
|
|
|
|
|
|
2 |
import gradio as gr
|
3 |
import torch
|
4 |
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
|
|
12 |
from anthropic import Anthropic
|
13 |
from dotenv import load_dotenv
|
14 |
|
15 |
+
# Load environment variables
|
16 |
load_dotenv()
|
17 |
|
18 |
+
# Suppress warnings
|
19 |
warnings.filterwarnings('ignore')
|
20 |
|
21 |
+
# Initialize global variables
|
22 |
processor = None
|
23 |
whisper_model = None
|
24 |
emotion_tokenizer = None
|
|
|
26 |
clinical_analyzer = None
|
27 |
|
28 |
def load_models():
|
29 |
+
"""Initialize and load required ML models."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
global processor, whisper_model, emotion_tokenizer, emotion_model
|
31 |
|
32 |
try:
|
33 |
+
# Load Whisper model
|
34 |
print("Loading Whisper model...")
|
35 |
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
|
36 |
whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
|
37 |
|
38 |
+
# Load emotion model
|
39 |
print("Loading emotion model...")
|
40 |
emotion_tokenizer = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
|
41 |
emotion_model = AutoModelForSequenceClassification.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
|
42 |
|
43 |
+
# Set device
|
44 |
device = "cpu"
|
45 |
whisper_model.to(device)
|
46 |
emotion_model.to(device)
|
|
|
52 |
return False
|
53 |
|
54 |
def extract_prosodic_features(waveform, sr):
|
55 |
+
"""Extract voice features from audio data."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
try:
|
|
|
57 |
if waveform is None or len(waveform) == 0:
|
58 |
return None
|
59 |
|
60 |
features = {}
|
61 |
|
62 |
+
# Pitch analysis
|
63 |
try:
|
64 |
pitches, magnitudes = librosa.piptrack(
|
65 |
y=waveform,
|
66 |
sr=sr,
|
67 |
+
fmin=50,
|
68 |
+
fmax=2000,
|
69 |
+
n_mels=128,
|
70 |
hop_length=512,
|
71 |
win_length=2048
|
72 |
)
|
73 |
|
|
|
74 |
f0_contour = [
|
75 |
pitches[magnitudes[:, t].argmax(), t]
|
76 |
for t in range(pitches.shape[1])
|
77 |
if 50 <= pitches[magnitudes[:, t].argmax(), t] <= 2000
|
78 |
]
|
79 |
|
|
|
80 |
if f0_contour:
|
81 |
features['pitch_mean'] = float(np.mean(f0_contour))
|
82 |
features['pitch_std'] = float(np.std(f0_contour))
|
83 |
features['pitch_range'] = float(np.ptp(f0_contour))
|
84 |
else:
|
85 |
+
features['pitch_mean'] = 160.0
|
86 |
features['pitch_std'] = 0.0
|
87 |
features['pitch_range'] = 0.0
|
88 |
|
|
|
90 |
print(f"Pitch extraction error: {e}")
|
91 |
features.update({'pitch_mean': 160.0, 'pitch_std': 0.0, 'pitch_range': 0.0})
|
92 |
|
93 |
+
# Energy analysis
|
94 |
try:
|
95 |
rms = librosa.feature.rms(
|
96 |
y=waveform,
|
|
|
108 |
print(f"Energy extraction error: {e}")
|
109 |
features.update({'energy_mean': 0.02, 'energy_std': 0.0, 'energy_range': 0.0})
|
110 |
|
111 |
+
# Rhythm analysis
|
112 |
try:
|
113 |
onset_env = librosa.onset.onset_strength(
|
114 |
y=waveform,
|
|
|
124 |
aggregate=None
|
125 |
)[0]
|
126 |
|
|
|
127 |
features['tempo'] = float(tempo) if 40 <= tempo <= 240 else 120.0
|
128 |
|
129 |
except Exception as e:
|
|
|
136 |
return None
|
137 |
|
138 |
class ClinicalVoiceAnalyzer:
|
139 |
+
"""Clinical voice analysis and interpretation."""
|
140 |
|
141 |
def __init__(self):
|
142 |
+
"""Initialize analyzer with API and reference ranges."""
|
143 |
self.anthropic = Anthropic(api_key=os.getenv('ANTHROPIC_API_KEY'))
|
144 |
self.model = "claude-3-opus-20240229"
|
|
|
145 |
self.reference_ranges = {
|
146 |
+
'pitch': {'min': 150, 'max': 400},
|
147 |
+
'tempo': {'min': 90, 'max': 130},
|
148 |
'energy': {'min': 0.01, 'max': 0.05}
|
149 |
}
|
150 |
print("Clinical analyzer ready")
|
151 |
|
152 |
def analyze_voice_metrics(self, features, emotions, transcription):
|
153 |
+
"""Analyze voice metrics and generate clinical insights."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
154 |
try:
|
155 |
prompt = self._create_clinical_prompt(features, emotions, transcription)
|
156 |
response = self.anthropic.messages.create(
|
|
|
164 |
return self._generate_backup_analysis(features, emotions)
|
165 |
|
166 |
def _create_clinical_prompt(self, features, emotions, transcription):
|
167 |
+
"""Create clinical analysis prompt."""
|
168 |
return f"""As a clinical voice analysis expert, provide a psychological assessment of:
|
169 |
|
170 |
Voice Metrics:
|
|
|
187 |
5. Clinical recommendations"""
|
188 |
|
189 |
def _format_analysis(self, analysis):
|
190 |
+
"""Format clinical analysis output."""
|
191 |
return f"\nClinical Assessment:\n{analysis}"
|
192 |
|
193 |
def _generate_backup_analysis(self, features, emotions):
|
194 |
+
"""Generate backup analysis when API fails."""
|
195 |
dominant_emotion = max(emotions.items(), key=lambda x: x[1])
|
196 |
pitch_status = (
|
197 |
"elevated" if features['pitch_mean'] > self.reference_ranges['pitch']['max']
|
|
|
207 |
- Primary Emotion: {dominant_emotion[0]} ({dominant_emotion[1]:.1%} confidence)"""
|
208 |
|
209 |
def create_feature_plots(features):
|
210 |
+
"""Create visualizations for voice features."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
211 |
try:
|
212 |
fig = go.Figure()
|
213 |
|
|
|
246 |
marker=dict(size=15, color='green')
|
247 |
))
|
248 |
|
|
|
249 |
fig.update_layout(
|
250 |
title='Voice Feature Analysis',
|
251 |
showlegend=True,
|
|
|
262 |
return None
|
263 |
|
264 |
def create_emotion_plot(emotions):
|
265 |
+
"""Create visualization for emotion analysis."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
266 |
try:
|
267 |
fig = go.Figure(data=[
|
268 |
go.Bar(
|
|
|
288 |
return None
|
289 |
|
290 |
def analyze_audio(audio_input):
|
291 |
+
"""Main function for audio analysis."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
292 |
try:
|
|
|
293 |
if audio_input is None:
|
294 |
return "Please provide an audio input", None, None
|
295 |
|
|
|
296 |
audio_path = audio_input[0] if isinstance(audio_input, tuple) else audio_input
|
297 |
waveform, sr = librosa.load(audio_path, sr=16000, duration=30)
|
298 |
|
|
|
299 |
duration = len(waveform) / sr
|
300 |
if duration < 0.5:
|
301 |
return "Audio too short (minimum 0.5 seconds needed)", None, None
|
302 |
|
|
|
303 |
features = extract_prosodic_features(waveform, sr)
|
304 |
if features is None:
|
305 |
return "Feature extraction failed", None, None
|
306 |
|
|
|
307 |
feature_viz = create_feature_plots(features)
|
308 |
|
|
|
309 |
inputs = processor(waveform, sampling_rate=sr, return_tensors="pt").input_features
|
310 |
with torch.no_grad():
|
311 |
predicted_ids = whisper_model.generate(inputs)
|
312 |
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
|
313 |
|
|
|
314 |
emotion_inputs = emotion_tokenizer(
|
315 |
transcription,
|
316 |
return_tensors="pt",
|
|
|
323 |
emotion_outputs = emotion_model(**emotion_inputs)
|
324 |
emotions = torch.nn.functional.softmax(emotion_outputs.logits, dim=-1)
|
325 |
|
|
|
326 |
emotion_labels = ['anger', 'fear', 'joy', 'neutral', 'sadness', 'surprise']
|
327 |
emotion_scores = {
|
328 |
label: float(score)
|
|
|
331 |
|
332 |
emotion_viz = create_emotion_plot(emotion_scores)
|
333 |
|
|
|
334 |
global clinical_analyzer
|
335 |
if clinical_analyzer is None:
|
336 |
clinical_analyzer = ClinicalVoiceAnalyzer()
|
|
|
339 |
features, emotion_scores, transcription
|
340 |
)
|
341 |
|
342 |
+
# Create summary with fixed string formatting
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
343 |
summary = f"""Voice Analysis Summary:
|
344 |
|
345 |
Speech Content:
|
|
|
356 |
|
357 |
Recording Duration: {duration:.2f} seconds
|
358 |
|
359 |
+
{clinical_analysis}
|
360 |
+
"""
|
361 |
return summary, emotion_viz, feature_viz
|
362 |
|
363 |
except Exception as e:
|
|
|
365 |
print(error_msg)
|
366 |
return error_msg, None, None
|
367 |
|
368 |
+
# Application initialization
|
369 |
try:
|
370 |
print("===== Application Startup =====")
|
371 |
|
|
|
372 |
if not load_models():
|
373 |
raise RuntimeError("Model loading failed")
|
374 |
|
|
|
375 |
clinical_analyzer = ClinicalVoiceAnalyzer()
|
376 |
print("Clinical analyzer initialized")
|
377 |
|
|
|
378 |
description = """This application provides comprehensive voice analysis with clinical insights:
|
379 |
|
380 |
1. Voice Features:
|
|
|
401 |
|
402 |
Upload an audio file or record directly through your microphone."""
|
403 |
|
|
|
404 |
demo = gr.Interface(
|
405 |
fn=analyze_audio,
|
406 |
inputs=gr.Audio(
|
|
|
426 |
theme="default"
|
427 |
)
|
428 |
|
|
|
429 |
if __name__ == "__main__":
|
430 |
demo.launch(
|
431 |
+
server_name="0.0.0.0",
|
432 |
+
server_port=7860,
|
433 |
+
share=False,
|
434 |
+
debug=False
|
435 |
)
|
436 |
|
437 |
except Exception as e:
|