invincible-jha commited on
Commit
978a6ce
1 Parent(s): e666e44

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +79 -257
  2. requirements.txt +12 -1
app.py CHANGED
@@ -1,239 +1,82 @@
1
- import gradio as gr
2
- import torch
3
- from transformers import WhisperProcessor, WhisperForConditionalGeneration, AutoModelForSequenceClassification, AutoTokenizer
4
- import librosa
5
- import numpy as np
6
- import plotly.graph_objects as go
7
- import warnings
8
  import os
9
- from scipy.stats import kurtosis, skew
10
- warnings.filterwarnings('ignore')
11
-
12
- def extract_prosodic_features(waveform, sr):
13
- """Extract prosodic features from audio"""
14
- try:
15
- features = {}
16
-
17
- # 1. Pitch (F0) Features
18
- pitches, magnitudes = librosa.piptrack(y=waveform, sr=sr)
19
- f0_contour = []
20
- for t in range(pitches.shape[1]):
21
- pitches_at_t = pitches[:, t]
22
- mags = magnitudes[:, t]
23
- pitch_index = mags.argmax()
24
- f0_contour.append(pitches[pitch_index, t])
25
- f0_contour = np.array(f0_contour)
26
- f0_contour = f0_contour[f0_contour > 0] # Remove zero pitches
27
-
28
- if len(f0_contour) > 0:
29
- features['pitch_mean'] = np.mean(f0_contour)
30
- features['pitch_std'] = np.std(f0_contour)
31
- features['pitch_range'] = np.ptp(f0_contour)
32
- else:
33
- features['pitch_mean'] = 0
34
- features['pitch_std'] = 0
35
- features['pitch_range'] = 0
36
-
37
- # 2. Energy/Intensity Features
38
- rms = librosa.feature.rms(y=waveform)[0]
39
- features['energy_mean'] = np.mean(rms)
40
- features['energy_std'] = np.std(rms)
41
- features['energy_range'] = np.ptp(rms)
42
-
43
- # 3. Rhythm Features
44
- onset_env = librosa.onset.onset_strength(y=waveform, sr=sr)
45
- tempo = librosa.beat.tempo(onset_envelope=onset_env, sr=sr)
46
- features['tempo'] = tempo[0]
47
-
48
- # 4. Voice Quality Features
49
- spectral_centroids = librosa.feature.spectral_centroid(y=waveform, sr=sr)[0]
50
- features['spectral_centroid_mean'] = np.mean(spectral_centroids)
51
-
52
- spectral_rolloff = librosa.feature.spectral_rolloff(y=waveform, sr=sr)[0]
53
- features['spectral_rolloff_mean'] = np.mean(spectral_rolloff)
54
-
55
- # 5. MFCC Features
56
- mfccs = librosa.feature.mfcc(y=waveform, sr=sr, n_mfcc=13)
57
- for i in range(13):
58
- features[f'mfcc_{i}_mean'] = np.mean(mfccs[i])
59
- features[f'mfcc_{i}_std'] = np.std(mfccs[i])
60
-
61
- return features
62
-
63
- except Exception as e:
64
- print(f"Error in extract_prosodic_features: {str(e)}")
65
- return None
66
 
67
- def create_feature_plots(features):
68
- """Create visualizations for audio features"""
69
- try:
70
- # Create main figure with subplots
71
- fig = go.Figure()
72
-
73
- # 1. Pitch Features
74
- pitch_data = {
75
- 'Mean': features['pitch_mean'],
76
- 'Std Dev': features['pitch_std'],
77
- 'Range': features['pitch_range']
78
- }
79
-
80
- fig.add_trace(go.Bar(
81
- name='Pitch Features',
82
- x=list(pitch_data.keys()),
83
- y=list(pitch_data.values()),
84
- marker_color='blue'
85
- ))
86
-
87
- # 2. Energy Features
88
- energy_data = {
89
- 'Mean': features['energy_mean'],
90
- 'Std Dev': features['energy_std'],
91
- 'Range': features['energy_range']
92
  }
93
 
94
- fig.add_trace(go.Bar(
95
- name='Energy Features',
96
- x=[f"Energy {k}" for k in energy_data.keys()],
97
- y=list(energy_data.values()),
98
- marker_color='red'
99
- ))
100
-
101
- # 3. MFCC Plot
102
- mfcc_means = [features[f'mfcc_{i}_mean'] for i in range(13)]
103
- fig.add_trace(go.Scatter(
104
- name='MFCC Coefficients',
105
- y=mfcc_means,
106
- mode='lines+markers',
107
- marker_color='green'
108
- ))
109
-
110
- # Update layout
111
- fig.update_layout(
112
- title='Voice Feature Analysis',
113
- showlegend=True,
114
- height=600,
115
- barmode='group'
116
- )
117
-
118
- return fig.to_html(include_plotlyjs=True)
119
-
120
- except Exception as e:
121
- print(f"Error in create_feature_plots: {str(e)}")
122
- return None
123
-
124
- def load_models():
125
- """Initialize and load all required models"""
126
- global processor, whisper_model, emotion_tokenizer, emotion_model
127
 
128
- try:
129
- print("Loading Whisper model...")
130
- processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
131
- whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
132
-
133
- print("Loading emotion model...")
134
- emotion_tokenizer = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
135
- emotion_model = AutoModelForSequenceClassification.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
136
-
137
- whisper_model.to("cpu")
138
- emotion_model.to("cpu")
139
-
140
- print("Models loaded successfully!")
141
- return True
142
- except Exception as e:
143
- print(f"Error loading models: {str(e)}")
144
- return False
145
 
146
- def create_emotion_plot(emotions):
147
- """Create emotion analysis visualization"""
148
- try:
149
- fig = go.Figure(data=[
150
- go.Bar(
151
- x=list(emotions.keys()),
152
- y=list(emotions.values()),
153
- marker_color='rgb(55, 83, 109)'
 
 
 
 
 
 
154
  )
155
- ])
 
 
 
 
 
 
 
 
156
 
157
- fig.update_layout(
158
- title='Emotion Analysis',
159
- xaxis_title='Emotion',
160
- yaxis_title='Score',
161
- yaxis_range=[0, 1],
162
- template='plotly_white',
163
- height=400
164
- )
165
 
166
- return fig.to_html(include_plotlyjs=True)
167
- except Exception as e:
168
- print(f"Error creating emotion plot: {str(e)}")
169
- return None
 
170
 
 
 
 
171
  def analyze_audio(audio_input):
172
- """Main function to analyze audio input"""
173
  try:
174
- if audio_input is None:
175
- return "Please provide an audio input", None, None
176
-
177
- print(f"Processing audio input: {type(audio_input)}")
178
-
179
- # Handle audio input
180
- if isinstance(audio_input, tuple):
181
- audio_path = audio_input[0] # Get file path from tuple
182
- else:
183
- audio_path = audio_input
184
-
185
- print(f"Loading audio from path: {audio_path}")
186
 
187
- # Load audio
188
- waveform, sr = librosa.load(audio_path, sr=16000)
189
- print(f"Audio loaded: {waveform.shape}, SR: {sr}")
190
 
191
- # Extract voice features
192
- print("Extracting voice features...")
193
- features = extract_prosodic_features(waveform, sr)
194
- if features is None:
195
- return "Error extracting voice features", None, None
196
-
197
- # Create feature plots
198
- print("Creating feature visualizations...")
199
- feature_viz = create_feature_plots(features)
200
- if feature_viz is None:
201
- return "Error creating feature visualizations", None, None
202
-
203
- # Transcribe audio
204
- print("Transcribing audio...")
205
- inputs = processor(waveform, sampling_rate=sr, return_tensors="pt").input_features
206
-
207
- with torch.no_grad():
208
- predicted_ids = whisper_model.generate(inputs)
209
- transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
210
-
211
- # Analyze emotions
212
- print("Analyzing emotions...")
213
- emotion_inputs = emotion_tokenizer(
214
- transcription,
215
- return_tensors="pt",
216
- padding=True,
217
- truncation=True,
218
- max_length=512
219
- )
220
-
221
- with torch.no_grad():
222
- emotion_outputs = emotion_model(**emotion_inputs)
223
- emotions = torch.nn.functional.softmax(emotion_outputs.logits, dim=-1)
224
-
225
- emotion_labels = ['anger', 'fear', 'joy', 'neutral', 'sadness', 'surprise']
226
- emotion_scores = {
227
- label: float(score)
228
- for label, score in zip(emotion_labels, emotions[0].cpu().numpy())
229
- }
230
-
231
- # Create emotion visualization
232
- emotion_viz = create_emotion_plot(emotion_scores)
233
- if emotion_viz is None:
234
- return "Error creating emotion visualization", None, None
235
-
236
- # Create analysis summary
237
  summary = f"""Voice Analysis Summary:
238
 
239
  Speech Content:
@@ -246,21 +89,20 @@ Voice Characteristics:
246
  - Voice Energy: {features['energy_mean']:.4f}
247
 
248
  Dominant Emotion: {max(emotion_scores.items(), key=lambda x: x[1])[0]}
 
 
 
249
  """
250
-
251
- return summary, emotion_viz, feature_viz
252
 
253
  except Exception as e:
254
  error_msg = f"Error in audio analysis: {str(e)}"
255
  print(error_msg)
256
- return error_msg, None, None
257
 
258
- # Load models at startup
259
- print("Initializing application...")
260
- if not load_models():
261
- raise RuntimeError("Failed to load required models")
262
 
263
- # Create Gradio interface
264
  demo = gr.Interface(
265
  fn=analyze_audio,
266
  inputs=gr.Audio(
@@ -269,32 +111,12 @@ demo = gr.Interface(
269
  label="Audio Input"
270
  ),
271
  outputs=[
272
- gr.Textbox(label="Analysis Summary", lines=10),
273
  gr.HTML(label="Emotion Analysis"),
274
- gr.HTML(label="Voice Feature Analysis")
 
275
  ],
276
- title="Voice Analysis System",
277
- description="""
278
- This application analyzes voice recordings to extract various characteristics:
279
-
280
- 1. Voice Features:
281
- - Pitch analysis
282
- - Energy patterns
283
- - Speech rate
284
- - Voice quality
285
-
286
- 2. Emotional Content:
287
- - Emotion detection
288
- - Emotional intensity
289
-
290
- 3. Speech Content:
291
- - Text transcription
292
-
293
- Upload an audio file or record directly through your microphone.
294
- """,
295
- examples=None,
296
- cache_examples=False
297
- )
298
-
299
- if __name__ == "__main__":
300
- demo.launch(share=True)
 
 
 
 
 
 
 
 
1
  import os
2
+ from anthropic import Anthropic
3
+ import gradio as gr
4
+ # ... (your existing imports)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
+ class ClinicalVoiceAnalyzer:
7
+ def __init__(self):
8
+ # Initialize without the API key first
9
+ self.anthropic = None
10
+ self.model = "claude-3-opus-20240229"
11
+ self.api_key = os.getenv('ANTHROPIC_API_KEY')
12
+
13
+ # Reference ranges remain the same
14
+ self.reference_ranges = {
15
+ 'pitch': {'min': 150, 'max': 400},
16
+ 'tempo': {'min': 90, 'max': 130},
17
+ 'energy': {'min': 0.01, 'max': 0.05}
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  }
19
 
20
+ # Initialize Anthropic client if API key is available
21
+ self._initialize_anthropic()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
+ def _initialize_anthropic(self):
24
+ """Safely initialize the Anthropic client"""
25
+ try:
26
+ if self.api_key:
27
+ self.anthropic = Anthropic(api_key=self.api_key)
28
+ print("Anthropic client initialized successfully")
29
+ else:
30
+ print("Warning: ANTHROPIC_API_KEY not found in environment variables")
31
+ except Exception as e:
32
+ print(f"Error initializing Anthropic client: {str(e)}")
33
+ self.anthropic = None
 
 
 
 
 
 
34
 
35
+ def generate_clinical_analysis(self, voice_features):
36
+ """Generate clinical analysis with fallback behavior"""
37
+ if not self.anthropic:
38
+ return self._generate_fallback_analysis(voice_features), {}
39
+
40
+ try:
41
+ prompt = self._construct_analysis_prompt(voice_features)
42
+ response = self.anthropic.messages.create(
43
+ model=self.model,
44
+ max_tokens=1000,
45
+ messages=[{
46
+ "role": "user",
47
+ "content": prompt
48
+ }]
49
  )
50
+ return response.content, self._parse_clinical_response(response.content)
51
+ except Exception as e:
52
+ print(f"Error in clinical analysis: {str(e)}")
53
+ return self._generate_fallback_analysis(voice_features), {}
54
+
55
+ def _generate_fallback_analysis(self, features):
56
+ """Generate basic analysis when Anthropic API is unavailable"""
57
+ pitch_status = "elevated" if features['pitch_mean'] > self.reference_ranges['pitch']['max'] else "normal"
58
+ tempo_status = "elevated" if features['tempo'] > self.reference_ranges['tempo']['max'] else "normal"
59
 
60
+ return f"""Basic Voice Analysis:
 
 
 
 
 
 
 
61
 
62
+ Pitch Analysis: {pitch_status} ({features['pitch_mean']:.2f} Hz)
63
+ Speech Rate: {tempo_status} ({features['tempo']:.2f} BPM)
64
+ Energy Level: {features['energy_mean']:.4f}
65
+
66
+ Note: This is a basic analysis. For detailed clinical interpretation, please ensure the Anthropic API key is configured."""
67
 
68
+ # ... (rest of your ClinicalVoiceAnalyzer methods remain the same)
69
+
70
+ # Modified analyze_audio function
71
  def analyze_audio(audio_input):
 
72
  try:
73
+ # Your existing audio processing code...
 
 
 
 
 
 
 
 
 
 
 
74
 
75
+ # Initialize clinical analyzer with graceful fallback
76
+ clinical_analyzer = ClinicalVoiceAnalyzer()
77
+ clinical_analysis, clinical_insights = clinical_analyzer.generate_clinical_analysis(features)
78
 
79
+ # Create enhanced summary
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  summary = f"""Voice Analysis Summary:
81
 
82
  Speech Content:
 
89
  - Voice Energy: {features['energy_mean']:.4f}
90
 
91
  Dominant Emotion: {max(emotion_scores.items(), key=lambda x: x[1])[0]}
92
+
93
+ Clinical Analysis:
94
+ {clinical_analysis}
95
  """
96
+ return summary, emotion_viz, feature_viz, clinical_insights
 
97
 
98
  except Exception as e:
99
  error_msg = f"Error in audio analysis: {str(e)}"
100
  print(error_msg)
101
+ return error_msg, None, None, None
102
 
103
+ # ... (rest of your existing code)
 
 
 
104
 
105
+ # Modified Gradio interface
106
  demo = gr.Interface(
107
  fn=analyze_audio,
108
  inputs=gr.Audio(
 
111
  label="Audio Input"
112
  ),
113
  outputs=[
114
+ gr.Textbox(label="Analysis Summary", lines=15),
115
  gr.HTML(label="Emotion Analysis"),
116
+ gr.HTML(label="Voice Feature Analysis"),
117
+ gr.JSON(label="Clinical Insights")
118
  ],
119
+ title="Advanced Voice Analysis System",
120
+ description="""This system provides comprehensive voice analysis with clinical interpretation.
121
+ Upload an audio file or record directly through your microphone."""
122
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,3 +1,4 @@
 
1
  gradio==3.50.2
2
  torch==2.1.0
3
  transformers==4.35.2
@@ -5,4 +6,14 @@ librosa==0.10.1
5
  numpy==1.24.3
6
  plotly==5.18.0
7
  soundfile==0.12.1
8
- scipy==1.11.3
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies with existing versions
2
  gradio==3.50.2
3
  torch==2.1.0
4
  transformers==4.35.2
 
6
  numpy==1.24.3
7
  plotly==5.18.0
8
  soundfile==0.12.1
9
+ scipy==1.11.3
10
+
11
+ # New dependencies for Anthropic integration
12
+ anthropic==0.3.11
13
+ python-dotenv==1.0.0
14
+ requests>=2.31.0
15
+
16
+ # Additional utilities that enhance stability
17
+ tqdm>=4.66.1
18
+ regex>=2023.8.8
19
+ tenacity>=8.2.3