greg0rs commited on
Commit
76b58e9
Β·
verified Β·
1 Parent(s): 5dae976

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +111 -143
app.py CHANGED
@@ -133,24 +133,28 @@ def clean_word_for_phonemes(word: str) -> str:
133
  return cleaned
134
 
135
  def load_whisperx_models():
136
- """Load WhisperX models lazily - ENGLISH ONLY"""
137
  global whisperx_model, whisperx_align_model, whisperx_metadata
138
 
139
  if whisperx_model is None:
140
- log("Loading WhisperX models (English only)...")
141
  try:
142
- # Use float32 compute type for CPU compatibility
143
- whisperx_model = whisperx.load_model("base", device="cpu", compute_type="float32", language="en")
 
 
 
144
  whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
145
- log("WhisperX models loaded successfully (English forced)")
 
146
  except Exception as e:
147
  log(f"Error loading WhisperX models: {e}")
148
- # Fallback: try with smaller model
149
  try:
150
- log("Trying fallback with tiny model (English only)...")
151
- whisperx_model = whisperx.load_model("tiny", device="cpu", compute_type="float32", language="en")
152
  whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
153
- log("WhisperX models loaded with fallback (tiny model, English forced)")
154
  except Exception as fallback_error:
155
  log(f"Fallback also failed: {fallback_error}")
156
  raise
@@ -177,6 +181,54 @@ def normalize_phoneme_string(s: str) -> str:
177
 
178
  return normalized
179
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  def calculate_similarity(detected: str, expected: str) -> float:
181
  """Calculate similarity between detected and expected phonemes"""
182
  detected_norm = normalize_phoneme_string(detected)
@@ -203,25 +255,15 @@ def extract_audio_segment(waveform: torch.Tensor, sample_rate: int,
203
 
204
  return segment
205
 
206
- def detect_phoneme_from_audio_with_fallback(audio_segment: torch.Tensor, sample_rate: int,
207
- word: str, expansion_level: str = "normal") -> str:
208
- """
209
- Detect phoneme from audio segment with fallback expansion capability.
210
- expansion_level: "normal", "extended", or "maximum"
211
- """
212
- level_info = {
213
- "normal": "standard",
214
- "extended": "double-expanded",
215
- "maximum": "triple-expanded"
216
- }
217
-
218
- log(f"πŸ” Starting phoneme detection for '{word}' ({level_info[expansion_level]} audio)...")
219
 
220
  if audio_segment.shape[-1] == 0:
221
  log(f"⚠️ Empty audio segment for '{word}'")
222
  return ""
223
 
224
- log(f"πŸ“ Audio segment: {audio_segment.shape[-1]} samples ({audio_segment.shape[-1]/sample_rate:.3f}s)")
225
 
226
  # Pad or truncate to standard length for model
227
  target_length = 16000 # 1 second
@@ -263,112 +305,27 @@ def detect_phoneme_from_audio_with_fallback(audio_segment: torch.Tensor, sample_
263
  log(f"❌ Error in phoneme detection: {e}")
264
  return ""
265
 
266
- log(f"🎯 Phoneme detection for '{word}' ({expansion_level}): '{detected_phoneme}'")
267
  return detected_phoneme
268
 
269
- def extract_audio_segment_with_expansion_level(waveform: torch.Tensor, sample_rate: int,
270
- start_time: float, end_time: float, word: str,
271
- expansion_level: str = "normal") -> torch.Tensor:
272
  """
273
- Extract audio segment with different expansion levels.
274
- expansion_level: "normal" (Β±0.125s), "extended" (Β±0.25s), "maximum" (Β±0.5s)
275
- """
276
- expansion_configs = {
277
- "normal": 0.125, # Β±125ms (original)
278
- "extended": 0.25, # Β±250ms (double)
279
- "maximum": 0.5 # Β±500ms (quadruple)
280
- }
281
-
282
- expansion_seconds = expansion_configs.get(expansion_level, 0.125)
283
- audio_duration = waveform.shape[-1] / sample_rate
284
-
285
- # Calculate expanded timing with boundary protection
286
- expanded_start = max(0, start_time - expansion_seconds)
287
- expanded_end = min(audio_duration, end_time + expansion_seconds)
288
-
289
- log(f"πŸ“ Audio expansion ({expansion_level}): {start_time:.3f}s-{end_time:.3f}s β†’ {expanded_start:.3f}s-{expanded_end:.3f}s (Β±{expansion_seconds}s)")
290
-
291
- return extract_audio_segment(waveform, sample_rate, expanded_start, expanded_end, word, verbose=False)
292
-
293
- def sliding_window_phoneme_match_with_fallback(detected_phoneme: str, expected_phoneme: str,
294
- word: str, waveform: torch.Tensor, sample_rate: int,
295
- start_time: float, end_time: float) -> Tuple[str, float, int, int, torch.Tensor]:
296
- """
297
- Enhanced sliding window match with automatic fallback expansion when detected is too short.
298
- Returns: (best_match_substring, best_score, start_index, end_index, final_audio_segment)
299
  """
300
  detected_norm = normalize_phoneme_string(detected_phoneme)
301
  expected_norm = normalize_phoneme_string(expected_phoneme)
302
 
303
- log(f"πŸ” Enhanced sliding window analysis for '{word}':")
304
  log(f" Expected (norm): '{expected_norm}' (length: {len(expected_norm)})")
305
  log(f" Detected (norm): '{detected_norm}' (length: {len(detected_norm)})")
306
 
307
- # Track the audio segment used for final detection
308
- final_audio_segment = None
309
- final_detected_phoneme = detected_phoneme
310
-
311
- # Check if detected is significantly shorter than expected
312
- if len(detected_norm) < len(expected_norm):
313
- shortage_ratio = len(detected_norm) / len(expected_norm) if len(expected_norm) > 0 else 0
314
- log(f" ⚠️ Detected shorter than expected! Ratio: {shortage_ratio:.2f}")
315
-
316
- # Try progressive expansion if detected is too short
317
- if shortage_ratio < 0.8: # If detected is less than 80% of expected length
318
- log(f" πŸ”„ Attempting fallback with extended audio expansion...")
319
-
320
- # Try extended expansion (Β±0.25s instead of Β±0.125s)
321
- extended_audio = extract_audio_segment_with_expansion_level(
322
- waveform, sample_rate, start_time, end_time, word, "extended"
323
- )
324
- extended_detected = detect_phoneme_from_audio_with_fallback(
325
- extended_audio, sample_rate, word, "extended"
326
- )
327
- extended_detected_norm = normalize_phoneme_string(extended_detected)
328
-
329
- log(f" πŸ“ˆ Extended detection: '{extended_detected_norm}' (length: {len(extended_detected_norm)})")
330
-
331
- # If extended is better (longer and closer to expected), use it
332
- if len(extended_detected_norm) > len(detected_norm):
333
- final_detected_phoneme = extended_detected
334
- detected_norm = extended_detected_norm
335
- final_audio_segment = extended_audio
336
- log(f" βœ… Using extended detection (improved length)")
337
-
338
- # If still too short, try maximum expansion
339
- if len(detected_norm) < len(expected_norm) * 0.8:
340
- log(f" πŸ”„ Still short, trying maximum expansion...")
341
-
342
- maximum_audio = extract_audio_segment_with_expansion_level(
343
- waveform, sample_rate, start_time, end_time, word, "maximum"
344
- )
345
- maximum_detected = detect_phoneme_from_audio_with_fallback(
346
- maximum_audio, sample_rate, word, "maximum"
347
- )
348
- maximum_detected_norm = normalize_phoneme_string(maximum_detected)
349
-
350
- log(f" πŸ“ˆ Maximum detection: '{maximum_detected_norm}' (length: {len(maximum_detected_norm)})")
351
-
352
- if len(maximum_detected_norm) > len(detected_norm):
353
- final_detected_phoneme = maximum_detected
354
- detected_norm = maximum_detected_norm
355
- final_audio_segment = maximum_audio
356
- log(f" βœ… Using maximum detection (best length)")
357
- else:
358
- log(f" ❌ Extended detection didn't improve, keeping original")
359
-
360
- # If no fallback was used, use the original audio segment
361
- if final_audio_segment is None:
362
- final_audio_segment = extract_audio_segment_with_expansion_level(
363
- waveform, sample_rate, start_time, end_time, word, "normal"
364
- )
365
-
366
- # Now proceed with regular sliding window logic using the final detected phoneme
367
- # If detected is still shorter than or equal to expected, just compare directly
368
  if len(detected_norm) <= len(expected_norm):
369
  score = calculate_similarity(detected_norm, expected_norm)
370
  log(f" Direct comparison (detected ≀ expected): score = {score:.3f}")
371
- return detected_norm, score, 0, len(detected_norm), final_audio_segment
372
 
373
  # Sliding window: detected is longer than expected
374
  expected_len = len(expected_norm)
@@ -418,7 +375,7 @@ def sliding_window_phoneme_match_with_fallback(detected_phoneme: str, expected_p
418
  log(f" πŸ“ Aiming for middle: position {best_start}-{best_end}")
419
 
420
  log(f" πŸ† Final selection: '{best_match}' at position {best_start}-{best_end} (score: {best_score:.3f})")
421
- return best_match, best_score, best_start, best_end, final_audio_segment
422
 
423
  def create_word_phoneme_mapping(word: str, expected_phoneme: str) -> Dict[int, str]:
424
  """
@@ -677,7 +634,7 @@ def format_output_word(word_text: str, similarity_score: float, detected_phoneme
677
 
678
  @app.post("/api/transcribe")
679
  async def transcribe(audio: UploadFile = File(...)):
680
- log("=== STARTING WHISPERX PHONEME ANALYSIS ===")
681
 
682
  # Fixed similarity threshold
683
  similarity = 0.3
@@ -700,12 +657,11 @@ async def transcribe(audio: UploadFile = File(...)):
700
 
701
  log(f"Audio loaded for WhisperX: {len(audio_data)} samples")
702
 
703
- # 2. Get transcription with WhisperX - FORCE ENGLISH
704
- log("🌍 Forcing WhisperX to use English language detection...")
705
  result = whisperx_model.transcribe(audio_data, batch_size=16, language="en")
706
 
707
- # 3. Get precise word alignments with WhisperX - ENGLISH ONLY
708
- aligned_result = whisperx.align(result["segments"], whisperx_align_model, whisperx_metadata, audio_data, device="cpu", return_char_alignments=False)
709
 
710
  # Extract word-level data from WhisperX results
711
  words = []
@@ -752,7 +708,7 @@ async def transcribe(audio: UploadFile = File(...)):
752
  waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
753
  sample_rate = 16000
754
 
755
- # 6. Process each word using enhanced expansion with sliding window matching
756
  results = []
757
  audio_data_list = []
758
 
@@ -761,7 +717,7 @@ async def transcribe(audio: UploadFile = File(...)):
761
  tts_tasks = [generate_tts_audio(word_clean) for word_clean in word_texts_clean]
762
  tts_results = await asyncio.gather(*tts_tasks)
763
 
764
- log("\n=== PROCESSING WORDS WITH ENHANCED EXPANSION + SLIDING WINDOW ===")
765
 
766
  for i, (word_info, word_original, word_clean, (start_time, end_time)) in enumerate(zip(words, word_texts, word_texts_clean, word_timings)):
767
  expected_phoneme = expected_phonemes[i] if i < len(expected_phonemes) else ""
@@ -781,29 +737,41 @@ async def transcribe(audio: UploadFile = File(...)):
781
  else:
782
  log(f"πŸ”— No gap (continuous)")
783
 
784
- # Start with normal expansion (Β±0.125s) and detect phonemes
785
- initial_audio_segment = extract_audio_segment_with_expansion_level(
786
- waveform, sample_rate, start_time, end_time, word_clean, "normal"
787
- )
788
 
789
- initial_detected_phoneme = detect_phoneme_from_audio_with_fallback(
790
- initial_audio_segment, sample_rate, word_clean, "normal"
791
- )
 
 
 
 
 
 
 
 
 
 
 
792
 
793
- log(f"πŸ”Š Initial detected phoneme: '{initial_detected_phoneme}'")
 
 
794
 
795
- # Enhanced sliding window matching with automatic fallback expansion
796
- best_match_phoneme, similarity_score, match_start, match_end, final_audio_segment = sliding_window_phoneme_match_with_fallback(
797
- initial_detected_phoneme, expected_phoneme, word_clean, waveform, sample_rate, start_time, end_time
798
  )
799
 
800
- # Trim the final audio segment based on best phoneme match position
801
- trimmed_audio_segment = trim_audio_segment_by_phoneme_position(
802
- final_audio_segment, initial_detected_phoneme, match_start, match_end, word_clean
803
  )
804
 
805
  log(f"πŸ“Š Final similarity score: {similarity_score:.3f}")
806
- log(f"🎨 Final audio segment samples: {trimmed_audio_segment.shape[-1]} (duration: {trimmed_audio_segment.shape[-1]/sample_rate:.3f}s)")
807
 
808
  # Store results using the best match phoneme (use ORIGINAL word for display)
809
  results.append({
@@ -817,7 +785,7 @@ async def transcribe(audio: UploadFile = File(...)):
817
  })
818
 
819
  # Prepare audio data using trimmed segment (use ORIGINAL word for display)
820
- user_audio_b64 = audio_to_base64(trimmed_audio_segment, sample_rate)
821
  expected_audio_b64 = tts_results[i]
822
 
823
  audio_data_list.append({
@@ -851,7 +819,7 @@ async def transcribe(audio: UploadFile = File(...)):
851
  # Clean up temporary file
852
  os.remove(temp_audio_path)
853
 
854
- log("=== WHISPERX PHONEME ANALYSIS COMPLETE ===")
855
 
856
  return {
857
  "transcript": full_transcript,
@@ -861,7 +829,7 @@ async def transcribe(audio: UploadFile = File(...)):
861
  "debug_info": {
862
  "total_words": len(words),
863
  "similarity_threshold": similarity,
864
- "alignment_method": "WhisperX + Enhanced Sliding Window with Fallback (English Only)",
865
  "results_summary": [
866
  {
867
  "word": r['word_text'],
@@ -889,7 +857,7 @@ async def transcribe(audio: UploadFile = File(...)):
889
 
890
  @app.get("/")
891
  def root():
892
- return "Clean Fonetik with WhisperX + Enhanced Character-Level Feedback (English Only) running"
893
 
894
  @app.post("/api/clear-cache")
895
  def clear_cache():
 
133
  return cleaned
134
 
135
  def load_whisperx_models():
136
+ """Load WhisperX models lazily with English-only configuration"""
137
  global whisperx_model, whisperx_align_model, whisperx_metadata
138
 
139
  if whisperx_model is None:
140
+ log("Loading WhisperX models for English-only processing...")
141
  try:
142
+ # Load WhisperX model with English-only configuration
143
+ whisperx_model = whisperx.load_model("base.en", device="cpu", compute_type="float32", language="en")
144
+ log("WhisperX base.en model loaded successfully")
145
+
146
+ # Load alignment model for English
147
  whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
148
+ log("WhisperX English alignment model loaded successfully")
149
+
150
  except Exception as e:
151
  log(f"Error loading WhisperX models: {e}")
152
+ # Fallback: try with smaller English-only model
153
  try:
154
+ log("Trying fallback with tiny.en model...")
155
+ whisperx_model = whisperx.load_model("tiny.en", device="cpu", compute_type="float32", language="en")
156
  whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
157
+ log("WhisperX models loaded with fallback (tiny.en model)")
158
  except Exception as fallback_error:
159
  log(f"Fallback also failed: {fallback_error}")
160
  raise
 
181
 
182
  return normalized
183
 
184
+ # TEMPORARILY DISABLED: English letter sounds conversion
185
+ # def phoneme_to_english_sounds(phoneme_string: str) -> str:
186
+ # """Convert IPA phonemes to English letter sounds"""
187
+ # if not phoneme_string:
188
+ # return phoneme_string
189
+ #
190
+ # log(f"Converting phonemes to English sounds: '{phoneme_string}'")
191
+ #
192
+ # # Clean the input
193
+ # phoneme_string = phoneme_string.strip()
194
+ #
195
+ # # Split by spaces first (words/syllables)
196
+ # words = phoneme_string.split(' ')
197
+ # converted_words = []
198
+ #
199
+ # for word in words:
200
+ # if not word:
201
+ # continue
202
+ #
203
+ # english_sounds = []
204
+ # i = 0
205
+ #
206
+ # while i < len(word):
207
+ # found = False
208
+ # # Try longest matches first (like 'tʃ', 'dʒ' before 't', 'd')
209
+ # for length in [3, 2, 1]:
210
+ # if i + length <= len(word):
211
+ # phoneme = word[i:i+length]
212
+ # if phoneme in PHONEME_TO_ENGLISH:
213
+ # english_sounds.append(PHONEME_TO_ENGLISH[phoneme])
214
+ # i += length
215
+ # found = True
216
+ # break
217
+ #
218
+ # if not found:
219
+ # # Keep unknown characters as-is, but clean them up
220
+ # char = word[i]
221
+ # if char.isalpha():
222
+ # english_sounds.append(char.upper())
223
+ # i += 1
224
+ #
225
+ # if english_sounds:
226
+ # converted_words.append('-'.join(english_sounds))
227
+ #
228
+ # result = ' '.join(converted_words)
229
+ # log(f"Converted '{phoneme_string}' -> '{result}'")
230
+ # return result
231
+
232
  def calculate_similarity(detected: str, expected: str) -> float:
233
  """Calculate similarity between detected and expected phonemes"""
234
  detected_norm = normalize_phoneme_string(detected)
 
255
 
256
  return segment
257
 
258
+ def detect_phoneme_from_audio(audio_segment: torch.Tensor, sample_rate: int, word: str) -> str:
259
+ """Detect phoneme from audio segment using phoneme model"""
260
+ log(f"πŸ” Starting phoneme detection for '{word}'...")
 
 
 
 
 
 
 
 
 
 
261
 
262
  if audio_segment.shape[-1] == 0:
263
  log(f"⚠️ Empty audio segment for '{word}'")
264
  return ""
265
 
266
+ log(f"πŸ“ Original audio segment: {audio_segment.shape[-1]} samples")
267
 
268
  # Pad or truncate to standard length for model
269
  target_length = 16000 # 1 second
 
305
  log(f"❌ Error in phoneme detection: {e}")
306
  return ""
307
 
308
+ log(f"🎯 Phoneme detection for '{word}': '{detected_phoneme}'")
309
  return detected_phoneme
310
 
311
+ def sliding_window_phoneme_match(detected_phoneme: str, expected_phoneme: str, word: str) -> Tuple[str, float, int, int]:
 
 
312
  """
313
+ Find the best matching substring in detected phoneme using sliding window.
314
+ For zero scores, intelligently selects which phoneme substring to return.
315
+ Returns: (best_match_substring, best_score, start_index, end_index)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
  """
317
  detected_norm = normalize_phoneme_string(detected_phoneme)
318
  expected_norm = normalize_phoneme_string(expected_phoneme)
319
 
320
+ log(f"πŸ” Sliding window analysis for '{word}':")
321
  log(f" Expected (norm): '{expected_norm}' (length: {len(expected_norm)})")
322
  log(f" Detected (norm): '{detected_norm}' (length: {len(detected_norm)})")
323
 
324
+ # If detected is shorter than or equal to expected, just compare directly
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
325
  if len(detected_norm) <= len(expected_norm):
326
  score = calculate_similarity(detected_norm, expected_norm)
327
  log(f" Direct comparison (detected ≀ expected): score = {score:.3f}")
328
+ return detected_norm, score, 0, len(detected_norm)
329
 
330
  # Sliding window: detected is longer than expected
331
  expected_len = len(expected_norm)
 
375
  log(f" πŸ“ Aiming for middle: position {best_start}-{best_end}")
376
 
377
  log(f" πŸ† Final selection: '{best_match}' at position {best_start}-{best_end} (score: {best_score:.3f})")
378
+ return best_match, best_score, best_start, best_end
379
 
380
  def create_word_phoneme_mapping(word: str, expected_phoneme: str) -> Dict[int, str]:
381
  """
 
634
 
635
  @app.post("/api/transcribe")
636
  async def transcribe(audio: UploadFile = File(...)):
637
+ log("=== STARTING WHISPERX ENGLISH-ONLY PHONEME ANALYSIS ===")
638
 
639
  # Fixed similarity threshold
640
  similarity = 0.3
 
657
 
658
  log(f"Audio loaded for WhisperX: {len(audio_data)} samples")
659
 
660
+ # 2. Get transcription with WhisperX - EXPLICITLY SET TO ENGLISH
 
661
  result = whisperx_model.transcribe(audio_data, batch_size=16, language="en")
662
 
663
+ # 3. Get precise word alignments with WhisperX
664
+ aligned_result = whisperx.align(result["segments"], whisperx_align_model, whisperx_metadata, audio_data, device="cpu")
665
 
666
  # Extract word-level data from WhisperX results
667
  words = []
 
708
  waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
709
  sample_rate = 16000
710
 
711
+ # 6. Process each word using expanded timing with sliding window matching
712
  results = []
713
  audio_data_list = []
714
 
 
717
  tts_tasks = [generate_tts_audio(word_clean) for word_clean in word_texts_clean]
718
  tts_results = await asyncio.gather(*tts_tasks)
719
 
720
+ log("\n=== PROCESSING WORDS WITH EXPANDED TIMING + SLIDING WINDOW ===")
721
 
722
  for i, (word_info, word_original, word_clean, (start_time, end_time)) in enumerate(zip(words, word_texts, word_texts_clean, word_timings)):
723
  expected_phoneme = expected_phonemes[i] if i < len(expected_phonemes) else ""
 
737
  else:
738
  log(f"πŸ”— No gap (continuous)")
739
 
740
+ # Calculate expanded timing (Β±0.125s with boundary protection)
741
+ expansion_seconds = 0.125
742
+ audio_duration = waveform.shape[-1] / sample_rate
 
743
 
744
+ expanded_start = max(0, start_time - expansion_seconds)
745
+ expanded_end = min(audio_duration, end_time + expansion_seconds)
746
+
747
+ log(f"πŸ“ Timing expansion: {start_time:.3f}s-{end_time:.3f}s β†’ {expanded_start:.3f}s-{expanded_end:.3f}s")
748
+
749
+ # Extract expanded audio segment
750
+ expanded_audio_segment = extract_audio_segment(waveform, sample_rate, expanded_start, expanded_end, word_clean, verbose=True)
751
+
752
+ # Detect phoneme from expanded audio segment
753
+ detected_phoneme_raw = detect_phoneme_from_audio(expanded_audio_segment, sample_rate, word_clean)
754
+
755
+ # Get expected phoneme and normalize both
756
+ detected_phoneme_norm = normalize_phoneme_string(detected_phoneme_raw)
757
+ expected_phoneme_norm = normalize_phoneme_string(expected_phoneme)
758
 
759
+ log(f"πŸ”Š Raw detected phoneme (expanded): '{detected_phoneme_raw}'")
760
+ log(f"🧹 Normalized detected: '{detected_phoneme_norm}'")
761
+ log(f"🧹 Normalized expected: '{expected_phoneme_norm}'")
762
 
763
+ # Find best matching substring using sliding window
764
+ best_match_phoneme, similarity_score, match_start, match_end = sliding_window_phoneme_match(
765
+ detected_phoneme_raw, expected_phoneme, word_clean
766
  )
767
 
768
+ # Trim audio segment based on best phoneme match position
769
+ final_audio_segment = trim_audio_segment_by_phoneme_position(
770
+ expanded_audio_segment, detected_phoneme_raw, match_start, match_end, word_clean
771
  )
772
 
773
  log(f"πŸ“Š Final similarity score: {similarity_score:.3f}")
774
+ log(f"🎨 Final audio segment samples: {final_audio_segment.shape[-1]} (duration: {final_audio_segment.shape[-1]/sample_rate:.3f}s)")
775
 
776
  # Store results using the best match phoneme (use ORIGINAL word for display)
777
  results.append({
 
785
  })
786
 
787
  # Prepare audio data using trimmed segment (use ORIGINAL word for display)
788
+ user_audio_b64 = audio_to_base64(final_audio_segment, sample_rate)
789
  expected_audio_b64 = tts_results[i]
790
 
791
  audio_data_list.append({
 
819
  # Clean up temporary file
820
  os.remove(temp_audio_path)
821
 
822
+ log("=== WHISPERX ENGLISH-ONLY PHONEME ANALYSIS COMPLETE ===")
823
 
824
  return {
825
  "transcript": full_transcript,
 
829
  "debug_info": {
830
  "total_words": len(words),
831
  "similarity_threshold": similarity,
832
+ "alignment_method": "WhisperX English-only + Sliding Window",
833
  "results_summary": [
834
  {
835
  "word": r['word_text'],
 
857
 
858
  @app.get("/")
859
  def root():
860
+ return "Clean Fonetik with WhisperX English-only + Character-Level Feedback running"
861
 
862
  @app.post("/api/clear-cache")
863
  def clear_cache():