greg0rs commited on
Commit
779a747
·
verified ·
1 Parent(s): c40ce88

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +131 -67
app.py CHANGED
@@ -569,76 +569,123 @@ def sliding_window_phoneme_match(detected_phoneme: str, expected_phoneme: str, w
569
  def create_word_phoneme_mapping(word: str, expected_phoneme: str) -> Dict[int, str]:
570
  """
571
  Create dynamic mapping from phoneme positions to original word letters.
572
- Uses dynamic programming to find optimal alignment.
573
  Returns: {phoneme_position: word_letters}
574
  """
575
  word_normalized = word.lower()
576
  phoneme_normalized = normalize_phoneme_string(expected_phoneme)
577
 
578
  log(f"🗺️ Creating mapping for '{word}' → '{phoneme_normalized}'")
 
579
 
580
  if not phoneme_normalized:
581
  return {}
582
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
583
  # Simple case: if lengths are equal, try direct mapping
584
  if len(word_normalized) == len(phoneme_normalized):
585
  mapping = {i: word_normalized[i] for i in range(len(phoneme_normalized))}
586
  log(f" Direct mapping (equal lengths): {mapping}")
587
  return mapping
588
 
589
- # Dynamic programming alignment for different lengths
590
- word_len = len(word_normalized)
591
- phoneme_len = len(phoneme_normalized)
592
-
593
- # Initialize DP matrix
594
- dp = [[float('inf')] * (phoneme_len + 1) for _ in range(word_len + 1)]
595
- dp[0][0] = 0
596
-
597
- # Fill DP matrix
598
- for i in range(word_len + 1):
599
- for j in range(phoneme_len + 1):
600
- if i < word_len and j < phoneme_len:
601
- # Match/substitute
602
- cost = 0 if word_normalized[i] == phoneme_normalized[j] else 1
603
- dp[i+1][j+1] = min(dp[i+1][j+1], dp[i][j] + cost)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
604
 
605
- if i < word_len:
606
  # Delete word character (silent letter)
607
- dp[i+1][j] = min(dp[i+1][j], dp[i][j] + 1)
608
-
609
- if j < phoneme_len:
610
- # Insert phoneme (multiple letters for one phoneme)
611
- dp[i][j+1] = min(dp[i][j+1], dp[i][j] + 1)
612
-
613
- # Backtrack to find alignment
614
- mapping = {}
615
- i, j = word_len, phoneme_len
616
-
617
- while i > 0 or j > 0:
618
- if i > 0 and j > 0:
619
- cost = 0 if word_normalized[i-1] == phoneme_normalized[j-1] else 1
620
- if dp[i][j] == dp[i-1][j-1] + cost:
621
- # Match/substitute
622
- if j-1 not in mapping:
623
- mapping[j-1] = word_normalized[i-1]
624
- else:
625
- mapping[j-1] = word_normalized[i-1] + mapping[j-1]
626
- i, j = i-1, j-1
627
- continue
628
-
629
- if i > 0 and dp[i][j] == dp[i-1][j] + 1:
630
- # Delete word character (silent letter) - add to previous phoneme if exists
631
- if j < phoneme_len and j in mapping:
632
- mapping[j] = word_normalized[i-1] + mapping[j]
633
- elif j > 0 and j-1 in mapping:
634
- mapping[j-1] = mapping[j-1] + word_normalized[i-1]
635
- i -= 1
636
- elif j > 0 and dp[i][j] == dp[i][j-1] + 1:
637
- # Insert phoneme (skip phoneme, should not happen in our case)
638
- j -= 1
639
-
640
- log(f" Dynamic mapping: {mapping}")
641
- return mapping
642
 
643
  def create_character_level_feedback(word: str, expected_normalized: str,
644
  detected_normalized: str,
@@ -652,10 +699,18 @@ def create_character_level_feedback(word: str, expected_normalized: str,
652
  result = []
653
 
654
  log(f"🔍 Character-level feedback for '{word}':")
 
655
  log(f" Expected: '{expected_normalized}' (length: {len(expected_normalized)})")
656
  log(f" Detected: '{detected_normalized}' (length: {len(detected_normalized)})")
657
  log(f" Mapping: {phoneme_mapping}")
658
 
 
 
 
 
 
 
 
659
  # Handle length mismatches by using the expected length and padding detected if needed
660
  max_length = len(expected_normalized)
661
  detected_padded = detected_normalized + ' ' * max(0, max_length - len(detected_normalized))
@@ -703,10 +758,12 @@ def create_character_level_feedback(word: str, expected_normalized: str,
703
  # MATCH: Use original word letters from mapping in natural case
704
  if i in phoneme_mapping:
705
  word_letters = phoneme_mapping[i] # Keep natural case
706
- test_result.append(word_letters)
707
- tooltip_result.append(word_letters) # Same for tooltip
708
- real_chars += len(word_letters)
709
- matches += 1
 
 
710
  else:
711
  test_result.append(expected_char)
712
  tooltip_result.append(expected_char)
@@ -716,17 +773,19 @@ def create_character_level_feedback(word: str, expected_normalized: str,
716
  # MISMATCH: Use original word letters with bold+underline formatting
717
  if i in phoneme_mapping:
718
  word_letters = phoneme_mapping[i] # Keep natural case
719
- # Format with bold and underline for display
720
- formatted_letters = f'<strong><u>{word_letters}</u></strong>'
721
- test_result.append(formatted_letters)
722
-
723
- # For tooltip, convert detected phoneme to English equivalent
724
- detected_phoneme = detected_char
725
- if detected_phoneme in PHONEME_TO_ENGLISH:
726
- english_equiv = PHONEME_TO_ENGLISH[detected_phoneme].lower()
727
- tooltip_result.append(english_equiv)
728
- else:
729
- tooltip_result.append('?')
 
 
730
  else:
731
  # Fallback for cases without mapping
732
  formatted_char = f'<strong><u>{expected_char}</u></strong>'
@@ -1144,6 +1203,11 @@ async def transcribe(audio: UploadFile = File(...), similarity_threshold: float
1144
  log(f"📍 WhisperX timing: {start_time:.3f}s - {end_time:.3f}s (duration: {end_time - start_time:.3f}s)")
1145
  log(f"🎯 Expected phoneme: '{expected_phoneme}'")
1146
 
 
 
 
 
 
1147
  # For very short words, expand the WhisperX timing itself before processing
1148
  original_duration = end_time - start_time
1149
  if original_duration < 0.1:
 
569
  def create_word_phoneme_mapping(word: str, expected_phoneme: str) -> Dict[int, str]:
570
  """
571
  Create dynamic mapping from phoneme positions to original word letters.
572
+ Handles cases where phonemes are longer than the word (e.g., 'go' → 'ɡoʊ').
573
  Returns: {phoneme_position: word_letters}
574
  """
575
  word_normalized = word.lower()
576
  phoneme_normalized = normalize_phoneme_string(expected_phoneme)
577
 
578
  log(f"🗺️ Creating mapping for '{word}' → '{phoneme_normalized}'")
579
+ log(f" Word length: {len(word_normalized)}, Phoneme length: {len(phoneme_normalized)}")
580
 
581
  if not phoneme_normalized:
582
  return {}
583
 
584
+ # CRITICAL FIX: Handle diphthongs and multi-phoneme characters
585
+ # Common diphthongs that represent single vowel sounds
586
+ diphthongs = ['aɪ', 'aʊ', 'eɪ', 'oʊ', 'ɔɪ', 'ɪə', 'eə', 'ʊə']
587
+
588
+ # Check if this is likely a diphthong case (phonemes longer than word)
589
+ if len(phoneme_normalized) > len(word_normalized):
590
+ log(f" ⚠️ Phonemes longer than word - checking for diphthongs")
591
+
592
+ # Try to identify diphthongs in the phoneme string
593
+ for diphthong in diphthongs:
594
+ if diphthong in phoneme_normalized:
595
+ log(f" Found diphthong: {diphthong}")
596
+
597
+ # Special handling for common patterns
598
+ if word_normalized == 'go' and phoneme_normalized in ['ɡoʊ', 'goʊ']:
599
+ mapping = {0: 'g', 1: 'o', 2: ''} # Map 'oʊ' diphthong to 'o'
600
+ log(f" Special case mapping for 'go': {mapping}")
601
+ return mapping
602
+
603
+ # General approach: distribute word letters proportionally
604
+ # But avoid duplicating letters
605
+ mapping = {}
606
+ word_idx = 0
607
+ phonemes_per_letter = len(phoneme_normalized) / len(word_normalized)
608
+
609
+ for phoneme_idx in range(len(phoneme_normalized)):
610
+ # Calculate which word letter this phoneme should map to
611
+ target_word_idx = min(int(phoneme_idx / phonemes_per_letter), len(word_normalized) - 1)
612
+
613
+ # Only assign the letter once (first phoneme that maps to it)
614
+ if target_word_idx not in [mapping.get(i, None) for i in range(phoneme_idx)]:
615
+ mapping[phoneme_idx] = word_normalized[target_word_idx]
616
+ else:
617
+ mapping[phoneme_idx] = '' # Empty string for extra phonemes
618
+
619
+ log(f" Proportional mapping (no duplication): {mapping}")
620
+ return mapping
621
+
622
  # Simple case: if lengths are equal, try direct mapping
623
  if len(word_normalized) == len(phoneme_normalized):
624
  mapping = {i: word_normalized[i] for i in range(len(phoneme_normalized))}
625
  log(f" Direct mapping (equal lengths): {mapping}")
626
  return mapping
627
 
628
+ # Case: word is longer than phonemes (silent letters)
629
+ if len(word_normalized) > len(phoneme_normalized):
630
+ log(f" Word longer than phonemes - likely has silent letters")
631
+
632
+ # Use DP alignment for this case (existing code works well)
633
+ word_len = len(word_normalized)
634
+ phoneme_len = len(phoneme_normalized)
635
+
636
+ # Initialize DP matrix
637
+ dp = [[float('inf')] * (phoneme_len + 1) for _ in range(word_len + 1)]
638
+ dp[0][0] = 0
639
+
640
+ # Fill DP matrix
641
+ for i in range(word_len + 1):
642
+ for j in range(phoneme_len + 1):
643
+ if i < word_len and j < phoneme_len:
644
+ # Match/substitute
645
+ cost = 0 if word_normalized[i] == phoneme_normalized[j] else 1
646
+ dp[i+1][j+1] = min(dp[i+1][j+1], dp[i][j] + cost)
647
+
648
+ if i < word_len:
649
+ # Delete word character (silent letter)
650
+ dp[i+1][j] = min(dp[i+1][j], dp[i][j] + 1)
651
+
652
+ if j < phoneme_len:
653
+ # Insert phoneme
654
+ dp[i][j+1] = min(dp[i][j+1], dp[i][j] + 1)
655
+
656
+ # Backtrack to find alignment
657
+ mapping = {}
658
+ i, j = word_len, phoneme_len
659
+
660
+ while i > 0 or j > 0:
661
+ if i > 0 and j > 0:
662
+ cost = 0 if word_normalized[i-1] == phoneme_normalized[j-1] else 1
663
+ if dp[i][j] == dp[i-1][j-1] + cost:
664
+ # Match/substitute
665
+ if j-1 not in mapping:
666
+ mapping[j-1] = word_normalized[i-1]
667
+ else:
668
+ mapping[j-1] = word_normalized[i-1] + mapping[j-1]
669
+ i, j = i-1, j-1
670
+ continue
671
 
672
+ if i > 0 and dp[i][j] == dp[i-1][j] + 1:
673
  # Delete word character (silent letter)
674
+ if j < phoneme_len and j in mapping:
675
+ mapping[j] = word_normalized[i-1] + mapping[j]
676
+ elif j > 0 and j-1 in mapping:
677
+ mapping[j-1] = mapping[j-1] + word_normalized[i-1]
678
+ i -= 1
679
+ elif j > 0 and dp[i][j] == dp[i][j-1] + 1:
680
+ # Insert phoneme
681
+ j -= 1
682
+
683
+ log(f" DP alignment mapping: {mapping}")
684
+ return mapping
685
+
686
+ # Shouldn't reach here, but fallback to empty mapping
687
+ log(f" ⚠️ Unexpected case - returning empty mapping")
688
+ return {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
689
 
690
  def create_character_level_feedback(word: str, expected_normalized: str,
691
  detected_normalized: str,
 
699
  result = []
700
 
701
  log(f"🔍 Character-level feedback for '{word}':")
702
+ log(f" Original word: '{word}' (length: {len(word)})")
703
  log(f" Expected: '{expected_normalized}' (length: {len(expected_normalized)})")
704
  log(f" Detected: '{detected_normalized}' (length: {len(detected_normalized)})")
705
  log(f" Mapping: {phoneme_mapping}")
706
 
707
+ # CRITICAL: Count actual word letters in mapping to detect duplication issues
708
+ mapped_letters = ''.join(phoneme_mapping.values())
709
+ log(f" Mapped letters: '{mapped_letters}' (length: {len(mapped_letters)})")
710
+ if len(mapped_letters) > len(word):
711
+ log(f" ⚠️ WARNING: Mapped letters ({len(mapped_letters)}) longer than word ({len(word)})!")
712
+ log(f" This will cause incorrect display. Checking for duplicates...")
713
+
714
  # Handle length mismatches by using the expected length and padding detected if needed
715
  max_length = len(expected_normalized)
716
  detected_padded = detected_normalized + ' ' * max(0, max_length - len(detected_normalized))
 
758
  # MATCH: Use original word letters from mapping in natural case
759
  if i in phoneme_mapping:
760
  word_letters = phoneme_mapping[i] # Keep natural case
761
+ # CRITICAL: Skip empty mappings (used for extra phonemes in diphthongs)
762
+ if word_letters: # Only add non-empty letters
763
+ test_result.append(word_letters)
764
+ tooltip_result.append(word_letters) # Same for tooltip
765
+ real_chars += len(word_letters)
766
+ matches += 1
767
  else:
768
  test_result.append(expected_char)
769
  tooltip_result.append(expected_char)
 
773
  # MISMATCH: Use original word letters with bold+underline formatting
774
  if i in phoneme_mapping:
775
  word_letters = phoneme_mapping[i] # Keep natural case
776
+ # CRITICAL: Skip empty mappings (used for extra phonemes in diphthongs)
777
+ if word_letters: # Only add non-empty letters
778
+ # Format with bold and underline for display
779
+ formatted_letters = f'<strong><u>{word_letters}</u></strong>'
780
+ test_result.append(formatted_letters)
781
+
782
+ # For tooltip, convert detected phoneme to English equivalent
783
+ detected_phoneme = detected_char
784
+ if detected_phoneme in PHONEME_TO_ENGLISH:
785
+ english_equiv = PHONEME_TO_ENGLISH[detected_phoneme].lower()
786
+ tooltip_result.append(english_equiv)
787
+ else:
788
+ tooltip_result.append('?')
789
  else:
790
  # Fallback for cases without mapping
791
  formatted_char = f'<strong><u>{expected_char}</u></strong>'
 
1203
  log(f"📍 WhisperX timing: {start_time:.3f}s - {end_time:.3f}s (duration: {end_time - start_time:.3f}s)")
1204
  log(f"🎯 Expected phoneme: '{expected_phoneme}'")
1205
 
1206
+ # DEBUGGING: Special attention to problematic words
1207
+ if word_clean.lower() in ['go', 'no', 'so', 'to', 'do'] or len(expected_phoneme) > len(word_clean):
1208
+ log(f"⚠️ SPECIAL CASE: Word '{word_clean}' has {len(word_clean)} letters but {len(expected_phoneme)} phonemes")
1209
+ log(f" This may be a diphthong case requiring special handling")
1210
+
1211
  # For very short words, expand the WhisperX timing itself before processing
1212
  original_duration = end_time - start_time
1213
  if original_duration < 0.1: