Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -569,76 +569,123 @@ def sliding_window_phoneme_match(detected_phoneme: str, expected_phoneme: str, w
|
|
569 |
def create_word_phoneme_mapping(word: str, expected_phoneme: str) -> Dict[int, str]:
|
570 |
"""
|
571 |
Create dynamic mapping from phoneme positions to original word letters.
|
572 |
-
|
573 |
Returns: {phoneme_position: word_letters}
|
574 |
"""
|
575 |
word_normalized = word.lower()
|
576 |
phoneme_normalized = normalize_phoneme_string(expected_phoneme)
|
577 |
|
578 |
log(f"🗺️ Creating mapping for '{word}' → '{phoneme_normalized}'")
|
|
|
579 |
|
580 |
if not phoneme_normalized:
|
581 |
return {}
|
582 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
583 |
# Simple case: if lengths are equal, try direct mapping
|
584 |
if len(word_normalized) == len(phoneme_normalized):
|
585 |
mapping = {i: word_normalized[i] for i in range(len(phoneme_normalized))}
|
586 |
log(f" Direct mapping (equal lengths): {mapping}")
|
587 |
return mapping
|
588 |
|
589 |
-
#
|
590 |
-
|
591 |
-
|
592 |
-
|
593 |
-
|
594 |
-
|
595 |
-
|
596 |
-
|
597 |
-
|
598 |
-
|
599 |
-
|
600 |
-
|
601 |
-
|
602 |
-
|
603 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
604 |
|
605 |
-
if i
|
606 |
# Delete word character (silent letter)
|
607 |
-
|
608 |
-
|
609 |
-
|
610 |
-
|
611 |
-
|
612 |
-
|
613 |
-
|
614 |
-
|
615 |
-
|
616 |
-
|
617 |
-
|
618 |
-
|
619 |
-
|
620 |
-
|
621 |
-
|
622 |
-
if j-1 not in mapping:
|
623 |
-
mapping[j-1] = word_normalized[i-1]
|
624 |
-
else:
|
625 |
-
mapping[j-1] = word_normalized[i-1] + mapping[j-1]
|
626 |
-
i, j = i-1, j-1
|
627 |
-
continue
|
628 |
-
|
629 |
-
if i > 0 and dp[i][j] == dp[i-1][j] + 1:
|
630 |
-
# Delete word character (silent letter) - add to previous phoneme if exists
|
631 |
-
if j < phoneme_len and j in mapping:
|
632 |
-
mapping[j] = word_normalized[i-1] + mapping[j]
|
633 |
-
elif j > 0 and j-1 in mapping:
|
634 |
-
mapping[j-1] = mapping[j-1] + word_normalized[i-1]
|
635 |
-
i -= 1
|
636 |
-
elif j > 0 and dp[i][j] == dp[i][j-1] + 1:
|
637 |
-
# Insert phoneme (skip phoneme, should not happen in our case)
|
638 |
-
j -= 1
|
639 |
-
|
640 |
-
log(f" Dynamic mapping: {mapping}")
|
641 |
-
return mapping
|
642 |
|
643 |
def create_character_level_feedback(word: str, expected_normalized: str,
|
644 |
detected_normalized: str,
|
@@ -652,10 +699,18 @@ def create_character_level_feedback(word: str, expected_normalized: str,
|
|
652 |
result = []
|
653 |
|
654 |
log(f"🔍 Character-level feedback for '{word}':")
|
|
|
655 |
log(f" Expected: '{expected_normalized}' (length: {len(expected_normalized)})")
|
656 |
log(f" Detected: '{detected_normalized}' (length: {len(detected_normalized)})")
|
657 |
log(f" Mapping: {phoneme_mapping}")
|
658 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
659 |
# Handle length mismatches by using the expected length and padding detected if needed
|
660 |
max_length = len(expected_normalized)
|
661 |
detected_padded = detected_normalized + ' ' * max(0, max_length - len(detected_normalized))
|
@@ -703,10 +758,12 @@ def create_character_level_feedback(word: str, expected_normalized: str,
|
|
703 |
# MATCH: Use original word letters from mapping in natural case
|
704 |
if i in phoneme_mapping:
|
705 |
word_letters = phoneme_mapping[i] # Keep natural case
|
706 |
-
|
707 |
-
|
708 |
-
|
709 |
-
|
|
|
|
|
710 |
else:
|
711 |
test_result.append(expected_char)
|
712 |
tooltip_result.append(expected_char)
|
@@ -716,17 +773,19 @@ def create_character_level_feedback(word: str, expected_normalized: str,
|
|
716 |
# MISMATCH: Use original word letters with bold+underline formatting
|
717 |
if i in phoneme_mapping:
|
718 |
word_letters = phoneme_mapping[i] # Keep natural case
|
719 |
-
#
|
720 |
-
|
721 |
-
|
722 |
-
|
723 |
-
|
724 |
-
|
725 |
-
|
726 |
-
|
727 |
-
|
728 |
-
|
729 |
-
|
|
|
|
|
730 |
else:
|
731 |
# Fallback for cases without mapping
|
732 |
formatted_char = f'<strong><u>{expected_char}</u></strong>'
|
@@ -1144,6 +1203,11 @@ async def transcribe(audio: UploadFile = File(...), similarity_threshold: float
|
|
1144 |
log(f"📍 WhisperX timing: {start_time:.3f}s - {end_time:.3f}s (duration: {end_time - start_time:.3f}s)")
|
1145 |
log(f"🎯 Expected phoneme: '{expected_phoneme}'")
|
1146 |
|
|
|
|
|
|
|
|
|
|
|
1147 |
# For very short words, expand the WhisperX timing itself before processing
|
1148 |
original_duration = end_time - start_time
|
1149 |
if original_duration < 0.1:
|
|
|
569 |
def create_word_phoneme_mapping(word: str, expected_phoneme: str) -> Dict[int, str]:
|
570 |
"""
|
571 |
Create dynamic mapping from phoneme positions to original word letters.
|
572 |
+
Handles cases where phonemes are longer than the word (e.g., 'go' → 'ɡoʊ').
|
573 |
Returns: {phoneme_position: word_letters}
|
574 |
"""
|
575 |
word_normalized = word.lower()
|
576 |
phoneme_normalized = normalize_phoneme_string(expected_phoneme)
|
577 |
|
578 |
log(f"🗺️ Creating mapping for '{word}' → '{phoneme_normalized}'")
|
579 |
+
log(f" Word length: {len(word_normalized)}, Phoneme length: {len(phoneme_normalized)}")
|
580 |
|
581 |
if not phoneme_normalized:
|
582 |
return {}
|
583 |
|
584 |
+
# CRITICAL FIX: Handle diphthongs and multi-phoneme characters
|
585 |
+
# Common diphthongs that represent single vowel sounds
|
586 |
+
diphthongs = ['aɪ', 'aʊ', 'eɪ', 'oʊ', 'ɔɪ', 'ɪə', 'eə', 'ʊə']
|
587 |
+
|
588 |
+
# Check if this is likely a diphthong case (phonemes longer than word)
|
589 |
+
if len(phoneme_normalized) > len(word_normalized):
|
590 |
+
log(f" ⚠️ Phonemes longer than word - checking for diphthongs")
|
591 |
+
|
592 |
+
# Try to identify diphthongs in the phoneme string
|
593 |
+
for diphthong in diphthongs:
|
594 |
+
if diphthong in phoneme_normalized:
|
595 |
+
log(f" Found diphthong: {diphthong}")
|
596 |
+
|
597 |
+
# Special handling for common patterns
|
598 |
+
if word_normalized == 'go' and phoneme_normalized in ['ɡoʊ', 'goʊ']:
|
599 |
+
mapping = {0: 'g', 1: 'o', 2: ''} # Map 'oʊ' diphthong to 'o'
|
600 |
+
log(f" Special case mapping for 'go': {mapping}")
|
601 |
+
return mapping
|
602 |
+
|
603 |
+
# General approach: distribute word letters proportionally
|
604 |
+
# But avoid duplicating letters
|
605 |
+
mapping = {}
|
606 |
+
word_idx = 0
|
607 |
+
phonemes_per_letter = len(phoneme_normalized) / len(word_normalized)
|
608 |
+
|
609 |
+
for phoneme_idx in range(len(phoneme_normalized)):
|
610 |
+
# Calculate which word letter this phoneme should map to
|
611 |
+
target_word_idx = min(int(phoneme_idx / phonemes_per_letter), len(word_normalized) - 1)
|
612 |
+
|
613 |
+
# Only assign the letter once (first phoneme that maps to it)
|
614 |
+
if target_word_idx not in [mapping.get(i, None) for i in range(phoneme_idx)]:
|
615 |
+
mapping[phoneme_idx] = word_normalized[target_word_idx]
|
616 |
+
else:
|
617 |
+
mapping[phoneme_idx] = '' # Empty string for extra phonemes
|
618 |
+
|
619 |
+
log(f" Proportional mapping (no duplication): {mapping}")
|
620 |
+
return mapping
|
621 |
+
|
622 |
# Simple case: if lengths are equal, try direct mapping
|
623 |
if len(word_normalized) == len(phoneme_normalized):
|
624 |
mapping = {i: word_normalized[i] for i in range(len(phoneme_normalized))}
|
625 |
log(f" Direct mapping (equal lengths): {mapping}")
|
626 |
return mapping
|
627 |
|
628 |
+
# Case: word is longer than phonemes (silent letters)
|
629 |
+
if len(word_normalized) > len(phoneme_normalized):
|
630 |
+
log(f" Word longer than phonemes - likely has silent letters")
|
631 |
+
|
632 |
+
# Use DP alignment for this case (existing code works well)
|
633 |
+
word_len = len(word_normalized)
|
634 |
+
phoneme_len = len(phoneme_normalized)
|
635 |
+
|
636 |
+
# Initialize DP matrix
|
637 |
+
dp = [[float('inf')] * (phoneme_len + 1) for _ in range(word_len + 1)]
|
638 |
+
dp[0][0] = 0
|
639 |
+
|
640 |
+
# Fill DP matrix
|
641 |
+
for i in range(word_len + 1):
|
642 |
+
for j in range(phoneme_len + 1):
|
643 |
+
if i < word_len and j < phoneme_len:
|
644 |
+
# Match/substitute
|
645 |
+
cost = 0 if word_normalized[i] == phoneme_normalized[j] else 1
|
646 |
+
dp[i+1][j+1] = min(dp[i+1][j+1], dp[i][j] + cost)
|
647 |
+
|
648 |
+
if i < word_len:
|
649 |
+
# Delete word character (silent letter)
|
650 |
+
dp[i+1][j] = min(dp[i+1][j], dp[i][j] + 1)
|
651 |
+
|
652 |
+
if j < phoneme_len:
|
653 |
+
# Insert phoneme
|
654 |
+
dp[i][j+1] = min(dp[i][j+1], dp[i][j] + 1)
|
655 |
+
|
656 |
+
# Backtrack to find alignment
|
657 |
+
mapping = {}
|
658 |
+
i, j = word_len, phoneme_len
|
659 |
+
|
660 |
+
while i > 0 or j > 0:
|
661 |
+
if i > 0 and j > 0:
|
662 |
+
cost = 0 if word_normalized[i-1] == phoneme_normalized[j-1] else 1
|
663 |
+
if dp[i][j] == dp[i-1][j-1] + cost:
|
664 |
+
# Match/substitute
|
665 |
+
if j-1 not in mapping:
|
666 |
+
mapping[j-1] = word_normalized[i-1]
|
667 |
+
else:
|
668 |
+
mapping[j-1] = word_normalized[i-1] + mapping[j-1]
|
669 |
+
i, j = i-1, j-1
|
670 |
+
continue
|
671 |
|
672 |
+
if i > 0 and dp[i][j] == dp[i-1][j] + 1:
|
673 |
# Delete word character (silent letter)
|
674 |
+
if j < phoneme_len and j in mapping:
|
675 |
+
mapping[j] = word_normalized[i-1] + mapping[j]
|
676 |
+
elif j > 0 and j-1 in mapping:
|
677 |
+
mapping[j-1] = mapping[j-1] + word_normalized[i-1]
|
678 |
+
i -= 1
|
679 |
+
elif j > 0 and dp[i][j] == dp[i][j-1] + 1:
|
680 |
+
# Insert phoneme
|
681 |
+
j -= 1
|
682 |
+
|
683 |
+
log(f" DP alignment mapping: {mapping}")
|
684 |
+
return mapping
|
685 |
+
|
686 |
+
# Shouldn't reach here, but fallback to empty mapping
|
687 |
+
log(f" ⚠️ Unexpected case - returning empty mapping")
|
688 |
+
return {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
689 |
|
690 |
def create_character_level_feedback(word: str, expected_normalized: str,
|
691 |
detected_normalized: str,
|
|
|
699 |
result = []
|
700 |
|
701 |
log(f"🔍 Character-level feedback for '{word}':")
|
702 |
+
log(f" Original word: '{word}' (length: {len(word)})")
|
703 |
log(f" Expected: '{expected_normalized}' (length: {len(expected_normalized)})")
|
704 |
log(f" Detected: '{detected_normalized}' (length: {len(detected_normalized)})")
|
705 |
log(f" Mapping: {phoneme_mapping}")
|
706 |
|
707 |
+
# CRITICAL: Count actual word letters in mapping to detect duplication issues
|
708 |
+
mapped_letters = ''.join(phoneme_mapping.values())
|
709 |
+
log(f" Mapped letters: '{mapped_letters}' (length: {len(mapped_letters)})")
|
710 |
+
if len(mapped_letters) > len(word):
|
711 |
+
log(f" ⚠️ WARNING: Mapped letters ({len(mapped_letters)}) longer than word ({len(word)})!")
|
712 |
+
log(f" This will cause incorrect display. Checking for duplicates...")
|
713 |
+
|
714 |
# Handle length mismatches by using the expected length and padding detected if needed
|
715 |
max_length = len(expected_normalized)
|
716 |
detected_padded = detected_normalized + ' ' * max(0, max_length - len(detected_normalized))
|
|
|
758 |
# MATCH: Use original word letters from mapping in natural case
|
759 |
if i in phoneme_mapping:
|
760 |
word_letters = phoneme_mapping[i] # Keep natural case
|
761 |
+
# CRITICAL: Skip empty mappings (used for extra phonemes in diphthongs)
|
762 |
+
if word_letters: # Only add non-empty letters
|
763 |
+
test_result.append(word_letters)
|
764 |
+
tooltip_result.append(word_letters) # Same for tooltip
|
765 |
+
real_chars += len(word_letters)
|
766 |
+
matches += 1
|
767 |
else:
|
768 |
test_result.append(expected_char)
|
769 |
tooltip_result.append(expected_char)
|
|
|
773 |
# MISMATCH: Use original word letters with bold+underline formatting
|
774 |
if i in phoneme_mapping:
|
775 |
word_letters = phoneme_mapping[i] # Keep natural case
|
776 |
+
# CRITICAL: Skip empty mappings (used for extra phonemes in diphthongs)
|
777 |
+
if word_letters: # Only add non-empty letters
|
778 |
+
# Format with bold and underline for display
|
779 |
+
formatted_letters = f'<strong><u>{word_letters}</u></strong>'
|
780 |
+
test_result.append(formatted_letters)
|
781 |
+
|
782 |
+
# For tooltip, convert detected phoneme to English equivalent
|
783 |
+
detected_phoneme = detected_char
|
784 |
+
if detected_phoneme in PHONEME_TO_ENGLISH:
|
785 |
+
english_equiv = PHONEME_TO_ENGLISH[detected_phoneme].lower()
|
786 |
+
tooltip_result.append(english_equiv)
|
787 |
+
else:
|
788 |
+
tooltip_result.append('?')
|
789 |
else:
|
790 |
# Fallback for cases without mapping
|
791 |
formatted_char = f'<strong><u>{expected_char}</u></strong>'
|
|
|
1203 |
log(f"📍 WhisperX timing: {start_time:.3f}s - {end_time:.3f}s (duration: {end_time - start_time:.3f}s)")
|
1204 |
log(f"🎯 Expected phoneme: '{expected_phoneme}'")
|
1205 |
|
1206 |
+
# DEBUGGING: Special attention to problematic words
|
1207 |
+
if word_clean.lower() in ['go', 'no', 'so', 'to', 'do'] or len(expected_phoneme) > len(word_clean):
|
1208 |
+
log(f"⚠️ SPECIAL CASE: Word '{word_clean}' has {len(word_clean)} letters but {len(expected_phoneme)} phonemes")
|
1209 |
+
log(f" This may be a diphthong case requiring special handling")
|
1210 |
+
|
1211 |
# For very short words, expand the WhisperX timing itself before processing
|
1212 |
original_duration = end_time - start_time
|
1213 |
if original_duration < 0.1:
|