Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -642,11 +642,12 @@ def create_word_phoneme_mapping(word: str, expected_phoneme: str) -> Dict[int, s
|
|
642 |
|
643 |
def create_character_level_feedback(word: str, expected_normalized: str,
|
644 |
detected_normalized: str,
|
645 |
-
phoneme_mapping: Dict[int, str]) -> str:
|
646 |
"""
|
647 |
Create character-level feedback using phoneme mapping.
|
648 |
Tries three alignments (normal, left-shifted, right-shifted) and picks the best one.
|
649 |
-
Returns mixed string: real word letters where phonemes match, π where they don't
|
|
|
650 |
"""
|
651 |
result = []
|
652 |
|
@@ -684,6 +685,7 @@ def create_character_level_feedback(word: str, expected_normalized: str,
|
|
684 |
best_alignment = None
|
685 |
best_score = -1
|
686 |
best_result = []
|
|
|
687 |
|
688 |
log(f" Testing {len(alignments)} alignments:")
|
689 |
|
@@ -691,6 +693,7 @@ def create_character_level_feedback(word: str, expected_normalized: str,
|
|
691 |
matches = 0
|
692 |
real_chars = 0
|
693 |
test_result = []
|
|
|
694 |
|
695 |
for i in range(max_length):
|
696 |
expected_char = expected_normalized[i] if i < len(expected_normalized) else ' '
|
@@ -701,15 +704,26 @@ def create_character_level_feedback(word: str, expected_normalized: str,
|
|
701 |
if i in phoneme_mapping:
|
702 |
word_letters = phoneme_mapping[i].upper() # Use uppercase for matches
|
703 |
test_result.append(word_letters)
|
|
|
704 |
real_chars += len(word_letters)
|
705 |
matches += 1
|
706 |
else:
|
707 |
test_result.append(expected_char.upper())
|
|
|
708 |
real_chars += 1
|
709 |
matches += 1
|
710 |
else:
|
711 |
-
# MISMATCH: Use emoji
|
712 |
test_result.append('π')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
713 |
|
714 |
log(f" {alignment_name}: {matches} matches, {real_chars} real chars, result: '{''.join(test_result)}'")
|
715 |
|
@@ -720,12 +734,17 @@ def create_character_level_feedback(word: str, expected_normalized: str,
|
|
720 |
best_score = score
|
721 |
best_alignment = alignment_name
|
722 |
best_result = test_result
|
|
|
723 |
|
724 |
log(f" π Best alignment: {best_alignment} (score: {best_score})")
|
725 |
|
726 |
feedback = ''.join(best_result)
|
|
|
|
|
727 |
log(f" Final feedback: '{feedback}'")
|
728 |
-
|
|
|
|
|
729 |
|
730 |
def trim_audio_segment_by_phoneme_position(audio_segment: torch.Tensor,
|
731 |
detected_phoneme_full: str,
|
@@ -922,8 +941,8 @@ def format_output_word(word_text: str, similarity_score: float, detected_phoneme
|
|
922 |
expected_normalized = normalize_phoneme_string(expected_phoneme)
|
923 |
detected_normalized = normalize_phoneme_string(detected_phoneme)
|
924 |
|
925 |
-
# Create character-level feedback using CLEANED word
|
926 |
-
feedback = create_character_level_feedback(
|
927 |
word_clean, expected_normalized, detected_normalized, phoneme_mapping
|
928 |
)
|
929 |
|
@@ -937,13 +956,16 @@ def format_output_word(word_text: str, similarity_score: float, detected_phoneme
|
|
937 |
else:
|
938 |
break
|
939 |
output_text = feedback + suffix
|
|
|
940 |
else:
|
941 |
output_text = feedback
|
|
|
942 |
|
943 |
color = "red"
|
944 |
else:
|
945 |
# Above threshold - show original word (with punctuation)
|
946 |
output_text = word_text
|
|
|
947 |
# Color based on quality within acceptable range
|
948 |
if similarity_score >= similarity_threshold + (1.0 - similarity_threshold) * 0.3:
|
949 |
color = "green" # Good
|
@@ -953,8 +975,8 @@ def format_output_word(word_text: str, similarity_score: float, detected_phoneme
|
|
953 |
# Convert score to percentage for display
|
954 |
score_percentage = int(similarity_score * 100)
|
955 |
|
956 |
-
# Create colored text with score data embedded
|
957 |
-
colored_text = f'<span style="color:{color}" data-score="{score_percentage}" data-word="{word_text}">{output_text}</span>'
|
958 |
return output_text, colored_text
|
959 |
|
960 |
@app.post("/api/transcribe")
|
@@ -1091,6 +1113,7 @@ async def transcribe(audio: UploadFile = File(...), similarity_threshold: float
|
|
1091 |
expanded_audio_segment = extract_audio_segment(waveform, sample_rate, expanded_start, expanded_end, word_clean, verbose=True)
|
1092 |
|
1093 |
# Check for word boundary overlap and trim if needed
|
|
|
1094 |
boundary_offset = detect_word_boundary_overlap(expanded_audio_segment, sample_rate, word_clean)
|
1095 |
if boundary_offset > 0:
|
1096 |
log(f"π§ Detected word overlap, trimming {boundary_offset:.3f}s from start")
|
|
|
642 |
|
643 |
def create_character_level_feedback(word: str, expected_normalized: str,
|
644 |
detected_normalized: str,
|
645 |
+
phoneme_mapping: Dict[int, str]) -> Tuple[str, str]:
|
646 |
"""
|
647 |
Create character-level feedback using phoneme mapping.
|
648 |
Tries three alignments (normal, left-shifted, right-shifted) and picks the best one.
|
649 |
+
Returns: (mixed string: real word letters where phonemes match, π where they don't,
|
650 |
+
tooltip version with English equivalents for detected phonemes)
|
651 |
"""
|
652 |
result = []
|
653 |
|
|
|
685 |
best_alignment = None
|
686 |
best_score = -1
|
687 |
best_result = []
|
688 |
+
best_tooltip_result = []
|
689 |
|
690 |
log(f" Testing {len(alignments)} alignments:")
|
691 |
|
|
|
693 |
matches = 0
|
694 |
real_chars = 0
|
695 |
test_result = []
|
696 |
+
tooltip_result = []
|
697 |
|
698 |
for i in range(max_length):
|
699 |
expected_char = expected_normalized[i] if i < len(expected_normalized) else ' '
|
|
|
704 |
if i in phoneme_mapping:
|
705 |
word_letters = phoneme_mapping[i].upper() # Use uppercase for matches
|
706 |
test_result.append(word_letters)
|
707 |
+
tooltip_result.append(word_letters) # Same for tooltip
|
708 |
real_chars += len(word_letters)
|
709 |
matches += 1
|
710 |
else:
|
711 |
test_result.append(expected_char.upper())
|
712 |
+
tooltip_result.append(expected_char.upper())
|
713 |
real_chars += 1
|
714 |
matches += 1
|
715 |
else:
|
716 |
+
# MISMATCH: Use emoji for display, English equivalent for tooltip
|
717 |
test_result.append('π')
|
718 |
+
|
719 |
+
# For tooltip, convert detected phoneme to English equivalent
|
720 |
+
detected_phoneme = detected_char
|
721 |
+
if detected_phoneme in PHONEME_TO_ENGLISH:
|
722 |
+
english_equiv = PHONEME_TO_ENGLISH[detected_phoneme].lower()
|
723 |
+
tooltip_result.append(english_equiv)
|
724 |
+
else:
|
725 |
+
# Fallback if phoneme not found in mapping
|
726 |
+
tooltip_result.append('?')
|
727 |
|
728 |
log(f" {alignment_name}: {matches} matches, {real_chars} real chars, result: '{''.join(test_result)}'")
|
729 |
|
|
|
734 |
best_score = score
|
735 |
best_alignment = alignment_name
|
736 |
best_result = test_result
|
737 |
+
best_tooltip_result = tooltip_result
|
738 |
|
739 |
log(f" π Best alignment: {best_alignment} (score: {best_score})")
|
740 |
|
741 |
feedback = ''.join(best_result)
|
742 |
+
tooltip_feedback = ''.join(best_tooltip_result)
|
743 |
+
|
744 |
log(f" Final feedback: '{feedback}'")
|
745 |
+
log(f" Tooltip feedback: '{tooltip_feedback}'")
|
746 |
+
|
747 |
+
return feedback, tooltip_feedback
|
748 |
|
749 |
def trim_audio_segment_by_phoneme_position(audio_segment: torch.Tensor,
|
750 |
detected_phoneme_full: str,
|
|
|
941 |
expected_normalized = normalize_phoneme_string(expected_phoneme)
|
942 |
detected_normalized = normalize_phoneme_string(detected_phoneme)
|
943 |
|
944 |
+
# Create character-level feedback using CLEANED word - now returns tuple
|
945 |
+
feedback, tooltip_feedback = create_character_level_feedback(
|
946 |
word_clean, expected_normalized, detected_normalized, phoneme_mapping
|
947 |
)
|
948 |
|
|
|
956 |
else:
|
957 |
break
|
958 |
output_text = feedback + suffix
|
959 |
+
tooltip_text = tooltip_feedback + suffix
|
960 |
else:
|
961 |
output_text = feedback
|
962 |
+
tooltip_text = tooltip_feedback
|
963 |
|
964 |
color = "red"
|
965 |
else:
|
966 |
# Above threshold - show original word (with punctuation)
|
967 |
output_text = word_text
|
968 |
+
tooltip_text = word_text # No change needed for good pronunciations
|
969 |
# Color based on quality within acceptable range
|
970 |
if similarity_score >= similarity_threshold + (1.0 - similarity_threshold) * 0.3:
|
971 |
color = "green" # Good
|
|
|
975 |
# Convert score to percentage for display
|
976 |
score_percentage = int(similarity_score * 100)
|
977 |
|
978 |
+
# Create colored text with score data embedded, including tooltip text
|
979 |
+
colored_text = f'<span style="color:{color}" data-score="{score_percentage}" data-word="{word_text}" data-tooltip="{tooltip_text}">{output_text}</span>'
|
980 |
return output_text, colored_text
|
981 |
|
982 |
@app.post("/api/transcribe")
|
|
|
1113 |
expanded_audio_segment = extract_audio_segment(waveform, sample_rate, expanded_start, expanded_end, word_clean, verbose=True)
|
1114 |
|
1115 |
# Check for word boundary overlap and trim if needed
|
1116 |
+
log(f"π Checking word boundary overlap for '{word_clean}'...")
|
1117 |
boundary_offset = detect_word_boundary_overlap(expanded_audio_segment, sample_rate, word_clean)
|
1118 |
if boundary_offset > 0:
|
1119 |
log(f"π§ Detected word overlap, trimming {boundary_offset:.3f}s from start")
|