Spaces:

greg0rs
/

fonetik-fast

Running

App Files Files Community

greg0rs commited on Jul 27

Commit

5aee394

verified ·

1 Parent(s): ced276b

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -8

app.py CHANGED Viewed

@@ -642,11 +642,12 @@ def create_word_phoneme_mapping(word: str, expected_phoneme: str) -> Dict[int, s
 def create_character_level_feedback(word: str, expected_normalized: str,
                                   detected_normalized: str,
-                                  phoneme_mapping: Dict[int, str]) -> str:
     """
     Create character-level feedback using phoneme mapping.
     Tries three alignments (normal, left-shifted, right-shifted) and picks the best one.
-    Returns mixed string: real word letters where phonemes match, 😕 where they don't.
     """
     result = []
@@ -684,6 +685,7 @@ def create_character_level_feedback(word: str, expected_normalized: str,
     best_alignment = None
     best_score = -1
     best_result = []
     log(f"  Testing {len(alignments)} alignments:")
@@ -691,6 +693,7 @@ def create_character_level_feedback(word: str, expected_normalized: str,
         matches = 0
         real_chars = 0
         test_result = []
         for i in range(max_length):
             expected_char = expected_normalized[i] if i < len(expected_normalized) else ' '
@@ -701,15 +704,26 @@ def create_character_level_feedback(word: str, expected_normalized: str,
                 if i in phoneme_mapping:
                     word_letters = phoneme_mapping[i].upper()  # Use uppercase for matches
                     test_result.append(word_letters)
                     real_chars += len(word_letters)
                     matches += 1
                 else:
                     test_result.append(expected_char.upper())
                     real_chars += 1
                     matches += 1
             else:
-                # MISMATCH: Use emoji
                 test_result.append('😕')
         log(f"    {alignment_name}: {matches} matches, {real_chars} real chars, result: '{''.join(test_result)}'")
@@ -720,12 +734,17 @@ def create_character_level_feedback(word: str, expected_normalized: str,
             best_score = score
             best_alignment = alignment_name
             best_result = test_result
     log(f"  🏆 Best alignment: {best_alignment} (score: {best_score})")
     feedback = ''.join(best_result)
     log(f"  Final feedback: '{feedback}'")
-    return feedback
 def trim_audio_segment_by_phoneme_position(audio_segment: torch.Tensor,
                                          detected_phoneme_full: str,
@@ -922,8 +941,8 @@ def format_output_word(word_text: str, similarity_score: float, detected_phoneme
         expected_normalized = normalize_phoneme_string(expected_phoneme)
         detected_normalized = normalize_phoneme_string(detected_phoneme)
-        # Create character-level feedback using CLEANED word
-        feedback = create_character_level_feedback(
             word_clean, expected_normalized, detected_normalized, phoneme_mapping
         )
@@ -937,13 +956,16 @@ def format_output_word(word_text: str, similarity_score: float, detected_phoneme
                 else:
                     break
             output_text = feedback + suffix
         else:
             output_text = feedback
         color = "red"
     else:
         # Above threshold - show original word (with punctuation)
         output_text = word_text
         # Color based on quality within acceptable range
         if similarity_score >= similarity_threshold + (1.0 - similarity_threshold) * 0.3:
             color = "green"  # Good
@@ -953,8 +975,8 @@ def format_output_word(word_text: str, similarity_score: float, detected_phoneme
     # Convert score to percentage for display
     score_percentage = int(similarity_score * 100)
-    # Create colored text with score data embedded
-    colored_text = f'<span style="color:{color}" data-score="{score_percentage}" data-word="{word_text}">{output_text}</span>'
     return output_text, colored_text
 @app.post("/api/transcribe")
@@ -1091,6 +1113,7 @@ async def transcribe(audio: UploadFile = File(...), similarity_threshold: float
             expanded_audio_segment = extract_audio_segment(waveform, sample_rate, expanded_start, expanded_end, word_clean, verbose=True)
             # Check for word boundary overlap and trim if needed
             boundary_offset = detect_word_boundary_overlap(expanded_audio_segment, sample_rate, word_clean)
             if boundary_offset > 0:
                 log(f"🔧 Detected word overlap, trimming {boundary_offset:.3f}s from start")

 def create_character_level_feedback(word: str, expected_normalized: str,
                                   detected_normalized: str,
+                                  phoneme_mapping: Dict[int, str]) -> Tuple[str, str]:
     """
     Create character-level feedback using phoneme mapping.
     Tries three alignments (normal, left-shifted, right-shifted) and picks the best one.
+    Returns: (mixed string: real word letters where phonemes match, 😕 where they don't,
+              tooltip version with English equivalents for detected phonemes)
     """
     result = []
     best_alignment = None
     best_score = -1
     best_result = []
+    best_tooltip_result = []
     log(f"  Testing {len(alignments)} alignments:")
         matches = 0
         real_chars = 0
         test_result = []
+        tooltip_result = []
         for i in range(max_length):
             expected_char = expected_normalized[i] if i < len(expected_normalized) else ' '
                 if i in phoneme_mapping:
                     word_letters = phoneme_mapping[i].upper()  # Use uppercase for matches
                     test_result.append(word_letters)
+                    tooltip_result.append(word_letters)  # Same for tooltip
                     real_chars += len(word_letters)
                     matches += 1
                 else:
                     test_result.append(expected_char.upper())
+                    tooltip_result.append(expected_char.upper())
                     real_chars += 1
                     matches += 1
             else:
+                # MISMATCH: Use emoji for display, English equivalent for tooltip
                 test_result.append('😕')
+                # For tooltip, convert detected phoneme to English equivalent
+                detected_phoneme = detected_char
+                if detected_phoneme in PHONEME_TO_ENGLISH:
+                    english_equiv = PHONEME_TO_ENGLISH[detected_phoneme].lower()
+                    tooltip_result.append(english_equiv)
+                else:
+                    # Fallback if phoneme not found in mapping
+                    tooltip_result.append('?')
         log(f"    {alignment_name}: {matches} matches, {real_chars} real chars, result: '{''.join(test_result)}'")
             best_score = score
             best_alignment = alignment_name
             best_result = test_result
+            best_tooltip_result = tooltip_result
     log(f"  🏆 Best alignment: {best_alignment} (score: {best_score})")
     feedback = ''.join(best_result)
+    tooltip_feedback = ''.join(best_tooltip_result)
     log(f"  Final feedback: '{feedback}'")
+    log(f"  Tooltip feedback: '{tooltip_feedback}'")
+    return feedback, tooltip_feedback
 def trim_audio_segment_by_phoneme_position(audio_segment: torch.Tensor,
                                          detected_phoneme_full: str,
         expected_normalized = normalize_phoneme_string(expected_phoneme)
         detected_normalized = normalize_phoneme_string(detected_phoneme)
+        # Create character-level feedback using CLEANED word - now returns tuple
+        feedback, tooltip_feedback = create_character_level_feedback(
             word_clean, expected_normalized, detected_normalized, phoneme_mapping
         )
                 else:
                     break
             output_text = feedback + suffix
+            tooltip_text = tooltip_feedback + suffix
         else:
             output_text = feedback
+            tooltip_text = tooltip_feedback
         color = "red"
     else:
         # Above threshold - show original word (with punctuation)
         output_text = word_text
+        tooltip_text = word_text  # No change needed for good pronunciations
         # Color based on quality within acceptable range
         if similarity_score >= similarity_threshold + (1.0 - similarity_threshold) * 0.3:
             color = "green"  # Good
     # Convert score to percentage for display
     score_percentage = int(similarity_score * 100)
+    # Create colored text with score data embedded, including tooltip text
+    colored_text = f'<span style="color:{color}" data-score="{score_percentage}" data-word="{word_text}" data-tooltip="{tooltip_text}">{output_text}</span>'
     return output_text, colored_text
 @app.post("/api/transcribe")
             expanded_audio_segment = extract_audio_segment(waveform, sample_rate, expanded_start, expanded_end, word_clean, verbose=True)
             # Check for word boundary overlap and trim if needed
+            log(f"🔍 Checking word boundary overlap for '{word_clean}'...")
             boundary_offset = detect_word_boundary_overlap(expanded_audio_segment, sample_rate, word_clean)
             if boundary_offset > 0:
                 log(f"🔧 Detected word overlap, trimming {boundary_offset:.3f}s from start")