greg0rs commited on
Commit
5aee394
Β·
verified Β·
1 Parent(s): ced276b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -8
app.py CHANGED
@@ -642,11 +642,12 @@ def create_word_phoneme_mapping(word: str, expected_phoneme: str) -> Dict[int, s
642
 
643
  def create_character_level_feedback(word: str, expected_normalized: str,
644
  detected_normalized: str,
645
- phoneme_mapping: Dict[int, str]) -> str:
646
  """
647
  Create character-level feedback using phoneme mapping.
648
  Tries three alignments (normal, left-shifted, right-shifted) and picks the best one.
649
- Returns mixed string: real word letters where phonemes match, πŸ˜• where they don't.
 
650
  """
651
  result = []
652
 
@@ -684,6 +685,7 @@ def create_character_level_feedback(word: str, expected_normalized: str,
684
  best_alignment = None
685
  best_score = -1
686
  best_result = []
 
687
 
688
  log(f" Testing {len(alignments)} alignments:")
689
 
@@ -691,6 +693,7 @@ def create_character_level_feedback(word: str, expected_normalized: str,
691
  matches = 0
692
  real_chars = 0
693
  test_result = []
 
694
 
695
  for i in range(max_length):
696
  expected_char = expected_normalized[i] if i < len(expected_normalized) else ' '
@@ -701,15 +704,26 @@ def create_character_level_feedback(word: str, expected_normalized: str,
701
  if i in phoneme_mapping:
702
  word_letters = phoneme_mapping[i].upper() # Use uppercase for matches
703
  test_result.append(word_letters)
 
704
  real_chars += len(word_letters)
705
  matches += 1
706
  else:
707
  test_result.append(expected_char.upper())
 
708
  real_chars += 1
709
  matches += 1
710
  else:
711
- # MISMATCH: Use emoji
712
  test_result.append('πŸ˜•')
 
 
 
 
 
 
 
 
 
713
 
714
  log(f" {alignment_name}: {matches} matches, {real_chars} real chars, result: '{''.join(test_result)}'")
715
 
@@ -720,12 +734,17 @@ def create_character_level_feedback(word: str, expected_normalized: str,
720
  best_score = score
721
  best_alignment = alignment_name
722
  best_result = test_result
 
723
 
724
  log(f" πŸ† Best alignment: {best_alignment} (score: {best_score})")
725
 
726
  feedback = ''.join(best_result)
 
 
727
  log(f" Final feedback: '{feedback}'")
728
- return feedback
 
 
729
 
730
  def trim_audio_segment_by_phoneme_position(audio_segment: torch.Tensor,
731
  detected_phoneme_full: str,
@@ -922,8 +941,8 @@ def format_output_word(word_text: str, similarity_score: float, detected_phoneme
922
  expected_normalized = normalize_phoneme_string(expected_phoneme)
923
  detected_normalized = normalize_phoneme_string(detected_phoneme)
924
 
925
- # Create character-level feedback using CLEANED word
926
- feedback = create_character_level_feedback(
927
  word_clean, expected_normalized, detected_normalized, phoneme_mapping
928
  )
929
 
@@ -937,13 +956,16 @@ def format_output_word(word_text: str, similarity_score: float, detected_phoneme
937
  else:
938
  break
939
  output_text = feedback + suffix
 
940
  else:
941
  output_text = feedback
 
942
 
943
  color = "red"
944
  else:
945
  # Above threshold - show original word (with punctuation)
946
  output_text = word_text
 
947
  # Color based on quality within acceptable range
948
  if similarity_score >= similarity_threshold + (1.0 - similarity_threshold) * 0.3:
949
  color = "green" # Good
@@ -953,8 +975,8 @@ def format_output_word(word_text: str, similarity_score: float, detected_phoneme
953
  # Convert score to percentage for display
954
  score_percentage = int(similarity_score * 100)
955
 
956
- # Create colored text with score data embedded
957
- colored_text = f'<span style="color:{color}" data-score="{score_percentage}" data-word="{word_text}">{output_text}</span>'
958
  return output_text, colored_text
959
 
960
  @app.post("/api/transcribe")
@@ -1091,6 +1113,7 @@ async def transcribe(audio: UploadFile = File(...), similarity_threshold: float
1091
  expanded_audio_segment = extract_audio_segment(waveform, sample_rate, expanded_start, expanded_end, word_clean, verbose=True)
1092
 
1093
  # Check for word boundary overlap and trim if needed
 
1094
  boundary_offset = detect_word_boundary_overlap(expanded_audio_segment, sample_rate, word_clean)
1095
  if boundary_offset > 0:
1096
  log(f"πŸ”§ Detected word overlap, trimming {boundary_offset:.3f}s from start")
 
642
 
643
  def create_character_level_feedback(word: str, expected_normalized: str,
644
  detected_normalized: str,
645
+ phoneme_mapping: Dict[int, str]) -> Tuple[str, str]:
646
  """
647
  Create character-level feedback using phoneme mapping.
648
  Tries three alignments (normal, left-shifted, right-shifted) and picks the best one.
649
+ Returns: (mixed string: real word letters where phonemes match, πŸ˜• where they don't,
650
+ tooltip version with English equivalents for detected phonemes)
651
  """
652
  result = []
653
 
 
685
  best_alignment = None
686
  best_score = -1
687
  best_result = []
688
+ best_tooltip_result = []
689
 
690
  log(f" Testing {len(alignments)} alignments:")
691
 
 
693
  matches = 0
694
  real_chars = 0
695
  test_result = []
696
+ tooltip_result = []
697
 
698
  for i in range(max_length):
699
  expected_char = expected_normalized[i] if i < len(expected_normalized) else ' '
 
704
  if i in phoneme_mapping:
705
  word_letters = phoneme_mapping[i].upper() # Use uppercase for matches
706
  test_result.append(word_letters)
707
+ tooltip_result.append(word_letters) # Same for tooltip
708
  real_chars += len(word_letters)
709
  matches += 1
710
  else:
711
  test_result.append(expected_char.upper())
712
+ tooltip_result.append(expected_char.upper())
713
  real_chars += 1
714
  matches += 1
715
  else:
716
+ # MISMATCH: Use emoji for display, English equivalent for tooltip
717
  test_result.append('πŸ˜•')
718
+
719
+ # For tooltip, convert detected phoneme to English equivalent
720
+ detected_phoneme = detected_char
721
+ if detected_phoneme in PHONEME_TO_ENGLISH:
722
+ english_equiv = PHONEME_TO_ENGLISH[detected_phoneme].lower()
723
+ tooltip_result.append(english_equiv)
724
+ else:
725
+ # Fallback if phoneme not found in mapping
726
+ tooltip_result.append('?')
727
 
728
  log(f" {alignment_name}: {matches} matches, {real_chars} real chars, result: '{''.join(test_result)}'")
729
 
 
734
  best_score = score
735
  best_alignment = alignment_name
736
  best_result = test_result
737
+ best_tooltip_result = tooltip_result
738
 
739
  log(f" πŸ† Best alignment: {best_alignment} (score: {best_score})")
740
 
741
  feedback = ''.join(best_result)
742
+ tooltip_feedback = ''.join(best_tooltip_result)
743
+
744
  log(f" Final feedback: '{feedback}'")
745
+ log(f" Tooltip feedback: '{tooltip_feedback}'")
746
+
747
+ return feedback, tooltip_feedback
748
 
749
  def trim_audio_segment_by_phoneme_position(audio_segment: torch.Tensor,
750
  detected_phoneme_full: str,
 
941
  expected_normalized = normalize_phoneme_string(expected_phoneme)
942
  detected_normalized = normalize_phoneme_string(detected_phoneme)
943
 
944
+ # Create character-level feedback using CLEANED word - now returns tuple
945
+ feedback, tooltip_feedback = create_character_level_feedback(
946
  word_clean, expected_normalized, detected_normalized, phoneme_mapping
947
  )
948
 
 
956
  else:
957
  break
958
  output_text = feedback + suffix
959
+ tooltip_text = tooltip_feedback + suffix
960
  else:
961
  output_text = feedback
962
+ tooltip_text = tooltip_feedback
963
 
964
  color = "red"
965
  else:
966
  # Above threshold - show original word (with punctuation)
967
  output_text = word_text
968
+ tooltip_text = word_text # No change needed for good pronunciations
969
  # Color based on quality within acceptable range
970
  if similarity_score >= similarity_threshold + (1.0 - similarity_threshold) * 0.3:
971
  color = "green" # Good
 
975
  # Convert score to percentage for display
976
  score_percentage = int(similarity_score * 100)
977
 
978
+ # Create colored text with score data embedded, including tooltip text
979
+ colored_text = f'<span style="color:{color}" data-score="{score_percentage}" data-word="{word_text}" data-tooltip="{tooltip_text}">{output_text}</span>'
980
  return output_text, colored_text
981
 
982
  @app.post("/api/transcribe")
 
1113
  expanded_audio_segment = extract_audio_segment(waveform, sample_rate, expanded_start, expanded_end, word_clean, verbose=True)
1114
 
1115
  # Check for word boundary overlap and trim if needed
1116
+ log(f"πŸ” Checking word boundary overlap for '{word_clean}'...")
1117
  boundary_offset = detect_word_boundary_overlap(expanded_audio_segment, sample_rate, word_clean)
1118
  if boundary_offset > 0:
1119
  log(f"πŸ”§ Detected word overlap, trimming {boundary_offset:.3f}s from start")