Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -11,6 +11,7 @@ import string
|
|
11 |
import re
|
12 |
import urllib.request
|
13 |
import gzip
|
|
|
14 |
|
15 |
# Set cache environment
|
16 |
os.environ['HF_HOME'] = '/tmp/hf'
|
@@ -45,8 +46,6 @@ def normalize_phoneme_string(s: str) -> str:
|
|
45 |
if not s:
|
46 |
return s
|
47 |
|
48 |
-
original = s
|
49 |
-
|
50 |
# Convert to lowercase and remove spaces, stress marks, and length markers
|
51 |
normalized = s.lower().strip()
|
52 |
normalized = normalized.replace(' ', '') # Remove spaces between phonemes
|
@@ -83,101 +82,12 @@ def normalize_phoneme_string(s: str) -> str:
|
|
83 |
for variant_char, standard_char in ipa_variants.items():
|
84 |
normalized = normalized.replace(variant_char, standard_char)
|
85 |
|
86 |
-
# Debug specific phoneme strings for normalization
|
87 |
-
if any(word in original.lower() for word in ['hello', 'red']) or any(pattern in original for pattern in ['həloʊ', 'hɛloʊ', 'ɹɛd']):
|
88 |
-
log(f"🔍 Normalization debug: '{original}' → '{normalized}'")
|
89 |
-
|
90 |
return normalized
|
91 |
|
92 |
-
# Phoneme reverse lookup
|
93 |
-
phoneme_to_words_cache = {}
|
94 |
-
|
95 |
-
def build_phoneme_reverse_lookup():
|
96 |
-
"""Build reverse lookup dictionary from CMUdict (ARPABET to words)"""
|
97 |
-
global phoneme_to_words_cache
|
98 |
-
|
99 |
-
if phoneme_to_words_cache:
|
100 |
-
return # Already built
|
101 |
-
|
102 |
-
try:
|
103 |
-
# Download CMUdict if not exists
|
104 |
-
cmudict_path = "/tmp/cmudict.dict"
|
105 |
-
if not os.path.exists(cmudict_path):
|
106 |
-
urllib.request.urlretrieve("https://raw.githubusercontent.com/cmusphinx/cmudict/master/cmudict.dict", cmudict_path)
|
107 |
-
|
108 |
-
# ARPABET to IPA conversion mapping
|
109 |
-
arpabet_to_ipa = {
|
110 |
-
'AA': 'ɑ', 'AE': 'æ', 'AH': 'ə', 'AO': 'ɔ', 'AW': 'aʊ', # Fixed AH: ʌ→ə
|
111 |
-
'AY': 'aɪ', 'B': 'b', 'CH': 'tʃ', 'D': 'd', 'DH': 'ð',
|
112 |
-
'EH': 'ɛ', 'ER': 'ɝ', 'EY': 'eɪ', 'F': 'f', 'G': 'ɡ',
|
113 |
-
'HH': 'h', 'IH': 'ɪ', 'IY': 'i', 'JH': 'dʒ', 'K': 'k',
|
114 |
-
'L': 'l', 'M': 'm', 'N': 'n', 'NG': 'ŋ', 'OW': 'oʊ',
|
115 |
-
'OY': 'ɔɪ', 'P': 'p', 'R': 'r', 'S': 's', 'SH': 'ʃ',
|
116 |
-
'T': 't', 'TH': 'θ', 'UH': 'ʊ', 'UW': 'u', 'V': 'v',
|
117 |
-
'W': 'w', 'Y': 'j', 'Z': 'z', 'ZH': 'ʒ', 'DX': 'ɾ' # Fixed: T→DX for flap
|
118 |
-
}
|
119 |
-
|
120 |
-
# Parse CMUdict and build reverse lookup
|
121 |
-
word_count = 0
|
122 |
-
with open(cmudict_path, 'r', encoding='latin-1') as f:
|
123 |
-
for line in f:
|
124 |
-
line = line.strip()
|
125 |
-
if not line or line.startswith(';;;'):
|
126 |
-
continue
|
127 |
-
|
128 |
-
# Parse line: WORD P H O N E M E S
|
129 |
-
parts = line.split()
|
130 |
-
if len(parts) < 2:
|
131 |
-
continue
|
132 |
-
|
133 |
-
word = parts[0].lower()
|
134 |
-
# Remove variant indicators like (2), (3)
|
135 |
-
if '(' in word:
|
136 |
-
word = word.split('(')[0]
|
137 |
-
|
138 |
-
# Convert ARPABET to IPA
|
139 |
-
arpabet_phones = parts[1:]
|
140 |
-
ipa_phones = []
|
141 |
-
for phone in arpabet_phones:
|
142 |
-
# Remove stress markers (0,1,2)
|
143 |
-
clean_phone = ''.join(c for c in phone if not c.isdigit())
|
144 |
-
if clean_phone in arpabet_to_ipa:
|
145 |
-
ipa_phones.append(arpabet_to_ipa[clean_phone])
|
146 |
-
|
147 |
-
if ipa_phones:
|
148 |
-
# Create phoneme string and normalize it
|
149 |
-
ipa_string = ''.join(ipa_phones)
|
150 |
-
normalized_ipa = normalize_phoneme_string(ipa_string)
|
151 |
-
|
152 |
-
# Add to reverse lookup
|
153 |
-
if normalized_ipa not in phoneme_to_words_cache:
|
154 |
-
phoneme_to_words_cache[normalized_ipa] = []
|
155 |
-
if word not in phoneme_to_words_cache[normalized_ipa]:
|
156 |
-
phoneme_to_words_cache[normalized_ipa].append(word)
|
157 |
-
|
158 |
-
word_count += 1
|
159 |
-
|
160 |
-
log(f"✅ Built reverse lookup: {word_count} words, {len(phoneme_to_words_cache)} unique phoneme patterns")
|
161 |
-
|
162 |
-
except Exception as e:
|
163 |
-
log(f"❌ Error building phoneme reverse lookup: {e}")
|
164 |
-
phoneme_to_words_cache = {}
|
165 |
-
|
166 |
-
def lookup_words_from_phonemes(phoneme_string: str) -> List[str]:
|
167 |
-
"""Look up possible words for a given phoneme string"""
|
168 |
-
if not phoneme_to_words_cache:
|
169 |
-
return []
|
170 |
-
|
171 |
-
normalized = normalize_phoneme_string(phoneme_string)
|
172 |
-
return phoneme_to_words_cache.get(normalized, [])
|
173 |
-
|
174 |
# Load models once at startup
|
175 |
phoneme_processor = Wav2Vec2Processor.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
|
176 |
phoneme_model = Wav2Vec2ForCTC.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
|
177 |
|
178 |
-
# Build phoneme reverse lookup dictionary
|
179 |
-
build_phoneme_reverse_lookup()
|
180 |
-
|
181 |
# Model inspection complete - wav2vec2 uses ASCII 'g' (token 15), not IPA 'ɡ'
|
182 |
log("✅ Phoneme models loaded - using ASCII/IPA normalization")
|
183 |
|
@@ -261,6 +171,67 @@ PHONEME_TO_ENGLISH = {
|
|
261 |
'ˌ': '', # secondary stress (remove)
|
262 |
}
|
263 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
264 |
def clean_word_for_phonemes(word: str) -> str:
|
265 |
"""
|
266 |
Clean word by removing punctuation and extra spaces for phoneme processing.
|
@@ -334,6 +305,10 @@ def load_whisperx_models():
|
|
334 |
|
335 |
if whisperx_model is None:
|
336 |
log("Loading WhisperX models for English-only processing...")
|
|
|
|
|
|
|
|
|
337 |
try:
|
338 |
# Try loading with base.en first
|
339 |
whisperx_model = whisperx.load_model("base.en", device="cpu", compute_type="float32", language="en")
|
@@ -345,42 +320,65 @@ def load_whisperx_models():
|
|
345 |
|
346 |
except ImportError as ie:
|
347 |
log(f"Import error loading WhisperX models: {ie}")
|
348 |
-
|
|
|
349 |
try:
|
350 |
-
log("
|
351 |
-
|
352 |
-
|
353 |
-
|
354 |
-
|
355 |
-
|
356 |
-
#
|
357 |
-
|
358 |
-
|
359 |
-
|
360 |
-
|
361 |
-
|
362 |
-
|
363 |
-
|
364 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
365 |
except Exception as e:
|
366 |
log(f"Error loading WhisperX models: {e}")
|
367 |
-
|
368 |
-
try:
|
369 |
-
log("Trying fallback with tiny.en model...")
|
370 |
-
whisperx_model = whisperx.load_model("tiny.en", device="cpu", compute_type="int8", language="en")
|
371 |
-
whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
|
372 |
-
log("WhisperX models loaded with fallback (tiny.en model)")
|
373 |
-
except Exception as fallback_error:
|
374 |
-
log(f"Fallback also failed: {fallback_error}")
|
375 |
-
# Final attempt without compute_type specification
|
376 |
-
try:
|
377 |
-
log("Final attempt with default settings...")
|
378 |
-
whisperx_model = whisperx.load_model("tiny.en", device="cpu", language="en")
|
379 |
-
whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
|
380 |
-
log("WhisperX models loaded with default settings")
|
381 |
-
except Exception as final_error:
|
382 |
-
log(f"All attempts failed: {final_error}")
|
383 |
-
raise RuntimeError("Unable to load WhisperX models in this environment")
|
384 |
|
385 |
def convert_webm_to_wav(bts):
|
386 |
p = subprocess.run(["ffmpeg", "-i", "pipe:0", "-f", "wav", "-ar", "16000", "-ac", "1", "pipe:1"],
|
@@ -822,16 +820,16 @@ def create_character_level_feedback(word: str, expected_normalized: str,
|
|
822 |
detected_english = "?"
|
823 |
detected_example = ""
|
824 |
|
825 |
-
# Create tooltip text with example words
|
826 |
if expected_example and detected_example:
|
827 |
-
tooltip_text = f"Expected '{expected_english}' as in '{expected_example}'
|
828 |
elif expected_example:
|
829 |
-
tooltip_text = f"Expected '{expected_english}' as in '{expected_example}'
|
830 |
else:
|
831 |
-
tooltip_text = f"Expected '{expected_english}'
|
832 |
|
833 |
# Create span with inline tooltip for each mispronounced letter/group
|
834 |
-
formatted_letters = f'<span class="phoneme-error" data-expected="{expected_english}" data-detected="{detected_english}"
|
835 |
test_result.append(formatted_letters)
|
836 |
|
837 |
# For the simplified tooltip feedback
|
@@ -1009,7 +1007,7 @@ def get_expected_phonemes(words: List[str]) -> List[str]:
|
|
1009 |
return empty_results
|
1010 |
|
1011 |
async def generate_tts_audio(word: str) -> str:
|
1012 |
-
"""Generate TTS audio for a word"""
|
1013 |
if word in tts_cache:
|
1014 |
return tts_cache[word]
|
1015 |
|
@@ -1021,17 +1019,81 @@ async def generate_tts_audio(word: str) -> str:
|
|
1021 |
audio_data += chunk["data"]
|
1022 |
|
1023 |
if audio_data:
|
1024 |
-
|
1025 |
-
|
1026 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1027 |
except Exception as e:
|
1028 |
log(f"TTS failed for '{word}': {e}")
|
1029 |
|
1030 |
return ""
|
1031 |
|
1032 |
-
def audio_to_base64(audio_segment: torch.Tensor, sample_rate: int) -> str:
|
1033 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1034 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1035 |
buffer = io.BytesIO()
|
1036 |
torchaudio.save(buffer, audio_segment, sample_rate, format="wav")
|
1037 |
buffer.seek(0)
|
@@ -1420,7 +1482,6 @@ async def transcribe(audio: UploadFile = File(...), similarity_threshold: float
|
|
1420 |
})
|
1421 |
|
1422 |
# 7. Format output
|
1423 |
-
full_transcript = " ".join(word_texts)
|
1424 |
resolved_output = []
|
1425 |
resolved_colored = []
|
1426 |
|
@@ -1441,7 +1502,6 @@ async def transcribe(audio: UploadFile = File(...), similarity_threshold: float
|
|
1441 |
log("=== WHISPERX ENGLISH-ONLY PHONEME ANALYSIS COMPLETE ===")
|
1442 |
|
1443 |
return {
|
1444 |
-
"transcript": full_transcript,
|
1445 |
"resolved": " ".join(resolved_output),
|
1446 |
"resolved_colored": " ".join(resolved_colored),
|
1447 |
"audio_data": audio_data_list,
|
@@ -1467,7 +1527,6 @@ async def transcribe(audio: UploadFile = File(...), similarity_threshold: float
|
|
1467 |
import traceback
|
1468 |
log(f"Traceback: {traceback.format_exc()}")
|
1469 |
return {
|
1470 |
-
"transcript": "Error occurred",
|
1471 |
"resolved": "Error occurred",
|
1472 |
"resolved_colored": "Error occurred",
|
1473 |
"audio_data": [],
|
|
|
11 |
import re
|
12 |
import urllib.request
|
13 |
import gzip
|
14 |
+
import tempfile
|
15 |
|
16 |
# Set cache environment
|
17 |
os.environ['HF_HOME'] = '/tmp/hf'
|
|
|
46 |
if not s:
|
47 |
return s
|
48 |
|
|
|
|
|
49 |
# Convert to lowercase and remove spaces, stress marks, and length markers
|
50 |
normalized = s.lower().strip()
|
51 |
normalized = normalized.replace(' ', '') # Remove spaces between phonemes
|
|
|
82 |
for variant_char, standard_char in ipa_variants.items():
|
83 |
normalized = normalized.replace(variant_char, standard_char)
|
84 |
|
|
|
|
|
|
|
|
|
85 |
return normalized
|
86 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
# Load models once at startup
|
88 |
phoneme_processor = Wav2Vec2Processor.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
|
89 |
phoneme_model = Wav2Vec2ForCTC.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
|
90 |
|
|
|
|
|
|
|
91 |
# Model inspection complete - wav2vec2 uses ASCII 'g' (token 15), not IPA 'ɡ'
|
92 |
log("✅ Phoneme models loaded - using ASCII/IPA normalization")
|
93 |
|
|
|
171 |
'ˌ': '', # secondary stress (remove)
|
172 |
}
|
173 |
|
174 |
+
# Phoneme example words - showing the sound in context
|
175 |
+
PHONEME_EXAMPLES = {
|
176 |
+
# Vowels (monophthongs)
|
177 |
+
'ɪ': 'bit', # IH sound
|
178 |
+
'ɛ': 'bed', # EH sound
|
179 |
+
'æ': 'cat', # AE sound
|
180 |
+
'ʌ': 'but', # UH sound (stressed)
|
181 |
+
'ɑ': 'father', # AH sound
|
182 |
+
'ɔ': 'law', # AW sound
|
183 |
+
'ʊ': 'book', # UU sound
|
184 |
+
'u': 'boot', # OO sound
|
185 |
+
'i': 'beat', # EE sound
|
186 |
+
'ə': 'about', # schwa (unstressed)
|
187 |
+
'ɝ': 'bird', # ER sound (stressed)
|
188 |
+
'ɚ': 'letter', # ER sound (unstressed)
|
189 |
+
|
190 |
+
# Diphthongs
|
191 |
+
'eɪ': 'day', # AY sound
|
192 |
+
'aɪ': 'my', # EYE sound
|
193 |
+
'ɔɪ': 'boy', # OY sound
|
194 |
+
'aʊ': 'now', # OW sound
|
195 |
+
'oʊ': 'go', # OH sound
|
196 |
+
|
197 |
+
# R-colored vowels
|
198 |
+
'ɪr': 'near', # EER sound
|
199 |
+
'ɛr': 'care', # AIR sound
|
200 |
+
'ɑr': 'car', # AR sound
|
201 |
+
'ɔr': 'for', # OR sound
|
202 |
+
'ʊr': 'tour', # OOR sound
|
203 |
+
'ər': 'letter', # ER sound
|
204 |
+
|
205 |
+
# Consonants
|
206 |
+
'p': 'pat', # P sound
|
207 |
+
'b': 'bat', # B sound
|
208 |
+
't': 'tap', # T sound
|
209 |
+
'd': 'dap', # D sound
|
210 |
+
'k': 'cat', # K sound
|
211 |
+
'g': 'gap', # G sound (ASCII)
|
212 |
+
'ɡ': 'gap', # G sound (IPA)
|
213 |
+
'f': 'fat', # F sound
|
214 |
+
'v': 'vat', # V sound
|
215 |
+
'θ': 'think', # TH sound (voiceless)
|
216 |
+
'ð': 'this', # TH sound (voiced)
|
217 |
+
's': 'sap', # S sound
|
218 |
+
'z': 'zap', # Z sound
|
219 |
+
'ʃ': 'ship', # SH sound
|
220 |
+
'ʒ': 'measure', # ZH sound
|
221 |
+
'h': 'hat', # H sound
|
222 |
+
'm': 'mat', # M sound
|
223 |
+
'n': 'nat', # N sound
|
224 |
+
'ŋ': 'sing', # NG sound
|
225 |
+
'l': 'lap', # L sound
|
226 |
+
'r': 'rap', # R sound
|
227 |
+
'j': 'yes', # Y sound
|
228 |
+
'w': 'wet', # W sound
|
229 |
+
|
230 |
+
# Affricates
|
231 |
+
'tʃ': 'chip', # CH sound
|
232 |
+
'dʒ': 'jump', # J sound
|
233 |
+
}
|
234 |
+
|
235 |
def clean_word_for_phonemes(word: str) -> str:
|
236 |
"""
|
237 |
Clean word by removing punctuation and extra spaces for phoneme processing.
|
|
|
305 |
|
306 |
if whisperx_model is None:
|
307 |
log("Loading WhisperX models for English-only processing...")
|
308 |
+
|
309 |
+
# First, try to set environment variable to disable executable stack
|
310 |
+
os.environ['LD_BIND_NOW'] = '1'
|
311 |
+
|
312 |
try:
|
313 |
# Try loading with base.en first
|
314 |
whisperx_model = whisperx.load_model("base.en", device="cpu", compute_type="float32", language="en")
|
|
|
320 |
|
321 |
except ImportError as ie:
|
322 |
log(f"Import error loading WhisperX models: {ie}")
|
323 |
+
|
324 |
+
# Try to use regular Whisper as fallback
|
325 |
try:
|
326 |
+
log("Attempting to use standard Whisper instead of WhisperX...")
|
327 |
+
import whisper
|
328 |
+
|
329 |
+
# Load standard whisper model
|
330 |
+
whisper_model = whisper.load_model("base.en", device="cpu")
|
331 |
+
|
332 |
+
# Create a wrapper to make it compatible with WhisperX interface
|
333 |
+
class WhisperWrapper:
|
334 |
+
def __init__(self, model):
|
335 |
+
self.model = model
|
336 |
+
|
337 |
+
def transcribe(self, audio, batch_size=16, language="en"):
|
338 |
+
result = self.model.transcribe(audio, language=language)
|
339 |
+
# Convert to WhisperX format
|
340 |
+
return {
|
341 |
+
"segments": [{
|
342 |
+
"text": result["text"],
|
343 |
+
"start": 0.0,
|
344 |
+
"end": len(audio) / 16000.0, # Approximate based on sample rate
|
345 |
+
"words": [] # Will need to handle word-level timing differently
|
346 |
+
}],
|
347 |
+
"language": language
|
348 |
+
}
|
349 |
+
|
350 |
+
whisperx_model = WhisperWrapper(whisper_model)
|
351 |
+
log("Using standard Whisper as fallback (limited word-level timing)")
|
352 |
+
|
353 |
+
# For alignment, we'll need to handle this differently
|
354 |
+
whisperx_align_model = None
|
355 |
+
whisperx_metadata = None
|
356 |
+
|
357 |
+
except Exception as whisper_error:
|
358 |
+
log(f"Standard Whisper fallback failed: {whisper_error}")
|
359 |
+
|
360 |
+
# Last resort: Create a minimal mock that at least returns something
|
361 |
+
class MinimalWhisperMock:
|
362 |
+
def transcribe(self, audio, batch_size=16, language="en"):
|
363 |
+
# Return a minimal valid structure
|
364 |
+
return {
|
365 |
+
"segments": [{
|
366 |
+
"text": "[Audio processing unavailable - WhisperX loading failed]",
|
367 |
+
"start": 0.0,
|
368 |
+
"end": 1.0,
|
369 |
+
"words": []
|
370 |
+
}],
|
371 |
+
"language": language
|
372 |
+
}
|
373 |
+
|
374 |
+
whisperx_model = MinimalWhisperMock()
|
375 |
+
whisperx_align_model = None
|
376 |
+
whisperx_metadata = None
|
377 |
+
log("WARNING: Using minimal mock - transcription will be limited")
|
378 |
+
|
379 |
except Exception as e:
|
380 |
log(f"Error loading WhisperX models: {e}")
|
381 |
+
raise RuntimeError(f"Unable to load speech recognition models: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
382 |
|
383 |
def convert_webm_to_wav(bts):
|
384 |
p = subprocess.run(["ffmpeg", "-i", "pipe:0", "-f", "wav", "-ar", "16000", "-ac", "1", "pipe:1"],
|
|
|
820 |
detected_english = "?"
|
821 |
detected_example = ""
|
822 |
|
823 |
+
# Create tooltip text with example words (two lines)
|
824 |
if expected_example and detected_example:
|
825 |
+
tooltip_text = f"Expected '{expected_english}' as in '{expected_example}'<br>You said '{detected_english}' as in '{detected_example}'"
|
826 |
elif expected_example:
|
827 |
+
tooltip_text = f"Expected '{expected_english}' as in '{expected_example}'<br>You said '{detected_english}'"
|
828 |
else:
|
829 |
+
tooltip_text = f"Expected '{expected_english}'<br>You said '{detected_english}'"
|
830 |
|
831 |
# Create span with inline tooltip for each mispronounced letter/group
|
832 |
+
formatted_letters = f'<span class="phoneme-error" data-expected="{expected_english}" data-detected="{detected_english}" data-tooltip-html="{tooltip_text}"><strong><u>{word_letters}</u></strong></span>'
|
833 |
test_result.append(formatted_letters)
|
834 |
|
835 |
# For the simplified tooltip feedback
|
|
|
1007 |
return empty_results
|
1008 |
|
1009 |
async def generate_tts_audio(word: str) -> str:
|
1010 |
+
"""Generate TTS audio for a word with silence padding"""
|
1011 |
if word in tts_cache:
|
1012 |
return tts_cache[word]
|
1013 |
|
|
|
1019 |
audio_data += chunk["data"]
|
1020 |
|
1021 |
if audio_data:
|
1022 |
+
# Add silence padding to TTS audio as well
|
1023 |
+
# First decode the MP3 to get raw audio
|
1024 |
+
import tempfile
|
1025 |
+
with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as tmp_mp3:
|
1026 |
+
tmp_mp3.write(audio_data)
|
1027 |
+
tmp_mp3_path = tmp_mp3.name
|
1028 |
+
|
1029 |
+
try:
|
1030 |
+
# Load the TTS audio
|
1031 |
+
tts_waveform, tts_sample_rate = torchaudio.load(tmp_mp3_path)
|
1032 |
+
|
1033 |
+
# Resample if needed to match our standard rate
|
1034 |
+
if tts_sample_rate != 16000:
|
1035 |
+
tts_waveform = torchaudio.transforms.Resample(tts_sample_rate, 16000)(tts_waveform)
|
1036 |
+
tts_sample_rate = 16000
|
1037 |
+
|
1038 |
+
# Add 0.25s silence padding on each end
|
1039 |
+
padding_samples = int(0.25 * tts_sample_rate)
|
1040 |
+
silence_shape = list(tts_waveform.shape)
|
1041 |
+
silence_shape[-1] = padding_samples
|
1042 |
+
silence_padding = torch.zeros(silence_shape)
|
1043 |
+
|
1044 |
+
# Concatenate: silence + audio + silence
|
1045 |
+
padded_waveform = torch.cat([silence_padding, tts_waveform, silence_padding], dim=-1)
|
1046 |
+
|
1047 |
+
# Convert back to base64
|
1048 |
+
buffer = io.BytesIO()
|
1049 |
+
torchaudio.save(buffer, padded_waveform, tts_sample_rate, format="wav")
|
1050 |
+
buffer.seek(0)
|
1051 |
+
audio_b64 = base64.b64encode(buffer.read()).decode('utf-8')
|
1052 |
+
|
1053 |
+
tts_cache[word] = audio_b64
|
1054 |
+
log(f"🔇 TTS for '{word}': Added 0.25s silence padding on each end")
|
1055 |
+
return audio_b64
|
1056 |
+
|
1057 |
+
finally:
|
1058 |
+
# Clean up temp file
|
1059 |
+
if os.path.exists(tmp_mp3_path):
|
1060 |
+
os.remove(tmp_mp3_path)
|
1061 |
+
|
1062 |
except Exception as e:
|
1063 |
log(f"TTS failed for '{word}': {e}")
|
1064 |
|
1065 |
return ""
|
1066 |
|
1067 |
+
def audio_to_base64(audio_segment: torch.Tensor, sample_rate: int, add_padding: bool = True) -> str:
|
1068 |
+
"""
|
1069 |
+
Convert audio tensor to base64 string.
|
1070 |
+
|
1071 |
+
Args:
|
1072 |
+
audio_segment: The audio tensor to convert
|
1073 |
+
sample_rate: Sample rate of the audio
|
1074 |
+
add_padding: If True, adds 0.25s of silence on each end to prevent audio processor lag
|
1075 |
+
|
1076 |
+
Returns:
|
1077 |
+
Base64 encoded audio string
|
1078 |
+
"""
|
1079 |
try:
|
1080 |
+
if add_padding:
|
1081 |
+
# Add 0.25 seconds of silence on each end
|
1082 |
+
padding_samples = int(0.25 * sample_rate) # 0.25 seconds worth of samples
|
1083 |
+
|
1084 |
+
# Create silence padding (zeros with same shape as audio segment)
|
1085 |
+
silence_shape = list(audio_segment.shape)
|
1086 |
+
silence_shape[-1] = padding_samples
|
1087 |
+
silence_padding = torch.zeros(silence_shape)
|
1088 |
+
|
1089 |
+
# Concatenate: silence + audio + silence
|
1090 |
+
padded_segment = torch.cat([silence_padding, audio_segment, silence_padding], dim=-1)
|
1091 |
+
|
1092 |
+
log(f"🔇 Added silence padding: {padding_samples} samples (0.25s) on each end")
|
1093 |
+
log(f" Original: {audio_segment.shape[-1]} samples → Padded: {padded_segment.shape[-1]} samples")
|
1094 |
+
|
1095 |
+
audio_segment = padded_segment
|
1096 |
+
|
1097 |
buffer = io.BytesIO()
|
1098 |
torchaudio.save(buffer, audio_segment, sample_rate, format="wav")
|
1099 |
buffer.seek(0)
|
|
|
1482 |
})
|
1483 |
|
1484 |
# 7. Format output
|
|
|
1485 |
resolved_output = []
|
1486 |
resolved_colored = []
|
1487 |
|
|
|
1502 |
log("=== WHISPERX ENGLISH-ONLY PHONEME ANALYSIS COMPLETE ===")
|
1503 |
|
1504 |
return {
|
|
|
1505 |
"resolved": " ".join(resolved_output),
|
1506 |
"resolved_colored": " ".join(resolved_colored),
|
1507 |
"audio_data": audio_data_list,
|
|
|
1527 |
import traceback
|
1528 |
log(f"Traceback: {traceback.format_exc()}")
|
1529 |
return {
|
|
|
1530 |
"resolved": "Error occurred",
|
1531 |
"resolved_colored": "Error occurred",
|
1532 |
"audio_data": [],
|