Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -133,24 +133,28 @@ def clean_word_for_phonemes(word: str) -> str:
|
|
133 |
return cleaned
|
134 |
|
135 |
def load_whisperx_models():
|
136 |
-
"""Load WhisperX models lazily -
|
137 |
global whisperx_model, whisperx_align_model, whisperx_metadata
|
138 |
|
139 |
if whisperx_model is None:
|
140 |
-
log("Loading WhisperX models
|
141 |
try:
|
142 |
-
#
|
143 |
-
whisperx_model = whisperx.load_model("base", device="cpu", compute_type="float32", language="en")
|
|
|
|
|
|
|
144 |
whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
|
145 |
-
log("WhisperX
|
|
|
146 |
except Exception as e:
|
147 |
log(f"Error loading WhisperX models: {e}")
|
148 |
-
# Fallback: try with smaller model
|
149 |
try:
|
150 |
-
log("Trying fallback with tiny model
|
151 |
-
whisperx_model = whisperx.load_model("tiny", device="cpu", compute_type="float32", language="en")
|
152 |
whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
|
153 |
-
log("WhisperX models loaded with fallback (tiny model
|
154 |
except Exception as fallback_error:
|
155 |
log(f"Fallback also failed: {fallback_error}")
|
156 |
raise
|
@@ -177,6 +181,54 @@ def normalize_phoneme_string(s: str) -> str:
|
|
177 |
|
178 |
return normalized
|
179 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
180 |
def calculate_similarity(detected: str, expected: str) -> float:
|
181 |
"""Calculate similarity between detected and expected phonemes"""
|
182 |
detected_norm = normalize_phoneme_string(detected)
|
@@ -203,25 +255,15 @@ def extract_audio_segment(waveform: torch.Tensor, sample_rate: int,
|
|
203 |
|
204 |
return segment
|
205 |
|
206 |
-
def
|
207 |
-
|
208 |
-
""
|
209 |
-
Detect phoneme from audio segment with fallback expansion capability.
|
210 |
-
expansion_level: "normal", "extended", or "maximum"
|
211 |
-
"""
|
212 |
-
level_info = {
|
213 |
-
"normal": "standard",
|
214 |
-
"extended": "double-expanded",
|
215 |
-
"maximum": "triple-expanded"
|
216 |
-
}
|
217 |
-
|
218 |
-
log(f"π Starting phoneme detection for '{word}' ({level_info[expansion_level]} audio)...")
|
219 |
|
220 |
if audio_segment.shape[-1] == 0:
|
221 |
log(f"β οΈ Empty audio segment for '{word}'")
|
222 |
return ""
|
223 |
|
224 |
-
log(f"π
|
225 |
|
226 |
# Pad or truncate to standard length for model
|
227 |
target_length = 16000 # 1 second
|
@@ -263,112 +305,27 @@ def detect_phoneme_from_audio_with_fallback(audio_segment: torch.Tensor, sample_
|
|
263 |
log(f"β Error in phoneme detection: {e}")
|
264 |
return ""
|
265 |
|
266 |
-
log(f"π― Phoneme detection for '{word}'
|
267 |
return detected_phoneme
|
268 |
|
269 |
-
def
|
270 |
-
start_time: float, end_time: float, word: str,
|
271 |
-
expansion_level: str = "normal") -> torch.Tensor:
|
272 |
"""
|
273 |
-
|
274 |
-
|
275 |
-
|
276 |
-
expansion_configs = {
|
277 |
-
"normal": 0.125, # Β±125ms (original)
|
278 |
-
"extended": 0.25, # Β±250ms (double)
|
279 |
-
"maximum": 0.5 # Β±500ms (quadruple)
|
280 |
-
}
|
281 |
-
|
282 |
-
expansion_seconds = expansion_configs.get(expansion_level, 0.125)
|
283 |
-
audio_duration = waveform.shape[-1] / sample_rate
|
284 |
-
|
285 |
-
# Calculate expanded timing with boundary protection
|
286 |
-
expanded_start = max(0, start_time - expansion_seconds)
|
287 |
-
expanded_end = min(audio_duration, end_time + expansion_seconds)
|
288 |
-
|
289 |
-
log(f"π Audio expansion ({expansion_level}): {start_time:.3f}s-{end_time:.3f}s β {expanded_start:.3f}s-{expanded_end:.3f}s (Β±{expansion_seconds}s)")
|
290 |
-
|
291 |
-
return extract_audio_segment(waveform, sample_rate, expanded_start, expanded_end, word, verbose=False)
|
292 |
-
|
293 |
-
def sliding_window_phoneme_match_with_fallback(detected_phoneme: str, expected_phoneme: str,
|
294 |
-
word: str, waveform: torch.Tensor, sample_rate: int,
|
295 |
-
start_time: float, end_time: float) -> Tuple[str, float, int, int, torch.Tensor]:
|
296 |
-
"""
|
297 |
-
Enhanced sliding window match with automatic fallback expansion when detected is too short.
|
298 |
-
Returns: (best_match_substring, best_score, start_index, end_index, final_audio_segment)
|
299 |
"""
|
300 |
detected_norm = normalize_phoneme_string(detected_phoneme)
|
301 |
expected_norm = normalize_phoneme_string(expected_phoneme)
|
302 |
|
303 |
-
log(f"π
|
304 |
log(f" Expected (norm): '{expected_norm}' (length: {len(expected_norm)})")
|
305 |
log(f" Detected (norm): '{detected_norm}' (length: {len(detected_norm)})")
|
306 |
|
307 |
-
#
|
308 |
-
final_audio_segment = None
|
309 |
-
final_detected_phoneme = detected_phoneme
|
310 |
-
|
311 |
-
# Check if detected is significantly shorter than expected
|
312 |
-
if len(detected_norm) < len(expected_norm):
|
313 |
-
shortage_ratio = len(detected_norm) / len(expected_norm) if len(expected_norm) > 0 else 0
|
314 |
-
log(f" β οΈ Detected shorter than expected! Ratio: {shortage_ratio:.2f}")
|
315 |
-
|
316 |
-
# Try progressive expansion if detected is too short
|
317 |
-
if shortage_ratio < 0.8: # If detected is less than 80% of expected length
|
318 |
-
log(f" π Attempting fallback with extended audio expansion...")
|
319 |
-
|
320 |
-
# Try extended expansion (Β±0.25s instead of Β±0.125s)
|
321 |
-
extended_audio = extract_audio_segment_with_expansion_level(
|
322 |
-
waveform, sample_rate, start_time, end_time, word, "extended"
|
323 |
-
)
|
324 |
-
extended_detected = detect_phoneme_from_audio_with_fallback(
|
325 |
-
extended_audio, sample_rate, word, "extended"
|
326 |
-
)
|
327 |
-
extended_detected_norm = normalize_phoneme_string(extended_detected)
|
328 |
-
|
329 |
-
log(f" π Extended detection: '{extended_detected_norm}' (length: {len(extended_detected_norm)})")
|
330 |
-
|
331 |
-
# If extended is better (longer and closer to expected), use it
|
332 |
-
if len(extended_detected_norm) > len(detected_norm):
|
333 |
-
final_detected_phoneme = extended_detected
|
334 |
-
detected_norm = extended_detected_norm
|
335 |
-
final_audio_segment = extended_audio
|
336 |
-
log(f" β
Using extended detection (improved length)")
|
337 |
-
|
338 |
-
# If still too short, try maximum expansion
|
339 |
-
if len(detected_norm) < len(expected_norm) * 0.8:
|
340 |
-
log(f" π Still short, trying maximum expansion...")
|
341 |
-
|
342 |
-
maximum_audio = extract_audio_segment_with_expansion_level(
|
343 |
-
waveform, sample_rate, start_time, end_time, word, "maximum"
|
344 |
-
)
|
345 |
-
maximum_detected = detect_phoneme_from_audio_with_fallback(
|
346 |
-
maximum_audio, sample_rate, word, "maximum"
|
347 |
-
)
|
348 |
-
maximum_detected_norm = normalize_phoneme_string(maximum_detected)
|
349 |
-
|
350 |
-
log(f" π Maximum detection: '{maximum_detected_norm}' (length: {len(maximum_detected_norm)})")
|
351 |
-
|
352 |
-
if len(maximum_detected_norm) > len(detected_norm):
|
353 |
-
final_detected_phoneme = maximum_detected
|
354 |
-
detected_norm = maximum_detected_norm
|
355 |
-
final_audio_segment = maximum_audio
|
356 |
-
log(f" β
Using maximum detection (best length)")
|
357 |
-
else:
|
358 |
-
log(f" β Extended detection didn't improve, keeping original")
|
359 |
-
|
360 |
-
# If no fallback was used, use the original audio segment
|
361 |
-
if final_audio_segment is None:
|
362 |
-
final_audio_segment = extract_audio_segment_with_expansion_level(
|
363 |
-
waveform, sample_rate, start_time, end_time, word, "normal"
|
364 |
-
)
|
365 |
-
|
366 |
-
# Now proceed with regular sliding window logic using the final detected phoneme
|
367 |
-
# If detected is still shorter than or equal to expected, just compare directly
|
368 |
if len(detected_norm) <= len(expected_norm):
|
369 |
score = calculate_similarity(detected_norm, expected_norm)
|
370 |
log(f" Direct comparison (detected β€ expected): score = {score:.3f}")
|
371 |
-
return detected_norm, score, 0, len(detected_norm)
|
372 |
|
373 |
# Sliding window: detected is longer than expected
|
374 |
expected_len = len(expected_norm)
|
@@ -418,7 +375,7 @@ def sliding_window_phoneme_match_with_fallback(detected_phoneme: str, expected_p
|
|
418 |
log(f" π Aiming for middle: position {best_start}-{best_end}")
|
419 |
|
420 |
log(f" π Final selection: '{best_match}' at position {best_start}-{best_end} (score: {best_score:.3f})")
|
421 |
-
return best_match, best_score, best_start, best_end
|
422 |
|
423 |
def create_word_phoneme_mapping(word: str, expected_phoneme: str) -> Dict[int, str]:
|
424 |
"""
|
@@ -677,7 +634,7 @@ def format_output_word(word_text: str, similarity_score: float, detected_phoneme
|
|
677 |
|
678 |
@app.post("/api/transcribe")
|
679 |
async def transcribe(audio: UploadFile = File(...)):
|
680 |
-
log("=== STARTING WHISPERX PHONEME ANALYSIS ===")
|
681 |
|
682 |
# Fixed similarity threshold
|
683 |
similarity = 0.3
|
@@ -700,12 +657,11 @@ async def transcribe(audio: UploadFile = File(...)):
|
|
700 |
|
701 |
log(f"Audio loaded for WhisperX: {len(audio_data)} samples")
|
702 |
|
703 |
-
# 2. Get transcription with WhisperX -
|
704 |
-
log("π Forcing WhisperX to use English language detection...")
|
705 |
result = whisperx_model.transcribe(audio_data, batch_size=16, language="en")
|
706 |
|
707 |
-
# 3. Get precise word alignments with WhisperX
|
708 |
-
aligned_result = whisperx.align(result["segments"], whisperx_align_model, whisperx_metadata, audio_data, device="cpu"
|
709 |
|
710 |
# Extract word-level data from WhisperX results
|
711 |
words = []
|
@@ -752,7 +708,7 @@ async def transcribe(audio: UploadFile = File(...)):
|
|
752 |
waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
|
753 |
sample_rate = 16000
|
754 |
|
755 |
-
# 6. Process each word using
|
756 |
results = []
|
757 |
audio_data_list = []
|
758 |
|
@@ -761,7 +717,7 @@ async def transcribe(audio: UploadFile = File(...)):
|
|
761 |
tts_tasks = [generate_tts_audio(word_clean) for word_clean in word_texts_clean]
|
762 |
tts_results = await asyncio.gather(*tts_tasks)
|
763 |
|
764 |
-
log("\n=== PROCESSING WORDS WITH
|
765 |
|
766 |
for i, (word_info, word_original, word_clean, (start_time, end_time)) in enumerate(zip(words, word_texts, word_texts_clean, word_timings)):
|
767 |
expected_phoneme = expected_phonemes[i] if i < len(expected_phonemes) else ""
|
@@ -781,29 +737,41 @@ async def transcribe(audio: UploadFile = File(...)):
|
|
781 |
else:
|
782 |
log(f"π No gap (continuous)")
|
783 |
|
784 |
-
#
|
785 |
-
|
786 |
-
|
787 |
-
)
|
788 |
|
789 |
-
|
790 |
-
|
791 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
792 |
|
793 |
-
log(f"π
|
|
|
|
|
794 |
|
795 |
-
#
|
796 |
-
best_match_phoneme, similarity_score, match_start, match_end
|
797 |
-
|
798 |
)
|
799 |
|
800 |
-
# Trim
|
801 |
-
|
802 |
-
|
803 |
)
|
804 |
|
805 |
log(f"π Final similarity score: {similarity_score:.3f}")
|
806 |
-
log(f"π¨ Final audio segment samples: {
|
807 |
|
808 |
# Store results using the best match phoneme (use ORIGINAL word for display)
|
809 |
results.append({
|
@@ -817,7 +785,7 @@ async def transcribe(audio: UploadFile = File(...)):
|
|
817 |
})
|
818 |
|
819 |
# Prepare audio data using trimmed segment (use ORIGINAL word for display)
|
820 |
-
user_audio_b64 = audio_to_base64(
|
821 |
expected_audio_b64 = tts_results[i]
|
822 |
|
823 |
audio_data_list.append({
|
@@ -851,7 +819,7 @@ async def transcribe(audio: UploadFile = File(...)):
|
|
851 |
# Clean up temporary file
|
852 |
os.remove(temp_audio_path)
|
853 |
|
854 |
-
log("=== WHISPERX PHONEME ANALYSIS COMPLETE ===")
|
855 |
|
856 |
return {
|
857 |
"transcript": full_transcript,
|
@@ -861,7 +829,7 @@ async def transcribe(audio: UploadFile = File(...)):
|
|
861 |
"debug_info": {
|
862 |
"total_words": len(words),
|
863 |
"similarity_threshold": similarity,
|
864 |
-
"alignment_method": "WhisperX +
|
865 |
"results_summary": [
|
866 |
{
|
867 |
"word": r['word_text'],
|
@@ -889,7 +857,7 @@ async def transcribe(audio: UploadFile = File(...)):
|
|
889 |
|
890 |
@app.get("/")
|
891 |
def root():
|
892 |
-
return "Clean Fonetik with WhisperX +
|
893 |
|
894 |
@app.post("/api/clear-cache")
|
895 |
def clear_cache():
|
|
|
133 |
return cleaned
|
134 |
|
135 |
def load_whisperx_models():
|
136 |
+
"""Load WhisperX models lazily with English-only configuration"""
|
137 |
global whisperx_model, whisperx_align_model, whisperx_metadata
|
138 |
|
139 |
if whisperx_model is None:
|
140 |
+
log("Loading WhisperX models for English-only processing...")
|
141 |
try:
|
142 |
+
# Load WhisperX model with English-only configuration
|
143 |
+
whisperx_model = whisperx.load_model("base.en", device="cpu", compute_type="float32", language="en")
|
144 |
+
log("WhisperX base.en model loaded successfully")
|
145 |
+
|
146 |
+
# Load alignment model for English
|
147 |
whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
|
148 |
+
log("WhisperX English alignment model loaded successfully")
|
149 |
+
|
150 |
except Exception as e:
|
151 |
log(f"Error loading WhisperX models: {e}")
|
152 |
+
# Fallback: try with smaller English-only model
|
153 |
try:
|
154 |
+
log("Trying fallback with tiny.en model...")
|
155 |
+
whisperx_model = whisperx.load_model("tiny.en", device="cpu", compute_type="float32", language="en")
|
156 |
whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
|
157 |
+
log("WhisperX models loaded with fallback (tiny.en model)")
|
158 |
except Exception as fallback_error:
|
159 |
log(f"Fallback also failed: {fallback_error}")
|
160 |
raise
|
|
|
181 |
|
182 |
return normalized
|
183 |
|
184 |
+
# TEMPORARILY DISABLED: English letter sounds conversion
|
185 |
+
# def phoneme_to_english_sounds(phoneme_string: str) -> str:
|
186 |
+
# """Convert IPA phonemes to English letter sounds"""
|
187 |
+
# if not phoneme_string:
|
188 |
+
# return phoneme_string
|
189 |
+
#
|
190 |
+
# log(f"Converting phonemes to English sounds: '{phoneme_string}'")
|
191 |
+
#
|
192 |
+
# # Clean the input
|
193 |
+
# phoneme_string = phoneme_string.strip()
|
194 |
+
#
|
195 |
+
# # Split by spaces first (words/syllables)
|
196 |
+
# words = phoneme_string.split(' ')
|
197 |
+
# converted_words = []
|
198 |
+
#
|
199 |
+
# for word in words:
|
200 |
+
# if not word:
|
201 |
+
# continue
|
202 |
+
#
|
203 |
+
# english_sounds = []
|
204 |
+
# i = 0
|
205 |
+
#
|
206 |
+
# while i < len(word):
|
207 |
+
# found = False
|
208 |
+
# # Try longest matches first (like 'tΚ', 'dΚ' before 't', 'd')
|
209 |
+
# for length in [3, 2, 1]:
|
210 |
+
# if i + length <= len(word):
|
211 |
+
# phoneme = word[i:i+length]
|
212 |
+
# if phoneme in PHONEME_TO_ENGLISH:
|
213 |
+
# english_sounds.append(PHONEME_TO_ENGLISH[phoneme])
|
214 |
+
# i += length
|
215 |
+
# found = True
|
216 |
+
# break
|
217 |
+
#
|
218 |
+
# if not found:
|
219 |
+
# # Keep unknown characters as-is, but clean them up
|
220 |
+
# char = word[i]
|
221 |
+
# if char.isalpha():
|
222 |
+
# english_sounds.append(char.upper())
|
223 |
+
# i += 1
|
224 |
+
#
|
225 |
+
# if english_sounds:
|
226 |
+
# converted_words.append('-'.join(english_sounds))
|
227 |
+
#
|
228 |
+
# result = ' '.join(converted_words)
|
229 |
+
# log(f"Converted '{phoneme_string}' -> '{result}'")
|
230 |
+
# return result
|
231 |
+
|
232 |
def calculate_similarity(detected: str, expected: str) -> float:
|
233 |
"""Calculate similarity between detected and expected phonemes"""
|
234 |
detected_norm = normalize_phoneme_string(detected)
|
|
|
255 |
|
256 |
return segment
|
257 |
|
258 |
+
def detect_phoneme_from_audio(audio_segment: torch.Tensor, sample_rate: int, word: str) -> str:
|
259 |
+
"""Detect phoneme from audio segment using phoneme model"""
|
260 |
+
log(f"π Starting phoneme detection for '{word}'...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
261 |
|
262 |
if audio_segment.shape[-1] == 0:
|
263 |
log(f"β οΈ Empty audio segment for '{word}'")
|
264 |
return ""
|
265 |
|
266 |
+
log(f"π Original audio segment: {audio_segment.shape[-1]} samples")
|
267 |
|
268 |
# Pad or truncate to standard length for model
|
269 |
target_length = 16000 # 1 second
|
|
|
305 |
log(f"β Error in phoneme detection: {e}")
|
306 |
return ""
|
307 |
|
308 |
+
log(f"π― Phoneme detection for '{word}': '{detected_phoneme}'")
|
309 |
return detected_phoneme
|
310 |
|
311 |
+
def sliding_window_phoneme_match(detected_phoneme: str, expected_phoneme: str, word: str) -> Tuple[str, float, int, int]:
|
|
|
|
|
312 |
"""
|
313 |
+
Find the best matching substring in detected phoneme using sliding window.
|
314 |
+
For zero scores, intelligently selects which phoneme substring to return.
|
315 |
+
Returns: (best_match_substring, best_score, start_index, end_index)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
316 |
"""
|
317 |
detected_norm = normalize_phoneme_string(detected_phoneme)
|
318 |
expected_norm = normalize_phoneme_string(expected_phoneme)
|
319 |
|
320 |
+
log(f"π Sliding window analysis for '{word}':")
|
321 |
log(f" Expected (norm): '{expected_norm}' (length: {len(expected_norm)})")
|
322 |
log(f" Detected (norm): '{detected_norm}' (length: {len(detected_norm)})")
|
323 |
|
324 |
+
# If detected is shorter than or equal to expected, just compare directly
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
325 |
if len(detected_norm) <= len(expected_norm):
|
326 |
score = calculate_similarity(detected_norm, expected_norm)
|
327 |
log(f" Direct comparison (detected β€ expected): score = {score:.3f}")
|
328 |
+
return detected_norm, score, 0, len(detected_norm)
|
329 |
|
330 |
# Sliding window: detected is longer than expected
|
331 |
expected_len = len(expected_norm)
|
|
|
375 |
log(f" π Aiming for middle: position {best_start}-{best_end}")
|
376 |
|
377 |
log(f" π Final selection: '{best_match}' at position {best_start}-{best_end} (score: {best_score:.3f})")
|
378 |
+
return best_match, best_score, best_start, best_end
|
379 |
|
380 |
def create_word_phoneme_mapping(word: str, expected_phoneme: str) -> Dict[int, str]:
|
381 |
"""
|
|
|
634 |
|
635 |
@app.post("/api/transcribe")
|
636 |
async def transcribe(audio: UploadFile = File(...)):
|
637 |
+
log("=== STARTING WHISPERX ENGLISH-ONLY PHONEME ANALYSIS ===")
|
638 |
|
639 |
# Fixed similarity threshold
|
640 |
similarity = 0.3
|
|
|
657 |
|
658 |
log(f"Audio loaded for WhisperX: {len(audio_data)} samples")
|
659 |
|
660 |
+
# 2. Get transcription with WhisperX - EXPLICITLY SET TO ENGLISH
|
|
|
661 |
result = whisperx_model.transcribe(audio_data, batch_size=16, language="en")
|
662 |
|
663 |
+
# 3. Get precise word alignments with WhisperX
|
664 |
+
aligned_result = whisperx.align(result["segments"], whisperx_align_model, whisperx_metadata, audio_data, device="cpu")
|
665 |
|
666 |
# Extract word-level data from WhisperX results
|
667 |
words = []
|
|
|
708 |
waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
|
709 |
sample_rate = 16000
|
710 |
|
711 |
+
# 6. Process each word using expanded timing with sliding window matching
|
712 |
results = []
|
713 |
audio_data_list = []
|
714 |
|
|
|
717 |
tts_tasks = [generate_tts_audio(word_clean) for word_clean in word_texts_clean]
|
718 |
tts_results = await asyncio.gather(*tts_tasks)
|
719 |
|
720 |
+
log("\n=== PROCESSING WORDS WITH EXPANDED TIMING + SLIDING WINDOW ===")
|
721 |
|
722 |
for i, (word_info, word_original, word_clean, (start_time, end_time)) in enumerate(zip(words, word_texts, word_texts_clean, word_timings)):
|
723 |
expected_phoneme = expected_phonemes[i] if i < len(expected_phonemes) else ""
|
|
|
737 |
else:
|
738 |
log(f"π No gap (continuous)")
|
739 |
|
740 |
+
# Calculate expanded timing (Β±0.125s with boundary protection)
|
741 |
+
expansion_seconds = 0.125
|
742 |
+
audio_duration = waveform.shape[-1] / sample_rate
|
|
|
743 |
|
744 |
+
expanded_start = max(0, start_time - expansion_seconds)
|
745 |
+
expanded_end = min(audio_duration, end_time + expansion_seconds)
|
746 |
+
|
747 |
+
log(f"π Timing expansion: {start_time:.3f}s-{end_time:.3f}s β {expanded_start:.3f}s-{expanded_end:.3f}s")
|
748 |
+
|
749 |
+
# Extract expanded audio segment
|
750 |
+
expanded_audio_segment = extract_audio_segment(waveform, sample_rate, expanded_start, expanded_end, word_clean, verbose=True)
|
751 |
+
|
752 |
+
# Detect phoneme from expanded audio segment
|
753 |
+
detected_phoneme_raw = detect_phoneme_from_audio(expanded_audio_segment, sample_rate, word_clean)
|
754 |
+
|
755 |
+
# Get expected phoneme and normalize both
|
756 |
+
detected_phoneme_norm = normalize_phoneme_string(detected_phoneme_raw)
|
757 |
+
expected_phoneme_norm = normalize_phoneme_string(expected_phoneme)
|
758 |
|
759 |
+
log(f"π Raw detected phoneme (expanded): '{detected_phoneme_raw}'")
|
760 |
+
log(f"π§Ή Normalized detected: '{detected_phoneme_norm}'")
|
761 |
+
log(f"π§Ή Normalized expected: '{expected_phoneme_norm}'")
|
762 |
|
763 |
+
# Find best matching substring using sliding window
|
764 |
+
best_match_phoneme, similarity_score, match_start, match_end = sliding_window_phoneme_match(
|
765 |
+
detected_phoneme_raw, expected_phoneme, word_clean
|
766 |
)
|
767 |
|
768 |
+
# Trim audio segment based on best phoneme match position
|
769 |
+
final_audio_segment = trim_audio_segment_by_phoneme_position(
|
770 |
+
expanded_audio_segment, detected_phoneme_raw, match_start, match_end, word_clean
|
771 |
)
|
772 |
|
773 |
log(f"π Final similarity score: {similarity_score:.3f}")
|
774 |
+
log(f"π¨ Final audio segment samples: {final_audio_segment.shape[-1]} (duration: {final_audio_segment.shape[-1]/sample_rate:.3f}s)")
|
775 |
|
776 |
# Store results using the best match phoneme (use ORIGINAL word for display)
|
777 |
results.append({
|
|
|
785 |
})
|
786 |
|
787 |
# Prepare audio data using trimmed segment (use ORIGINAL word for display)
|
788 |
+
user_audio_b64 = audio_to_base64(final_audio_segment, sample_rate)
|
789 |
expected_audio_b64 = tts_results[i]
|
790 |
|
791 |
audio_data_list.append({
|
|
|
819 |
# Clean up temporary file
|
820 |
os.remove(temp_audio_path)
|
821 |
|
822 |
+
log("=== WHISPERX ENGLISH-ONLY PHONEME ANALYSIS COMPLETE ===")
|
823 |
|
824 |
return {
|
825 |
"transcript": full_transcript,
|
|
|
829 |
"debug_info": {
|
830 |
"total_words": len(words),
|
831 |
"similarity_threshold": similarity,
|
832 |
+
"alignment_method": "WhisperX English-only + Sliding Window",
|
833 |
"results_summary": [
|
834 |
{
|
835 |
"word": r['word_text'],
|
|
|
857 |
|
858 |
@app.get("/")
|
859 |
def root():
|
860 |
+
return "Clean Fonetik with WhisperX English-only + Character-Level Feedback running"
|
861 |
|
862 |
@app.post("/api/clear-cache")
|
863 |
def clear_cache():
|