Spaces:

greg0rs
/

fonetik-fast

Sleeping

App Files Files Community

greg0rs commited on Jul 21

Commit

76b58e9

verified ·

1 Parent(s): 5dae976

Update app.py

Browse files

Files changed (1) hide show

app.py +111 -143

app.py CHANGED Viewed

@@ -133,24 +133,28 @@ def clean_word_for_phonemes(word: str) -> str:
     return cleaned
 def load_whisperx_models():
-    """Load WhisperX models lazily - ENGLISH ONLY"""
     global whisperx_model, whisperx_align_model, whisperx_metadata
     if whisperx_model is None:
-        log("Loading WhisperX models (English only)...")
         try:
-            # Use float32 compute type for CPU compatibility
-            whisperx_model = whisperx.load_model("base", device="cpu", compute_type="float32", language="en")
             whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
-            log("WhisperX models loaded successfully (English forced)")
         except Exception as e:
             log(f"Error loading WhisperX models: {e}")
-            # Fallback: try with smaller model
             try:
-                log("Trying fallback with tiny model (English only)...")
-                whisperx_model = whisperx.load_model("tiny", device="cpu", compute_type="float32", language="en")
                 whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
-                log("WhisperX models loaded with fallback (tiny model, English forced)")
             except Exception as fallback_error:
                 log(f"Fallback also failed: {fallback_error}")
                 raise
@@ -177,6 +181,54 @@ def normalize_phoneme_string(s: str) -> str:
     return normalized
 def calculate_similarity(detected: str, expected: str) -> float:
     """Calculate similarity between detected and expected phonemes"""
     detected_norm = normalize_phoneme_string(detected)
@@ -203,25 +255,15 @@ def extract_audio_segment(waveform: torch.Tensor, sample_rate: int,
     return segment
-def detect_phoneme_from_audio_with_fallback(audio_segment: torch.Tensor, sample_rate: int,
-                                           word: str, expansion_level: str = "normal") -> str:
-    """
-    Detect phoneme from audio segment with fallback expansion capability.
-    expansion_level: "normal", "extended", or "maximum"
-    """
-    level_info = {
-        "normal": "standard",
-        "extended": "double-expanded",
-        "maximum": "triple-expanded"
-    }
-    log(f"🔍 Starting phoneme detection for '{word}' ({level_info[expansion_level]} audio)...")
     if audio_segment.shape[-1] == 0:
         log(f"⚠️  Empty audio segment for '{word}'")
         return ""
-    log(f"📏 Audio segment: {audio_segment.shape[-1]} samples ({audio_segment.shape[-1]/sample_rate:.3f}s)")
     # Pad or truncate to standard length for model
     target_length = 16000  # 1 second
@@ -263,112 +305,27 @@ def detect_phoneme_from_audio_with_fallback(audio_segment: torch.Tensor, sample_
         log(f"❌ Error in phoneme detection: {e}")
         return ""
-    log(f"🎯 Phoneme detection for '{word}' ({expansion_level}): '{detected_phoneme}'")
     return detected_phoneme
-def extract_audio_segment_with_expansion_level(waveform: torch.Tensor, sample_rate: int,
-                                             start_time: float, end_time: float, word: str,
-                                             expansion_level: str = "normal") -> torch.Tensor:
     """
-    Extract audio segment with different expansion levels.
-    expansion_level: "normal" (±0.125s), "extended" (±0.25s), "maximum" (±0.5s)
-    """
-    expansion_configs = {
-        "normal": 0.125,    # ±125ms (original)
-        "extended": 0.25,   # ±250ms (double)
-        "maximum": 0.5      # ±500ms (quadruple)
-    }
-    expansion_seconds = expansion_configs.get(expansion_level, 0.125)
-    audio_duration = waveform.shape[-1] / sample_rate
-    # Calculate expanded timing with boundary protection
-    expanded_start = max(0, start_time - expansion_seconds)
-    expanded_end = min(audio_duration, end_time + expansion_seconds)
-    log(f"📏 Audio expansion ({expansion_level}): {start_time:.3f}s-{end_time:.3f}s → {expanded_start:.3f}s-{expanded_end:.3f}s (±{expansion_seconds}s)")
-    return extract_audio_segment(waveform, sample_rate, expanded_start, expanded_end, word, verbose=False)
-def sliding_window_phoneme_match_with_fallback(detected_phoneme: str, expected_phoneme: str,
-                                             word: str, waveform: torch.Tensor, sample_rate: int,
-                                             start_time: float, end_time: float) -> Tuple[str, float, int, int, torch.Tensor]:
-    """
-    Enhanced sliding window match with automatic fallback expansion when detected is too short.
-    Returns: (best_match_substring, best_score, start_index, end_index, final_audio_segment)
     """
     detected_norm = normalize_phoneme_string(detected_phoneme)
     expected_norm = normalize_phoneme_string(expected_phoneme)
-    log(f"🔍 Enhanced sliding window analysis for '{word}':")
     log(f"  Expected (norm): '{expected_norm}' (length: {len(expected_norm)})")
     log(f"  Detected (norm): '{detected_norm}' (length: {len(detected_norm)})")
-    # Track the audio segment used for final detection
-    final_audio_segment = None
-    final_detected_phoneme = detected_phoneme
-    # Check if detected is significantly shorter than expected
-    if len(detected_norm) < len(expected_norm):
-        shortage_ratio = len(detected_norm) / len(expected_norm) if len(expected_norm) > 0 else 0
-        log(f"  ⚠️  Detected shorter than expected! Ratio: {shortage_ratio:.2f}")
-        # Try progressive expansion if detected is too short
-        if shortage_ratio < 0.8:  # If detected is less than 80% of expected length
-            log(f"  🔄 Attempting fallback with extended audio expansion...")
-            # Try extended expansion (±0.25s instead of ±0.125s)
-            extended_audio = extract_audio_segment_with_expansion_level(
-                waveform, sample_rate, start_time, end_time, word, "extended"
-            )
-            extended_detected = detect_phoneme_from_audio_with_fallback(
-                extended_audio, sample_rate, word, "extended"
-            )
-            extended_detected_norm = normalize_phoneme_string(extended_detected)
-            log(f"  📈 Extended detection: '{extended_detected_norm}' (length: {len(extended_detected_norm)})")
-            # If extended is better (longer and closer to expected), use it
-            if len(extended_detected_norm) > len(detected_norm):
-                final_detected_phoneme = extended_detected
-                detected_norm = extended_detected_norm
-                final_audio_segment = extended_audio
-                log(f"  ✅ Using extended detection (improved length)")
-                # If still too short, try maximum expansion
-                if len(detected_norm) < len(expected_norm) * 0.8:
-                    log(f"  🔄 Still short, trying maximum expansion...")
-                    maximum_audio = extract_audio_segment_with_expansion_level(
-                        waveform, sample_rate, start_time, end_time, word, "maximum"
-                    )
-                    maximum_detected = detect_phoneme_from_audio_with_fallback(
-                        maximum_audio, sample_rate, word, "maximum"
-                    )
-                    maximum_detected_norm = normalize_phoneme_string(maximum_detected)
-                    log(f"  📈 Maximum detection: '{maximum_detected_norm}' (length: {len(maximum_detected_norm)})")
-                    if len(maximum_detected_norm) > len(detected_norm):
-                        final_detected_phoneme = maximum_detected
-                        detected_norm = maximum_detected_norm
-                        final_audio_segment = maximum_audio
-                        log(f"  ✅ Using maximum detection (best length)")
-            else:
-                log(f"  ❌ Extended detection didn't improve, keeping original")
-    # If no fallback was used, use the original audio segment
-    if final_audio_segment is None:
-        final_audio_segment = extract_audio_segment_with_expansion_level(
-            waveform, sample_rate, start_time, end_time, word, "normal"
-        )
-    # Now proceed with regular sliding window logic using the final detected phoneme
-    # If detected is still shorter than or equal to expected, just compare directly
     if len(detected_norm) <= len(expected_norm):
         score = calculate_similarity(detected_norm, expected_norm)
         log(f"  Direct comparison (detected ≤ expected): score = {score:.3f}")
-        return detected_norm, score, 0, len(detected_norm), final_audio_segment
     # Sliding window: detected is longer than expected
     expected_len = len(expected_norm)
@@ -418,7 +375,7 @@ def sliding_window_phoneme_match_with_fallback(detected_phoneme: str, expected_p
             log(f"    📍 Aiming for middle: position {best_start}-{best_end}")
     log(f"  🏆 Final selection: '{best_match}' at position {best_start}-{best_end} (score: {best_score:.3f})")
-    return best_match, best_score, best_start, best_end, final_audio_segment
 def create_word_phoneme_mapping(word: str, expected_phoneme: str) -> Dict[int, str]:
     """
@@ -677,7 +634,7 @@ def format_output_word(word_text: str, similarity_score: float, detected_phoneme
 @app.post("/api/transcribe")
 async def transcribe(audio: UploadFile = File(...)):
-    log("=== STARTING WHISPERX PHONEME ANALYSIS ===")
     # Fixed similarity threshold
     similarity = 0.3
@@ -700,12 +657,11 @@ async def transcribe(audio: UploadFile = File(...)):
         log(f"Audio loaded for WhisperX: {len(audio_data)} samples")
-        # 2. Get transcription with WhisperX - FORCE ENGLISH
-        log("🌍 Forcing WhisperX to use English language detection...")
         result = whisperx_model.transcribe(audio_data, batch_size=16, language="en")
-        # 3. Get precise word alignments with WhisperX - ENGLISH ONLY
-        aligned_result = whisperx.align(result["segments"], whisperx_align_model, whisperx_metadata, audio_data, device="cpu", return_char_alignments=False)
         # Extract word-level data from WhisperX results
         words = []
@@ -752,7 +708,7 @@ async def transcribe(audio: UploadFile = File(...)):
             waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
             sample_rate = 16000
-        # 6. Process each word using enhanced expansion with sliding window matching
         results = []
         audio_data_list = []
@@ -761,7 +717,7 @@ async def transcribe(audio: UploadFile = File(...)):
         tts_tasks = [generate_tts_audio(word_clean) for word_clean in word_texts_clean]
         tts_results = await asyncio.gather(*tts_tasks)
-        log("\n=== PROCESSING WORDS WITH ENHANCED EXPANSION + SLIDING WINDOW ===")
         for i, (word_info, word_original, word_clean, (start_time, end_time)) in enumerate(zip(words, word_texts, word_texts_clean, word_timings)):
             expected_phoneme = expected_phonemes[i] if i < len(expected_phonemes) else ""
@@ -781,29 +737,41 @@ async def transcribe(audio: UploadFile = File(...)):
                 else:
                     log(f"🔗 No gap (continuous)")
-            # Start with normal expansion (±0.125s) and detect phonemes
-            initial_audio_segment = extract_audio_segment_with_expansion_level(
-                waveform, sample_rate, start_time, end_time, word_clean, "normal"
-            )
-            initial_detected_phoneme = detect_phoneme_from_audio_with_fallback(
-                initial_audio_segment, sample_rate, word_clean, "normal"
-            )
-            log(f"🔊 Initial detected phoneme: '{initial_detected_phoneme}'")
-            # Enhanced sliding window matching with automatic fallback expansion
-            best_match_phoneme, similarity_score, match_start, match_end, final_audio_segment = sliding_window_phoneme_match_with_fallback(
-                initial_detected_phoneme, expected_phoneme, word_clean, waveform, sample_rate, start_time, end_time
             )
-            # Trim the final audio segment based on best phoneme match position
-            trimmed_audio_segment = trim_audio_segment_by_phoneme_position(
-                final_audio_segment, initial_detected_phoneme, match_start, match_end, word_clean
             )
             log(f"📊 Final similarity score: {similarity_score:.3f}")
-            log(f"🎨 Final audio segment samples: {trimmed_audio_segment.shape[-1]} (duration: {trimmed_audio_segment.shape[-1]/sample_rate:.3f}s)")
             # Store results using the best match phoneme (use ORIGINAL word for display)
             results.append({
@@ -817,7 +785,7 @@ async def transcribe(audio: UploadFile = File(...)):
             })
             # Prepare audio data using trimmed segment (use ORIGINAL word for display)
-            user_audio_b64 = audio_to_base64(trimmed_audio_segment, sample_rate)
             expected_audio_b64 = tts_results[i]
             audio_data_list.append({
@@ -851,7 +819,7 @@ async def transcribe(audio: UploadFile = File(...)):
         # Clean up temporary file
         os.remove(temp_audio_path)
-        log("=== WHISPERX PHONEME ANALYSIS COMPLETE ===")
         return {
             "transcript": full_transcript,
@@ -861,7 +829,7 @@ async def transcribe(audio: UploadFile = File(...)):
             "debug_info": {
                 "total_words": len(words),
                 "similarity_threshold": similarity,
-                "alignment_method": "WhisperX + Enhanced Sliding Window with Fallback (English Only)",
                 "results_summary": [
                     {
                         "word": r['word_text'],
@@ -889,7 +857,7 @@ async def transcribe(audio: UploadFile = File(...)):
 @app.get("/")
 def root():
-    return "Clean Fonetik with WhisperX + Enhanced Character-Level Feedback (English Only) running"
 @app.post("/api/clear-cache")
 def clear_cache():

     return cleaned
 def load_whisperx_models():
+    """Load WhisperX models lazily with English-only configuration"""
     global whisperx_model, whisperx_align_model, whisperx_metadata
     if whisperx_model is None:
+        log("Loading WhisperX models for English-only processing...")
         try:
+            # Load WhisperX model with English-only configuration
+            whisperx_model = whisperx.load_model("base.en", device="cpu", compute_type="float32", language="en")
+            log("WhisperX base.en model loaded successfully")
+            # Load alignment model for English
             whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
+            log("WhisperX English alignment model loaded successfully")
         except Exception as e:
             log(f"Error loading WhisperX models: {e}")
+            # Fallback: try with smaller English-only model
             try:
+                log("Trying fallback with tiny.en model...")
+                whisperx_model = whisperx.load_model("tiny.en", device="cpu", compute_type="float32", language="en")
                 whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
+                log("WhisperX models loaded with fallback (tiny.en model)")
             except Exception as fallback_error:
                 log(f"Fallback also failed: {fallback_error}")
                 raise
     return normalized
+# TEMPORARILY DISABLED: English letter sounds conversion
+# def phoneme_to_english_sounds(phoneme_string: str) -> str:
+#     """Convert IPA phonemes to English letter sounds"""
+#     if not phoneme_string:
+#         return phoneme_string
+#
+#     log(f"Converting phonemes to English sounds: '{phoneme_string}'")
+#
+#     # Clean the input
+#     phoneme_string = phoneme_string.strip()
+#
+#     # Split by spaces first (words/syllables)
+#     words = phoneme_string.split(' ')
+#     converted_words = []
+#
+#     for word in words:
+#         if not word:
+#             continue
+#
+#         english_sounds = []
+#         i = 0
+#
+#         while i < len(word):
+#             found = False
+#             # Try longest matches first (like 'tʃ', 'dʒ' before 't', 'd')
+#             for length in [3, 2, 1]:
+#                 if i + length <= len(word):
+#                     phoneme = word[i:i+length]
+#                     if phoneme in PHONEME_TO_ENGLISH:
+#                         english_sounds.append(PHONEME_TO_ENGLISH[phoneme])
+#                         i += length
+#                         found = True
+#                         break
+#
+#             if not found:
+#                 # Keep unknown characters as-is, but clean them up
+#                 char = word[i]
+#                 if char.isalpha():
+#                     english_sounds.append(char.upper())
+#                 i += 1
+#
+#         if english_sounds:
+#             converted_words.append('-'.join(english_sounds))
+#
+#     result = ' '.join(converted_words)
+#     log(f"Converted '{phoneme_string}' -> '{result}'")
+#     return result
 def calculate_similarity(detected: str, expected: str) -> float:
     """Calculate similarity between detected and expected phonemes"""
     detected_norm = normalize_phoneme_string(detected)
     return segment
+def detect_phoneme_from_audio(audio_segment: torch.Tensor, sample_rate: int, word: str) -> str:
+    """Detect phoneme from audio segment using phoneme model"""
+    log(f"🔍 Starting phoneme detection for '{word}'...")
     if audio_segment.shape[-1] == 0:
         log(f"⚠️  Empty audio segment for '{word}'")
         return ""
+    log(f"📏 Original audio segment: {audio_segment.shape[-1]} samples")
     # Pad or truncate to standard length for model
     target_length = 16000  # 1 second
         log(f"❌ Error in phoneme detection: {e}")
         return ""
+    log(f"🎯 Phoneme detection for '{word}': '{detected_phoneme}'")
     return detected_phoneme
+def sliding_window_phoneme_match(detected_phoneme: str, expected_phoneme: str, word: str) -> Tuple[str, float, int, int]:
     """
+    Find the best matching substring in detected phoneme using sliding window.
+    For zero scores, intelligently selects which phoneme substring to return.
+    Returns: (best_match_substring, best_score, start_index, end_index)
     """
     detected_norm = normalize_phoneme_string(detected_phoneme)
     expected_norm = normalize_phoneme_string(expected_phoneme)
+    log(f"🔍 Sliding window analysis for '{word}':")
     log(f"  Expected (norm): '{expected_norm}' (length: {len(expected_norm)})")
     log(f"  Detected (norm): '{detected_norm}' (length: {len(detected_norm)})")
+    # If detected is shorter than or equal to expected, just compare directly
     if len(detected_norm) <= len(expected_norm):
         score = calculate_similarity(detected_norm, expected_norm)
         log(f"  Direct comparison (detected ≤ expected): score = {score:.3f}")
+        return detected_norm, score, 0, len(detected_norm)
     # Sliding window: detected is longer than expected
     expected_len = len(expected_norm)
             log(f"    📍 Aiming for middle: position {best_start}-{best_end}")
     log(f"  🏆 Final selection: '{best_match}' at position {best_start}-{best_end} (score: {best_score:.3f})")
+    return best_match, best_score, best_start, best_end
 def create_word_phoneme_mapping(word: str, expected_phoneme: str) -> Dict[int, str]:
     """
 @app.post("/api/transcribe")
 async def transcribe(audio: UploadFile = File(...)):
+    log("=== STARTING WHISPERX ENGLISH-ONLY PHONEME ANALYSIS ===")
     # Fixed similarity threshold
     similarity = 0.3
         log(f"Audio loaded for WhisperX: {len(audio_data)} samples")
+        # 2. Get transcription with WhisperX - EXPLICITLY SET TO ENGLISH
         result = whisperx_model.transcribe(audio_data, batch_size=16, language="en")
+        # 3. Get precise word alignments with WhisperX
+        aligned_result = whisperx.align(result["segments"], whisperx_align_model, whisperx_metadata, audio_data, device="cpu")
         # Extract word-level data from WhisperX results
         words = []
             waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
             sample_rate = 16000
+        # 6. Process each word using expanded timing with sliding window matching
         results = []
         audio_data_list = []
         tts_tasks = [generate_tts_audio(word_clean) for word_clean in word_texts_clean]
         tts_results = await asyncio.gather(*tts_tasks)
+        log("\n=== PROCESSING WORDS WITH EXPANDED TIMING + SLIDING WINDOW ===")
         for i, (word_info, word_original, word_clean, (start_time, end_time)) in enumerate(zip(words, word_texts, word_texts_clean, word_timings)):
             expected_phoneme = expected_phonemes[i] if i < len(expected_phonemes) else ""
                 else:
                     log(f"🔗 No gap (continuous)")
+            # Calculate expanded timing (±0.125s with boundary protection)
+            expansion_seconds = 0.125
+            audio_duration = waveform.shape[-1] / sample_rate
+            expanded_start = max(0, start_time - expansion_seconds)
+            expanded_end = min(audio_duration, end_time + expansion_seconds)
+            log(f"📏 Timing expansion: {start_time:.3f}s-{end_time:.3f}s → {expanded_start:.3f}s-{expanded_end:.3f}s")
+            # Extract expanded audio segment
+            expanded_audio_segment = extract_audio_segment(waveform, sample_rate, expanded_start, expanded_end, word_clean, verbose=True)
+            # Detect phoneme from expanded audio segment
+            detected_phoneme_raw = detect_phoneme_from_audio(expanded_audio_segment, sample_rate, word_clean)
+            # Get expected phoneme and normalize both
+            detected_phoneme_norm = normalize_phoneme_string(detected_phoneme_raw)
+            expected_phoneme_norm = normalize_phoneme_string(expected_phoneme)
+            log(f"🔊 Raw detected phoneme (expanded): '{detected_phoneme_raw}'")
+            log(f"🧹 Normalized detected: '{detected_phoneme_norm}'")
+            log(f"🧹 Normalized expected: '{expected_phoneme_norm}'")
+            # Find best matching substring using sliding window
+            best_match_phoneme, similarity_score, match_start, match_end = sliding_window_phoneme_match(
+                detected_phoneme_raw, expected_phoneme, word_clean
             )
+            # Trim audio segment based on best phoneme match position
+            final_audio_segment = trim_audio_segment_by_phoneme_position(
+                expanded_audio_segment, detected_phoneme_raw, match_start, match_end, word_clean
             )
             log(f"📊 Final similarity score: {similarity_score:.3f}")
+            log(f"🎨 Final audio segment samples: {final_audio_segment.shape[-1]} (duration: {final_audio_segment.shape[-1]/sample_rate:.3f}s)")
             # Store results using the best match phoneme (use ORIGINAL word for display)
             results.append({
             })
             # Prepare audio data using trimmed segment (use ORIGINAL word for display)
+            user_audio_b64 = audio_to_base64(final_audio_segment, sample_rate)
             expected_audio_b64 = tts_results[i]
             audio_data_list.append({
         # Clean up temporary file
         os.remove(temp_audio_path)
+        log("=== WHISPERX ENGLISH-ONLY PHONEME ANALYSIS COMPLETE ===")
         return {
             "transcript": full_transcript,
             "debug_info": {
                 "total_words": len(words),
                 "similarity_threshold": similarity,
+                "alignment_method": "WhisperX English-only + Sliding Window",
                 "results_summary": [
                     {
                         "word": r['word_text'],
 @app.get("/")
 def root():
+    return "Clean Fonetik with WhisperX English-only + Character-Level Feedback running"
 @app.post("/api/clear-cache")
 def clear_cache():