Spaces:

greg0rs
/

fonetik-fast

Sleeping

App Files Files Community

greg0rs commited on 9 days ago

Commit

4b0e9d9

verified ·

1 Parent(s): 1bf5bd5

Update app.py

Browse files

Files changed (1) hide show

app.py +197 -138

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ import string
 import re
 import urllib.request
 import gzip
 # Set cache environment
 os.environ['HF_HOME'] = '/tmp/hf'
@@ -45,8 +46,6 @@ def normalize_phoneme_string(s: str) -> str:
     if not s:
         return s
-    original = s
     # Convert to lowercase and remove spaces, stress marks, and length markers
     normalized = s.lower().strip()
     normalized = normalized.replace(' ', '')  # Remove spaces between phonemes
@@ -83,101 +82,12 @@ def normalize_phoneme_string(s: str) -> str:
     for variant_char, standard_char in ipa_variants.items():
         normalized = normalized.replace(variant_char, standard_char)
-    # Debug specific phoneme strings for normalization
-    if any(word in original.lower() for word in ['hello', 'red']) or any(pattern in original for pattern in ['həloʊ', 'hɛloʊ', 'ɹɛd']):
-        log(f"🔍 Normalization debug: '{original}' → '{normalized}'")
     return normalized
-# Phoneme reverse lookup
-phoneme_to_words_cache = {}
-def build_phoneme_reverse_lookup():
-    """Build reverse lookup dictionary from CMUdict (ARPABET to words)"""
-    global phoneme_to_words_cache
-    if phoneme_to_words_cache:
-        return  # Already built
-    try:
-        # Download CMUdict if not exists
-        cmudict_path = "/tmp/cmudict.dict"
-        if not os.path.exists(cmudict_path):
-            urllib.request.urlretrieve("https://raw.githubusercontent.com/cmusphinx/cmudict/master/cmudict.dict", cmudict_path)
-        # ARPABET to IPA conversion mapping
-        arpabet_to_ipa = {
-            'AA': 'ɑ',   'AE': 'æ',   'AH': 'ə',   'AO': 'ɔ',   'AW': 'aʊ',  # Fixed AH: ʌ→ə
-            'AY': 'aɪ',  'B': 'b',    'CH': 'tʃ',  'D': 'd',    'DH': 'ð',
-            'EH': 'ɛ',   'ER': 'ɝ',   'EY': 'eɪ',  'F': 'f',    'G': 'ɡ',
-            'HH': 'h',   'IH': 'ɪ',   'IY': 'i',   'JH': 'dʒ',  'K': 'k',
-            'L': 'l',    'M': 'm',    'N': 'n',    'NG': 'ŋ',   'OW': 'oʊ',
-            'OY': 'ɔɪ',  'P': 'p',    'R': 'r',    'S': 's',    'SH': 'ʃ',
-            'T': 't',    'TH': 'θ',   'UH': 'ʊ',   'UW': 'u',   'V': 'v',
-            'W': 'w',    'Y': 'j',    'Z': 'z',    'ZH': 'ʒ',   'DX': 'ɾ'   # Fixed: T→DX for flap
-        }
-        # Parse CMUdict and build reverse lookup
-        word_count = 0
-        with open(cmudict_path, 'r', encoding='latin-1') as f:
-            for line in f:
-                line = line.strip()
-                if not line or line.startswith(';;;'):
-                    continue
-                # Parse line: WORD  P H O N E M E S
-                parts = line.split()
-                if len(parts) < 2:
-                    continue
-                word = parts[0].lower()
-                # Remove variant indicators like (2), (3)
-                if '(' in word:
-                    word = word.split('(')[0]
-                # Convert ARPABET to IPA
-                arpabet_phones = parts[1:]
-                ipa_phones = []
-                for phone in arpabet_phones:
-                    # Remove stress markers (0,1,2)
-                    clean_phone = ''.join(c for c in phone if not c.isdigit())
-                    if clean_phone in arpabet_to_ipa:
-                        ipa_phones.append(arpabet_to_ipa[clean_phone])
-                if ipa_phones:
-                    # Create phoneme string and normalize it
-                    ipa_string = ''.join(ipa_phones)
-                    normalized_ipa = normalize_phoneme_string(ipa_string)
-                    # Add to reverse lookup
-                    if normalized_ipa not in phoneme_to_words_cache:
-                        phoneme_to_words_cache[normalized_ipa] = []
-                    if word not in phoneme_to_words_cache[normalized_ipa]:
-                        phoneme_to_words_cache[normalized_ipa].append(word)
-                    word_count += 1
-        log(f"✅ Built reverse lookup: {word_count} words, {len(phoneme_to_words_cache)} unique phoneme patterns")
-    except Exception as e:
-        log(f"❌ Error building phoneme reverse lookup: {e}")
-        phoneme_to_words_cache = {}
-def lookup_words_from_phonemes(phoneme_string: str) -> List[str]:
-    """Look up possible words for a given phoneme string"""
-    if not phoneme_to_words_cache:
-        return []
-    normalized = normalize_phoneme_string(phoneme_string)
-    return phoneme_to_words_cache.get(normalized, [])
 # Load models once at startup
 phoneme_processor = Wav2Vec2Processor.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
 phoneme_model = Wav2Vec2ForCTC.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
-# Build phoneme reverse lookup dictionary
-build_phoneme_reverse_lookup()
 # Model inspection complete - wav2vec2 uses ASCII 'g' (token 15), not IPA 'ɡ'
 log("✅ Phoneme models loaded - using ASCII/IPA normalization")
@@ -261,6 +171,67 @@ PHONEME_TO_ENGLISH = {
     'ˌ': '',       # secondary stress (remove)
 }
 def clean_word_for_phonemes(word: str) -> str:
     """
     Clean word by removing punctuation and extra spaces for phoneme processing.
@@ -334,6 +305,10 @@ def load_whisperx_models():
     if whisperx_model is None:
         log("Loading WhisperX models for English-only processing...")
         try:
             # Try loading with base.en first
             whisperx_model = whisperx.load_model("base.en", device="cpu", compute_type="float32", language="en")
@@ -345,42 +320,65 @@ def load_whisperx_models():
         except ImportError as ie:
             log(f"Import error loading WhisperX models: {ie}")
-            # Try without ctranslate2 by using int8 compute type
             try:
-                log("Trying fallback with int8 compute type...")
-                whisperx_model = whisperx.load_model("base.en", device="cpu", compute_type="int8", language="en")
-                whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
-                log("WhisperX models loaded with int8 compute type")
-            except Exception as fallback_error:
-                log(f"Int8 fallback also failed: {fallback_error}")
-                # Last resort: try tiny model with default compute
-                try:
-                    log("Trying final fallback with tiny.en model and default compute...")
-                    whisperx_model = whisperx.load_model("tiny.en", device="cpu", language="en")
-                    whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
-                    log("WhisperX models loaded with tiny.en and default compute")
-                except Exception as final_error:
-                    log(f"All WhisperX loading attempts failed: {final_error}")
-                    raise RuntimeError("Unable to load WhisperX models. Please check environment setup.")
         except Exception as e:
             log(f"Error loading WhisperX models: {e}")
-            # Fallback: try with smaller English-only model
-            try:
-                log("Trying fallback with tiny.en model...")
-                whisperx_model = whisperx.load_model("tiny.en", device="cpu", compute_type="int8", language="en")
-                whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
-                log("WhisperX models loaded with fallback (tiny.en model)")
-            except Exception as fallback_error:
-                log(f"Fallback also failed: {fallback_error}")
-                # Final attempt without compute_type specification
-                try:
-                    log("Final attempt with default settings...")
-                    whisperx_model = whisperx.load_model("tiny.en", device="cpu", language="en")
-                    whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
-                    log("WhisperX models loaded with default settings")
-                except Exception as final_error:
-                    log(f"All attempts failed: {final_error}")
-                    raise RuntimeError("Unable to load WhisperX models in this environment")
 def convert_webm_to_wav(bts):
     p = subprocess.run(["ffmpeg", "-i", "pipe:0", "-f", "wav", "-ar", "16000", "-ac", "1", "pipe:1"],
@@ -822,16 +820,16 @@ def create_character_level_feedback(word: str, expected_normalized: str,
                             detected_english = "?"
                             detected_example = ""
-                        # Create tooltip text with example words
                         if expected_example and detected_example:
-                            tooltip_text = f"Expected '{expected_english}' as in '{expected_example}', You said '{detected_english}' as in '{detected_example}'"
                         elif expected_example:
-                            tooltip_text = f"Expected '{expected_english}' as in '{expected_example}', You said '{detected_english}'"
                         else:
-                            tooltip_text = f"Expected '{expected_english}', You said '{detected_english}'"
                         # Create span with inline tooltip for each mispronounced letter/group
-                        formatted_letters = f'<span class="phoneme-error" data-expected="{expected_english}" data-detected="{detected_english}" title="{tooltip_text}"><strong><u>{word_letters}</u></strong></span>'
                         test_result.append(formatted_letters)
                         # For the simplified tooltip feedback
@@ -1009,7 +1007,7 @@ def get_expected_phonemes(words: List[str]) -> List[str]:
         return empty_results
 async def generate_tts_audio(word: str) -> str:
-    """Generate TTS audio for a word"""
     if word in tts_cache:
         return tts_cache[word]
@@ -1021,17 +1019,81 @@ async def generate_tts_audio(word: str) -> str:
                 audio_data += chunk["data"]
         if audio_data:
-            audio_b64 = base64.b64encode(audio_data).decode('utf-8')
-            tts_cache[word] = audio_b64
-            return audio_b64
     except Exception as e:
         log(f"TTS failed for '{word}': {e}")
     return ""
-def audio_to_base64(audio_segment: torch.Tensor, sample_rate: int) -> str:
-    """Convert audio tensor to base64 string"""
     try:
         buffer = io.BytesIO()
         torchaudio.save(buffer, audio_segment, sample_rate, format="wav")
         buffer.seek(0)
@@ -1420,7 +1482,6 @@ async def transcribe(audio: UploadFile = File(...), similarity_threshold: float
             })
         # 7. Format output
-        full_transcript = " ".join(word_texts)
         resolved_output = []
         resolved_colored = []
@@ -1441,7 +1502,6 @@ async def transcribe(audio: UploadFile = File(...), similarity_threshold: float
         log("=== WHISPERX ENGLISH-ONLY PHONEME ANALYSIS COMPLETE ===")
         return {
-            "transcript": full_transcript,
             "resolved": " ".join(resolved_output),
             "resolved_colored": " ".join(resolved_colored),
             "audio_data": audio_data_list,
@@ -1467,7 +1527,6 @@ async def transcribe(audio: UploadFile = File(...), similarity_threshold: float
         import traceback
         log(f"Traceback: {traceback.format_exc()}")
         return {
-            "transcript": "Error occurred",
             "resolved": "Error occurred",
             "resolved_colored": "Error occurred",
             "audio_data": [],

 import re
 import urllib.request
 import gzip
+import tempfile
 # Set cache environment
 os.environ['HF_HOME'] = '/tmp/hf'
     if not s:
         return s
     # Convert to lowercase and remove spaces, stress marks, and length markers
     normalized = s.lower().strip()
     normalized = normalized.replace(' ', '')  # Remove spaces between phonemes
     for variant_char, standard_char in ipa_variants.items():
         normalized = normalized.replace(variant_char, standard_char)
     return normalized
 # Load models once at startup
 phoneme_processor = Wav2Vec2Processor.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
 phoneme_model = Wav2Vec2ForCTC.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
 # Model inspection complete - wav2vec2 uses ASCII 'g' (token 15), not IPA 'ɡ'
 log("✅ Phoneme models loaded - using ASCII/IPA normalization")
     'ˌ': '',       # secondary stress (remove)
 }
+# Phoneme example words - showing the sound in context
+PHONEME_EXAMPLES = {
+    # Vowels (monophthongs)
+    'ɪ': 'bit',     # IH sound
+    'ɛ': 'bed',     # EH sound
+    'æ': 'cat',     # AE sound
+    'ʌ': 'but',     # UH sound (stressed)
+    'ɑ': 'father',  # AH sound
+    'ɔ': 'law',     # AW sound
+    'ʊ': 'book',    # UU sound
+    'u': 'boot',    # OO sound
+    'i': 'beat',    # EE sound
+    'ə': 'about',   # schwa (unstressed)
+    'ɝ': 'bird',    # ER sound (stressed)
+    'ɚ': 'letter',  # ER sound (unstressed)
+    # Diphthongs
+    'eɪ': 'day',    # AY sound
+    'aɪ': 'my',     # EYE sound
+    'ɔɪ': 'boy',    # OY sound
+    'aʊ': 'now',    # OW sound
+    'oʊ': 'go',     # OH sound
+    # R-colored vowels
+    'ɪr': 'near',   # EER sound
+    'ɛr': 'care',   # AIR sound
+    'ɑr': 'car',    # AR sound
+    'ɔr': 'for',    # OR sound
+    'ʊr': 'tour',   # OOR sound
+    'ər': 'letter', # ER sound
+    # Consonants
+    'p': 'pat',     # P sound
+    'b': 'bat',     # B sound
+    't': 'tap',     # T sound
+    'd': 'dap',     # D sound
+    'k': 'cat',     # K sound
+    'g': 'gap',     # G sound (ASCII)
+    'ɡ': 'gap',     # G sound (IPA)
+    'f': 'fat',     # F sound
+    'v': 'vat',     # V sound
+    'θ': 'think',   # TH sound (voiceless)
+    'ð': 'this',    # TH sound (voiced)
+    's': 'sap',     # S sound
+    'z': 'zap',     # Z sound
+    'ʃ': 'ship',    # SH sound
+    'ʒ': 'measure', # ZH sound
+    'h': 'hat',     # H sound
+    'm': 'mat',     # M sound
+    'n': 'nat',     # N sound
+    'ŋ': 'sing',    # NG sound
+    'l': 'lap',     # L sound
+    'r': 'rap',     # R sound
+    'j': 'yes',     # Y sound
+    'w': 'wet',     # W sound
+    # Affricates
+    'tʃ': 'chip',   # CH sound
+    'dʒ': 'jump',   # J sound
+}
 def clean_word_for_phonemes(word: str) -> str:
     """
     Clean word by removing punctuation and extra spaces for phoneme processing.
     if whisperx_model is None:
         log("Loading WhisperX models for English-only processing...")
+        # First, try to set environment variable to disable executable stack
+        os.environ['LD_BIND_NOW'] = '1'
         try:
             # Try loading with base.en first
             whisperx_model = whisperx.load_model("base.en", device="cpu", compute_type="float32", language="en")
         except ImportError as ie:
             log(f"Import error loading WhisperX models: {ie}")
+            # Try to use regular Whisper as fallback
             try:
+                log("Attempting to use standard Whisper instead of WhisperX...")
+                import whisper
+                # Load standard whisper model
+                whisper_model = whisper.load_model("base.en", device="cpu")
+                # Create a wrapper to make it compatible with WhisperX interface
+                class WhisperWrapper:
+                    def __init__(self, model):
+                        self.model = model
+                    def transcribe(self, audio, batch_size=16, language="en"):
+                        result = self.model.transcribe(audio, language=language)
+                        # Convert to WhisperX format
+                        return {
+                            "segments": [{
+                                "text": result["text"],
+                                "start": 0.0,
+                                "end": len(audio) / 16000.0,  # Approximate based on sample rate
+                                "words": []  # Will need to handle word-level timing differently
+                            }],
+                            "language": language
+                        }
+                whisperx_model = WhisperWrapper(whisper_model)
+                log("Using standard Whisper as fallback (limited word-level timing)")
+                # For alignment, we'll need to handle this differently
+                whisperx_align_model = None
+                whisperx_metadata = None
+            except Exception as whisper_error:
+                log(f"Standard Whisper fallback failed: {whisper_error}")
+                # Last resort: Create a minimal mock that at least returns something
+                class MinimalWhisperMock:
+                    def transcribe(self, audio, batch_size=16, language="en"):
+                        # Return a minimal valid structure
+                        return {
+                            "segments": [{
+                                "text": "[Audio processing unavailable - WhisperX loading failed]",
+                                "start": 0.0,
+                                "end": 1.0,
+                                "words": []
+                            }],
+                            "language": language
+                        }
+                whisperx_model = MinimalWhisperMock()
+                whisperx_align_model = None
+                whisperx_metadata = None
+                log("WARNING: Using minimal mock - transcription will be limited")
         except Exception as e:
             log(f"Error loading WhisperX models: {e}")
+            raise RuntimeError(f"Unable to load speech recognition models: {e}")
 def convert_webm_to_wav(bts):
     p = subprocess.run(["ffmpeg", "-i", "pipe:0", "-f", "wav", "-ar", "16000", "-ac", "1", "pipe:1"],
                             detected_english = "?"
                             detected_example = ""
+                        # Create tooltip text with example words (two lines)
                         if expected_example and detected_example:
+                            tooltip_text = f"Expected '{expected_english}' as in '{expected_example}'<br>You said '{detected_english}' as in '{detected_example}'"
                         elif expected_example:
+                            tooltip_text = f"Expected '{expected_english}' as in '{expected_example}'<br>You said '{detected_english}'"
                         else:
+                            tooltip_text = f"Expected '{expected_english}'<br>You said '{detected_english}'"
                         # Create span with inline tooltip for each mispronounced letter/group
+                        formatted_letters = f'<span class="phoneme-error" data-expected="{expected_english}" data-detected="{detected_english}" data-tooltip-html="{tooltip_text}"><strong><u>{word_letters}</u></strong></span>'
                         test_result.append(formatted_letters)
                         # For the simplified tooltip feedback
         return empty_results
 async def generate_tts_audio(word: str) -> str:
+    """Generate TTS audio for a word with silence padding"""
     if word in tts_cache:
         return tts_cache[word]
                 audio_data += chunk["data"]
         if audio_data:
+            # Add silence padding to TTS audio as well
+            # First decode the MP3 to get raw audio
+            import tempfile
+            with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as tmp_mp3:
+                tmp_mp3.write(audio_data)
+                tmp_mp3_path = tmp_mp3.name
+            try:
+                # Load the TTS audio
+                tts_waveform, tts_sample_rate = torchaudio.load(tmp_mp3_path)
+                # Resample if needed to match our standard rate
+                if tts_sample_rate != 16000:
+                    tts_waveform = torchaudio.transforms.Resample(tts_sample_rate, 16000)(tts_waveform)
+                    tts_sample_rate = 16000
+                # Add 0.25s silence padding on each end
+                padding_samples = int(0.25 * tts_sample_rate)
+                silence_shape = list(tts_waveform.shape)
+                silence_shape[-1] = padding_samples
+                silence_padding = torch.zeros(silence_shape)
+                # Concatenate: silence + audio + silence
+                padded_waveform = torch.cat([silence_padding, tts_waveform, silence_padding], dim=-1)
+                # Convert back to base64
+                buffer = io.BytesIO()
+                torchaudio.save(buffer, padded_waveform, tts_sample_rate, format="wav")
+                buffer.seek(0)
+                audio_b64 = base64.b64encode(buffer.read()).decode('utf-8')
+                tts_cache[word] = audio_b64
+                log(f"🔇 TTS for '{word}': Added 0.25s silence padding on each end")
+                return audio_b64
+            finally:
+                # Clean up temp file
+                if os.path.exists(tmp_mp3_path):
+                    os.remove(tmp_mp3_path)
     except Exception as e:
         log(f"TTS failed for '{word}': {e}")
     return ""
+def audio_to_base64(audio_segment: torch.Tensor, sample_rate: int, add_padding: bool = True) -> str:
+    """
+    Convert audio tensor to base64 string.
+    Args:
+        audio_segment: The audio tensor to convert
+        sample_rate: Sample rate of the audio
+        add_padding: If True, adds 0.25s of silence on each end to prevent audio processor lag
+    Returns:
+        Base64 encoded audio string
+    """
     try:
+        if add_padding:
+            # Add 0.25 seconds of silence on each end
+            padding_samples = int(0.25 * sample_rate)  # 0.25 seconds worth of samples
+            # Create silence padding (zeros with same shape as audio segment)
+            silence_shape = list(audio_segment.shape)
+            silence_shape[-1] = padding_samples
+            silence_padding = torch.zeros(silence_shape)
+            # Concatenate: silence + audio + silence
+            padded_segment = torch.cat([silence_padding, audio_segment, silence_padding], dim=-1)
+            log(f"🔇 Added silence padding: {padding_samples} samples (0.25s) on each end")
+            log(f"   Original: {audio_segment.shape[-1]} samples → Padded: {padded_segment.shape[-1]} samples")
+            audio_segment = padded_segment
         buffer = io.BytesIO()
         torchaudio.save(buffer, audio_segment, sample_rate, format="wav")
         buffer.seek(0)
             })
         # 7. Format output
         resolved_output = []
         resolved_colored = []
         log("=== WHISPERX ENGLISH-ONLY PHONEME ANALYSIS COMPLETE ===")
         return {
             "resolved": " ".join(resolved_output),
             "resolved_colored": " ".join(resolved_colored),
             "audio_data": audio_data_list,
         import traceback
         log(f"Traceback: {traceback.format_exc()}")
         return {
             "resolved": "Error occurred",
             "resolved_colored": "Error occurred",
             "audio_data": [],