Spaces:

greg0rs
/

fonetik-fast

Running

App Files Files Community

greg0rs commited on Jul 23

Commit

33b4e31

verified ·

1 Parent(s): 2ddd203

Update app.py

Browse files

Files changed (1) hide show

app.py +116 -0

app.py CHANGED Viewed

@@ -9,6 +9,8 @@ import asyncio
 import base64
 import string
 import re
 # Set cache environment
 os.environ['HF_HOME'] = '/tmp/hf'
@@ -32,6 +34,101 @@ import whisperx  # New: WhisperX for precise alignment
 from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
 import edge_tts
 def log(msg):
     print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}")
@@ -81,6 +178,9 @@ def inspect_phoneme_model_vocab():
 phoneme_processor = Wav2Vec2Processor.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
 phoneme_model = Wav2Vec2ForCTC.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
 # Model inspection complete - wav2vec2 uses ASCII 'g' (token 15), not IPA 'ɡ'
 log("✅ Phoneme models loaded - using ASCII/IPA normalization")
@@ -904,6 +1004,22 @@ async def transcribe(audio: UploadFile = File(...)):
                 detected_phoneme_raw, expected_phoneme, word_clean
             )
             # Trim audio segment based on best phoneme match position
             trimmed_audio_segment = trim_audio_segment_by_phoneme_position(
                 expanded_audio_segment, detected_phoneme_raw, match_start, match_end, word_clean

 import base64
 import string
 import re
+import urllib.request
+import gzip
 # Set cache environment
 os.environ['HF_HOME'] = '/tmp/hf'
 from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
 import edge_tts
+# Phoneme reverse lookup
+phoneme_to_words_cache = {}
+def build_phoneme_reverse_lookup():
+    """Build reverse lookup dictionary from CMUdict (ARPABET to words)"""
+    global phoneme_to_words_cache
+    if phoneme_to_words_cache:
+        return  # Already built
+    log("📚 Building phoneme reverse lookup from CMUdict...")
+    try:
+        # Download CMUdict if not exists
+        cmudict_path = "/tmp/cmudict.dict"
+        if not os.path.exists(cmudict_path):
+            log("⬇️  Downloading CMUdict...")
+            url = "https://raw.githubusercontent.com/cmusphinx/cmudict/master/cmudict.dict"
+            urllib.request.urlretrieve(url, cmudict_path)
+            log("✅ CMUdict downloaded")
+        # ARPABET to IPA conversion mapping
+        arpabet_to_ipa = {
+            'AA': 'ɑ',   'AE': 'æ',   'AH': 'ʌ',   'AO': 'ɔ',   'AW': 'aʊ',
+            'AY': 'aɪ',  'B': 'b',    'CH': 'tʃ',  'D': 'd',    'DH': 'ð',
+            'EH': 'ɛ',   'ER': 'ɝ',   'EY': 'eɪ',  'F': 'f',    'G': 'ɡ',
+            'HH': 'h',   'IH': 'ɪ',   'IY': 'i',   'JH': 'dʒ',  'K': 'k',
+            'L': 'l',    'M': 'm',    'N': 'n',    'NG': 'ŋ',   'OW': 'oʊ',
+            'OY': 'ɔɪ',  'P': 'p',    'R': 'r',    'S': 's',    'SH': 'ʃ',
+            'T': 't',    'TH': 'θ',   'UH': 'ʊ',   'UW': 'u',   'V': 'v',
+            'W': 'w',    'Y': 'j',    'Z': 'z',    'ZH': 'ʒ',   'T': 'ɾ'
+        }
+        # Parse CMUdict and build reverse lookup
+        word_count = 0
+        with open(cmudict_path, 'r', encoding='latin-1') as f:
+            for line in f:
+                line = line.strip()
+                if not line or line.startswith(';;;'):
+                    continue
+                # Parse line: WORD  P H O N E M E S
+                parts = line.split()
+                if len(parts) < 2:
+                    continue
+                word = parts[0].lower()
+                # Remove variant indicators like (2), (3)
+                if '(' in word:
+                    word = word.split('(')[0]
+                # Convert ARPABET to IPA
+                arpabet_phones = parts[1:]
+                ipa_phones = []
+                for phone in arpabet_phones:
+                    # Remove stress markers (0,1,2)
+                    clean_phone = ''.join(c for c in phone if not c.isdigit())
+                    if clean_phone in arpabet_to_ipa:
+                        ipa_phones.append(arpabet_to_ipa[clean_phone])
+                    else:
+                        # Skip unknown phones
+                        continue
+                if ipa_phones:
+                    # Create phoneme string and normalize it
+                    ipa_string = ''.join(ipa_phones)
+                    normalized_ipa = normalize_phoneme_string(ipa_string)
+                    # Add to reverse lookup
+                    if normalized_ipa not in phoneme_to_words_cache:
+                        phoneme_to_words_cache[normalized_ipa] = []
+                    if word not in phoneme_to_words_cache[normalized_ipa]:
+                        phoneme_to_words_cache[normalized_ipa].append(word)
+                    word_count += 1
+        log(f"✅ Built reverse lookup: {word_count} words, {len(phoneme_to_words_cache)} unique phoneme patterns")
+        # Show some examples
+        sample_items = list(phoneme_to_words_cache.items())[:5]
+        for phonemes, words in sample_items:
+            log(f"   Example: '{phonemes}' → {words[:3]}{'...' if len(words) > 3 else ''}")
+    except Exception as e:
+        log(f"❌ Error building phoneme reverse lookup: {e}")
+        phoneme_to_words_cache = {}
+def lookup_words_from_phonemes(phoneme_string: str) -> List[str]:
+    """Look up possible words for a given phoneme string"""
+    if not phoneme_to_words_cache:
+        return []
+    normalized = normalize_phoneme_string(phoneme_string)
+    return phoneme_to_words_cache.get(normalized, [])
 def log(msg):
     print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}")
 phoneme_processor = Wav2Vec2Processor.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
 phoneme_model = Wav2Vec2ForCTC.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
+# Build phoneme reverse lookup dictionary
+build_phoneme_reverse_lookup()
 # Model inspection complete - wav2vec2 uses ASCII 'g' (token 15), not IPA 'ɡ'
 log("✅ Phoneme models loaded - using ASCII/IPA normalization")
                 detected_phoneme_raw, expected_phoneme, word_clean
             )
+            # Look up possible words for detected phonemes (for validation/debugging)
+            detected_words = lookup_words_from_phonemes(detected_phoneme_raw)
+            expected_words = lookup_words_from_phonemes(expected_phoneme)
+            log(f"🔍 Phoneme word lookup:")
+            log(f"   Detected '{detected_phoneme_raw}' could be: {detected_words[:5] if detected_words else ['<no matches>']}")
+            log(f"   Expected '{expected_phoneme}' could be: {expected_words[:5] if expected_words else ['<no matches>']}")
+            # Check if the target word appears in detected phoneme lookup
+            if detected_words and word_clean.lower() in detected_words:
+                log(f"   ✅ Target word '{word_clean}' found in detected phoneme matches!")
+            elif detected_words:
+                log(f"   ❓ Target word '{word_clean}' not in detected matches (closest: {detected_words[0] if detected_words else 'none'})")
+            else:
+                log(f"   ❌ No dictionary words match detected phonemes")
             # Trim audio segment based on best phoneme match position
             trimmed_audio_segment = trim_audio_segment_by_phoneme_position(
                 expanded_audio_segment, detected_phoneme_raw, match_start, match_end, word_clean