greg0rs commited on
Commit
33b4e31
·
verified ·
1 Parent(s): 2ddd203

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +116 -0
app.py CHANGED
@@ -9,6 +9,8 @@ import asyncio
9
  import base64
10
  import string
11
  import re
 
 
12
 
13
  # Set cache environment
14
  os.environ['HF_HOME'] = '/tmp/hf'
@@ -32,6 +34,101 @@ import whisperx # New: WhisperX for precise alignment
32
  from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
33
  import edge_tts
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  def log(msg):
36
  print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}")
37
 
@@ -81,6 +178,9 @@ def inspect_phoneme_model_vocab():
81
  phoneme_processor = Wav2Vec2Processor.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
82
  phoneme_model = Wav2Vec2ForCTC.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
83
 
 
 
 
84
  # Model inspection complete - wav2vec2 uses ASCII 'g' (token 15), not IPA 'ɡ'
85
  log("✅ Phoneme models loaded - using ASCII/IPA normalization")
86
 
@@ -904,6 +1004,22 @@ async def transcribe(audio: UploadFile = File(...)):
904
  detected_phoneme_raw, expected_phoneme, word_clean
905
  )
906
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
907
  # Trim audio segment based on best phoneme match position
908
  trimmed_audio_segment = trim_audio_segment_by_phoneme_position(
909
  expanded_audio_segment, detected_phoneme_raw, match_start, match_end, word_clean
 
9
  import base64
10
  import string
11
  import re
12
+ import urllib.request
13
+ import gzip
14
 
15
  # Set cache environment
16
  os.environ['HF_HOME'] = '/tmp/hf'
 
34
  from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
35
  import edge_tts
36
 
37
+ # Phoneme reverse lookup
38
+ phoneme_to_words_cache = {}
39
+
40
+ def build_phoneme_reverse_lookup():
41
+ """Build reverse lookup dictionary from CMUdict (ARPABET to words)"""
42
+ global phoneme_to_words_cache
43
+
44
+ if phoneme_to_words_cache:
45
+ return # Already built
46
+
47
+ log("📚 Building phoneme reverse lookup from CMUdict...")
48
+
49
+ try:
50
+ # Download CMUdict if not exists
51
+ cmudict_path = "/tmp/cmudict.dict"
52
+ if not os.path.exists(cmudict_path):
53
+ log("⬇️ Downloading CMUdict...")
54
+ url = "https://raw.githubusercontent.com/cmusphinx/cmudict/master/cmudict.dict"
55
+ urllib.request.urlretrieve(url, cmudict_path)
56
+ log("✅ CMUdict downloaded")
57
+
58
+ # ARPABET to IPA conversion mapping
59
+ arpabet_to_ipa = {
60
+ 'AA': 'ɑ', 'AE': 'æ', 'AH': 'ʌ', 'AO': 'ɔ', 'AW': 'aʊ',
61
+ 'AY': 'aɪ', 'B': 'b', 'CH': 'tʃ', 'D': 'd', 'DH': 'ð',
62
+ 'EH': 'ɛ', 'ER': 'ɝ', 'EY': 'eɪ', 'F': 'f', 'G': 'ɡ',
63
+ 'HH': 'h', 'IH': 'ɪ', 'IY': 'i', 'JH': 'dʒ', 'K': 'k',
64
+ 'L': 'l', 'M': 'm', 'N': 'n', 'NG': 'ŋ', 'OW': 'oʊ',
65
+ 'OY': 'ɔɪ', 'P': 'p', 'R': 'r', 'S': 's', 'SH': 'ʃ',
66
+ 'T': 't', 'TH': 'θ', 'UH': 'ʊ', 'UW': 'u', 'V': 'v',
67
+ 'W': 'w', 'Y': 'j', 'Z': 'z', 'ZH': 'ʒ', 'T': 'ɾ'
68
+ }
69
+
70
+ # Parse CMUdict and build reverse lookup
71
+ word_count = 0
72
+ with open(cmudict_path, 'r', encoding='latin-1') as f:
73
+ for line in f:
74
+ line = line.strip()
75
+ if not line or line.startswith(';;;'):
76
+ continue
77
+
78
+ # Parse line: WORD P H O N E M E S
79
+ parts = line.split()
80
+ if len(parts) < 2:
81
+ continue
82
+
83
+ word = parts[0].lower()
84
+ # Remove variant indicators like (2), (3)
85
+ if '(' in word:
86
+ word = word.split('(')[0]
87
+
88
+ # Convert ARPABET to IPA
89
+ arpabet_phones = parts[1:]
90
+ ipa_phones = []
91
+ for phone in arpabet_phones:
92
+ # Remove stress markers (0,1,2)
93
+ clean_phone = ''.join(c for c in phone if not c.isdigit())
94
+ if clean_phone in arpabet_to_ipa:
95
+ ipa_phones.append(arpabet_to_ipa[clean_phone])
96
+ else:
97
+ # Skip unknown phones
98
+ continue
99
+
100
+ if ipa_phones:
101
+ # Create phoneme string and normalize it
102
+ ipa_string = ''.join(ipa_phones)
103
+ normalized_ipa = normalize_phoneme_string(ipa_string)
104
+
105
+ # Add to reverse lookup
106
+ if normalized_ipa not in phoneme_to_words_cache:
107
+ phoneme_to_words_cache[normalized_ipa] = []
108
+ if word not in phoneme_to_words_cache[normalized_ipa]:
109
+ phoneme_to_words_cache[normalized_ipa].append(word)
110
+
111
+ word_count += 1
112
+
113
+ log(f"✅ Built reverse lookup: {word_count} words, {len(phoneme_to_words_cache)} unique phoneme patterns")
114
+
115
+ # Show some examples
116
+ sample_items = list(phoneme_to_words_cache.items())[:5]
117
+ for phonemes, words in sample_items:
118
+ log(f" Example: '{phonemes}' → {words[:3]}{'...' if len(words) > 3 else ''}")
119
+
120
+ except Exception as e:
121
+ log(f"❌ Error building phoneme reverse lookup: {e}")
122
+ phoneme_to_words_cache = {}
123
+
124
+ def lookup_words_from_phonemes(phoneme_string: str) -> List[str]:
125
+ """Look up possible words for a given phoneme string"""
126
+ if not phoneme_to_words_cache:
127
+ return []
128
+
129
+ normalized = normalize_phoneme_string(phoneme_string)
130
+ return phoneme_to_words_cache.get(normalized, [])
131
+
132
  def log(msg):
133
  print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}")
134
 
 
178
  phoneme_processor = Wav2Vec2Processor.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
179
  phoneme_model = Wav2Vec2ForCTC.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
180
 
181
+ # Build phoneme reverse lookup dictionary
182
+ build_phoneme_reverse_lookup()
183
+
184
  # Model inspection complete - wav2vec2 uses ASCII 'g' (token 15), not IPA 'ɡ'
185
  log("✅ Phoneme models loaded - using ASCII/IPA normalization")
186
 
 
1004
  detected_phoneme_raw, expected_phoneme, word_clean
1005
  )
1006
 
1007
+ # Look up possible words for detected phonemes (for validation/debugging)
1008
+ detected_words = lookup_words_from_phonemes(detected_phoneme_raw)
1009
+ expected_words = lookup_words_from_phonemes(expected_phoneme)
1010
+
1011
+ log(f"🔍 Phoneme word lookup:")
1012
+ log(f" Detected '{detected_phoneme_raw}' could be: {detected_words[:5] if detected_words else ['<no matches>']}")
1013
+ log(f" Expected '{expected_phoneme}' could be: {expected_words[:5] if expected_words else ['<no matches>']}")
1014
+
1015
+ # Check if the target word appears in detected phoneme lookup
1016
+ if detected_words and word_clean.lower() in detected_words:
1017
+ log(f" ✅ Target word '{word_clean}' found in detected phoneme matches!")
1018
+ elif detected_words:
1019
+ log(f" ❓ Target word '{word_clean}' not in detected matches (closest: {detected_words[0] if detected_words else 'none'})")
1020
+ else:
1021
+ log(f" ❌ No dictionary words match detected phonemes")
1022
+
1023
  # Trim audio segment based on best phoneme match position
1024
  trimmed_audio_segment = trim_audio_segment_by_phoneme_position(
1025
  expanded_audio_segment, detected_phoneme_raw, match_start, match_end, word_clean