Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -9,6 +9,8 @@ import asyncio
|
|
9 |
import base64
|
10 |
import string
|
11 |
import re
|
|
|
|
|
12 |
|
13 |
# Set cache environment
|
14 |
os.environ['HF_HOME'] = '/tmp/hf'
|
@@ -32,6 +34,101 @@ import whisperx # New: WhisperX for precise alignment
|
|
32 |
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
|
33 |
import edge_tts
|
34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
def log(msg):
|
36 |
print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}")
|
37 |
|
@@ -81,6 +178,9 @@ def inspect_phoneme_model_vocab():
|
|
81 |
phoneme_processor = Wav2Vec2Processor.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
|
82 |
phoneme_model = Wav2Vec2ForCTC.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
|
83 |
|
|
|
|
|
|
|
84 |
# Model inspection complete - wav2vec2 uses ASCII 'g' (token 15), not IPA 'ɡ'
|
85 |
log("✅ Phoneme models loaded - using ASCII/IPA normalization")
|
86 |
|
@@ -904,6 +1004,22 @@ async def transcribe(audio: UploadFile = File(...)):
|
|
904 |
detected_phoneme_raw, expected_phoneme, word_clean
|
905 |
)
|
906 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
907 |
# Trim audio segment based on best phoneme match position
|
908 |
trimmed_audio_segment = trim_audio_segment_by_phoneme_position(
|
909 |
expanded_audio_segment, detected_phoneme_raw, match_start, match_end, word_clean
|
|
|
9 |
import base64
|
10 |
import string
|
11 |
import re
|
12 |
+
import urllib.request
|
13 |
+
import gzip
|
14 |
|
15 |
# Set cache environment
|
16 |
os.environ['HF_HOME'] = '/tmp/hf'
|
|
|
34 |
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
|
35 |
import edge_tts
|
36 |
|
37 |
+
# Phoneme reverse lookup
|
38 |
+
phoneme_to_words_cache = {}
|
39 |
+
|
40 |
+
def build_phoneme_reverse_lookup():
|
41 |
+
"""Build reverse lookup dictionary from CMUdict (ARPABET to words)"""
|
42 |
+
global phoneme_to_words_cache
|
43 |
+
|
44 |
+
if phoneme_to_words_cache:
|
45 |
+
return # Already built
|
46 |
+
|
47 |
+
log("📚 Building phoneme reverse lookup from CMUdict...")
|
48 |
+
|
49 |
+
try:
|
50 |
+
# Download CMUdict if not exists
|
51 |
+
cmudict_path = "/tmp/cmudict.dict"
|
52 |
+
if not os.path.exists(cmudict_path):
|
53 |
+
log("⬇️ Downloading CMUdict...")
|
54 |
+
url = "https://raw.githubusercontent.com/cmusphinx/cmudict/master/cmudict.dict"
|
55 |
+
urllib.request.urlretrieve(url, cmudict_path)
|
56 |
+
log("✅ CMUdict downloaded")
|
57 |
+
|
58 |
+
# ARPABET to IPA conversion mapping
|
59 |
+
arpabet_to_ipa = {
|
60 |
+
'AA': 'ɑ', 'AE': 'æ', 'AH': 'ʌ', 'AO': 'ɔ', 'AW': 'aʊ',
|
61 |
+
'AY': 'aɪ', 'B': 'b', 'CH': 'tʃ', 'D': 'd', 'DH': 'ð',
|
62 |
+
'EH': 'ɛ', 'ER': 'ɝ', 'EY': 'eɪ', 'F': 'f', 'G': 'ɡ',
|
63 |
+
'HH': 'h', 'IH': 'ɪ', 'IY': 'i', 'JH': 'dʒ', 'K': 'k',
|
64 |
+
'L': 'l', 'M': 'm', 'N': 'n', 'NG': 'ŋ', 'OW': 'oʊ',
|
65 |
+
'OY': 'ɔɪ', 'P': 'p', 'R': 'r', 'S': 's', 'SH': 'ʃ',
|
66 |
+
'T': 't', 'TH': 'θ', 'UH': 'ʊ', 'UW': 'u', 'V': 'v',
|
67 |
+
'W': 'w', 'Y': 'j', 'Z': 'z', 'ZH': 'ʒ', 'T': 'ɾ'
|
68 |
+
}
|
69 |
+
|
70 |
+
# Parse CMUdict and build reverse lookup
|
71 |
+
word_count = 0
|
72 |
+
with open(cmudict_path, 'r', encoding='latin-1') as f:
|
73 |
+
for line in f:
|
74 |
+
line = line.strip()
|
75 |
+
if not line or line.startswith(';;;'):
|
76 |
+
continue
|
77 |
+
|
78 |
+
# Parse line: WORD P H O N E M E S
|
79 |
+
parts = line.split()
|
80 |
+
if len(parts) < 2:
|
81 |
+
continue
|
82 |
+
|
83 |
+
word = parts[0].lower()
|
84 |
+
# Remove variant indicators like (2), (3)
|
85 |
+
if '(' in word:
|
86 |
+
word = word.split('(')[0]
|
87 |
+
|
88 |
+
# Convert ARPABET to IPA
|
89 |
+
arpabet_phones = parts[1:]
|
90 |
+
ipa_phones = []
|
91 |
+
for phone in arpabet_phones:
|
92 |
+
# Remove stress markers (0,1,2)
|
93 |
+
clean_phone = ''.join(c for c in phone if not c.isdigit())
|
94 |
+
if clean_phone in arpabet_to_ipa:
|
95 |
+
ipa_phones.append(arpabet_to_ipa[clean_phone])
|
96 |
+
else:
|
97 |
+
# Skip unknown phones
|
98 |
+
continue
|
99 |
+
|
100 |
+
if ipa_phones:
|
101 |
+
# Create phoneme string and normalize it
|
102 |
+
ipa_string = ''.join(ipa_phones)
|
103 |
+
normalized_ipa = normalize_phoneme_string(ipa_string)
|
104 |
+
|
105 |
+
# Add to reverse lookup
|
106 |
+
if normalized_ipa not in phoneme_to_words_cache:
|
107 |
+
phoneme_to_words_cache[normalized_ipa] = []
|
108 |
+
if word not in phoneme_to_words_cache[normalized_ipa]:
|
109 |
+
phoneme_to_words_cache[normalized_ipa].append(word)
|
110 |
+
|
111 |
+
word_count += 1
|
112 |
+
|
113 |
+
log(f"✅ Built reverse lookup: {word_count} words, {len(phoneme_to_words_cache)} unique phoneme patterns")
|
114 |
+
|
115 |
+
# Show some examples
|
116 |
+
sample_items = list(phoneme_to_words_cache.items())[:5]
|
117 |
+
for phonemes, words in sample_items:
|
118 |
+
log(f" Example: '{phonemes}' → {words[:3]}{'...' if len(words) > 3 else ''}")
|
119 |
+
|
120 |
+
except Exception as e:
|
121 |
+
log(f"❌ Error building phoneme reverse lookup: {e}")
|
122 |
+
phoneme_to_words_cache = {}
|
123 |
+
|
124 |
+
def lookup_words_from_phonemes(phoneme_string: str) -> List[str]:
|
125 |
+
"""Look up possible words for a given phoneme string"""
|
126 |
+
if not phoneme_to_words_cache:
|
127 |
+
return []
|
128 |
+
|
129 |
+
normalized = normalize_phoneme_string(phoneme_string)
|
130 |
+
return phoneme_to_words_cache.get(normalized, [])
|
131 |
+
|
132 |
def log(msg):
|
133 |
print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}")
|
134 |
|
|
|
178 |
phoneme_processor = Wav2Vec2Processor.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
|
179 |
phoneme_model = Wav2Vec2ForCTC.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
|
180 |
|
181 |
+
# Build phoneme reverse lookup dictionary
|
182 |
+
build_phoneme_reverse_lookup()
|
183 |
+
|
184 |
# Model inspection complete - wav2vec2 uses ASCII 'g' (token 15), not IPA 'ɡ'
|
185 |
log("✅ Phoneme models loaded - using ASCII/IPA normalization")
|
186 |
|
|
|
1004 |
detected_phoneme_raw, expected_phoneme, word_clean
|
1005 |
)
|
1006 |
|
1007 |
+
# Look up possible words for detected phonemes (for validation/debugging)
|
1008 |
+
detected_words = lookup_words_from_phonemes(detected_phoneme_raw)
|
1009 |
+
expected_words = lookup_words_from_phonemes(expected_phoneme)
|
1010 |
+
|
1011 |
+
log(f"🔍 Phoneme word lookup:")
|
1012 |
+
log(f" Detected '{detected_phoneme_raw}' could be: {detected_words[:5] if detected_words else ['<no matches>']}")
|
1013 |
+
log(f" Expected '{expected_phoneme}' could be: {expected_words[:5] if expected_words else ['<no matches>']}")
|
1014 |
+
|
1015 |
+
# Check if the target word appears in detected phoneme lookup
|
1016 |
+
if detected_words and word_clean.lower() in detected_words:
|
1017 |
+
log(f" ✅ Target word '{word_clean}' found in detected phoneme matches!")
|
1018 |
+
elif detected_words:
|
1019 |
+
log(f" ❓ Target word '{word_clean}' not in detected matches (closest: {detected_words[0] if detected_words else 'none'})")
|
1020 |
+
else:
|
1021 |
+
log(f" ❌ No dictionary words match detected phonemes")
|
1022 |
+
|
1023 |
# Trim audio segment based on best phoneme match position
|
1024 |
trimmed_audio_segment = trim_audio_segment_by_phoneme_position(
|
1025 |
expanded_audio_segment, detected_phoneme_raw, match_start, match_end, word_clean
|