Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -306,7 +306,7 @@ def load_whisperx_models():
|
|
306 |
if whisperx_model is None:
|
307 |
log("Loading WhisperX models for English-only processing...")
|
308 |
try:
|
309 |
-
#
|
310 |
whisperx_model = whisperx.load_model("base.en", device="cpu", compute_type="float32", language="en")
|
311 |
log("WhisperX base.en model loaded successfully")
|
312 |
|
@@ -314,44 +314,17 @@ def load_whisperx_models():
|
|
314 |
whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
|
315 |
log("WhisperX English alignment model loaded successfully")
|
316 |
|
317 |
-
except ImportError as ie:
|
318 |
-
log(f"Import error loading WhisperX models: {ie}")
|
319 |
-
# Try without ctranslate2 by using int8 compute type
|
320 |
-
try:
|
321 |
-
log("Trying fallback with int8 compute type...")
|
322 |
-
whisperx_model = whisperx.load_model("base.en", device="cpu", compute_type="int8", language="en")
|
323 |
-
whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
|
324 |
-
log("WhisperX models loaded with int8 compute type")
|
325 |
-
except Exception as fallback_error:
|
326 |
-
log(f"Int8 fallback also failed: {fallback_error}")
|
327 |
-
# Last resort: try tiny model with default compute
|
328 |
-
try:
|
329 |
-
log("Trying final fallback with tiny.en model and default compute...")
|
330 |
-
whisperx_model = whisperx.load_model("tiny.en", device="cpu", language="en")
|
331 |
-
whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
|
332 |
-
log("WhisperX models loaded with tiny.en and default compute")
|
333 |
-
except Exception as final_error:
|
334 |
-
log(f"All WhisperX loading attempts failed: {final_error}")
|
335 |
-
raise RuntimeError("Unable to load WhisperX models. Please check environment setup.")
|
336 |
except Exception as e:
|
337 |
log(f"Error loading WhisperX models: {e}")
|
338 |
# Fallback: try with smaller English-only model
|
339 |
try:
|
340 |
log("Trying fallback with tiny.en model...")
|
341 |
-
whisperx_model = whisperx.load_model("tiny.en", device="cpu", compute_type="
|
342 |
whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
|
343 |
log("WhisperX models loaded with fallback (tiny.en model)")
|
344 |
except Exception as fallback_error:
|
345 |
log(f"Fallback also failed: {fallback_error}")
|
346 |
-
|
347 |
-
try:
|
348 |
-
log("Final attempt with default settings...")
|
349 |
-
whisperx_model = whisperx.load_model("tiny.en", device="cpu", language="en")
|
350 |
-
whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
|
351 |
-
log("WhisperX models loaded with default settings")
|
352 |
-
except Exception as final_error:
|
353 |
-
log(f"All attempts failed: {final_error}")
|
354 |
-
raise RuntimeError("Unable to load WhisperX models in this environment")
|
355 |
|
356 |
def convert_webm_to_wav(bts):
|
357 |
p = subprocess.run(["ffmpeg", "-i", "pipe:0", "-f", "wav", "-ar", "16000", "-ac", "1", "pipe:1"],
|
@@ -947,37 +920,103 @@ def trim_audio_segment_by_phoneme_position(audio_segment: torch.Tensor,
|
|
947 |
return trimmed_segment
|
948 |
|
949 |
def get_expected_phonemes(words: List[str]) -> List[str]:
|
950 |
-
"""Get expected phonemes using espeak phonemizer"""
|
951 |
cache_key = tuple(words)
|
952 |
if cache_key in phoneme_cache:
|
953 |
-
|
954 |
-
|
955 |
-
|
956 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
957 |
|
958 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
959 |
|
960 |
-
|
961 |
-
|
962 |
-
|
963 |
-
|
964 |
-
# Cache the results
|
965 |
-
phoneme_cache[cache_key] = phonemes
|
966 |
-
|
967 |
-
# Log the phoneme results
|
968 |
-
log(f"✅ Phonemizer results:")
|
969 |
-
for word, phoneme in zip(words, phonemes):
|
970 |
-
log(f" '{word}' → '{phoneme}'")
|
971 |
|
972 |
-
|
973 |
-
|
974 |
-
|
975 |
-
|
976 |
-
|
977 |
-
|
978 |
-
|
979 |
-
|
980 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
981 |
|
982 |
async def generate_tts_audio(word: str) -> str:
|
983 |
"""Generate TTS audio for a word with silence padding"""
|
|
|
306 |
if whisperx_model is None:
|
307 |
log("Loading WhisperX models for English-only processing...")
|
308 |
try:
|
309 |
+
# Load WhisperX model with English-only configuration
|
310 |
whisperx_model = whisperx.load_model("base.en", device="cpu", compute_type="float32", language="en")
|
311 |
log("WhisperX base.en model loaded successfully")
|
312 |
|
|
|
314 |
whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
|
315 |
log("WhisperX English alignment model loaded successfully")
|
316 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
317 |
except Exception as e:
|
318 |
log(f"Error loading WhisperX models: {e}")
|
319 |
# Fallback: try with smaller English-only model
|
320 |
try:
|
321 |
log("Trying fallback with tiny.en model...")
|
322 |
+
whisperx_model = whisperx.load_model("tiny.en", device="cpu", compute_type="float32", language="en")
|
323 |
whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
|
324 |
log("WhisperX models loaded with fallback (tiny.en model)")
|
325 |
except Exception as fallback_error:
|
326 |
log(f"Fallback also failed: {fallback_error}")
|
327 |
+
raise
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
328 |
|
329 |
def convert_webm_to_wav(bts):
|
330 |
p = subprocess.run(["ffmpeg", "-i", "pipe:0", "-f", "wav", "-ar", "16000", "-ac", "1", "pipe:1"],
|
|
|
920 |
return trimmed_segment
|
921 |
|
922 |
def get_expected_phonemes(words: List[str]) -> List[str]:
|
923 |
+
"""Get expected phonemes using CMUdict instead of espeak phonemizer"""
|
924 |
cache_key = tuple(words)
|
925 |
if cache_key in phoneme_cache:
|
926 |
+
return phoneme_cache[cache_key]
|
927 |
+
|
928 |
+
log(f"Getting expected phonemes from CMUdict for: {words}")
|
929 |
+
|
930 |
+
# ARPABET to IPA conversion mapping (same as in build_phoneme_reverse_lookup)
|
931 |
+
arpabet_to_ipa = {
|
932 |
+
'AA': 'ɑ', 'AE': 'æ', 'AH': 'ə', 'AO': 'ɔ', 'AW': 'aʊ',
|
933 |
+
'AY': 'aɪ', 'B': 'b', 'CH': 'tʃ', 'D': 'd', 'DH': 'ð',
|
934 |
+
'EH': 'ɛ', 'ER': 'ɝ', 'EY': 'eɪ', 'F': 'f', 'G': 'ɡ',
|
935 |
+
'HH': 'h', 'IH': 'ɪ', 'IY': 'i', 'JH': 'dʒ', 'K': 'k',
|
936 |
+
'L': 'l', 'M': 'm', 'N': 'n', 'NG': 'ŋ', 'OW': 'oʊ',
|
937 |
+
'OY': 'ɔɪ', 'P': 'p', 'R': 'r', 'S': 's', 'SH': 'ʃ',
|
938 |
+
'T': 't', 'TH': 'θ', 'UH': 'ʊ', 'UW': 'u', 'V': 'v',
|
939 |
+
'W': 'w', 'Y': 'j', 'Z': 'z', 'ZH': 'ʒ', 'DX': 'ɾ'
|
940 |
+
}
|
941 |
|
942 |
+
# Load CMUdict on first use
|
943 |
+
cmudict_lookup = {}
|
944 |
+
if not hasattr(get_expected_phonemes, '_cmudict_loaded'):
|
945 |
+
log("Loading CMUdict for expected phonemes...")
|
946 |
+
try:
|
947 |
+
cmudict_path = "/tmp/cmudict.dict"
|
948 |
+
if os.path.exists(cmudict_path):
|
949 |
+
with open(cmudict_path, 'r', encoding='latin-1') as f:
|
950 |
+
for line in f:
|
951 |
+
line = line.strip()
|
952 |
+
if not line or line.startswith(';;;'):
|
953 |
+
continue
|
954 |
+
|
955 |
+
parts = line.split()
|
956 |
+
if len(parts) < 2:
|
957 |
+
continue
|
958 |
+
|
959 |
+
word = parts[0].lower()
|
960 |
+
# Remove variant indicators like (2), (3)
|
961 |
+
if '(' in word:
|
962 |
+
word = word.split('(')[0]
|
963 |
+
|
964 |
+
# Store ARPABET phones (we'll convert to IPA as needed)
|
965 |
+
arpabet_phones = parts[1:]
|
966 |
+
cmudict_lookup[word] = arpabet_phones
|
967 |
+
|
968 |
+
get_expected_phonemes._cmudict_loaded = True
|
969 |
+
get_expected_phonemes._cmudict_lookup = cmudict_lookup
|
970 |
+
log(f"Loaded {len(cmudict_lookup)} words from CMUdict")
|
971 |
+
else:
|
972 |
+
log("⚠️ CMUdict not found, falling back to phonemizer")
|
973 |
+
# Fallback to original phonemizer
|
974 |
+
phonemes = phonemize(words, language='en-us', backend='espeak', strip=True)
|
975 |
+
phoneme_cache[cache_key] = phonemes
|
976 |
+
return phonemes
|
977 |
+
except Exception as e:
|
978 |
+
log(f"❌ Error loading CMUdict: {e}, falling back to phonemizer")
|
979 |
+
phonemes = phonemize(words, language='en-us', backend='espeak', strip=True)
|
980 |
+
phoneme_cache[cache_key] = phonemes
|
981 |
+
return phonemes
|
982 |
+
else:
|
983 |
+
cmudict_lookup = get_expected_phonemes._cmudict_lookup
|
984 |
|
985 |
+
# Convert words to phonemes using CMUdict
|
986 |
+
results = []
|
987 |
+
for word in words:
|
988 |
+
word_lower = word.lower()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
989 |
|
990 |
+
if word_lower in cmudict_lookup:
|
991 |
+
# Convert ARPABET to IPA
|
992 |
+
arpabet_phones = cmudict_lookup[word_lower]
|
993 |
+
ipa_phones = []
|
994 |
+
|
995 |
+
for phone in arpabet_phones:
|
996 |
+
# Remove stress markers (0,1,2)
|
997 |
+
clean_phone = ''.join(c for c in phone if not c.isdigit())
|
998 |
+
if clean_phone in arpabet_to_ipa:
|
999 |
+
ipa_phones.append(arpabet_to_ipa[clean_phone])
|
1000 |
+
else:
|
1001 |
+
log(f"⚠️ Unknown ARPABET phone '{clean_phone}' in word '{word}'")
|
1002 |
+
|
1003 |
+
ipa_string = ''.join(ipa_phones)
|
1004 |
+
results.append(ipa_string)
|
1005 |
+
log(f"CMUdict: '{word}' → ARPABET {arpabet_phones} → IPA '{ipa_string}'")
|
1006 |
+
|
1007 |
+
else:
|
1008 |
+
# Fallback to phonemizer for out-of-vocabulary words
|
1009 |
+
log(f"⚠️ '{word}' not in CMUdict, using phonemizer fallback")
|
1010 |
+
try:
|
1011 |
+
fallback_phoneme = phonemize([word], language='en-us', backend='espeak', strip=True)[0]
|
1012 |
+
results.append(fallback_phoneme)
|
1013 |
+
except Exception as e:
|
1014 |
+
log(f"❌ Phonemizer fallback failed for '{word}': {e}")
|
1015 |
+
results.append("") # Empty string as last resort
|
1016 |
+
|
1017 |
+
phoneme_cache[cache_key] = results
|
1018 |
+
log(f"Final expected phonemes: {list(zip(words, results))}")
|
1019 |
+
return results
|
1020 |
|
1021 |
async def generate_tts_audio(word: str) -> str:
|
1022 |
"""Generate TTS audio for a word with silence padding"""
|