Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -306,7 +306,7 @@ def load_whisperx_models():
|
|
306 |
if whisperx_model is None:
|
307 |
log("Loading WhisperX models for English-only processing...")
|
308 |
try:
|
309 |
-
#
|
310 |
whisperx_model = whisperx.load_model("base.en", device="cpu", compute_type="float32", language="en")
|
311 |
log("WhisperX base.en model loaded successfully")
|
312 |
|
@@ -314,17 +314,44 @@ def load_whisperx_models():
|
|
314 |
whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
|
315 |
log("WhisperX English alignment model loaded successfully")
|
316 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
317 |
except Exception as e:
|
318 |
log(f"Error loading WhisperX models: {e}")
|
319 |
# Fallback: try with smaller English-only model
|
320 |
try:
|
321 |
log("Trying fallback with tiny.en model...")
|
322 |
-
whisperx_model = whisperx.load_model("tiny.en", device="cpu", compute_type="
|
323 |
whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
|
324 |
log("WhisperX models loaded with fallback (tiny.en model)")
|
325 |
except Exception as fallback_error:
|
326 |
log(f"Fallback also failed: {fallback_error}")
|
327 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
328 |
|
329 |
def convert_webm_to_wav(bts):
|
330 |
p = subprocess.run(["ffmpeg", "-i", "pipe:0", "-f", "wav", "-ar", "16000", "-ac", "1", "pipe:1"],
|
@@ -920,103 +947,37 @@ def trim_audio_segment_by_phoneme_position(audio_segment: torch.Tensor,
|
|
920 |
return trimmed_segment
|
921 |
|
922 |
def get_expected_phonemes(words: List[str]) -> List[str]:
|
923 |
-
"""Get expected phonemes using
|
924 |
cache_key = tuple(words)
|
925 |
if cache_key in phoneme_cache:
|
926 |
-
|
927 |
-
|
928 |
-
|
929 |
-
|
930 |
-
# ARPABET to IPA conversion mapping (same as in build_phoneme_reverse_lookup)
|
931 |
-
arpabet_to_ipa = {
|
932 |
-
'AA': 'Ι', 'AE': 'Γ¦', 'AH': 'Ι', 'AO': 'Ι', 'AW': 'aΚ',
|
933 |
-
'AY': 'aΙͺ', 'B': 'b', 'CH': 'tΚ', 'D': 'd', 'DH': 'Γ°',
|
934 |
-
'EH': 'Ι', 'ER': 'Ι', 'EY': 'eΙͺ', 'F': 'f', 'G': 'Ι‘',
|
935 |
-
'HH': 'h', 'IH': 'Ιͺ', 'IY': 'i', 'JH': 'dΚ', 'K': 'k',
|
936 |
-
'L': 'l', 'M': 'm', 'N': 'n', 'NG': 'Ε', 'OW': 'oΚ',
|
937 |
-
'OY': 'ΙΙͺ', 'P': 'p', 'R': 'r', 'S': 's', 'SH': 'Κ',
|
938 |
-
'T': 't', 'TH': 'ΞΈ', 'UH': 'Κ', 'UW': 'u', 'V': 'v',
|
939 |
-
'W': 'w', 'Y': 'j', 'Z': 'z', 'ZH': 'Κ', 'DX': 'ΙΎ'
|
940 |
-
}
|
941 |
|
942 |
-
|
943 |
-
cmudict_lookup = {}
|
944 |
-
if not hasattr(get_expected_phonemes, '_cmudict_loaded'):
|
945 |
-
log("Loading CMUdict for expected phonemes...")
|
946 |
-
try:
|
947 |
-
cmudict_path = "/tmp/cmudict.dict"
|
948 |
-
if os.path.exists(cmudict_path):
|
949 |
-
with open(cmudict_path, 'r', encoding='latin-1') as f:
|
950 |
-
for line in f:
|
951 |
-
line = line.strip()
|
952 |
-
if not line or line.startswith(';;;'):
|
953 |
-
continue
|
954 |
-
|
955 |
-
parts = line.split()
|
956 |
-
if len(parts) < 2:
|
957 |
-
continue
|
958 |
-
|
959 |
-
word = parts[0].lower()
|
960 |
-
# Remove variant indicators like (2), (3)
|
961 |
-
if '(' in word:
|
962 |
-
word = word.split('(')[0]
|
963 |
-
|
964 |
-
# Store ARPABET phones (we'll convert to IPA as needed)
|
965 |
-
arpabet_phones = parts[1:]
|
966 |
-
cmudict_lookup[word] = arpabet_phones
|
967 |
-
|
968 |
-
get_expected_phonemes._cmudict_loaded = True
|
969 |
-
get_expected_phonemes._cmudict_lookup = cmudict_lookup
|
970 |
-
log(f"Loaded {len(cmudict_lookup)} words from CMUdict")
|
971 |
-
else:
|
972 |
-
log("β οΈ CMUdict not found, falling back to phonemizer")
|
973 |
-
# Fallback to original phonemizer
|
974 |
-
phonemes = phonemize(words, language='en-us', backend='espeak', strip=True)
|
975 |
-
phoneme_cache[cache_key] = phonemes
|
976 |
-
return phonemes
|
977 |
-
except Exception as e:
|
978 |
-
log(f"β Error loading CMUdict: {e}, falling back to phonemizer")
|
979 |
-
phonemes = phonemize(words, language='en-us', backend='espeak', strip=True)
|
980 |
-
phoneme_cache[cache_key] = phonemes
|
981 |
-
return phonemes
|
982 |
-
else:
|
983 |
-
cmudict_lookup = get_expected_phonemes._cmudict_lookup
|
984 |
|
985 |
-
|
986 |
-
|
987 |
-
|
988 |
-
word_lower = word.lower()
|
989 |
|
990 |
-
|
991 |
-
|
992 |
-
|
993 |
-
|
994 |
-
|
995 |
-
|
996 |
-
|
997 |
-
|
998 |
-
|
999 |
-
|
1000 |
-
|
1001 |
-
|
1002 |
-
|
1003 |
-
|
1004 |
-
|
1005 |
-
|
1006 |
-
|
1007 |
-
else:
|
1008 |
-
# Fallback to phonemizer for out-of-vocabulary words
|
1009 |
-
log(f"β οΈ '{word}' not in CMUdict, using phonemizer fallback")
|
1010 |
-
try:
|
1011 |
-
fallback_phoneme = phonemize([word], language='en-us', backend='espeak', strip=True)[0]
|
1012 |
-
results.append(fallback_phoneme)
|
1013 |
-
except Exception as e:
|
1014 |
-
log(f"β Phonemizer fallback failed for '{word}': {e}")
|
1015 |
-
results.append("") # Empty string as last resort
|
1016 |
-
|
1017 |
-
phoneme_cache[cache_key] = results
|
1018 |
-
log(f"Final expected phonemes: {list(zip(words, results))}")
|
1019 |
-
return results
|
1020 |
|
1021 |
async def generate_tts_audio(word: str) -> str:
|
1022 |
"""Generate TTS audio for a word with silence padding"""
|
|
|
306 |
if whisperx_model is None:
|
307 |
log("Loading WhisperX models for English-only processing...")
|
308 |
try:
|
309 |
+
# Try loading with base.en first
|
310 |
whisperx_model = whisperx.load_model("base.en", device="cpu", compute_type="float32", language="en")
|
311 |
log("WhisperX base.en model loaded successfully")
|
312 |
|
|
|
314 |
whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
|
315 |
log("WhisperX English alignment model loaded successfully")
|
316 |
|
317 |
+
except ImportError as ie:
|
318 |
+
log(f"Import error loading WhisperX models: {ie}")
|
319 |
+
# Try without ctranslate2 by using int8 compute type
|
320 |
+
try:
|
321 |
+
log("Trying fallback with int8 compute type...")
|
322 |
+
whisperx_model = whisperx.load_model("base.en", device="cpu", compute_type="int8", language="en")
|
323 |
+
whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
|
324 |
+
log("WhisperX models loaded with int8 compute type")
|
325 |
+
except Exception as fallback_error:
|
326 |
+
log(f"Int8 fallback also failed: {fallback_error}")
|
327 |
+
# Last resort: try tiny model with default compute
|
328 |
+
try:
|
329 |
+
log("Trying final fallback with tiny.en model and default compute...")
|
330 |
+
whisperx_model = whisperx.load_model("tiny.en", device="cpu", language="en")
|
331 |
+
whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
|
332 |
+
log("WhisperX models loaded with tiny.en and default compute")
|
333 |
+
except Exception as final_error:
|
334 |
+
log(f"All WhisperX loading attempts failed: {final_error}")
|
335 |
+
raise RuntimeError("Unable to load WhisperX models. Please check environment setup.")
|
336 |
except Exception as e:
|
337 |
log(f"Error loading WhisperX models: {e}")
|
338 |
# Fallback: try with smaller English-only model
|
339 |
try:
|
340 |
log("Trying fallback with tiny.en model...")
|
341 |
+
whisperx_model = whisperx.load_model("tiny.en", device="cpu", compute_type="int8", language="en")
|
342 |
whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
|
343 |
log("WhisperX models loaded with fallback (tiny.en model)")
|
344 |
except Exception as fallback_error:
|
345 |
log(f"Fallback also failed: {fallback_error}")
|
346 |
+
# Final attempt without compute_type specification
|
347 |
+
try:
|
348 |
+
log("Final attempt with default settings...")
|
349 |
+
whisperx_model = whisperx.load_model("tiny.en", device="cpu", language="en")
|
350 |
+
whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
|
351 |
+
log("WhisperX models loaded with default settings")
|
352 |
+
except Exception as final_error:
|
353 |
+
log(f"All attempts failed: {final_error}")
|
354 |
+
raise RuntimeError("Unable to load WhisperX models in this environment")
|
355 |
|
356 |
def convert_webm_to_wav(bts):
|
357 |
p = subprocess.run(["ffmpeg", "-i", "pipe:0", "-f", "wav", "-ar", "16000", "-ac", "1", "pipe:1"],
|
|
|
947 |
return trimmed_segment
|
948 |
|
949 |
def get_expected_phonemes(words: List[str]) -> List[str]:
|
950 |
+
"""Get expected phonemes using espeak phonemizer"""
|
951 |
cache_key = tuple(words)
|
952 |
if cache_key in phoneme_cache:
|
953 |
+
log(f"π Using cached phonemes for: {words}")
|
954 |
+
cached_result = phoneme_cache[cache_key]
|
955 |
+
log(f" Cached phonemes: {list(zip(words, cached_result))}")
|
956 |
+
return cached_result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
957 |
|
958 |
+
log(f"π€ Getting expected phonemes using phonemizer for: {words}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
959 |
|
960 |
+
try:
|
961 |
+
# Use espeak phonemizer to get IPA phonemes
|
962 |
+
phonemes = phonemize(words, language='en-us', backend='espeak', strip=True)
|
|
|
963 |
|
964 |
+
# Cache the results
|
965 |
+
phoneme_cache[cache_key] = phonemes
|
966 |
+
|
967 |
+
# Log the phoneme results
|
968 |
+
log(f"β
Phonemizer results:")
|
969 |
+
for word, phoneme in zip(words, phonemes):
|
970 |
+
log(f" '{word}' β '{phoneme}'")
|
971 |
+
|
972 |
+
return phonemes
|
973 |
+
|
974 |
+
except Exception as e:
|
975 |
+
log(f"β Error in phonemizer: {e}")
|
976 |
+
log(f" Returning empty phonemes for all words")
|
977 |
+
# Return empty strings as fallback
|
978 |
+
empty_results = [""] * len(words)
|
979 |
+
phoneme_cache[cache_key] = empty_results
|
980 |
+
return empty_results
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
981 |
|
982 |
async def generate_tts_audio(word: str) -> str:
|
983 |
"""Generate TTS audio for a word with silence padding"""
|