greg0rs commited on
Commit
05d30e4
Β·
verified Β·
1 Parent(s): 91ff8f4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +56 -95
app.py CHANGED
@@ -306,7 +306,7 @@ def load_whisperx_models():
306
  if whisperx_model is None:
307
  log("Loading WhisperX models for English-only processing...")
308
  try:
309
- # Load WhisperX model with English-only configuration
310
  whisperx_model = whisperx.load_model("base.en", device="cpu", compute_type="float32", language="en")
311
  log("WhisperX base.en model loaded successfully")
312
 
@@ -314,17 +314,44 @@ def load_whisperx_models():
314
  whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
315
  log("WhisperX English alignment model loaded successfully")
316
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
  except Exception as e:
318
  log(f"Error loading WhisperX models: {e}")
319
  # Fallback: try with smaller English-only model
320
  try:
321
  log("Trying fallback with tiny.en model...")
322
- whisperx_model = whisperx.load_model("tiny.en", device="cpu", compute_type="float32", language="en")
323
  whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
324
  log("WhisperX models loaded with fallback (tiny.en model)")
325
  except Exception as fallback_error:
326
  log(f"Fallback also failed: {fallback_error}")
327
- raise
 
 
 
 
 
 
 
 
328
 
329
  def convert_webm_to_wav(bts):
330
  p = subprocess.run(["ffmpeg", "-i", "pipe:0", "-f", "wav", "-ar", "16000", "-ac", "1", "pipe:1"],
@@ -920,103 +947,37 @@ def trim_audio_segment_by_phoneme_position(audio_segment: torch.Tensor,
920
  return trimmed_segment
921
 
922
  def get_expected_phonemes(words: List[str]) -> List[str]:
923
- """Get expected phonemes using CMUdict instead of espeak phonemizer"""
924
  cache_key = tuple(words)
925
  if cache_key in phoneme_cache:
926
- return phoneme_cache[cache_key]
927
-
928
- log(f"Getting expected phonemes from CMUdict for: {words}")
929
-
930
- # ARPABET to IPA conversion mapping (same as in build_phoneme_reverse_lookup)
931
- arpabet_to_ipa = {
932
- 'AA': 'Ι‘', 'AE': 'Γ¦', 'AH': 'Ι™', 'AO': 'Ι”', 'AW': 'aʊ',
933
- 'AY': 'aΙͺ', 'B': 'b', 'CH': 'tΚƒ', 'D': 'd', 'DH': 'Γ°',
934
- 'EH': 'Ι›', 'ER': 'ɝ', 'EY': 'eΙͺ', 'F': 'f', 'G': 'Ι‘',
935
- 'HH': 'h', 'IH': 'Ιͺ', 'IY': 'i', 'JH': 'dΚ’', 'K': 'k',
936
- 'L': 'l', 'M': 'm', 'N': 'n', 'NG': 'Ε‹', 'OW': 'oʊ',
937
- 'OY': 'Ι”Ιͺ', 'P': 'p', 'R': 'r', 'S': 's', 'SH': 'Κƒ',
938
- 'T': 't', 'TH': 'θ', 'UH': 'ʊ', 'UW': 'u', 'V': 'v',
939
- 'W': 'w', 'Y': 'j', 'Z': 'z', 'ZH': 'Κ’', 'DX': 'ΙΎ'
940
- }
941
 
942
- # Load CMUdict on first use
943
- cmudict_lookup = {}
944
- if not hasattr(get_expected_phonemes, '_cmudict_loaded'):
945
- log("Loading CMUdict for expected phonemes...")
946
- try:
947
- cmudict_path = "/tmp/cmudict.dict"
948
- if os.path.exists(cmudict_path):
949
- with open(cmudict_path, 'r', encoding='latin-1') as f:
950
- for line in f:
951
- line = line.strip()
952
- if not line or line.startswith(';;;'):
953
- continue
954
-
955
- parts = line.split()
956
- if len(parts) < 2:
957
- continue
958
-
959
- word = parts[0].lower()
960
- # Remove variant indicators like (2), (3)
961
- if '(' in word:
962
- word = word.split('(')[0]
963
-
964
- # Store ARPABET phones (we'll convert to IPA as needed)
965
- arpabet_phones = parts[1:]
966
- cmudict_lookup[word] = arpabet_phones
967
-
968
- get_expected_phonemes._cmudict_loaded = True
969
- get_expected_phonemes._cmudict_lookup = cmudict_lookup
970
- log(f"Loaded {len(cmudict_lookup)} words from CMUdict")
971
- else:
972
- log("⚠️ CMUdict not found, falling back to phonemizer")
973
- # Fallback to original phonemizer
974
- phonemes = phonemize(words, language='en-us', backend='espeak', strip=True)
975
- phoneme_cache[cache_key] = phonemes
976
- return phonemes
977
- except Exception as e:
978
- log(f"❌ Error loading CMUdict: {e}, falling back to phonemizer")
979
- phonemes = phonemize(words, language='en-us', backend='espeak', strip=True)
980
- phoneme_cache[cache_key] = phonemes
981
- return phonemes
982
- else:
983
- cmudict_lookup = get_expected_phonemes._cmudict_lookup
984
 
985
- # Convert words to phonemes using CMUdict
986
- results = []
987
- for word in words:
988
- word_lower = word.lower()
989
 
990
- if word_lower in cmudict_lookup:
991
- # Convert ARPABET to IPA
992
- arpabet_phones = cmudict_lookup[word_lower]
993
- ipa_phones = []
994
-
995
- for phone in arpabet_phones:
996
- # Remove stress markers (0,1,2)
997
- clean_phone = ''.join(c for c in phone if not c.isdigit())
998
- if clean_phone in arpabet_to_ipa:
999
- ipa_phones.append(arpabet_to_ipa[clean_phone])
1000
- else:
1001
- log(f"⚠️ Unknown ARPABET phone '{clean_phone}' in word '{word}'")
1002
-
1003
- ipa_string = ''.join(ipa_phones)
1004
- results.append(ipa_string)
1005
- log(f"CMUdict: '{word}' β†’ ARPABET {arpabet_phones} β†’ IPA '{ipa_string}'")
1006
-
1007
- else:
1008
- # Fallback to phonemizer for out-of-vocabulary words
1009
- log(f"⚠️ '{word}' not in CMUdict, using phonemizer fallback")
1010
- try:
1011
- fallback_phoneme = phonemize([word], language='en-us', backend='espeak', strip=True)[0]
1012
- results.append(fallback_phoneme)
1013
- except Exception as e:
1014
- log(f"❌ Phonemizer fallback failed for '{word}': {e}")
1015
- results.append("") # Empty string as last resort
1016
-
1017
- phoneme_cache[cache_key] = results
1018
- log(f"Final expected phonemes: {list(zip(words, results))}")
1019
- return results
1020
 
1021
  async def generate_tts_audio(word: str) -> str:
1022
  """Generate TTS audio for a word with silence padding"""
 
306
  if whisperx_model is None:
307
  log("Loading WhisperX models for English-only processing...")
308
  try:
309
+ # Try loading with base.en first
310
  whisperx_model = whisperx.load_model("base.en", device="cpu", compute_type="float32", language="en")
311
  log("WhisperX base.en model loaded successfully")
312
 
 
314
  whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
315
  log("WhisperX English alignment model loaded successfully")
316
 
317
+ except ImportError as ie:
318
+ log(f"Import error loading WhisperX models: {ie}")
319
+ # Try without ctranslate2 by using int8 compute type
320
+ try:
321
+ log("Trying fallback with int8 compute type...")
322
+ whisperx_model = whisperx.load_model("base.en", device="cpu", compute_type="int8", language="en")
323
+ whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
324
+ log("WhisperX models loaded with int8 compute type")
325
+ except Exception as fallback_error:
326
+ log(f"Int8 fallback also failed: {fallback_error}")
327
+ # Last resort: try tiny model with default compute
328
+ try:
329
+ log("Trying final fallback with tiny.en model and default compute...")
330
+ whisperx_model = whisperx.load_model("tiny.en", device="cpu", language="en")
331
+ whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
332
+ log("WhisperX models loaded with tiny.en and default compute")
333
+ except Exception as final_error:
334
+ log(f"All WhisperX loading attempts failed: {final_error}")
335
+ raise RuntimeError("Unable to load WhisperX models. Please check environment setup.")
336
  except Exception as e:
337
  log(f"Error loading WhisperX models: {e}")
338
  # Fallback: try with smaller English-only model
339
  try:
340
  log("Trying fallback with tiny.en model...")
341
+ whisperx_model = whisperx.load_model("tiny.en", device="cpu", compute_type="int8", language="en")
342
  whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
343
  log("WhisperX models loaded with fallback (tiny.en model)")
344
  except Exception as fallback_error:
345
  log(f"Fallback also failed: {fallback_error}")
346
+ # Final attempt without compute_type specification
347
+ try:
348
+ log("Final attempt with default settings...")
349
+ whisperx_model = whisperx.load_model("tiny.en", device="cpu", language="en")
350
+ whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
351
+ log("WhisperX models loaded with default settings")
352
+ except Exception as final_error:
353
+ log(f"All attempts failed: {final_error}")
354
+ raise RuntimeError("Unable to load WhisperX models in this environment")
355
 
356
  def convert_webm_to_wav(bts):
357
  p = subprocess.run(["ffmpeg", "-i", "pipe:0", "-f", "wav", "-ar", "16000", "-ac", "1", "pipe:1"],
 
947
  return trimmed_segment
948
 
949
  def get_expected_phonemes(words: List[str]) -> List[str]:
950
+ """Get expected phonemes using espeak phonemizer"""
951
  cache_key = tuple(words)
952
  if cache_key in phoneme_cache:
953
+ log(f"πŸ“š Using cached phonemes for: {words}")
954
+ cached_result = phoneme_cache[cache_key]
955
+ log(f" Cached phonemes: {list(zip(words, cached_result))}")
956
+ return cached_result
 
 
 
 
 
 
 
 
 
 
 
957
 
958
+ log(f"πŸ”€ Getting expected phonemes using phonemizer for: {words}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
959
 
960
+ try:
961
+ # Use espeak phonemizer to get IPA phonemes
962
+ phonemes = phonemize(words, language='en-us', backend='espeak', strip=True)
 
963
 
964
+ # Cache the results
965
+ phoneme_cache[cache_key] = phonemes
966
+
967
+ # Log the phoneme results
968
+ log(f"βœ… Phonemizer results:")
969
+ for word, phoneme in zip(words, phonemes):
970
+ log(f" '{word}' β†’ '{phoneme}'")
971
+
972
+ return phonemes
973
+
974
+ except Exception as e:
975
+ log(f"❌ Error in phonemizer: {e}")
976
+ log(f" Returning empty phonemes for all words")
977
+ # Return empty strings as fallback
978
+ empty_results = [""] * len(words)
979
+ phoneme_cache[cache_key] = empty_results
980
+ return empty_results
 
 
 
 
 
 
 
 
 
 
 
 
 
981
 
982
  async def generate_tts_audio(word: str) -> str:
983
  """Generate TTS audio for a word with silence padding"""