greg0rs commited on
Commit
91ff8f4
·
verified ·
1 Parent(s): a02bc29

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +95 -56
app.py CHANGED
@@ -306,7 +306,7 @@ def load_whisperx_models():
306
  if whisperx_model is None:
307
  log("Loading WhisperX models for English-only processing...")
308
  try:
309
- # Try loading with base.en first
310
  whisperx_model = whisperx.load_model("base.en", device="cpu", compute_type="float32", language="en")
311
  log("WhisperX base.en model loaded successfully")
312
 
@@ -314,44 +314,17 @@ def load_whisperx_models():
314
  whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
315
  log("WhisperX English alignment model loaded successfully")
316
 
317
- except ImportError as ie:
318
- log(f"Import error loading WhisperX models: {ie}")
319
- # Try without ctranslate2 by using int8 compute type
320
- try:
321
- log("Trying fallback with int8 compute type...")
322
- whisperx_model = whisperx.load_model("base.en", device="cpu", compute_type="int8", language="en")
323
- whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
324
- log("WhisperX models loaded with int8 compute type")
325
- except Exception as fallback_error:
326
- log(f"Int8 fallback also failed: {fallback_error}")
327
- # Last resort: try tiny model with default compute
328
- try:
329
- log("Trying final fallback with tiny.en model and default compute...")
330
- whisperx_model = whisperx.load_model("tiny.en", device="cpu", language="en")
331
- whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
332
- log("WhisperX models loaded with tiny.en and default compute")
333
- except Exception as final_error:
334
- log(f"All WhisperX loading attempts failed: {final_error}")
335
- raise RuntimeError("Unable to load WhisperX models. Please check environment setup.")
336
  except Exception as e:
337
  log(f"Error loading WhisperX models: {e}")
338
  # Fallback: try with smaller English-only model
339
  try:
340
  log("Trying fallback with tiny.en model...")
341
- whisperx_model = whisperx.load_model("tiny.en", device="cpu", compute_type="int8", language="en")
342
  whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
343
  log("WhisperX models loaded with fallback (tiny.en model)")
344
  except Exception as fallback_error:
345
  log(f"Fallback also failed: {fallback_error}")
346
- # Final attempt without compute_type specification
347
- try:
348
- log("Final attempt with default settings...")
349
- whisperx_model = whisperx.load_model("tiny.en", device="cpu", language="en")
350
- whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
351
- log("WhisperX models loaded with default settings")
352
- except Exception as final_error:
353
- log(f"All attempts failed: {final_error}")
354
- raise RuntimeError("Unable to load WhisperX models in this environment")
355
 
356
  def convert_webm_to_wav(bts):
357
  p = subprocess.run(["ffmpeg", "-i", "pipe:0", "-f", "wav", "-ar", "16000", "-ac", "1", "pipe:1"],
@@ -947,37 +920,103 @@ def trim_audio_segment_by_phoneme_position(audio_segment: torch.Tensor,
947
  return trimmed_segment
948
 
949
  def get_expected_phonemes(words: List[str]) -> List[str]:
950
- """Get expected phonemes using espeak phonemizer"""
951
  cache_key = tuple(words)
952
  if cache_key in phoneme_cache:
953
- log(f"📚 Using cached phonemes for: {words}")
954
- cached_result = phoneme_cache[cache_key]
955
- log(f" Cached phonemes: {list(zip(words, cached_result))}")
956
- return cached_result
 
 
 
 
 
 
 
 
 
 
 
957
 
958
- log(f"🔤 Getting expected phonemes using phonemizer for: {words}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
959
 
960
- try:
961
- # Use espeak phonemizer to get IPA phonemes
962
- phonemes = phonemize(words, language='en-us', backend='espeak', strip=True)
963
-
964
- # Cache the results
965
- phoneme_cache[cache_key] = phonemes
966
-
967
- # Log the phoneme results
968
- log(f"✅ Phonemizer results:")
969
- for word, phoneme in zip(words, phonemes):
970
- log(f" '{word}' → '{phoneme}'")
971
 
972
- return phonemes
973
-
974
- except Exception as e:
975
- log(f"❌ Error in phonemizer: {e}")
976
- log(f" Returning empty phonemes for all words")
977
- # Return empty strings as fallback
978
- empty_results = [""] * len(words)
979
- phoneme_cache[cache_key] = empty_results
980
- return empty_results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
981
 
982
  async def generate_tts_audio(word: str) -> str:
983
  """Generate TTS audio for a word with silence padding"""
 
306
  if whisperx_model is None:
307
  log("Loading WhisperX models for English-only processing...")
308
  try:
309
+ # Load WhisperX model with English-only configuration
310
  whisperx_model = whisperx.load_model("base.en", device="cpu", compute_type="float32", language="en")
311
  log("WhisperX base.en model loaded successfully")
312
 
 
314
  whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
315
  log("WhisperX English alignment model loaded successfully")
316
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
317
  except Exception as e:
318
  log(f"Error loading WhisperX models: {e}")
319
  # Fallback: try with smaller English-only model
320
  try:
321
  log("Trying fallback with tiny.en model...")
322
+ whisperx_model = whisperx.load_model("tiny.en", device="cpu", compute_type="float32", language="en")
323
  whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
324
  log("WhisperX models loaded with fallback (tiny.en model)")
325
  except Exception as fallback_error:
326
  log(f"Fallback also failed: {fallback_error}")
327
+ raise
 
 
 
 
 
 
 
 
328
 
329
  def convert_webm_to_wav(bts):
330
  p = subprocess.run(["ffmpeg", "-i", "pipe:0", "-f", "wav", "-ar", "16000", "-ac", "1", "pipe:1"],
 
920
  return trimmed_segment
921
 
922
  def get_expected_phonemes(words: List[str]) -> List[str]:
923
+ """Get expected phonemes using CMUdict instead of espeak phonemizer"""
924
  cache_key = tuple(words)
925
  if cache_key in phoneme_cache:
926
+ return phoneme_cache[cache_key]
927
+
928
+ log(f"Getting expected phonemes from CMUdict for: {words}")
929
+
930
+ # ARPABET to IPA conversion mapping (same as in build_phoneme_reverse_lookup)
931
+ arpabet_to_ipa = {
932
+ 'AA': 'ɑ', 'AE': 'æ', 'AH': 'ə', 'AO': 'ɔ', 'AW': 'aʊ',
933
+ 'AY': 'aɪ', 'B': 'b', 'CH': 'tʃ', 'D': 'd', 'DH': 'ð',
934
+ 'EH': 'ɛ', 'ER': 'ɝ', 'EY': 'eɪ', 'F': 'f', 'G': 'ɡ',
935
+ 'HH': 'h', 'IH': 'ɪ', 'IY': 'i', 'JH': 'dʒ', 'K': 'k',
936
+ 'L': 'l', 'M': 'm', 'N': 'n', 'NG': 'ŋ', 'OW': 'oʊ',
937
+ 'OY': 'ɔɪ', 'P': 'p', 'R': 'r', 'S': 's', 'SH': 'ʃ',
938
+ 'T': 't', 'TH': 'θ', 'UH': 'ʊ', 'UW': 'u', 'V': 'v',
939
+ 'W': 'w', 'Y': 'j', 'Z': 'z', 'ZH': 'ʒ', 'DX': 'ɾ'
940
+ }
941
 
942
+ # Load CMUdict on first use
943
+ cmudict_lookup = {}
944
+ if not hasattr(get_expected_phonemes, '_cmudict_loaded'):
945
+ log("Loading CMUdict for expected phonemes...")
946
+ try:
947
+ cmudict_path = "/tmp/cmudict.dict"
948
+ if os.path.exists(cmudict_path):
949
+ with open(cmudict_path, 'r', encoding='latin-1') as f:
950
+ for line in f:
951
+ line = line.strip()
952
+ if not line or line.startswith(';;;'):
953
+ continue
954
+
955
+ parts = line.split()
956
+ if len(parts) < 2:
957
+ continue
958
+
959
+ word = parts[0].lower()
960
+ # Remove variant indicators like (2), (3)
961
+ if '(' in word:
962
+ word = word.split('(')[0]
963
+
964
+ # Store ARPABET phones (we'll convert to IPA as needed)
965
+ arpabet_phones = parts[1:]
966
+ cmudict_lookup[word] = arpabet_phones
967
+
968
+ get_expected_phonemes._cmudict_loaded = True
969
+ get_expected_phonemes._cmudict_lookup = cmudict_lookup
970
+ log(f"Loaded {len(cmudict_lookup)} words from CMUdict")
971
+ else:
972
+ log("⚠️ CMUdict not found, falling back to phonemizer")
973
+ # Fallback to original phonemizer
974
+ phonemes = phonemize(words, language='en-us', backend='espeak', strip=True)
975
+ phoneme_cache[cache_key] = phonemes
976
+ return phonemes
977
+ except Exception as e:
978
+ log(f"❌ Error loading CMUdict: {e}, falling back to phonemizer")
979
+ phonemes = phonemize(words, language='en-us', backend='espeak', strip=True)
980
+ phoneme_cache[cache_key] = phonemes
981
+ return phonemes
982
+ else:
983
+ cmudict_lookup = get_expected_phonemes._cmudict_lookup
984
 
985
+ # Convert words to phonemes using CMUdict
986
+ results = []
987
+ for word in words:
988
+ word_lower = word.lower()
 
 
 
 
 
 
 
989
 
990
+ if word_lower in cmudict_lookup:
991
+ # Convert ARPABET to IPA
992
+ arpabet_phones = cmudict_lookup[word_lower]
993
+ ipa_phones = []
994
+
995
+ for phone in arpabet_phones:
996
+ # Remove stress markers (0,1,2)
997
+ clean_phone = ''.join(c for c in phone if not c.isdigit())
998
+ if clean_phone in arpabet_to_ipa:
999
+ ipa_phones.append(arpabet_to_ipa[clean_phone])
1000
+ else:
1001
+ log(f"⚠️ Unknown ARPABET phone '{clean_phone}' in word '{word}'")
1002
+
1003
+ ipa_string = ''.join(ipa_phones)
1004
+ results.append(ipa_string)
1005
+ log(f"CMUdict: '{word}' → ARPABET {arpabet_phones} → IPA '{ipa_string}'")
1006
+
1007
+ else:
1008
+ # Fallback to phonemizer for out-of-vocabulary words
1009
+ log(f"⚠️ '{word}' not in CMUdict, using phonemizer fallback")
1010
+ try:
1011
+ fallback_phoneme = phonemize([word], language='en-us', backend='espeak', strip=True)[0]
1012
+ results.append(fallback_phoneme)
1013
+ except Exception as e:
1014
+ log(f"❌ Phonemizer fallback failed for '{word}': {e}")
1015
+ results.append("") # Empty string as last resort
1016
+
1017
+ phoneme_cache[cache_key] = results
1018
+ log(f"Final expected phonemes: {list(zip(words, results))}")
1019
+ return results
1020
 
1021
  async def generate_tts_audio(word: str) -> str:
1022
  """Generate TTS audio for a word with silence padding"""