Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -133,24 +133,24 @@ def clean_word_for_phonemes(word: str) -> str:
|
|
133 |
return cleaned
|
134 |
|
135 |
def load_whisperx_models():
|
136 |
-
"""Load WhisperX models lazily"""
|
137 |
global whisperx_model, whisperx_align_model, whisperx_metadata
|
138 |
|
139 |
if whisperx_model is None:
|
140 |
-
log("Loading WhisperX models...")
|
141 |
try:
|
142 |
# Use float32 compute type for CPU compatibility
|
143 |
-
whisperx_model = whisperx.load_model("base", device="cpu", compute_type="float32")
|
144 |
whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
|
145 |
-
log("WhisperX models loaded successfully")
|
146 |
except Exception as e:
|
147 |
log(f"Error loading WhisperX models: {e}")
|
148 |
# Fallback: try with smaller model
|
149 |
try:
|
150 |
-
log("Trying fallback with tiny model...")
|
151 |
-
whisperx_model = whisperx.load_model("tiny", device="cpu", compute_type="float32")
|
152 |
whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
|
153 |
-
log("WhisperX models loaded with fallback (tiny model)")
|
154 |
except Exception as fallback_error:
|
155 |
log(f"Fallback also failed: {fallback_error}")
|
156 |
raise
|
@@ -700,11 +700,12 @@ async def transcribe(audio: UploadFile = File(...)):
|
|
700 |
|
701 |
log(f"Audio loaded for WhisperX: {len(audio_data)} samples")
|
702 |
|
703 |
-
# 2. Get transcription with WhisperX
|
704 |
-
|
|
|
705 |
|
706 |
-
# 3. Get precise word alignments with WhisperX
|
707 |
-
aligned_result = whisperx.align(result["segments"], whisperx_align_model, whisperx_metadata, audio_data, device="cpu")
|
708 |
|
709 |
# Extract word-level data from WhisperX results
|
710 |
words = []
|
@@ -860,7 +861,7 @@ async def transcribe(audio: UploadFile = File(...)):
|
|
860 |
"debug_info": {
|
861 |
"total_words": len(words),
|
862 |
"similarity_threshold": similarity,
|
863 |
-
"alignment_method": "WhisperX + Enhanced Sliding Window with Fallback",
|
864 |
"results_summary": [
|
865 |
{
|
866 |
"word": r['word_text'],
|
@@ -888,7 +889,7 @@ async def transcribe(audio: UploadFile = File(...)):
|
|
888 |
|
889 |
@app.get("/")
|
890 |
def root():
|
891 |
-
return "Clean Fonetik with WhisperX + Enhanced Character-Level Feedback running"
|
892 |
|
893 |
@app.post("/api/clear-cache")
|
894 |
def clear_cache():
|
|
|
133 |
return cleaned
|
134 |
|
135 |
def load_whisperx_models():
|
136 |
+
"""Load WhisperX models lazily - ENGLISH ONLY"""
|
137 |
global whisperx_model, whisperx_align_model, whisperx_metadata
|
138 |
|
139 |
if whisperx_model is None:
|
140 |
+
log("Loading WhisperX models (English only)...")
|
141 |
try:
|
142 |
# Use float32 compute type for CPU compatibility
|
143 |
+
whisperx_model = whisperx.load_model("base", device="cpu", compute_type="float32", language="en")
|
144 |
whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
|
145 |
+
log("WhisperX models loaded successfully (English forced)")
|
146 |
except Exception as e:
|
147 |
log(f"Error loading WhisperX models: {e}")
|
148 |
# Fallback: try with smaller model
|
149 |
try:
|
150 |
+
log("Trying fallback with tiny model (English only)...")
|
151 |
+
whisperx_model = whisperx.load_model("tiny", device="cpu", compute_type="float32", language="en")
|
152 |
whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
|
153 |
+
log("WhisperX models loaded with fallback (tiny model, English forced)")
|
154 |
except Exception as fallback_error:
|
155 |
log(f"Fallback also failed: {fallback_error}")
|
156 |
raise
|
|
|
700 |
|
701 |
log(f"Audio loaded for WhisperX: {len(audio_data)} samples")
|
702 |
|
703 |
+
# 2. Get transcription with WhisperX - FORCE ENGLISH
|
704 |
+
log("🌍 Forcing WhisperX to use English language detection...")
|
705 |
+
result = whisperx_model.transcribe(audio_data, batch_size=16, language="en")
|
706 |
|
707 |
+
# 3. Get precise word alignments with WhisperX - ENGLISH ONLY
|
708 |
+
aligned_result = whisperx.align(result["segments"], whisperx_align_model, whisperx_metadata, audio_data, device="cpu", return_char_alignments=False)
|
709 |
|
710 |
# Extract word-level data from WhisperX results
|
711 |
words = []
|
|
|
861 |
"debug_info": {
|
862 |
"total_words": len(words),
|
863 |
"similarity_threshold": similarity,
|
864 |
+
"alignment_method": "WhisperX + Enhanced Sliding Window with Fallback (English Only)",
|
865 |
"results_summary": [
|
866 |
{
|
867 |
"word": r['word_text'],
|
|
|
889 |
|
890 |
@app.get("/")
|
891 |
def root():
|
892 |
+
return "Clean Fonetik with WhisperX + Enhanced Character-Level Feedback (English Only) running"
|
893 |
|
894 |
@app.post("/api/clear-cache")
|
895 |
def clear_cache():
|