greg0rs commited on
Commit
4b0e9d9
·
verified ·
1 Parent(s): 1bf5bd5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +197 -138
app.py CHANGED
@@ -11,6 +11,7 @@ import string
11
  import re
12
  import urllib.request
13
  import gzip
 
14
 
15
  # Set cache environment
16
  os.environ['HF_HOME'] = '/tmp/hf'
@@ -45,8 +46,6 @@ def normalize_phoneme_string(s: str) -> str:
45
  if not s:
46
  return s
47
 
48
- original = s
49
-
50
  # Convert to lowercase and remove spaces, stress marks, and length markers
51
  normalized = s.lower().strip()
52
  normalized = normalized.replace(' ', '') # Remove spaces between phonemes
@@ -83,101 +82,12 @@ def normalize_phoneme_string(s: str) -> str:
83
  for variant_char, standard_char in ipa_variants.items():
84
  normalized = normalized.replace(variant_char, standard_char)
85
 
86
- # Debug specific phoneme strings for normalization
87
- if any(word in original.lower() for word in ['hello', 'red']) or any(pattern in original for pattern in ['həloʊ', 'hɛloʊ', 'ɹɛd']):
88
- log(f"🔍 Normalization debug: '{original}' → '{normalized}'")
89
-
90
  return normalized
91
 
92
- # Phoneme reverse lookup
93
- phoneme_to_words_cache = {}
94
-
95
- def build_phoneme_reverse_lookup():
96
- """Build reverse lookup dictionary from CMUdict (ARPABET to words)"""
97
- global phoneme_to_words_cache
98
-
99
- if phoneme_to_words_cache:
100
- return # Already built
101
-
102
- try:
103
- # Download CMUdict if not exists
104
- cmudict_path = "/tmp/cmudict.dict"
105
- if not os.path.exists(cmudict_path):
106
- urllib.request.urlretrieve("https://raw.githubusercontent.com/cmusphinx/cmudict/master/cmudict.dict", cmudict_path)
107
-
108
- # ARPABET to IPA conversion mapping
109
- arpabet_to_ipa = {
110
- 'AA': 'ɑ', 'AE': 'æ', 'AH': 'ə', 'AO': 'ɔ', 'AW': 'aʊ', # Fixed AH: ʌ→ə
111
- 'AY': 'aɪ', 'B': 'b', 'CH': 'tʃ', 'D': 'd', 'DH': 'ð',
112
- 'EH': 'ɛ', 'ER': 'ɝ', 'EY': 'eɪ', 'F': 'f', 'G': 'ɡ',
113
- 'HH': 'h', 'IH': 'ɪ', 'IY': 'i', 'JH': 'dʒ', 'K': 'k',
114
- 'L': 'l', 'M': 'm', 'N': 'n', 'NG': 'ŋ', 'OW': 'oʊ',
115
- 'OY': 'ɔɪ', 'P': 'p', 'R': 'r', 'S': 's', 'SH': 'ʃ',
116
- 'T': 't', 'TH': 'θ', 'UH': 'ʊ', 'UW': 'u', 'V': 'v',
117
- 'W': 'w', 'Y': 'j', 'Z': 'z', 'ZH': 'ʒ', 'DX': 'ɾ' # Fixed: T→DX for flap
118
- }
119
-
120
- # Parse CMUdict and build reverse lookup
121
- word_count = 0
122
- with open(cmudict_path, 'r', encoding='latin-1') as f:
123
- for line in f:
124
- line = line.strip()
125
- if not line or line.startswith(';;;'):
126
- continue
127
-
128
- # Parse line: WORD P H O N E M E S
129
- parts = line.split()
130
- if len(parts) < 2:
131
- continue
132
-
133
- word = parts[0].lower()
134
- # Remove variant indicators like (2), (3)
135
- if '(' in word:
136
- word = word.split('(')[0]
137
-
138
- # Convert ARPABET to IPA
139
- arpabet_phones = parts[1:]
140
- ipa_phones = []
141
- for phone in arpabet_phones:
142
- # Remove stress markers (0,1,2)
143
- clean_phone = ''.join(c for c in phone if not c.isdigit())
144
- if clean_phone in arpabet_to_ipa:
145
- ipa_phones.append(arpabet_to_ipa[clean_phone])
146
-
147
- if ipa_phones:
148
- # Create phoneme string and normalize it
149
- ipa_string = ''.join(ipa_phones)
150
- normalized_ipa = normalize_phoneme_string(ipa_string)
151
-
152
- # Add to reverse lookup
153
- if normalized_ipa not in phoneme_to_words_cache:
154
- phoneme_to_words_cache[normalized_ipa] = []
155
- if word not in phoneme_to_words_cache[normalized_ipa]:
156
- phoneme_to_words_cache[normalized_ipa].append(word)
157
-
158
- word_count += 1
159
-
160
- log(f"✅ Built reverse lookup: {word_count} words, {len(phoneme_to_words_cache)} unique phoneme patterns")
161
-
162
- except Exception as e:
163
- log(f"❌ Error building phoneme reverse lookup: {e}")
164
- phoneme_to_words_cache = {}
165
-
166
- def lookup_words_from_phonemes(phoneme_string: str) -> List[str]:
167
- """Look up possible words for a given phoneme string"""
168
- if not phoneme_to_words_cache:
169
- return []
170
-
171
- normalized = normalize_phoneme_string(phoneme_string)
172
- return phoneme_to_words_cache.get(normalized, [])
173
-
174
  # Load models once at startup
175
  phoneme_processor = Wav2Vec2Processor.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
176
  phoneme_model = Wav2Vec2ForCTC.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
177
 
178
- # Build phoneme reverse lookup dictionary
179
- build_phoneme_reverse_lookup()
180
-
181
  # Model inspection complete - wav2vec2 uses ASCII 'g' (token 15), not IPA 'ɡ'
182
  log("✅ Phoneme models loaded - using ASCII/IPA normalization")
183
 
@@ -261,6 +171,67 @@ PHONEME_TO_ENGLISH = {
261
  'ˌ': '', # secondary stress (remove)
262
  }
263
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  def clean_word_for_phonemes(word: str) -> str:
265
  """
266
  Clean word by removing punctuation and extra spaces for phoneme processing.
@@ -334,6 +305,10 @@ def load_whisperx_models():
334
 
335
  if whisperx_model is None:
336
  log("Loading WhisperX models for English-only processing...")
 
 
 
 
337
  try:
338
  # Try loading with base.en first
339
  whisperx_model = whisperx.load_model("base.en", device="cpu", compute_type="float32", language="en")
@@ -345,42 +320,65 @@ def load_whisperx_models():
345
 
346
  except ImportError as ie:
347
  log(f"Import error loading WhisperX models: {ie}")
348
- # Try without ctranslate2 by using int8 compute type
 
349
  try:
350
- log("Trying fallback with int8 compute type...")
351
- whisperx_model = whisperx.load_model("base.en", device="cpu", compute_type="int8", language="en")
352
- whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
353
- log("WhisperX models loaded with int8 compute type")
354
- except Exception as fallback_error:
355
- log(f"Int8 fallback also failed: {fallback_error}")
356
- # Last resort: try tiny model with default compute
357
- try:
358
- log("Trying final fallback with tiny.en model and default compute...")
359
- whisperx_model = whisperx.load_model("tiny.en", device="cpu", language="en")
360
- whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
361
- log("WhisperX models loaded with tiny.en and default compute")
362
- except Exception as final_error:
363
- log(f"All WhisperX loading attempts failed: {final_error}")
364
- raise RuntimeError("Unable to load WhisperX models. Please check environment setup.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
365
  except Exception as e:
366
  log(f"Error loading WhisperX models: {e}")
367
- # Fallback: try with smaller English-only model
368
- try:
369
- log("Trying fallback with tiny.en model...")
370
- whisperx_model = whisperx.load_model("tiny.en", device="cpu", compute_type="int8", language="en")
371
- whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
372
- log("WhisperX models loaded with fallback (tiny.en model)")
373
- except Exception as fallback_error:
374
- log(f"Fallback also failed: {fallback_error}")
375
- # Final attempt without compute_type specification
376
- try:
377
- log("Final attempt with default settings...")
378
- whisperx_model = whisperx.load_model("tiny.en", device="cpu", language="en")
379
- whisperx_align_model, whisperx_metadata = whisperx.load_align_model(language_code="en", device="cpu")
380
- log("WhisperX models loaded with default settings")
381
- except Exception as final_error:
382
- log(f"All attempts failed: {final_error}")
383
- raise RuntimeError("Unable to load WhisperX models in this environment")
384
 
385
  def convert_webm_to_wav(bts):
386
  p = subprocess.run(["ffmpeg", "-i", "pipe:0", "-f", "wav", "-ar", "16000", "-ac", "1", "pipe:1"],
@@ -822,16 +820,16 @@ def create_character_level_feedback(word: str, expected_normalized: str,
822
  detected_english = "?"
823
  detected_example = ""
824
 
825
- # Create tooltip text with example words
826
  if expected_example and detected_example:
827
- tooltip_text = f"Expected '{expected_english}' as in '{expected_example}', You said '{detected_english}' as in '{detected_example}'"
828
  elif expected_example:
829
- tooltip_text = f"Expected '{expected_english}' as in '{expected_example}', You said '{detected_english}'"
830
  else:
831
- tooltip_text = f"Expected '{expected_english}', You said '{detected_english}'"
832
 
833
  # Create span with inline tooltip for each mispronounced letter/group
834
- formatted_letters = f'<span class="phoneme-error" data-expected="{expected_english}" data-detected="{detected_english}" title="{tooltip_text}"><strong><u>{word_letters}</u></strong></span>'
835
  test_result.append(formatted_letters)
836
 
837
  # For the simplified tooltip feedback
@@ -1009,7 +1007,7 @@ def get_expected_phonemes(words: List[str]) -> List[str]:
1009
  return empty_results
1010
 
1011
  async def generate_tts_audio(word: str) -> str:
1012
- """Generate TTS audio for a word"""
1013
  if word in tts_cache:
1014
  return tts_cache[word]
1015
 
@@ -1021,17 +1019,81 @@ async def generate_tts_audio(word: str) -> str:
1021
  audio_data += chunk["data"]
1022
 
1023
  if audio_data:
1024
- audio_b64 = base64.b64encode(audio_data).decode('utf-8')
1025
- tts_cache[word] = audio_b64
1026
- return audio_b64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1027
  except Exception as e:
1028
  log(f"TTS failed for '{word}': {e}")
1029
 
1030
  return ""
1031
 
1032
- def audio_to_base64(audio_segment: torch.Tensor, sample_rate: int) -> str:
1033
- """Convert audio tensor to base64 string"""
 
 
 
 
 
 
 
 
 
 
1034
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1035
  buffer = io.BytesIO()
1036
  torchaudio.save(buffer, audio_segment, sample_rate, format="wav")
1037
  buffer.seek(0)
@@ -1420,7 +1482,6 @@ async def transcribe(audio: UploadFile = File(...), similarity_threshold: float
1420
  })
1421
 
1422
  # 7. Format output
1423
- full_transcript = " ".join(word_texts)
1424
  resolved_output = []
1425
  resolved_colored = []
1426
 
@@ -1441,7 +1502,6 @@ async def transcribe(audio: UploadFile = File(...), similarity_threshold: float
1441
  log("=== WHISPERX ENGLISH-ONLY PHONEME ANALYSIS COMPLETE ===")
1442
 
1443
  return {
1444
- "transcript": full_transcript,
1445
  "resolved": " ".join(resolved_output),
1446
  "resolved_colored": " ".join(resolved_colored),
1447
  "audio_data": audio_data_list,
@@ -1467,7 +1527,6 @@ async def transcribe(audio: UploadFile = File(...), similarity_threshold: float
1467
  import traceback
1468
  log(f"Traceback: {traceback.format_exc()}")
1469
  return {
1470
- "transcript": "Error occurred",
1471
  "resolved": "Error occurred",
1472
  "resolved_colored": "Error occurred",
1473
  "audio_data": [],
 
11
  import re
12
  import urllib.request
13
  import gzip
14
+ import tempfile
15
 
16
  # Set cache environment
17
  os.environ['HF_HOME'] = '/tmp/hf'
 
46
  if not s:
47
  return s
48
 
 
 
49
  # Convert to lowercase and remove spaces, stress marks, and length markers
50
  normalized = s.lower().strip()
51
  normalized = normalized.replace(' ', '') # Remove spaces between phonemes
 
82
  for variant_char, standard_char in ipa_variants.items():
83
  normalized = normalized.replace(variant_char, standard_char)
84
 
 
 
 
 
85
  return normalized
86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  # Load models once at startup
88
  phoneme_processor = Wav2Vec2Processor.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
89
  phoneme_model = Wav2Vec2ForCTC.from_pretrained("vitouphy/wav2vec2-xls-r-300m-timit-phoneme")
90
 
 
 
 
91
  # Model inspection complete - wav2vec2 uses ASCII 'g' (token 15), not IPA 'ɡ'
92
  log("✅ Phoneme models loaded - using ASCII/IPA normalization")
93
 
 
171
  'ˌ': '', # secondary stress (remove)
172
  }
173
 
174
+ # Phoneme example words - showing the sound in context
175
+ PHONEME_EXAMPLES = {
176
+ # Vowels (monophthongs)
177
+ 'ɪ': 'bit', # IH sound
178
+ 'ɛ': 'bed', # EH sound
179
+ 'æ': 'cat', # AE sound
180
+ 'ʌ': 'but', # UH sound (stressed)
181
+ 'ɑ': 'father', # AH sound
182
+ 'ɔ': 'law', # AW sound
183
+ 'ʊ': 'book', # UU sound
184
+ 'u': 'boot', # OO sound
185
+ 'i': 'beat', # EE sound
186
+ 'ə': 'about', # schwa (unstressed)
187
+ 'ɝ': 'bird', # ER sound (stressed)
188
+ 'ɚ': 'letter', # ER sound (unstressed)
189
+
190
+ # Diphthongs
191
+ 'eɪ': 'day', # AY sound
192
+ 'aɪ': 'my', # EYE sound
193
+ 'ɔɪ': 'boy', # OY sound
194
+ 'aʊ': 'now', # OW sound
195
+ 'oʊ': 'go', # OH sound
196
+
197
+ # R-colored vowels
198
+ 'ɪr': 'near', # EER sound
199
+ 'ɛr': 'care', # AIR sound
200
+ 'ɑr': 'car', # AR sound
201
+ 'ɔr': 'for', # OR sound
202
+ 'ʊr': 'tour', # OOR sound
203
+ 'ər': 'letter', # ER sound
204
+
205
+ # Consonants
206
+ 'p': 'pat', # P sound
207
+ 'b': 'bat', # B sound
208
+ 't': 'tap', # T sound
209
+ 'd': 'dap', # D sound
210
+ 'k': 'cat', # K sound
211
+ 'g': 'gap', # G sound (ASCII)
212
+ 'ɡ': 'gap', # G sound (IPA)
213
+ 'f': 'fat', # F sound
214
+ 'v': 'vat', # V sound
215
+ 'θ': 'think', # TH sound (voiceless)
216
+ 'ð': 'this', # TH sound (voiced)
217
+ 's': 'sap', # S sound
218
+ 'z': 'zap', # Z sound
219
+ 'ʃ': 'ship', # SH sound
220
+ 'ʒ': 'measure', # ZH sound
221
+ 'h': 'hat', # H sound
222
+ 'm': 'mat', # M sound
223
+ 'n': 'nat', # N sound
224
+ 'ŋ': 'sing', # NG sound
225
+ 'l': 'lap', # L sound
226
+ 'r': 'rap', # R sound
227
+ 'j': 'yes', # Y sound
228
+ 'w': 'wet', # W sound
229
+
230
+ # Affricates
231
+ 'tʃ': 'chip', # CH sound
232
+ 'dʒ': 'jump', # J sound
233
+ }
234
+
235
  def clean_word_for_phonemes(word: str) -> str:
236
  """
237
  Clean word by removing punctuation and extra spaces for phoneme processing.
 
305
 
306
  if whisperx_model is None:
307
  log("Loading WhisperX models for English-only processing...")
308
+
309
+ # First, try to set environment variable to disable executable stack
310
+ os.environ['LD_BIND_NOW'] = '1'
311
+
312
  try:
313
  # Try loading with base.en first
314
  whisperx_model = whisperx.load_model("base.en", device="cpu", compute_type="float32", language="en")
 
320
 
321
  except ImportError as ie:
322
  log(f"Import error loading WhisperX models: {ie}")
323
+
324
+ # Try to use regular Whisper as fallback
325
  try:
326
+ log("Attempting to use standard Whisper instead of WhisperX...")
327
+ import whisper
328
+
329
+ # Load standard whisper model
330
+ whisper_model = whisper.load_model("base.en", device="cpu")
331
+
332
+ # Create a wrapper to make it compatible with WhisperX interface
333
+ class WhisperWrapper:
334
+ def __init__(self, model):
335
+ self.model = model
336
+
337
+ def transcribe(self, audio, batch_size=16, language="en"):
338
+ result = self.model.transcribe(audio, language=language)
339
+ # Convert to WhisperX format
340
+ return {
341
+ "segments": [{
342
+ "text": result["text"],
343
+ "start": 0.0,
344
+ "end": len(audio) / 16000.0, # Approximate based on sample rate
345
+ "words": [] # Will need to handle word-level timing differently
346
+ }],
347
+ "language": language
348
+ }
349
+
350
+ whisperx_model = WhisperWrapper(whisper_model)
351
+ log("Using standard Whisper as fallback (limited word-level timing)")
352
+
353
+ # For alignment, we'll need to handle this differently
354
+ whisperx_align_model = None
355
+ whisperx_metadata = None
356
+
357
+ except Exception as whisper_error:
358
+ log(f"Standard Whisper fallback failed: {whisper_error}")
359
+
360
+ # Last resort: Create a minimal mock that at least returns something
361
+ class MinimalWhisperMock:
362
+ def transcribe(self, audio, batch_size=16, language="en"):
363
+ # Return a minimal valid structure
364
+ return {
365
+ "segments": [{
366
+ "text": "[Audio processing unavailable - WhisperX loading failed]",
367
+ "start": 0.0,
368
+ "end": 1.0,
369
+ "words": []
370
+ }],
371
+ "language": language
372
+ }
373
+
374
+ whisperx_model = MinimalWhisperMock()
375
+ whisperx_align_model = None
376
+ whisperx_metadata = None
377
+ log("WARNING: Using minimal mock - transcription will be limited")
378
+
379
  except Exception as e:
380
  log(f"Error loading WhisperX models: {e}")
381
+ raise RuntimeError(f"Unable to load speech recognition models: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
382
 
383
  def convert_webm_to_wav(bts):
384
  p = subprocess.run(["ffmpeg", "-i", "pipe:0", "-f", "wav", "-ar", "16000", "-ac", "1", "pipe:1"],
 
820
  detected_english = "?"
821
  detected_example = ""
822
 
823
+ # Create tooltip text with example words (two lines)
824
  if expected_example and detected_example:
825
+ tooltip_text = f"Expected '{expected_english}' as in '{expected_example}'<br>You said '{detected_english}' as in '{detected_example}'"
826
  elif expected_example:
827
+ tooltip_text = f"Expected '{expected_english}' as in '{expected_example}'<br>You said '{detected_english}'"
828
  else:
829
+ tooltip_text = f"Expected '{expected_english}'<br>You said '{detected_english}'"
830
 
831
  # Create span with inline tooltip for each mispronounced letter/group
832
+ formatted_letters = f'<span class="phoneme-error" data-expected="{expected_english}" data-detected="{detected_english}" data-tooltip-html="{tooltip_text}"><strong><u>{word_letters}</u></strong></span>'
833
  test_result.append(formatted_letters)
834
 
835
  # For the simplified tooltip feedback
 
1007
  return empty_results
1008
 
1009
  async def generate_tts_audio(word: str) -> str:
1010
+ """Generate TTS audio for a word with silence padding"""
1011
  if word in tts_cache:
1012
  return tts_cache[word]
1013
 
 
1019
  audio_data += chunk["data"]
1020
 
1021
  if audio_data:
1022
+ # Add silence padding to TTS audio as well
1023
+ # First decode the MP3 to get raw audio
1024
+ import tempfile
1025
+ with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as tmp_mp3:
1026
+ tmp_mp3.write(audio_data)
1027
+ tmp_mp3_path = tmp_mp3.name
1028
+
1029
+ try:
1030
+ # Load the TTS audio
1031
+ tts_waveform, tts_sample_rate = torchaudio.load(tmp_mp3_path)
1032
+
1033
+ # Resample if needed to match our standard rate
1034
+ if tts_sample_rate != 16000:
1035
+ tts_waveform = torchaudio.transforms.Resample(tts_sample_rate, 16000)(tts_waveform)
1036
+ tts_sample_rate = 16000
1037
+
1038
+ # Add 0.25s silence padding on each end
1039
+ padding_samples = int(0.25 * tts_sample_rate)
1040
+ silence_shape = list(tts_waveform.shape)
1041
+ silence_shape[-1] = padding_samples
1042
+ silence_padding = torch.zeros(silence_shape)
1043
+
1044
+ # Concatenate: silence + audio + silence
1045
+ padded_waveform = torch.cat([silence_padding, tts_waveform, silence_padding], dim=-1)
1046
+
1047
+ # Convert back to base64
1048
+ buffer = io.BytesIO()
1049
+ torchaudio.save(buffer, padded_waveform, tts_sample_rate, format="wav")
1050
+ buffer.seek(0)
1051
+ audio_b64 = base64.b64encode(buffer.read()).decode('utf-8')
1052
+
1053
+ tts_cache[word] = audio_b64
1054
+ log(f"🔇 TTS for '{word}': Added 0.25s silence padding on each end")
1055
+ return audio_b64
1056
+
1057
+ finally:
1058
+ # Clean up temp file
1059
+ if os.path.exists(tmp_mp3_path):
1060
+ os.remove(tmp_mp3_path)
1061
+
1062
  except Exception as e:
1063
  log(f"TTS failed for '{word}': {e}")
1064
 
1065
  return ""
1066
 
1067
+ def audio_to_base64(audio_segment: torch.Tensor, sample_rate: int, add_padding: bool = True) -> str:
1068
+ """
1069
+ Convert audio tensor to base64 string.
1070
+
1071
+ Args:
1072
+ audio_segment: The audio tensor to convert
1073
+ sample_rate: Sample rate of the audio
1074
+ add_padding: If True, adds 0.25s of silence on each end to prevent audio processor lag
1075
+
1076
+ Returns:
1077
+ Base64 encoded audio string
1078
+ """
1079
  try:
1080
+ if add_padding:
1081
+ # Add 0.25 seconds of silence on each end
1082
+ padding_samples = int(0.25 * sample_rate) # 0.25 seconds worth of samples
1083
+
1084
+ # Create silence padding (zeros with same shape as audio segment)
1085
+ silence_shape = list(audio_segment.shape)
1086
+ silence_shape[-1] = padding_samples
1087
+ silence_padding = torch.zeros(silence_shape)
1088
+
1089
+ # Concatenate: silence + audio + silence
1090
+ padded_segment = torch.cat([silence_padding, audio_segment, silence_padding], dim=-1)
1091
+
1092
+ log(f"🔇 Added silence padding: {padding_samples} samples (0.25s) on each end")
1093
+ log(f" Original: {audio_segment.shape[-1]} samples → Padded: {padded_segment.shape[-1]} samples")
1094
+
1095
+ audio_segment = padded_segment
1096
+
1097
  buffer = io.BytesIO()
1098
  torchaudio.save(buffer, audio_segment, sample_rate, format="wav")
1099
  buffer.seek(0)
 
1482
  })
1483
 
1484
  # 7. Format output
 
1485
  resolved_output = []
1486
  resolved_colored = []
1487
 
 
1502
  log("=== WHISPERX ENGLISH-ONLY PHONEME ANALYSIS COMPLETE ===")
1503
 
1504
  return {
 
1505
  "resolved": " ".join(resolved_output),
1506
  "resolved_colored": " ".join(resolved_colored),
1507
  "audio_data": audio_data_list,
 
1527
  import traceback
1528
  log(f"Traceback: {traceback.format_exc()}")
1529
  return {
 
1530
  "resolved": "Error occurred",
1531
  "resolved_colored": "Error occurred",
1532
  "audio_data": [],