greg0rs commited on
Commit
b41d79a
Β·
verified Β·
1 Parent(s): 779a747

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -12
app.py CHANGED
@@ -11,6 +11,7 @@ import string
11
  import re
12
  import urllib.request
13
  import gzip
 
14
 
15
  # Set cache environment
16
  os.environ['HF_HOME'] = '/tmp/hf'
@@ -1020,7 +1021,7 @@ def get_expected_phonemes(words: List[str]) -> List[str]:
1020
  return results
1021
 
1022
  async def generate_tts_audio(word: str) -> str:
1023
- """Generate TTS audio for a word"""
1024
  if word in tts_cache:
1025
  return tts_cache[word]
1026
 
@@ -1032,17 +1033,81 @@ async def generate_tts_audio(word: str) -> str:
1032
  audio_data += chunk["data"]
1033
 
1034
  if audio_data:
1035
- audio_b64 = base64.b64encode(audio_data).decode('utf-8')
1036
- tts_cache[word] = audio_b64
1037
- return audio_b64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1038
  except Exception as e:
1039
  log(f"TTS failed for '{word}': {e}")
1040
 
1041
  return ""
1042
 
1043
- def audio_to_base64(audio_segment: torch.Tensor, sample_rate: int) -> str:
1044
- """Convert audio tensor to base64 string"""
 
 
 
 
 
 
 
 
 
 
1045
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1046
  buffer = io.BytesIO()
1047
  torchaudio.save(buffer, audio_segment, sample_rate, format="wav")
1048
  buffer.seek(0)
@@ -1399,15 +1464,16 @@ async def transcribe(audio: UploadFile = File(...), similarity_threshold: float
1399
  })
1400
 
1401
  # Prepare audio data with all three segments (use ORIGINAL word for display)
1402
- user_audio_b64 = audio_to_base64(final_audio_segment, sample_rate)
1403
- whisperx_audio_b64 = audio_to_base64(whisperx_audio_segment, sample_rate)
1404
- expected_audio_b64 = tts_results[i]
 
1405
 
1406
  audio_data_list.append({
1407
  "word": word_original, # Original with punctuation for display
1408
- "expected_audio": expected_audio_b64,
1409
- "user_audio": user_audio_b64,
1410
- "whisperx_audio": whisperx_audio_b64, # NEW: Original WhisperX timing
1411
  "start_time": float(start_time),
1412
  "end_time": float(end_time),
1413
  "similarity_score": float(similarity_score),
 
11
  import re
12
  import urllib.request
13
  import gzip
14
+ import tempfile
15
 
16
  # Set cache environment
17
  os.environ['HF_HOME'] = '/tmp/hf'
 
1021
  return results
1022
 
1023
  async def generate_tts_audio(word: str) -> str:
1024
+ """Generate TTS audio for a word with silence padding"""
1025
  if word in tts_cache:
1026
  return tts_cache[word]
1027
 
 
1033
  audio_data += chunk["data"]
1034
 
1035
  if audio_data:
1036
+ # Add silence padding to TTS audio as well
1037
+ # First decode the MP3 to get raw audio
1038
+ import tempfile
1039
+ with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as tmp_mp3:
1040
+ tmp_mp3.write(audio_data)
1041
+ tmp_mp3_path = tmp_mp3.name
1042
+
1043
+ try:
1044
+ # Load the TTS audio
1045
+ tts_waveform, tts_sample_rate = torchaudio.load(tmp_mp3_path)
1046
+
1047
+ # Resample if needed to match our standard rate
1048
+ if tts_sample_rate != 16000:
1049
+ tts_waveform = torchaudio.transforms.Resample(tts_sample_rate, 16000)(tts_waveform)
1050
+ tts_sample_rate = 16000
1051
+
1052
+ # Add 0.25s silence padding on each end
1053
+ padding_samples = int(0.25 * tts_sample_rate)
1054
+ silence_shape = list(tts_waveform.shape)
1055
+ silence_shape[-1] = padding_samples
1056
+ silence_padding = torch.zeros(silence_shape)
1057
+
1058
+ # Concatenate: silence + audio + silence
1059
+ padded_waveform = torch.cat([silence_padding, tts_waveform, silence_padding], dim=-1)
1060
+
1061
+ # Convert back to base64
1062
+ buffer = io.BytesIO()
1063
+ torchaudio.save(buffer, padded_waveform, tts_sample_rate, format="wav")
1064
+ buffer.seek(0)
1065
+ audio_b64 = base64.b64encode(buffer.read()).decode('utf-8')
1066
+
1067
+ tts_cache[word] = audio_b64
1068
+ log(f"πŸ”‡ TTS for '{word}': Added 0.25s silence padding on each end")
1069
+ return audio_b64
1070
+
1071
+ finally:
1072
+ # Clean up temp file
1073
+ if os.path.exists(tmp_mp3_path):
1074
+ os.remove(tmp_mp3_path)
1075
+
1076
  except Exception as e:
1077
  log(f"TTS failed for '{word}': {e}")
1078
 
1079
  return ""
1080
 
1081
+ def audio_to_base64(audio_segment: torch.Tensor, sample_rate: int, add_padding: bool = True) -> str:
1082
+ """
1083
+ Convert audio tensor to base64 string.
1084
+
1085
+ Args:
1086
+ audio_segment: The audio tensor to convert
1087
+ sample_rate: Sample rate of the audio
1088
+ add_padding: If True, adds 0.25s of silence on each end to prevent audio processor lag
1089
+
1090
+ Returns:
1091
+ Base64 encoded audio string
1092
+ """
1093
  try:
1094
+ if add_padding:
1095
+ # Add 0.25 seconds of silence on each end
1096
+ padding_samples = int(0.25 * sample_rate) # 0.25 seconds worth of samples
1097
+
1098
+ # Create silence padding (zeros with same shape as audio segment)
1099
+ silence_shape = list(audio_segment.shape)
1100
+ silence_shape[-1] = padding_samples
1101
+ silence_padding = torch.zeros(silence_shape)
1102
+
1103
+ # Concatenate: silence + audio + silence
1104
+ padded_segment = torch.cat([silence_padding, audio_segment, silence_padding], dim=-1)
1105
+
1106
+ log(f"πŸ”‡ Added silence padding: {padding_samples} samples (0.25s) on each end")
1107
+ log(f" Original: {audio_segment.shape[-1]} samples β†’ Padded: {padded_segment.shape[-1]} samples")
1108
+
1109
+ audio_segment = padded_segment
1110
+
1111
  buffer = io.BytesIO()
1112
  torchaudio.save(buffer, audio_segment, sample_rate, format="wav")
1113
  buffer.seek(0)
 
1464
  })
1465
 
1466
  # Prepare audio data with all three segments (use ORIGINAL word for display)
1467
+ # All three audio segments will have 0.25s silence padding added automatically
1468
+ user_audio_b64 = audio_to_base64(final_audio_segment, sample_rate) # Padded
1469
+ whisperx_audio_b64 = audio_to_base64(whisperx_audio_segment, sample_rate) # Padded
1470
+ expected_audio_b64 = tts_results[i] # Already padded in generate_tts_audio
1471
 
1472
  audio_data_list.append({
1473
  "word": word_original, # Original with punctuation for display
1474
+ "expected_audio": expected_audio_b64, # TTS with padding
1475
+ "user_audio": user_audio_b64, # User's pronunciation with padding
1476
+ "whisperx_audio": whisperx_audio_b64, # WhisperX original with padding
1477
  "start_time": float(start_time),
1478
  "end_time": float(end_time),
1479
  "similarity_score": float(similarity_score),