Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -11,6 +11,7 @@ import string
|
|
11 |
import re
|
12 |
import urllib.request
|
13 |
import gzip
|
|
|
14 |
|
15 |
# Set cache environment
|
16 |
os.environ['HF_HOME'] = '/tmp/hf'
|
@@ -1020,7 +1021,7 @@ def get_expected_phonemes(words: List[str]) -> List[str]:
|
|
1020 |
return results
|
1021 |
|
1022 |
async def generate_tts_audio(word: str) -> str:
|
1023 |
-
"""Generate TTS audio for a word"""
|
1024 |
if word in tts_cache:
|
1025 |
return tts_cache[word]
|
1026 |
|
@@ -1032,17 +1033,81 @@ async def generate_tts_audio(word: str) -> str:
|
|
1032 |
audio_data += chunk["data"]
|
1033 |
|
1034 |
if audio_data:
|
1035 |
-
|
1036 |
-
|
1037 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1038 |
except Exception as e:
|
1039 |
log(f"TTS failed for '{word}': {e}")
|
1040 |
|
1041 |
return ""
|
1042 |
|
1043 |
-
def audio_to_base64(audio_segment: torch.Tensor, sample_rate: int) -> str:
|
1044 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1045 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1046 |
buffer = io.BytesIO()
|
1047 |
torchaudio.save(buffer, audio_segment, sample_rate, format="wav")
|
1048 |
buffer.seek(0)
|
@@ -1399,15 +1464,16 @@ async def transcribe(audio: UploadFile = File(...), similarity_threshold: float
|
|
1399 |
})
|
1400 |
|
1401 |
# Prepare audio data with all three segments (use ORIGINAL word for display)
|
1402 |
-
|
1403 |
-
|
1404 |
-
|
|
|
1405 |
|
1406 |
audio_data_list.append({
|
1407 |
"word": word_original, # Original with punctuation for display
|
1408 |
-
"expected_audio": expected_audio_b64,
|
1409 |
-
"user_audio": user_audio_b64,
|
1410 |
-
"whisperx_audio": whisperx_audio_b64, #
|
1411 |
"start_time": float(start_time),
|
1412 |
"end_time": float(end_time),
|
1413 |
"similarity_score": float(similarity_score),
|
|
|
11 |
import re
|
12 |
import urllib.request
|
13 |
import gzip
|
14 |
+
import tempfile
|
15 |
|
16 |
# Set cache environment
|
17 |
os.environ['HF_HOME'] = '/tmp/hf'
|
|
|
1021 |
return results
|
1022 |
|
1023 |
async def generate_tts_audio(word: str) -> str:
|
1024 |
+
"""Generate TTS audio for a word with silence padding"""
|
1025 |
if word in tts_cache:
|
1026 |
return tts_cache[word]
|
1027 |
|
|
|
1033 |
audio_data += chunk["data"]
|
1034 |
|
1035 |
if audio_data:
|
1036 |
+
# Add silence padding to TTS audio as well
|
1037 |
+
# First decode the MP3 to get raw audio
|
1038 |
+
import tempfile
|
1039 |
+
with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as tmp_mp3:
|
1040 |
+
tmp_mp3.write(audio_data)
|
1041 |
+
tmp_mp3_path = tmp_mp3.name
|
1042 |
+
|
1043 |
+
try:
|
1044 |
+
# Load the TTS audio
|
1045 |
+
tts_waveform, tts_sample_rate = torchaudio.load(tmp_mp3_path)
|
1046 |
+
|
1047 |
+
# Resample if needed to match our standard rate
|
1048 |
+
if tts_sample_rate != 16000:
|
1049 |
+
tts_waveform = torchaudio.transforms.Resample(tts_sample_rate, 16000)(tts_waveform)
|
1050 |
+
tts_sample_rate = 16000
|
1051 |
+
|
1052 |
+
# Add 0.25s silence padding on each end
|
1053 |
+
padding_samples = int(0.25 * tts_sample_rate)
|
1054 |
+
silence_shape = list(tts_waveform.shape)
|
1055 |
+
silence_shape[-1] = padding_samples
|
1056 |
+
silence_padding = torch.zeros(silence_shape)
|
1057 |
+
|
1058 |
+
# Concatenate: silence + audio + silence
|
1059 |
+
padded_waveform = torch.cat([silence_padding, tts_waveform, silence_padding], dim=-1)
|
1060 |
+
|
1061 |
+
# Convert back to base64
|
1062 |
+
buffer = io.BytesIO()
|
1063 |
+
torchaudio.save(buffer, padded_waveform, tts_sample_rate, format="wav")
|
1064 |
+
buffer.seek(0)
|
1065 |
+
audio_b64 = base64.b64encode(buffer.read()).decode('utf-8')
|
1066 |
+
|
1067 |
+
tts_cache[word] = audio_b64
|
1068 |
+
log(f"π TTS for '{word}': Added 0.25s silence padding on each end")
|
1069 |
+
return audio_b64
|
1070 |
+
|
1071 |
+
finally:
|
1072 |
+
# Clean up temp file
|
1073 |
+
if os.path.exists(tmp_mp3_path):
|
1074 |
+
os.remove(tmp_mp3_path)
|
1075 |
+
|
1076 |
except Exception as e:
|
1077 |
log(f"TTS failed for '{word}': {e}")
|
1078 |
|
1079 |
return ""
|
1080 |
|
1081 |
+
def audio_to_base64(audio_segment: torch.Tensor, sample_rate: int, add_padding: bool = True) -> str:
|
1082 |
+
"""
|
1083 |
+
Convert audio tensor to base64 string.
|
1084 |
+
|
1085 |
+
Args:
|
1086 |
+
audio_segment: The audio tensor to convert
|
1087 |
+
sample_rate: Sample rate of the audio
|
1088 |
+
add_padding: If True, adds 0.25s of silence on each end to prevent audio processor lag
|
1089 |
+
|
1090 |
+
Returns:
|
1091 |
+
Base64 encoded audio string
|
1092 |
+
"""
|
1093 |
try:
|
1094 |
+
if add_padding:
|
1095 |
+
# Add 0.25 seconds of silence on each end
|
1096 |
+
padding_samples = int(0.25 * sample_rate) # 0.25 seconds worth of samples
|
1097 |
+
|
1098 |
+
# Create silence padding (zeros with same shape as audio segment)
|
1099 |
+
silence_shape = list(audio_segment.shape)
|
1100 |
+
silence_shape[-1] = padding_samples
|
1101 |
+
silence_padding = torch.zeros(silence_shape)
|
1102 |
+
|
1103 |
+
# Concatenate: silence + audio + silence
|
1104 |
+
padded_segment = torch.cat([silence_padding, audio_segment, silence_padding], dim=-1)
|
1105 |
+
|
1106 |
+
log(f"π Added silence padding: {padding_samples} samples (0.25s) on each end")
|
1107 |
+
log(f" Original: {audio_segment.shape[-1]} samples β Padded: {padded_segment.shape[-1]} samples")
|
1108 |
+
|
1109 |
+
audio_segment = padded_segment
|
1110 |
+
|
1111 |
buffer = io.BytesIO()
|
1112 |
torchaudio.save(buffer, audio_segment, sample_rate, format="wav")
|
1113 |
buffer.seek(0)
|
|
|
1464 |
})
|
1465 |
|
1466 |
# Prepare audio data with all three segments (use ORIGINAL word for display)
|
1467 |
+
# All three audio segments will have 0.25s silence padding added automatically
|
1468 |
+
user_audio_b64 = audio_to_base64(final_audio_segment, sample_rate) # Padded
|
1469 |
+
whisperx_audio_b64 = audio_to_base64(whisperx_audio_segment, sample_rate) # Padded
|
1470 |
+
expected_audio_b64 = tts_results[i] # Already padded in generate_tts_audio
|
1471 |
|
1472 |
audio_data_list.append({
|
1473 |
"word": word_original, # Original with punctuation for display
|
1474 |
+
"expected_audio": expected_audio_b64, # TTS with padding
|
1475 |
+
"user_audio": user_audio_b64, # User's pronunciation with padding
|
1476 |
+
"whisperx_audio": whisperx_audio_b64, # WhisperX original with padding
|
1477 |
"start_time": float(start_time),
|
1478 |
"end_time": float(end_time),
|
1479 |
"similarity_score": float(similarity_score),
|