Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -373,11 +373,6 @@ def detect_word_boundary_overlap(audio_segment: torch.Tensor, sample_rate: int,
|
|
373 |
Analyze first 1/3 of audio segment for: [noise] β [silence] β [noise] pattern
|
374 |
Returns: offset in seconds to skip initial noise, or 0.0 if no pattern found
|
375 |
"""
|
376 |
-
# Add this right after the function starts
|
377 |
-
log(f"π AUDIO VERIFICATION for '{word}':")
|
378 |
-
log(f" Full segment shape: {audio_segment.shape}")
|
379 |
-
log(f" Full segment first 3 samples: {audio_segment.squeeze()[:3].tolist()}")
|
380 |
-
log(f" Full segment last 3 samples: {audio_segment.squeeze()[-3:].tolist()}")
|
381 |
log(f"π detect_word_boundary_overlap called for '{word}':")
|
382 |
log(f" Input audio_segment shape: {audio_segment.shape}")
|
383 |
log(f" Sample rate: {sample_rate}")
|
@@ -388,30 +383,27 @@ def detect_word_boundary_overlap(audio_segment: torch.Tensor, sample_rate: int,
|
|
388 |
return 0.0
|
389 |
|
390 |
# Analyze only first 1/3 of segment
|
391 |
-
first_third_samples = audio_segment.shape[1] // 3
|
392 |
log(f" Total samples: {audio_segment.shape[-1]}, first third: {first_third_samples} samples")
|
393 |
log(f" First third duration: {first_third_samples / sample_rate:.3f}s")
|
394 |
|
395 |
if first_third_samples < sample_rate * 0.1: # Less than 100ms total
|
396 |
log(f" First third too short ({first_third_samples / sample_rate:.3f}s < 0.1s), returning 0.0")
|
397 |
return 0.0
|
398 |
-
|
399 |
-
# Add this right before: first_third = audio_segment[:, :first_third_samples]
|
400 |
-
log(f" PRE-EXTRACTION CHECK:")
|
401 |
-
log(f" audio_segment[:3]: {audio_segment.squeeze()[:3].tolist()}")
|
402 |
-
log(f" About to extract first_third_samples: {first_third_samples}")
|
403 |
-
|
404 |
first_third = audio_segment[:, :first_third_samples]
|
405 |
-
|
406 |
-
log(f" POST-EXTRACTION CHECK:")
|
407 |
-
log(f" first_third[:3]: {first_third.squeeze()[:3].tolist()}")
|
408 |
-
log(f" Are these the same? {torch.equal(audio_segment.squeeze()[:3], first_third.squeeze()[:3])}")
|
409 |
log(f" Extracted first_third shape: {first_third.shape}")
|
410 |
-
|
411 |
-
|
412 |
-
log(f"
|
413 |
-
log(f"
|
414 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
415 |
# Calculate energy in small windows (50ms chunks)
|
416 |
window_size = int(0.05 * sample_rate) # 50ms windows
|
417 |
log(f" Window size: {window_size} samples ({window_size / sample_rate:.3f}s)")
|
|
|
373 |
Analyze first 1/3 of audio segment for: [noise] β [silence] β [noise] pattern
|
374 |
Returns: offset in seconds to skip initial noise, or 0.0 if no pattern found
|
375 |
"""
|
|
|
|
|
|
|
|
|
|
|
376 |
log(f"π detect_word_boundary_overlap called for '{word}':")
|
377 |
log(f" Input audio_segment shape: {audio_segment.shape}")
|
378 |
log(f" Sample rate: {sample_rate}")
|
|
|
383 |
return 0.0
|
384 |
|
385 |
# Analyze only first 1/3 of segment
|
386 |
+
first_third_samples = audio_segment.shape[-1] // 3
|
387 |
log(f" Total samples: {audio_segment.shape[-1]}, first third: {first_third_samples} samples")
|
388 |
log(f" First third duration: {first_third_samples / sample_rate:.3f}s")
|
389 |
|
390 |
if first_third_samples < sample_rate * 0.1: # Less than 100ms total
|
391 |
log(f" First third too short ({first_third_samples / sample_rate:.3f}s < 0.1s), returning 0.0")
|
392 |
return 0.0
|
393 |
+
|
|
|
|
|
|
|
|
|
|
|
394 |
first_third = audio_segment[:, :first_third_samples]
|
|
|
|
|
|
|
|
|
395 |
log(f" Extracted first_third shape: {first_third.shape}")
|
396 |
+
|
397 |
+
# BRUTE FORCE TEST: Flip the first third and see if it fixes the problem
|
398 |
+
log(f" BEFORE FLIP - first 3 samples: {first_third.squeeze()[:3].tolist()}")
|
399 |
+
log(f" BEFORE FLIP - last 3 samples: {first_third.squeeze()[-3:].tolist()}")
|
400 |
+
|
401 |
+
first_third = torch.flip(first_third, [-1]) # Flip along the sample dimension
|
402 |
+
|
403 |
+
log(f" AFTER FLIP - first 3 samples: {first_third.squeeze()[:3].tolist()}")
|
404 |
+
log(f" AFTER FLIP - last 3 samples: {first_third.squeeze()[-3:].tolist()}")
|
405 |
+
log(f" π FLIPPED the first third audio segment!")
|
406 |
+
|
407 |
# Calculate energy in small windows (50ms chunks)
|
408 |
window_size = int(0.05 * sample_rate) # 50ms windows
|
409 |
log(f" Window size: {window_size} samples ({window_size / sample_rate:.3f}s)")
|