greg0rs commited on
Commit
c72beff
Β·
verified Β·
1 Parent(s): 40cc16d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -21
app.py CHANGED
@@ -373,11 +373,6 @@ def detect_word_boundary_overlap(audio_segment: torch.Tensor, sample_rate: int,
373
  Analyze first 1/3 of audio segment for: [noise] β†’ [silence] β†’ [noise] pattern
374
  Returns: offset in seconds to skip initial noise, or 0.0 if no pattern found
375
  """
376
- # Add this right after the function starts
377
- log(f"πŸ” AUDIO VERIFICATION for '{word}':")
378
- log(f" Full segment shape: {audio_segment.shape}")
379
- log(f" Full segment first 3 samples: {audio_segment.squeeze()[:3].tolist()}")
380
- log(f" Full segment last 3 samples: {audio_segment.squeeze()[-3:].tolist()}")
381
  log(f"πŸ” detect_word_boundary_overlap called for '{word}':")
382
  log(f" Input audio_segment shape: {audio_segment.shape}")
383
  log(f" Sample rate: {sample_rate}")
@@ -388,30 +383,27 @@ def detect_word_boundary_overlap(audio_segment: torch.Tensor, sample_rate: int,
388
  return 0.0
389
 
390
  # Analyze only first 1/3 of segment
391
- first_third_samples = audio_segment.shape[1] // 3
392
  log(f" Total samples: {audio_segment.shape[-1]}, first third: {first_third_samples} samples")
393
  log(f" First third duration: {first_third_samples / sample_rate:.3f}s")
394
 
395
  if first_third_samples < sample_rate * 0.1: # Less than 100ms total
396
  log(f" First third too short ({first_third_samples / sample_rate:.3f}s < 0.1s), returning 0.0")
397
  return 0.0
398
-
399
- # Add this right before: first_third = audio_segment[:, :first_third_samples]
400
- log(f" PRE-EXTRACTION CHECK:")
401
- log(f" audio_segment[:3]: {audio_segment.squeeze()[:3].tolist()}")
402
- log(f" About to extract first_third_samples: {first_third_samples}")
403
-
404
  first_third = audio_segment[:, :first_third_samples]
405
-
406
- log(f" POST-EXTRACTION CHECK:")
407
- log(f" first_third[:3]: {first_third.squeeze()[:3].tolist()}")
408
- log(f" Are these the same? {torch.equal(audio_segment.squeeze()[:3], first_third.squeeze()[:3])}")
409
  log(f" Extracted first_third shape: {first_third.shape}")
410
- log(f" CONSISTENCY CHECK:")
411
- log(f" Full segment first 3: {audio_segment.squeeze()[:3].tolist()}")
412
- log(f" First third first 3: {first_third.squeeze()[:3].tolist()}")
413
- log(f" These should be identical!")
414
-
 
 
 
 
 
 
415
  # Calculate energy in small windows (50ms chunks)
416
  window_size = int(0.05 * sample_rate) # 50ms windows
417
  log(f" Window size: {window_size} samples ({window_size / sample_rate:.3f}s)")
 
373
  Analyze first 1/3 of audio segment for: [noise] β†’ [silence] β†’ [noise] pattern
374
  Returns: offset in seconds to skip initial noise, or 0.0 if no pattern found
375
  """
 
 
 
 
 
376
  log(f"πŸ” detect_word_boundary_overlap called for '{word}':")
377
  log(f" Input audio_segment shape: {audio_segment.shape}")
378
  log(f" Sample rate: {sample_rate}")
 
383
  return 0.0
384
 
385
  # Analyze only first 1/3 of segment
386
+ first_third_samples = audio_segment.shape[-1] // 3
387
  log(f" Total samples: {audio_segment.shape[-1]}, first third: {first_third_samples} samples")
388
  log(f" First third duration: {first_third_samples / sample_rate:.3f}s")
389
 
390
  if first_third_samples < sample_rate * 0.1: # Less than 100ms total
391
  log(f" First third too short ({first_third_samples / sample_rate:.3f}s < 0.1s), returning 0.0")
392
  return 0.0
393
+
 
 
 
 
 
394
  first_third = audio_segment[:, :first_third_samples]
 
 
 
 
395
  log(f" Extracted first_third shape: {first_third.shape}")
396
+
397
+ # BRUTE FORCE TEST: Flip the first third and see if it fixes the problem
398
+ log(f" BEFORE FLIP - first 3 samples: {first_third.squeeze()[:3].tolist()}")
399
+ log(f" BEFORE FLIP - last 3 samples: {first_third.squeeze()[-3:].tolist()}")
400
+
401
+ first_third = torch.flip(first_third, [-1]) # Flip along the sample dimension
402
+
403
+ log(f" AFTER FLIP - first 3 samples: {first_third.squeeze()[:3].tolist()}")
404
+ log(f" AFTER FLIP - last 3 samples: {first_third.squeeze()[-3:].tolist()}")
405
+ log(f" πŸ”„ FLIPPED the first third audio segment!")
406
+
407
  # Calculate energy in small windows (50ms chunks)
408
  window_size = int(0.05 * sample_rate) # 50ms windows
409
  log(f" Window size: {window_size} samples ({window_size / sample_rate:.3f}s)")