Spaces:

greg0rs
/

fonetik-fast

Running

App Files Files Community

greg0rs commited on Jul 27

Commit

c72beff

verified ·

1 Parent(s): 40cc16d

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -21

app.py CHANGED Viewed

@@ -373,11 +373,6 @@ def detect_word_boundary_overlap(audio_segment: torch.Tensor, sample_rate: int,
     Analyze first 1/3 of audio segment for: [noise] → [silence] → [noise] pattern
     Returns: offset in seconds to skip initial noise, or 0.0 if no pattern found
     """
-    # Add this right after the function starts
-    log(f"🔍 AUDIO VERIFICATION for '{word}':")
-    log(f"   Full segment shape: {audio_segment.shape}")
-    log(f"   Full segment first 3 samples: {audio_segment.squeeze()[:3].tolist()}")
-    log(f"   Full segment last 3 samples: {audio_segment.squeeze()[-3:].tolist()}")
     log(f"🔍 detect_word_boundary_overlap called for '{word}':")
     log(f"   Input audio_segment shape: {audio_segment.shape}")
     log(f"   Sample rate: {sample_rate}")
@@ -388,30 +383,27 @@ def detect_word_boundary_overlap(audio_segment: torch.Tensor, sample_rate: int,
         return 0.0
     # Analyze only first 1/3 of segment
-    first_third_samples = audio_segment.shape[1] // 3
     log(f"   Total samples: {audio_segment.shape[-1]}, first third: {first_third_samples} samples")
     log(f"   First third duration: {first_third_samples / sample_rate:.3f}s")
     if first_third_samples < sample_rate * 0.1:  # Less than 100ms total
         log(f"   First third too short ({first_third_samples / sample_rate:.3f}s < 0.1s), returning 0.0")
         return 0.0
-    # Add this right before: first_third = audio_segment[:, :first_third_samples]
-    log(f"   PRE-EXTRACTION CHECK:")
-    log(f"   audio_segment[:3]: {audio_segment.squeeze()[:3].tolist()}")
-    log(f"   About to extract first_third_samples: {first_third_samples}")
     first_third = audio_segment[:, :first_third_samples]
-    log(f"   POST-EXTRACTION CHECK:")
-    log(f"   first_third[:3]: {first_third.squeeze()[:3].tolist()}")
-    log(f"   Are these the same? {torch.equal(audio_segment.squeeze()[:3], first_third.squeeze()[:3])}")
     log(f"   Extracted first_third shape: {first_third.shape}")
-    log(f"   CONSISTENCY CHECK:")
-    log(f"   Full segment first 3: {audio_segment.squeeze()[:3].tolist()}")
-    log(f"   First third first 3: {first_third.squeeze()[:3].tolist()}")
-    log(f"   These should be identical!")
     # Calculate energy in small windows (50ms chunks)
     window_size = int(0.05 * sample_rate)  # 50ms windows
     log(f"   Window size: {window_size} samples ({window_size / sample_rate:.3f}s)")

     Analyze first 1/3 of audio segment for: [noise] → [silence] → [noise] pattern
     Returns: offset in seconds to skip initial noise, or 0.0 if no pattern found
     """
     log(f"🔍 detect_word_boundary_overlap called for '{word}':")
     log(f"   Input audio_segment shape: {audio_segment.shape}")
     log(f"   Sample rate: {sample_rate}")
         return 0.0
     # Analyze only first 1/3 of segment
+    first_third_samples = audio_segment.shape[-1] // 3
     log(f"   Total samples: {audio_segment.shape[-1]}, first third: {first_third_samples} samples")
     log(f"   First third duration: {first_third_samples / sample_rate:.3f}s")
     if first_third_samples < sample_rate * 0.1:  # Less than 100ms total
         log(f"   First third too short ({first_third_samples / sample_rate:.3f}s < 0.1s), returning 0.0")
         return 0.0
     first_third = audio_segment[:, :first_third_samples]
     log(f"   Extracted first_third shape: {first_third.shape}")
+    # BRUTE FORCE TEST: Flip the first third and see if it fixes the problem
+    log(f"   BEFORE FLIP - first 3 samples: {first_third.squeeze()[:3].tolist()}")
+    log(f"   BEFORE FLIP - last 3 samples: {first_third.squeeze()[-3:].tolist()}")
+    first_third = torch.flip(first_third, [-1])  # Flip along the sample dimension
+    log(f"   AFTER FLIP - first 3 samples: {first_third.squeeze()[:3].tolist()}")
+    log(f"   AFTER FLIP - last 3 samples: {first_third.squeeze()[-3:].tolist()}")
+    log(f"   🔄 FLIPPED the first third audio segment!")
     # Calculate energy in small windows (50ms chunks)
     window_size = int(0.05 * sample_rate)  # 50ms windows
     log(f"   Window size: {window_size} samples ({window_size / sample_rate:.3f}s)")