greg0rs commited on
Commit
6717298
Β·
verified Β·
1 Parent(s): 3b421b4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -6
app.py CHANGED
@@ -398,15 +398,45 @@ def detect_word_boundary_overlap(audio_segment: torch.Tensor, sample_rate: int,
398
  if len(energy_levels) < 3:
399
  return 0.0
400
 
401
- # Look for pattern: [high energy] β†’ [low energy] β†’ [high energy]
402
- silence_threshold = np.percentile(energy_levels, 20) # Bottom 20%
403
- noise_threshold = silence_threshold * 3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
404
 
405
- log(f"πŸ” Boundary analysis for '{word}': {len(energy_levels)} windows, silence_thresh={silence_threshold:.6f}, noise_thresh={noise_threshold:.6f}")
 
 
406
 
407
- # Log each energy window for detailed analysis
 
 
 
 
 
408
  for i, energy in enumerate(energy_levels):
409
- window_type = "NOISE" if energy > noise_threshold else ("SILENCE" if energy < silence_threshold else "MID")
 
 
 
 
 
410
  log(f" Window {i}: energy={energy:.6f} [{window_type}]")
411
 
412
  # Find sustained silence (2+ consecutive low-energy windows)
 
398
  if len(energy_levels) < 3:
399
  return 0.0
400
 
401
+ # FIXED: Use absolute thresholds that work with real audio levels
402
+ # Convert to logarithmic scale to better handle wide dynamic range
403
+ log_energies = []
404
+ for energy in energy_levels:
405
+ # Convert to dB-like scale, with floor to prevent log(0)
406
+ log_energy = 10 * np.log10(max(energy, 1e-10))
407
+ log_energies.append(log_energy)
408
+
409
+ # Calculate thresholds in log domain
410
+ min_log = min(log_energies)
411
+ max_log = max(log_energies)
412
+ dynamic_range = max_log - min_log
413
+
414
+ # If dynamic range is too small, everything is similar energy - no clear pattern
415
+ if dynamic_range < 6: # Less than 6dB difference
416
+ log(f"πŸ” Boundary analysis for '{word}': insufficient dynamic range ({dynamic_range:.1f}dB)")
417
+ return 0.0
418
+
419
+ # Set thresholds: silence is bottom 25% of range, noise is top 50%
420
+ silence_threshold_log = min_log + dynamic_range * 0.25
421
+ noise_threshold_log = min_log + dynamic_range * 0.5
422
 
423
+ # Convert back to linear for comparison
424
+ silence_threshold = 10 ** (silence_threshold_log / 10)
425
+ noise_threshold = 10 ** (noise_threshold_log / 10)
426
 
427
+ log(f"πŸ” Boundary analysis for '{word}': {len(energy_levels)} windows")
428
+ log(f" Energy range: {min(energy_levels):.6f} to {max(energy_levels):.6f}")
429
+ log(f" Log range: {min_log:.1f}dB to {max_log:.1f}dB (dynamic range: {dynamic_range:.1f}dB)")
430
+ log(f" Thresholds: silence={silence_threshold:.6f}, noise={noise_threshold:.6f}")
431
+
432
+ # Classify windows with new thresholds
433
  for i, energy in enumerate(energy_levels):
434
+ if energy > noise_threshold:
435
+ window_type = "NOISE"
436
+ elif energy < silence_threshold:
437
+ window_type = "SILENCE"
438
+ else:
439
+ window_type = "MID"
440
  log(f" Window {i}: energy={energy:.6f} [{window_type}]")
441
 
442
  # Find sustained silence (2+ consecutive low-energy windows)