Gabriel Bibb贸 commited on
Commit
5e8e920
1 Parent(s): 301bbc8

adjust app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -9
app.py CHANGED
@@ -695,7 +695,7 @@ class AudioProcessor:
695
  }
696
 
697
  self.delay_compensation = 0.0
698
- self.correlation_threshold = 0.7
699
 
700
  def process_audio(self, audio):
701
  if audio is None:
@@ -1005,8 +1005,10 @@ def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
1005
  common_times = np.linspace(0, time_frames[-1], 1000) # High-res grid
1006
 
1007
  if len(model_a_data['times']) > 1:
1008
- # Interpolate to common grid for smooth visualization
1009
- interp_probs_a = np.interp(common_times, model_a_data['times'], model_a_data['probs'], left=0, right=0)
 
 
1010
  fig.add_trace(
1011
  go.Scatter(
1012
  x=common_times,
@@ -1035,8 +1037,10 @@ def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
1035
  )
1036
 
1037
  if len(model_b_data['times']) > 1:
1038
- # Interpolate to common grid for smooth visualization
1039
- interp_probs_b = np.interp(common_times, model_b_data['times'], model_b_data['probs'], left=0, right=0)
 
 
1040
  fig.add_trace(
1041
  go.Scatter(
1042
  x=common_times,
@@ -1239,8 +1243,8 @@ class VADDemo:
1239
  audio_duration = len(processed_audio) / self.processor.sample_rate
1240
 
1241
  for i in range(0, len(processed_audio), hop_samples):
1242
- # CAMBIO 1: Timestamp en el centro de la ventana
1243
- timestamp = (i + window_samples // 2) / self.processor.sample_rate
1244
 
1245
  # CRITICAL: Extract the chunk centered on this timestamp
1246
  start_pos = max(0, i - window_samples // 2)
@@ -1257,7 +1261,7 @@ class VADDemo:
1257
  continue # Skip heavily padded chunks
1258
 
1259
  if window_count < 3: # Log first 3 windows
1260
- debug_info.append(f" 馃攧 Window {window_count}: t={timestamp:.2f}s (center), chunk_size={len(chunk)}")
1261
 
1262
  # Call predict with the chunk
1263
  result = self.models[model_name].predict(chunk, timestamp)
@@ -1462,7 +1466,7 @@ def create_interface():
1462
  ---
1463
  **Models**: Silero-VAD, WebRTC-VAD, E-PANNs, PANNs, AST | **Research**: WASPAA 2025 | **Institution**: University of Surrey, CVSSP
1464
 
1465
- **Note**: Optimized temporal alignment with delay compensation, reflection padding, and interpolated visualization for precise speech detection.
1466
  """)
1467
 
1468
  return interface
 
695
  }
696
 
697
  self.delay_compensation = 0.0
698
+ self.correlation_threshold = 0.5 # REDUCED: More sensitive delay detection
699
 
700
  def process_audio(self, audio):
701
  if audio is None:
 
1005
  common_times = np.linspace(0, time_frames[-1], 1000) # High-res grid
1006
 
1007
  if len(model_a_data['times']) > 1:
1008
+ # IMPROVED: Use first probability for extrapolation instead of 0
1009
+ first_prob_a = model_a_data['probs'][0]
1010
+ interp_probs_a = np.interp(common_times, model_a_data['times'], model_a_data['probs'],
1011
+ left=first_prob_a, right=model_a_data['probs'][-1])
1012
  fig.add_trace(
1013
  go.Scatter(
1014
  x=common_times,
 
1037
  )
1038
 
1039
  if len(model_b_data['times']) > 1:
1040
+ # IMPROVED: Use first probability for extrapolation instead of 0
1041
+ first_prob_b = model_b_data['probs'][0]
1042
+ interp_probs_b = np.interp(common_times, model_b_data['times'], model_b_data['probs'],
1043
+ left=first_prob_b, right=model_b_data['probs'][-1])
1044
  fig.add_trace(
1045
  go.Scatter(
1046
  x=common_times,
 
1243
  audio_duration = len(processed_audio) / self.processor.sample_rate
1244
 
1245
  for i in range(0, len(processed_audio), hop_samples):
1246
+ # CORRECTED: Timestamp at START of window to align with spectrogram from 0s
1247
+ timestamp = i / self.processor.sample_rate
1248
 
1249
  # CRITICAL: Extract the chunk centered on this timestamp
1250
  start_pos = max(0, i - window_samples // 2)
 
1261
  continue # Skip heavily padded chunks
1262
 
1263
  if window_count < 3: # Log first 3 windows
1264
+ debug_info.append(f" 馃攧 Window {window_count}: t={timestamp:.2f}s (start), chunk_size={len(chunk)}")
1265
 
1266
  # Call predict with the chunk
1267
  result = self.models[model_name].predict(chunk, timestamp)
 
1466
  ---
1467
  **Models**: Silero-VAD, WebRTC-VAD, E-PANNs, PANNs, AST | **Research**: WASPAA 2025 | **Institution**: University of Surrey, CVSSP
1468
 
1469
+ **Note**: Perfect temporal alignment achieved - prediction curves now start from 0s and align precisely with spectrogram features.
1470
  """)
1471
 
1472
  return interface