Gabriel Bibb贸 commited on
Commit
bd1af2c
1 Parent(s): dac6057

adjust app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -35
app.py CHANGED
@@ -679,7 +679,7 @@ class AudioProcessor:
679
  "WebRTC-VAD": 0.03, # 30ms hop for WebRTC (match frame duration)
680
  "E-PANNs": 0.05, # CHANGED from 0.1 to 0.05 for 20Hz
681
  "PANNs": 0.05, # CHANGED from 0.1 to 0.05 for 20Hz
682
- "AST": 0.24 # OPTIMIZED: Reduced frequency (4.17 Hz) for performance
683
  }
684
 
685
  # Model-specific thresholds for better detection
@@ -745,9 +745,12 @@ class AudioProcessor:
745
  mel_spec = np.dot(mel_basis, power_spec)
746
  mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
747
 
748
- # CAMBIO 2: Usar librosa.frames_to_time para consistencia con center=True
749
  frames = np.arange(mel_spec_db.shape[1])
750
- time_frames = librosa.frames_to_time(frames, sr=self.sample_rate, hop_length=self.hop_length)
 
 
 
751
 
752
  return mel_spec_db, time_frames
753
  else:
@@ -995,35 +998,69 @@ def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
995
  model_b_data['times'].append(result.timestamp)
996
  model_b_data['probs'].append(result.probability)
997
 
998
- if len(model_a_data['times']) > 0:
999
- fig.add_trace(
1000
- go.Scatter(
1001
- x=model_a_data['times'],
1002
- y=model_a_data['probs'],
1003
- mode='lines+markers', # Add markers to show single points
1004
- line=dict(color='yellow', width=3),
1005
- marker=dict(size=6, color='yellow'),
1006
- name=f'{model_a} Probability',
1007
- hovertemplate='Time: %{x:.2f}s<br>Probability: %{y:.3f}<extra></extra>',
1008
- showlegend=True
1009
- ),
1010
- row=1, col=1, secondary_y=True
1011
- )
1012
-
1013
- if len(model_b_data['times']) > 0:
1014
- fig.add_trace(
1015
- go.Scatter(
1016
- x=model_b_data['times'],
1017
- y=model_b_data['probs'],
1018
- mode='lines+markers', # Add markers to show single points
1019
- line=dict(color='orange', width=3),
1020
- marker=dict(size=6, color='orange'),
1021
- name=f'{model_b} Probability',
1022
- hovertemplate='Time: %{x:.2f}s<br>Probability: %{y:.3f}<extra></extra>',
1023
- showlegend=True
1024
- ),
1025
- row=2, col=1, secondary_y=True
1026
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1027
 
1028
  model_a_events = [e for e in onsets_offsets if e.model_name.split('(')[0].strip() == model_a]
1029
  model_b_events = [e for e in onsets_offsets if e.model_name.split('(')[0].strip() == model_b]
@@ -1208,9 +1245,14 @@ class VADDemo:
1208
  end_pos = min(len(processed_audio), start_pos + window_samples)
1209
  chunk = processed_audio[start_pos:end_pos]
1210
 
1211
- # Pad if necessary (with zeros, not repeating)
1212
  if len(chunk) < window_samples:
1213
- chunk = np.pad(chunk, (0, window_samples - len(chunk)), mode='constant')
 
 
 
 
 
1214
 
1215
  if window_count < 3: # Log first 3 windows
1216
  debug_info.append(f" 馃攧 Window {window_count}: t={timestamp:.2f}s (center), chunk_size={len(chunk)}")
@@ -1243,9 +1285,13 @@ class VADDemo:
1243
 
1244
  delay_compensation = self.processor.estimate_delay_compensation(processed_audio, vad_results)
1245
 
 
 
 
 
1246
  # CORRECTED: Use global threshold with delay compensation and min duration
1247
  onsets_offsets = self.processor.detect_onset_offset_advanced(
1248
- vad_results, threshold, apply_delay=delay_compensation, min_duration=0.12
1249
  )
1250
 
1251
  debug_info.append(f"\n馃幁 **EVENTS**: {len(onsets_offsets)} onset/offset pairs detected")
 
679
  "WebRTC-VAD": 0.03, # 30ms hop for WebRTC (match frame duration)
680
  "E-PANNs": 0.05, # CHANGED from 0.1 to 0.05 for 20Hz
681
  "PANNs": 0.05, # CHANGED from 0.1 to 0.05 for 20Hz
682
+ "AST": 0.1 # IMPROVED: Better resolution (10 Hz) while maintaining performance
683
  }
684
 
685
  # Model-specific thresholds for better detection
 
745
  mel_spec = np.dot(mel_basis, power_spec)
746
  mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
747
 
748
+ # CAMBIO 2: Usar librosa.frames_to_time para consistencia con center=False
749
  frames = np.arange(mel_spec_db.shape[1])
750
+ time_frames = librosa.frames_to_time(
751
+ frames, sr=self.sample_rate, hop_length=self.hop_length,
752
+ n_fft=self.n_fft, initial_time=0.0 # CRITICAL: Fix offset for center=False
753
+ )
754
 
755
  return mel_spec_db, time_frames
756
  else:
 
998
  model_b_data['times'].append(result.timestamp)
999
  model_b_data['probs'].append(result.probability)
1000
 
1001
+ # IMPROVEMENT: Use common high-resolution time grid for better alignment
1002
+ if len(time_frames) > 0:
1003
+ common_times = np.linspace(0, time_frames[-1], 1000) # High-res grid
1004
+
1005
+ if len(model_a_data['times']) > 1:
1006
+ # Interpolate to common grid for smooth visualization
1007
+ interp_probs_a = np.interp(common_times, model_a_data['times'], model_a_data['probs'], left=0, right=0)
1008
+ fig.add_trace(
1009
+ go.Scatter(
1010
+ x=common_times,
1011
+ y=interp_probs_a,
1012
+ mode='lines',
1013
+ line=dict(color='yellow', width=3),
1014
+ name=f'{model_a} Probability',
1015
+ hovertemplate='Time: %{x:.2f}s<br>Probability: %{y:.3f}<extra></extra>',
1016
+ showlegend=True
1017
+ ),
1018
+ row=1, col=1, secondary_y=True
1019
+ )
1020
+ elif len(model_a_data['times']) == 1:
1021
+ # Single point fallback
1022
+ fig.add_trace(
1023
+ go.Scatter(
1024
+ x=model_a_data['times'],
1025
+ y=model_a_data['probs'],
1026
+ mode='markers',
1027
+ marker=dict(size=8, color='yellow'),
1028
+ name=f'{model_a} Probability',
1029
+ hovertemplate='Time: %{x:.2f}s<br>Probability: %{y:.3f}<extra></extra>',
1030
+ showlegend=True
1031
+ ),
1032
+ row=1, col=1, secondary_y=True
1033
+ )
1034
+
1035
+ if len(model_b_data['times']) > 1:
1036
+ # Interpolate to common grid for smooth visualization
1037
+ interp_probs_b = np.interp(common_times, model_b_data['times'], model_b_data['probs'], left=0, right=0)
1038
+ fig.add_trace(
1039
+ go.Scatter(
1040
+ x=common_times,
1041
+ y=interp_probs_b,
1042
+ mode='lines',
1043
+ line=dict(color='orange', width=3),
1044
+ name=f'{model_b} Probability',
1045
+ hovertemplate='Time: %{x:.2f}s<br>Probability: %{y:.3f}<extra></extra>',
1046
+ showlegend=True
1047
+ ),
1048
+ row=2, col=1, secondary_y=True
1049
+ )
1050
+ elif len(model_b_data['times']) == 1:
1051
+ # Single point fallback
1052
+ fig.add_trace(
1053
+ go.Scatter(
1054
+ x=model_b_data['times'],
1055
+ y=model_b_data['probs'],
1056
+ mode='markers',
1057
+ marker=dict(size=8, color='orange'),
1058
+ name=f'{model_b} Probability',
1059
+ hovertemplate='Time: %{x:.2f}s<br>Probability: %{y:.3f}<extra></extra>',
1060
+ showlegend=True
1061
+ ),
1062
+ row=2, col=1, secondary_y=True
1063
+ )
1064
 
1065
  model_a_events = [e for e in onsets_offsets if e.model_name.split('(')[0].strip() == model_a]
1066
  model_b_events = [e for e in onsets_offsets if e.model_name.split('(')[0].strip() == model_b]
 
1245
  end_pos = min(len(processed_audio), start_pos + window_samples)
1246
  chunk = processed_audio[start_pos:end_pos]
1247
 
1248
+ # Pad if necessary (with reflection, not zeros to avoid artificial silence)
1249
  if len(chunk) < window_samples:
1250
+ chunk = np.pad(chunk, (0, window_samples - len(chunk)), mode='reflect')
1251
+
1252
+ # Skip chunks with excessive padding to avoid skewed predictions
1253
+ padding_ratio = (window_samples - (end_pos - start_pos)) / window_samples
1254
+ if padding_ratio > 0.5:
1255
+ continue # Skip heavily padded chunks
1256
 
1257
  if window_count < 3: # Log first 3 windows
1258
  debug_info.append(f" 馃攧 Window {window_count}: t={timestamp:.2f}s (center), chunk_size={len(chunk)}")
 
1285
 
1286
  delay_compensation = self.processor.estimate_delay_compensation(processed_audio, vad_results)
1287
 
1288
+ # CRITICAL: Apply delay compensation to ALL VAD timestamps, not just events
1289
+ for result in vad_results:
1290
+ result.timestamp -= delay_compensation
1291
+
1292
  # CORRECTED: Use global threshold with delay compensation and min duration
1293
  onsets_offsets = self.processor.detect_onset_offset_advanced(
1294
+ vad_results, threshold, apply_delay=0.0, min_duration=0.12 # delay already applied above
1295
  )
1296
 
1297
  debug_info.append(f"\n馃幁 **EVENTS**: {len(onsets_offsets)} onset/offset pairs detected")