Gabriel Bibb贸
commited on
Commit
路
bd1af2c
1
Parent(s):
dac6057
adjust app.py
Browse files
app.py
CHANGED
|
@@ -679,7 +679,7 @@ class AudioProcessor:
|
|
| 679 |
"WebRTC-VAD": 0.03, # 30ms hop for WebRTC (match frame duration)
|
| 680 |
"E-PANNs": 0.05, # CHANGED from 0.1 to 0.05 for 20Hz
|
| 681 |
"PANNs": 0.05, # CHANGED from 0.1 to 0.05 for 20Hz
|
| 682 |
-
"AST": 0.
|
| 683 |
}
|
| 684 |
|
| 685 |
# Model-specific thresholds for better detection
|
|
@@ -745,9 +745,12 @@ class AudioProcessor:
|
|
| 745 |
mel_spec = np.dot(mel_basis, power_spec)
|
| 746 |
mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
|
| 747 |
|
| 748 |
-
# CAMBIO 2: Usar librosa.frames_to_time para consistencia con center=
|
| 749 |
frames = np.arange(mel_spec_db.shape[1])
|
| 750 |
-
time_frames = librosa.frames_to_time(
|
|
|
|
|
|
|
|
|
|
| 751 |
|
| 752 |
return mel_spec_db, time_frames
|
| 753 |
else:
|
|
@@ -995,35 +998,69 @@ def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
|
|
| 995 |
model_b_data['times'].append(result.timestamp)
|
| 996 |
model_b_data['probs'].append(result.probability)
|
| 997 |
|
| 998 |
-
|
| 999 |
-
|
| 1000 |
-
|
| 1001 |
-
|
| 1002 |
-
|
| 1003 |
-
|
| 1004 |
-
|
| 1005 |
-
|
| 1006 |
-
|
| 1007 |
-
|
| 1008 |
-
|
| 1009 |
-
|
| 1010 |
-
|
| 1011 |
-
|
| 1012 |
-
|
| 1013 |
-
|
| 1014 |
-
|
| 1015 |
-
|
| 1016 |
-
|
| 1017 |
-
|
| 1018 |
-
|
| 1019 |
-
|
| 1020 |
-
|
| 1021 |
-
|
| 1022 |
-
|
| 1023 |
-
|
| 1024 |
-
|
| 1025 |
-
|
| 1026 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1027 |
|
| 1028 |
model_a_events = [e for e in onsets_offsets if e.model_name.split('(')[0].strip() == model_a]
|
| 1029 |
model_b_events = [e for e in onsets_offsets if e.model_name.split('(')[0].strip() == model_b]
|
|
@@ -1208,9 +1245,14 @@ class VADDemo:
|
|
| 1208 |
end_pos = min(len(processed_audio), start_pos + window_samples)
|
| 1209 |
chunk = processed_audio[start_pos:end_pos]
|
| 1210 |
|
| 1211 |
-
# Pad if necessary (with
|
| 1212 |
if len(chunk) < window_samples:
|
| 1213 |
-
chunk = np.pad(chunk, (0, window_samples - len(chunk)), mode='
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1214 |
|
| 1215 |
if window_count < 3: # Log first 3 windows
|
| 1216 |
debug_info.append(f" 馃攧 Window {window_count}: t={timestamp:.2f}s (center), chunk_size={len(chunk)}")
|
|
@@ -1243,9 +1285,13 @@ class VADDemo:
|
|
| 1243 |
|
| 1244 |
delay_compensation = self.processor.estimate_delay_compensation(processed_audio, vad_results)
|
| 1245 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1246 |
# CORRECTED: Use global threshold with delay compensation and min duration
|
| 1247 |
onsets_offsets = self.processor.detect_onset_offset_advanced(
|
| 1248 |
-
vad_results, threshold, apply_delay=
|
| 1249 |
)
|
| 1250 |
|
| 1251 |
debug_info.append(f"\n馃幁 **EVENTS**: {len(onsets_offsets)} onset/offset pairs detected")
|
|
|
|
| 679 |
"WebRTC-VAD": 0.03, # 30ms hop for WebRTC (match frame duration)
|
| 680 |
"E-PANNs": 0.05, # CHANGED from 0.1 to 0.05 for 20Hz
|
| 681 |
"PANNs": 0.05, # CHANGED from 0.1 to 0.05 for 20Hz
|
| 682 |
+
"AST": 0.1 # IMPROVED: Better resolution (10 Hz) while maintaining performance
|
| 683 |
}
|
| 684 |
|
| 685 |
# Model-specific thresholds for better detection
|
|
|
|
| 745 |
mel_spec = np.dot(mel_basis, power_spec)
|
| 746 |
mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
|
| 747 |
|
| 748 |
+
# CAMBIO 2: Usar librosa.frames_to_time para consistencia con center=False
|
| 749 |
frames = np.arange(mel_spec_db.shape[1])
|
| 750 |
+
time_frames = librosa.frames_to_time(
|
| 751 |
+
frames, sr=self.sample_rate, hop_length=self.hop_length,
|
| 752 |
+
n_fft=self.n_fft, initial_time=0.0 # CRITICAL: Fix offset for center=False
|
| 753 |
+
)
|
| 754 |
|
| 755 |
return mel_spec_db, time_frames
|
| 756 |
else:
|
|
|
|
| 998 |
model_b_data['times'].append(result.timestamp)
|
| 999 |
model_b_data['probs'].append(result.probability)
|
| 1000 |
|
| 1001 |
+
# IMPROVEMENT: Use common high-resolution time grid for better alignment
|
| 1002 |
+
if len(time_frames) > 0:
|
| 1003 |
+
common_times = np.linspace(0, time_frames[-1], 1000) # High-res grid
|
| 1004 |
+
|
| 1005 |
+
if len(model_a_data['times']) > 1:
|
| 1006 |
+
# Interpolate to common grid for smooth visualization
|
| 1007 |
+
interp_probs_a = np.interp(common_times, model_a_data['times'], model_a_data['probs'], left=0, right=0)
|
| 1008 |
+
fig.add_trace(
|
| 1009 |
+
go.Scatter(
|
| 1010 |
+
x=common_times,
|
| 1011 |
+
y=interp_probs_a,
|
| 1012 |
+
mode='lines',
|
| 1013 |
+
line=dict(color='yellow', width=3),
|
| 1014 |
+
name=f'{model_a} Probability',
|
| 1015 |
+
hovertemplate='Time: %{x:.2f}s<br>Probability: %{y:.3f}<extra></extra>',
|
| 1016 |
+
showlegend=True
|
| 1017 |
+
),
|
| 1018 |
+
row=1, col=1, secondary_y=True
|
| 1019 |
+
)
|
| 1020 |
+
elif len(model_a_data['times']) == 1:
|
| 1021 |
+
# Single point fallback
|
| 1022 |
+
fig.add_trace(
|
| 1023 |
+
go.Scatter(
|
| 1024 |
+
x=model_a_data['times'],
|
| 1025 |
+
y=model_a_data['probs'],
|
| 1026 |
+
mode='markers',
|
| 1027 |
+
marker=dict(size=8, color='yellow'),
|
| 1028 |
+
name=f'{model_a} Probability',
|
| 1029 |
+
hovertemplate='Time: %{x:.2f}s<br>Probability: %{y:.3f}<extra></extra>',
|
| 1030 |
+
showlegend=True
|
| 1031 |
+
),
|
| 1032 |
+
row=1, col=1, secondary_y=True
|
| 1033 |
+
)
|
| 1034 |
+
|
| 1035 |
+
if len(model_b_data['times']) > 1:
|
| 1036 |
+
# Interpolate to common grid for smooth visualization
|
| 1037 |
+
interp_probs_b = np.interp(common_times, model_b_data['times'], model_b_data['probs'], left=0, right=0)
|
| 1038 |
+
fig.add_trace(
|
| 1039 |
+
go.Scatter(
|
| 1040 |
+
x=common_times,
|
| 1041 |
+
y=interp_probs_b,
|
| 1042 |
+
mode='lines',
|
| 1043 |
+
line=dict(color='orange', width=3),
|
| 1044 |
+
name=f'{model_b} Probability',
|
| 1045 |
+
hovertemplate='Time: %{x:.2f}s<br>Probability: %{y:.3f}<extra></extra>',
|
| 1046 |
+
showlegend=True
|
| 1047 |
+
),
|
| 1048 |
+
row=2, col=1, secondary_y=True
|
| 1049 |
+
)
|
| 1050 |
+
elif len(model_b_data['times']) == 1:
|
| 1051 |
+
# Single point fallback
|
| 1052 |
+
fig.add_trace(
|
| 1053 |
+
go.Scatter(
|
| 1054 |
+
x=model_b_data['times'],
|
| 1055 |
+
y=model_b_data['probs'],
|
| 1056 |
+
mode='markers',
|
| 1057 |
+
marker=dict(size=8, color='orange'),
|
| 1058 |
+
name=f'{model_b} Probability',
|
| 1059 |
+
hovertemplate='Time: %{x:.2f}s<br>Probability: %{y:.3f}<extra></extra>',
|
| 1060 |
+
showlegend=True
|
| 1061 |
+
),
|
| 1062 |
+
row=2, col=1, secondary_y=True
|
| 1063 |
+
)
|
| 1064 |
|
| 1065 |
model_a_events = [e for e in onsets_offsets if e.model_name.split('(')[0].strip() == model_a]
|
| 1066 |
model_b_events = [e for e in onsets_offsets if e.model_name.split('(')[0].strip() == model_b]
|
|
|
|
| 1245 |
end_pos = min(len(processed_audio), start_pos + window_samples)
|
| 1246 |
chunk = processed_audio[start_pos:end_pos]
|
| 1247 |
|
| 1248 |
+
# Pad if necessary (with reflection, not zeros to avoid artificial silence)
|
| 1249 |
if len(chunk) < window_samples:
|
| 1250 |
+
chunk = np.pad(chunk, (0, window_samples - len(chunk)), mode='reflect')
|
| 1251 |
+
|
| 1252 |
+
# Skip chunks with excessive padding to avoid skewed predictions
|
| 1253 |
+
padding_ratio = (window_samples - (end_pos - start_pos)) / window_samples
|
| 1254 |
+
if padding_ratio > 0.5:
|
| 1255 |
+
continue # Skip heavily padded chunks
|
| 1256 |
|
| 1257 |
if window_count < 3: # Log first 3 windows
|
| 1258 |
debug_info.append(f" 馃攧 Window {window_count}: t={timestamp:.2f}s (center), chunk_size={len(chunk)}")
|
|
|
|
| 1285 |
|
| 1286 |
delay_compensation = self.processor.estimate_delay_compensation(processed_audio, vad_results)
|
| 1287 |
|
| 1288 |
+
# CRITICAL: Apply delay compensation to ALL VAD timestamps, not just events
|
| 1289 |
+
for result in vad_results:
|
| 1290 |
+
result.timestamp -= delay_compensation
|
| 1291 |
+
|
| 1292 |
# CORRECTED: Use global threshold with delay compensation and min duration
|
| 1293 |
onsets_offsets = self.processor.detect_onset_offset_advanced(
|
| 1294 |
+
vad_results, threshold, apply_delay=0.0, min_duration=0.12 # delay already applied above
|
| 1295 |
)
|
| 1296 |
|
| 1297 |
debug_info.append(f"\n馃幁 **EVENTS**: {len(onsets_offsets)} onset/offset pairs detected")
|