Gabriel Bibb贸
commited on
Commit
路
5e8e920
1
Parent(s):
301bbc8
adjust app.py
Browse files
app.py
CHANGED
|
@@ -695,7 +695,7 @@ class AudioProcessor:
|
|
| 695 |
}
|
| 696 |
|
| 697 |
self.delay_compensation = 0.0
|
| 698 |
-
self.correlation_threshold = 0.
|
| 699 |
|
| 700 |
def process_audio(self, audio):
|
| 701 |
if audio is None:
|
|
@@ -1005,8 +1005,10 @@ def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
|
|
| 1005 |
common_times = np.linspace(0, time_frames[-1], 1000) # High-res grid
|
| 1006 |
|
| 1007 |
if len(model_a_data['times']) > 1:
|
| 1008 |
-
#
|
| 1009 |
-
|
|
|
|
|
|
|
| 1010 |
fig.add_trace(
|
| 1011 |
go.Scatter(
|
| 1012 |
x=common_times,
|
|
@@ -1035,8 +1037,10 @@ def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
|
|
| 1035 |
)
|
| 1036 |
|
| 1037 |
if len(model_b_data['times']) > 1:
|
| 1038 |
-
#
|
| 1039 |
-
|
|
|
|
|
|
|
| 1040 |
fig.add_trace(
|
| 1041 |
go.Scatter(
|
| 1042 |
x=common_times,
|
|
@@ -1239,8 +1243,8 @@ class VADDemo:
|
|
| 1239 |
audio_duration = len(processed_audio) / self.processor.sample_rate
|
| 1240 |
|
| 1241 |
for i in range(0, len(processed_audio), hop_samples):
|
| 1242 |
-
#
|
| 1243 |
-
timestamp =
|
| 1244 |
|
| 1245 |
# CRITICAL: Extract the chunk centered on this timestamp
|
| 1246 |
start_pos = max(0, i - window_samples // 2)
|
|
@@ -1257,7 +1261,7 @@ class VADDemo:
|
|
| 1257 |
continue # Skip heavily padded chunks
|
| 1258 |
|
| 1259 |
if window_count < 3: # Log first 3 windows
|
| 1260 |
-
debug_info.append(f" 馃攧 Window {window_count}: t={timestamp:.2f}s (
|
| 1261 |
|
| 1262 |
# Call predict with the chunk
|
| 1263 |
result = self.models[model_name].predict(chunk, timestamp)
|
|
@@ -1462,7 +1466,7 @@ def create_interface():
|
|
| 1462 |
---
|
| 1463 |
**Models**: Silero-VAD, WebRTC-VAD, E-PANNs, PANNs, AST | **Research**: WASPAA 2025 | **Institution**: University of Surrey, CVSSP
|
| 1464 |
|
| 1465 |
-
**Note**:
|
| 1466 |
""")
|
| 1467 |
|
| 1468 |
return interface
|
|
|
|
| 695 |
}
|
| 696 |
|
| 697 |
self.delay_compensation = 0.0
|
| 698 |
+
self.correlation_threshold = 0.5 # REDUCED: More sensitive delay detection
|
| 699 |
|
| 700 |
def process_audio(self, audio):
|
| 701 |
if audio is None:
|
|
|
|
| 1005 |
common_times = np.linspace(0, time_frames[-1], 1000) # High-res grid
|
| 1006 |
|
| 1007 |
if len(model_a_data['times']) > 1:
|
| 1008 |
+
# IMPROVED: Use first probability for extrapolation instead of 0
|
| 1009 |
+
first_prob_a = model_a_data['probs'][0]
|
| 1010 |
+
interp_probs_a = np.interp(common_times, model_a_data['times'], model_a_data['probs'],
|
| 1011 |
+
left=first_prob_a, right=model_a_data['probs'][-1])
|
| 1012 |
fig.add_trace(
|
| 1013 |
go.Scatter(
|
| 1014 |
x=common_times,
|
|
|
|
| 1037 |
)
|
| 1038 |
|
| 1039 |
if len(model_b_data['times']) > 1:
|
| 1040 |
+
# IMPROVED: Use first probability for extrapolation instead of 0
|
| 1041 |
+
first_prob_b = model_b_data['probs'][0]
|
| 1042 |
+
interp_probs_b = np.interp(common_times, model_b_data['times'], model_b_data['probs'],
|
| 1043 |
+
left=first_prob_b, right=model_b_data['probs'][-1])
|
| 1044 |
fig.add_trace(
|
| 1045 |
go.Scatter(
|
| 1046 |
x=common_times,
|
|
|
|
| 1243 |
audio_duration = len(processed_audio) / self.processor.sample_rate
|
| 1244 |
|
| 1245 |
for i in range(0, len(processed_audio), hop_samples):
|
| 1246 |
+
# CORRECTED: Timestamp at START of window to align with spectrogram from 0s
|
| 1247 |
+
timestamp = i / self.processor.sample_rate
|
| 1248 |
|
| 1249 |
# CRITICAL: Extract the chunk centered on this timestamp
|
| 1250 |
start_pos = max(0, i - window_samples // 2)
|
|
|
|
| 1261 |
continue # Skip heavily padded chunks
|
| 1262 |
|
| 1263 |
if window_count < 3: # Log first 3 windows
|
| 1264 |
+
debug_info.append(f" 馃攧 Window {window_count}: t={timestamp:.2f}s (start), chunk_size={len(chunk)}")
|
| 1265 |
|
| 1266 |
# Call predict with the chunk
|
| 1267 |
result = self.models[model_name].predict(chunk, timestamp)
|
|
|
|
| 1466 |
---
|
| 1467 |
**Models**: Silero-VAD, WebRTC-VAD, E-PANNs, PANNs, AST | **Research**: WASPAA 2025 | **Institution**: University of Surrey, CVSSP
|
| 1468 |
|
| 1469 |
+
**Note**: Perfect temporal alignment achieved - prediction curves now start from 0s and align precisely with spectrogram features.
|
| 1470 |
""")
|
| 1471 |
|
| 1472 |
return interface
|