Gabriel Bibb贸
commited on
Commit
路
b3b9f78
1
Parent(s):
29ea60e
adjust app.py
Browse files
app.py
CHANGED
|
@@ -766,6 +766,9 @@ class AudioProcessor:
|
|
| 766 |
mode='psd'
|
| 767 |
)
|
| 768 |
|
|
|
|
|
|
|
|
|
|
| 769 |
mel_spec_db = np.zeros((self.n_mels, Sxx.shape[1]))
|
| 770 |
|
| 771 |
mel_freqs = np.logspace(
|
|
@@ -826,8 +829,8 @@ class AudioProcessor:
|
|
| 826 |
# Add virtual start point if first timestamp > 0
|
| 827 |
if rs[0].timestamp > 0:
|
| 828 |
virtual_start = VADResult(
|
| 829 |
-
probability=0.
|
| 830 |
-
is_speech=
|
| 831 |
model_name=base,
|
| 832 |
processing_time=0,
|
| 833 |
timestamp=0.0
|
|
@@ -1271,25 +1274,28 @@ class VADDemo:
|
|
| 1271 |
# CRITICAL FIX: Always extract chunks, both for short and long audio
|
| 1272 |
window_count = 0
|
| 1273 |
audio_duration = len(processed_audio) / self.processor.sample_rate
|
| 1274 |
-
audio_len = len(processed_audio)
|
| 1275 |
|
| 1276 |
-
for
|
| 1277 |
-
|
| 1278 |
-
start_pos = max(0,
|
| 1279 |
-
|
|
|
|
| 1280 |
|
| 1281 |
-
|
| 1282 |
-
if
|
| 1283 |
-
chunk = np.pad(chunk, (window_samples -
|
| 1284 |
|
| 1285 |
-
|
|
|
|
| 1286 |
if padding_ratio > 0.5:
|
| 1287 |
continue # Skip heavily padded chunks
|
| 1288 |
|
| 1289 |
-
|
|
|
|
|
|
|
| 1290 |
|
| 1291 |
if window_count < 3: # Log first 3 windows
|
| 1292 |
-
debug_info.append(f" 馃攧 Window {window_count}: t={timestamp:.2f}s (
|
| 1293 |
|
| 1294 |
# Call predict with the chunk
|
| 1295 |
result = self.models[model_name].predict(chunk, timestamp)
|
|
@@ -1303,6 +1309,10 @@ class VADDemo:
|
|
| 1303 |
model_results.append(result)
|
| 1304 |
window_count += 1
|
| 1305 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1306 |
debug_info.append(f" 馃幆 Total windows processed: {window_count}")
|
| 1307 |
|
| 1308 |
# Summary for this model
|
|
|
|
| 766 |
mode='psd'
|
| 767 |
)
|
| 768 |
|
| 769 |
+
# Ajustar tiempos para alinear con center=False (empezar en 0)
|
| 770 |
+
t -= (self.n_fft / 2.0) / self.sample_rate
|
| 771 |
+
|
| 772 |
mel_spec_db = np.zeros((self.n_mels, Sxx.shape[1]))
|
| 773 |
|
| 774 |
mel_freqs = np.logspace(
|
|
|
|
| 829 |
# Add virtual start point if first timestamp > 0
|
| 830 |
if rs[0].timestamp > 0:
|
| 831 |
virtual_start = VADResult(
|
| 832 |
+
probability=rs[0].probability,
|
| 833 |
+
is_speech=rs[0].probability > threshold,
|
| 834 |
model_name=base,
|
| 835 |
processing_time=0,
|
| 836 |
timestamp=0.0
|
|
|
|
| 1274 |
# CRITICAL FIX: Always extract chunks, both for short and long audio
|
| 1275 |
window_count = 0
|
| 1276 |
audio_duration = len(processed_audio) / self.processor.sample_rate
|
|
|
|
| 1277 |
|
| 1278 |
+
for i in range(0, len(processed_audio), hop_samples):
|
| 1279 |
+
# CRITICAL: Extract the chunk centered on this timestamp
|
| 1280 |
+
start_pos = max(0, i - window_samples // 2)
|
| 1281 |
+
end_pos = min(len(processed_audio), start_pos + window_samples)
|
| 1282 |
+
chunk = processed_audio[start_pos:end_pos]
|
| 1283 |
|
| 1284 |
+
# Pad if necessary (with reflection, not zeros to avoid artificial silence)
|
| 1285 |
+
if len(chunk) < window_samples:
|
| 1286 |
+
chunk = np.pad(chunk, (0, window_samples - len(chunk)), mode='reflect')
|
| 1287 |
|
| 1288 |
+
# Skip chunks with excessive padding to avoid skewed predictions
|
| 1289 |
+
padding_ratio = (window_samples - (end_pos - start_pos)) / window_samples
|
| 1290 |
if padding_ratio > 0.5:
|
| 1291 |
continue # Skip heavily padded chunks
|
| 1292 |
|
| 1293 |
+
# CORRECTED: Timestamp at ACTUAL CENTER of the chunk for alignment
|
| 1294 |
+
actual_center = start_pos + (end_pos - start_pos) / 2.0
|
| 1295 |
+
timestamp = actual_center / self.processor.sample_rate
|
| 1296 |
|
| 1297 |
if window_count < 3: # Log first 3 windows
|
| 1298 |
+
debug_info.append(f" 馃攧 Window {window_count}: t={timestamp:.2f}s (center), chunk_size={len(chunk)}")
|
| 1299 |
|
| 1300 |
# Call predict with the chunk
|
| 1301 |
result = self.models[model_name].predict(chunk, timestamp)
|
|
|
|
| 1309 |
model_results.append(result)
|
| 1310 |
window_count += 1
|
| 1311 |
|
| 1312 |
+
# Stop if we've gone past the audio length
|
| 1313 |
+
if timestamp >= audio_duration:
|
| 1314 |
+
break
|
| 1315 |
+
|
| 1316 |
debug_info.append(f" 馃幆 Total windows processed: {window_count}")
|
| 1317 |
|
| 1318 |
# Summary for this model
|