Gabriel Bibbó
commited on
Commit
·
4fd21cb
1
Parent(s):
b647def
Hotfix: Restore basic functionality - fix AST saturation and PANNs execution
Browse files
app.py
CHANGED
|
@@ -276,9 +276,14 @@ class OptimizedEPANNs:
|
|
| 276 |
|
| 277 |
print(f"📊 E-PANNs: energy={energy:.2f}, centroid={spectral_centroid:.1f}, mfcc_var={mfcc_var:.4f}")
|
| 278 |
|
| 279 |
-
# Combine features for better speech detection
|
| 280 |
-
|
| 281 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
else:
|
| 283 |
print("⚠️ E-PANNs: Using scipy fallback")
|
| 284 |
from scipy import signal
|
|
@@ -333,9 +338,10 @@ class OptimizedPANNs:
|
|
| 333 |
if len(audio) > 0:
|
| 334 |
energy = np.sum(audio ** 2)
|
| 335 |
threshold = 0.01
|
| 336 |
-
|
|
|
|
| 337 |
is_speech = energy > threshold
|
| 338 |
-
print(f"🔄 PANNs fallback: energy={energy:.6f}, prob={probability:.4f}")
|
| 339 |
else:
|
| 340 |
probability = 0.0
|
| 341 |
is_speech = False
|
|
@@ -372,8 +378,8 @@ class OptimizedPANNs:
|
|
| 372 |
print(f"✅ PANNs: Padded, final_len={len(audio_resampled)}")
|
| 373 |
|
| 374 |
print(f"🚀 PANNs: Running inference...")
|
| 375 |
-
|
| 376 |
-
|
| 377 |
print(f"✅ PANNs: Inference complete, output_shape={clip_probs.shape}")
|
| 378 |
|
| 379 |
# Find speech-related indices
|
|
@@ -406,9 +412,10 @@ class OptimizedPANNs:
|
|
| 406 |
if len(audio) > 0:
|
| 407 |
energy = np.sum(audio ** 2)
|
| 408 |
threshold = 0.01
|
| 409 |
-
|
|
|
|
| 410 |
is_speech = energy > threshold
|
| 411 |
-
print(f"🔄 PANNs error fallback: energy={energy:.6f}, prob={probability:.4f}")
|
| 412 |
else:
|
| 413 |
probability = 0.0
|
| 414 |
is_speech = False
|
|
@@ -1159,25 +1166,32 @@ class VADDemo:
|
|
| 1159 |
|
| 1160 |
# Critical fix: Always process at least once, even if audio is shorter than window
|
| 1161 |
if len(processed_audio) < window_samples:
|
| 1162 |
-
debug_info.append(f" ⚠️ Audio too short ({len(processed_audio)} < {window_samples}), processing
|
| 1163 |
-
# Audio is shorter than required window - process once with available audio
|
| 1164 |
-
chunk = processed_audio
|
| 1165 |
-
timestamp = 0.0
|
| 1166 |
-
|
| 1167 |
-
debug_info.append(f" 🔄 Processing chunk at t={timestamp:.2f}s, size={len(chunk)}")
|
| 1168 |
-
|
| 1169 |
-
# Special handling for different models
|
| 1170 |
-
if model_name == 'AST':
|
| 1171 |
-
result = self.models[model_name].predict(chunk, timestamp, full_audio=processed_audio)
|
| 1172 |
-
else:
|
| 1173 |
-
result = self.models[model_name].predict(chunk, timestamp)
|
| 1174 |
|
| 1175 |
-
|
|
|
|
| 1176 |
|
| 1177 |
-
|
| 1178 |
-
|
| 1179 |
-
|
| 1180 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1181 |
else:
|
| 1182 |
# Audio is long enough - process in sliding windows
|
| 1183 |
debug_info.append(f" ✅ Audio long enough, processing in windows")
|
|
@@ -1333,13 +1347,13 @@ def create_interface():
|
|
| 1333 |
|
| 1334 |
model_a = gr.Dropdown(
|
| 1335 |
choices=["Silero-VAD", "WebRTC-VAD", "E-PANNs", "PANNs", "AST"],
|
| 1336 |
-
value="
|
| 1337 |
label="Model A (Top Panel)"
|
| 1338 |
)
|
| 1339 |
|
| 1340 |
model_b = gr.Dropdown(
|
| 1341 |
choices=["Silero-VAD", "WebRTC-VAD", "E-PANNs", "PANNs", "AST"],
|
| 1342 |
-
value="
|
| 1343 |
label="Model B (Bottom Panel)"
|
| 1344 |
)
|
| 1345 |
|
|
|
|
| 276 |
|
| 277 |
print(f"📊 E-PANNs: energy={energy:.2f}, centroid={spectral_centroid:.1f}, mfcc_var={mfcc_var:.4f}")
|
| 278 |
|
| 279 |
+
# Combine features for better speech detection with more conservative scaling
|
| 280 |
+
energy_score = np.clip((energy + 80) / 60, 0, 1) # More conservative energy scaling
|
| 281 |
+
centroid_score = np.clip(spectral_centroid / 8000, 0, 1) # More conservative centroid scaling
|
| 282 |
+
mfcc_score = np.clip(mfcc_var / 200, 0, 1) # More conservative MFCC scaling
|
| 283 |
+
|
| 284 |
+
speech_score = energy_score * 0.5 + centroid_score * 0.3 + mfcc_score * 0.2
|
| 285 |
+
print(f"📈 E-PANNs: energy_score={energy_score:.3f}, centroid_score={centroid_score:.3f}, mfcc_score={mfcc_score:.3f}")
|
| 286 |
+
print(f"📈 E-PANNs: final_speech_score={speech_score:.4f}")
|
| 287 |
else:
|
| 288 |
print("⚠️ E-PANNs: Using scipy fallback")
|
| 289 |
from scipy import signal
|
|
|
|
| 338 |
if len(audio) > 0:
|
| 339 |
energy = np.sum(audio ** 2)
|
| 340 |
threshold = 0.01
|
| 341 |
+
# More conservative energy scaling for fallback
|
| 342 |
+
probability = min(energy / (threshold * 100), 1.0) # Divide by 100 to reduce sensitivity
|
| 343 |
is_speech = energy > threshold
|
| 344 |
+
print(f"🔄 PANNs fallback: energy={energy:.6f}, threshold={threshold}, prob={probability:.4f}")
|
| 345 |
else:
|
| 346 |
probability = 0.0
|
| 347 |
is_speech = False
|
|
|
|
| 378 |
print(f"✅ PANNs: Padded, final_len={len(audio_resampled)}")
|
| 379 |
|
| 380 |
print(f"🚀 PANNs: Running inference...")
|
| 381 |
+
# Fix: PANNs inference doesn't take input_sr parameter
|
| 382 |
+
clip_probs, _ = self.model.inference(audio_resampled[np.newaxis, :])
|
| 383 |
print(f"✅ PANNs: Inference complete, output_shape={clip_probs.shape}")
|
| 384 |
|
| 385 |
# Find speech-related indices
|
|
|
|
| 412 |
if len(audio) > 0:
|
| 413 |
energy = np.sum(audio ** 2)
|
| 414 |
threshold = 0.01
|
| 415 |
+
# More conservative energy scaling for error fallback
|
| 416 |
+
probability = min(energy / (threshold * 100), 1.0) # Divide by 100 to reduce sensitivity
|
| 417 |
is_speech = energy > threshold
|
| 418 |
+
print(f"🔄 PANNs error fallback: energy={energy:.6f}, threshold={threshold}, prob={probability:.4f}")
|
| 419 |
else:
|
| 420 |
probability = 0.0
|
| 421 |
is_speech = False
|
|
|
|
| 1166 |
|
| 1167 |
# Critical fix: Always process at least once, even if audio is shorter than window
|
| 1168 |
if len(processed_audio) < window_samples:
|
| 1169 |
+
debug_info.append(f" ⚠️ Audio too short ({len(processed_audio)} < {window_samples}), processing multiple times with overlap")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1170 |
|
| 1171 |
+
# Generate multiple timestamps for visualization even with short audio
|
| 1172 |
+
num_points = max(3, int(len(processed_audio) / self.processor.sample_rate)) # At least 3 points
|
| 1173 |
|
| 1174 |
+
for point_idx in range(num_points):
|
| 1175 |
+
timestamp = (point_idx / (num_points - 1)) * (len(processed_audio) / self.processor.sample_rate) if num_points > 1 else 0.0
|
| 1176 |
+
chunk = processed_audio # Use full audio for each point
|
| 1177 |
+
|
| 1178 |
+
debug_info.append(f" 🔄 Processing point {point_idx} at t={timestamp:.2f}s, size={len(chunk)}")
|
| 1179 |
+
|
| 1180 |
+
# Special handling for different models
|
| 1181 |
+
if model_name == 'AST':
|
| 1182 |
+
result = self.models[model_name].predict(chunk, timestamp, full_audio=processed_audio)
|
| 1183 |
+
else:
|
| 1184 |
+
result = self.models[model_name].predict(chunk, timestamp)
|
| 1185 |
+
|
| 1186 |
+
# Update timestamp to spread points
|
| 1187 |
+
result.timestamp = timestamp
|
| 1188 |
+
|
| 1189 |
+
debug_info.append(f" 📈 Point {point_idx}: prob={result.probability:.4f}, speech={result.is_speech}")
|
| 1190 |
+
|
| 1191 |
+
# Use model-specific threshold
|
| 1192 |
+
result.is_speech = result.probability > model_threshold
|
| 1193 |
+
vad_results.append(result)
|
| 1194 |
+
model_results.append(result)
|
| 1195 |
else:
|
| 1196 |
# Audio is long enough - process in sliding windows
|
| 1197 |
debug_info.append(f" ✅ Audio long enough, processing in windows")
|
|
|
|
| 1347 |
|
| 1348 |
model_a = gr.Dropdown(
|
| 1349 |
choices=["Silero-VAD", "WebRTC-VAD", "E-PANNs", "PANNs", "AST"],
|
| 1350 |
+
value="E-PANNs",
|
| 1351 |
label="Model A (Top Panel)"
|
| 1352 |
)
|
| 1353 |
|
| 1354 |
model_b = gr.Dropdown(
|
| 1355 |
choices=["Silero-VAD", "WebRTC-VAD", "E-PANNs", "PANNs", "AST"],
|
| 1356 |
+
value="PANNs",
|
| 1357 |
label="Model B (Bottom Panel)"
|
| 1358 |
)
|
| 1359 |
|