Gabriel Bibbó commited on
Commit
4fd21cb
·
1 Parent(s): b647def

Hotfix: Restore basic functionality - fix AST saturation and PANNs execution

Browse files
Files changed (1) hide show
  1. app.py +42 -28
app.py CHANGED
@@ -276,9 +276,14 @@ class OptimizedEPANNs:
276
 
277
  print(f"📊 E-PANNs: energy={energy:.2f}, centroid={spectral_centroid:.1f}, mfcc_var={mfcc_var:.4f}")
278
 
279
- # Combine features for better speech detection
280
- speech_score = ((energy + 80) / 40) * 0.4 + (spectral_centroid / 5000) * 0.3 + (mfcc_var / 100) * 0.3
281
- print(f"📈 E-PANNs: speech_score={speech_score:.4f}")
 
 
 
 
 
282
  else:
283
  print("⚠️ E-PANNs: Using scipy fallback")
284
  from scipy import signal
@@ -333,9 +338,10 @@ class OptimizedPANNs:
333
  if len(audio) > 0:
334
  energy = np.sum(audio ** 2)
335
  threshold = 0.01
336
- probability = min(energy / threshold, 1.0)
 
337
  is_speech = energy > threshold
338
- print(f"🔄 PANNs fallback: energy={energy:.6f}, prob={probability:.4f}")
339
  else:
340
  probability = 0.0
341
  is_speech = False
@@ -372,8 +378,8 @@ class OptimizedPANNs:
372
  print(f"✅ PANNs: Padded, final_len={len(audio_resampled)}")
373
 
374
  print(f"🚀 PANNs: Running inference...")
375
- clip_probs, _ = self.model.inference(audio_resampled[np.newaxis, :],
376
- input_sr=self.sample_rate)
377
  print(f"✅ PANNs: Inference complete, output_shape={clip_probs.shape}")
378
 
379
  # Find speech-related indices
@@ -406,9 +412,10 @@ class OptimizedPANNs:
406
  if len(audio) > 0:
407
  energy = np.sum(audio ** 2)
408
  threshold = 0.01
409
- probability = min(energy / threshold, 1.0)
 
410
  is_speech = energy > threshold
411
- print(f"🔄 PANNs error fallback: energy={energy:.6f}, prob={probability:.4f}")
412
  else:
413
  probability = 0.0
414
  is_speech = False
@@ -1159,25 +1166,32 @@ class VADDemo:
1159
 
1160
  # Critical fix: Always process at least once, even if audio is shorter than window
1161
  if len(processed_audio) < window_samples:
1162
- debug_info.append(f" ⚠️ Audio too short ({len(processed_audio)} < {window_samples}), processing once")
1163
- # Audio is shorter than required window - process once with available audio
1164
- chunk = processed_audio
1165
- timestamp = 0.0
1166
-
1167
- debug_info.append(f" 🔄 Processing chunk at t={timestamp:.2f}s, size={len(chunk)}")
1168
-
1169
- # Special handling for different models
1170
- if model_name == 'AST':
1171
- result = self.models[model_name].predict(chunk, timestamp, full_audio=processed_audio)
1172
- else:
1173
- result = self.models[model_name].predict(chunk, timestamp)
1174
 
1175
- debug_info.append(f" 📈 Result: prob={result.probability:.4f}, speech={result.is_speech}, time={result.processing_time:.3f}s")
 
1176
 
1177
- # Use model-specific threshold
1178
- result.is_speech = result.probability > model_threshold
1179
- vad_results.append(result)
1180
- model_results.append(result)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1181
  else:
1182
  # Audio is long enough - process in sliding windows
1183
  debug_info.append(f" ✅ Audio long enough, processing in windows")
@@ -1333,13 +1347,13 @@ def create_interface():
1333
 
1334
  model_a = gr.Dropdown(
1335
  choices=["Silero-VAD", "WebRTC-VAD", "E-PANNs", "PANNs", "AST"],
1336
- value="Silero-VAD",
1337
  label="Model A (Top Panel)"
1338
  )
1339
 
1340
  model_b = gr.Dropdown(
1341
  choices=["Silero-VAD", "WebRTC-VAD", "E-PANNs", "PANNs", "AST"],
1342
- value="WebRTC-VAD",
1343
  label="Model B (Bottom Panel)"
1344
  )
1345
 
 
276
 
277
  print(f"📊 E-PANNs: energy={energy:.2f}, centroid={spectral_centroid:.1f}, mfcc_var={mfcc_var:.4f}")
278
 
279
+ # Combine features for better speech detection with more conservative scaling
280
+ energy_score = np.clip((energy + 80) / 60, 0, 1) # More conservative energy scaling
281
+ centroid_score = np.clip(spectral_centroid / 8000, 0, 1) # More conservative centroid scaling
282
+ mfcc_score = np.clip(mfcc_var / 200, 0, 1) # More conservative MFCC scaling
283
+
284
+ speech_score = energy_score * 0.5 + centroid_score * 0.3 + mfcc_score * 0.2
285
+ print(f"📈 E-PANNs: energy_score={energy_score:.3f}, centroid_score={centroid_score:.3f}, mfcc_score={mfcc_score:.3f}")
286
+ print(f"📈 E-PANNs: final_speech_score={speech_score:.4f}")
287
  else:
288
  print("⚠️ E-PANNs: Using scipy fallback")
289
  from scipy import signal
 
338
  if len(audio) > 0:
339
  energy = np.sum(audio ** 2)
340
  threshold = 0.01
341
+ # More conservative energy scaling for fallback
342
+ probability = min(energy / (threshold * 100), 1.0) # Divide by 100 to reduce sensitivity
343
  is_speech = energy > threshold
344
+ print(f"🔄 PANNs fallback: energy={energy:.6f}, threshold={threshold}, prob={probability:.4f}")
345
  else:
346
  probability = 0.0
347
  is_speech = False
 
378
  print(f"✅ PANNs: Padded, final_len={len(audio_resampled)}")
379
 
380
  print(f"🚀 PANNs: Running inference...")
381
+ # Fix: PANNs inference doesn't take input_sr parameter
382
+ clip_probs, _ = self.model.inference(audio_resampled[np.newaxis, :])
383
  print(f"✅ PANNs: Inference complete, output_shape={clip_probs.shape}")
384
 
385
  # Find speech-related indices
 
412
  if len(audio) > 0:
413
  energy = np.sum(audio ** 2)
414
  threshold = 0.01
415
+ # More conservative energy scaling for error fallback
416
+ probability = min(energy / (threshold * 100), 1.0) # Divide by 100 to reduce sensitivity
417
  is_speech = energy > threshold
418
+ print(f"🔄 PANNs error fallback: energy={energy:.6f}, threshold={threshold}, prob={probability:.4f}")
419
  else:
420
  probability = 0.0
421
  is_speech = False
 
1166
 
1167
  # Critical fix: Always process at least once, even if audio is shorter than window
1168
  if len(processed_audio) < window_samples:
1169
+ debug_info.append(f" ⚠️ Audio too short ({len(processed_audio)} < {window_samples}), processing multiple times with overlap")
 
 
 
 
 
 
 
 
 
 
 
1170
 
1171
+ # Generate multiple timestamps for visualization even with short audio
1172
+ num_points = max(3, int(len(processed_audio) / self.processor.sample_rate)) # At least 3 points
1173
 
1174
+ for point_idx in range(num_points):
1175
+ timestamp = (point_idx / (num_points - 1)) * (len(processed_audio) / self.processor.sample_rate) if num_points > 1 else 0.0
1176
+ chunk = processed_audio # Use full audio for each point
1177
+
1178
+ debug_info.append(f" 🔄 Processing point {point_idx} at t={timestamp:.2f}s, size={len(chunk)}")
1179
+
1180
+ # Special handling for different models
1181
+ if model_name == 'AST':
1182
+ result = self.models[model_name].predict(chunk, timestamp, full_audio=processed_audio)
1183
+ else:
1184
+ result = self.models[model_name].predict(chunk, timestamp)
1185
+
1186
+ # Update timestamp to spread points
1187
+ result.timestamp = timestamp
1188
+
1189
+ debug_info.append(f" 📈 Point {point_idx}: prob={result.probability:.4f}, speech={result.is_speech}")
1190
+
1191
+ # Use model-specific threshold
1192
+ result.is_speech = result.probability > model_threshold
1193
+ vad_results.append(result)
1194
+ model_results.append(result)
1195
  else:
1196
  # Audio is long enough - process in sliding windows
1197
  debug_info.append(f" ✅ Audio long enough, processing in windows")
 
1347
 
1348
  model_a = gr.Dropdown(
1349
  choices=["Silero-VAD", "WebRTC-VAD", "E-PANNs", "PANNs", "AST"],
1350
+ value="E-PANNs",
1351
  label="Model A (Top Panel)"
1352
  )
1353
 
1354
  model_b = gr.Dropdown(
1355
  choices=["Silero-VAD", "WebRTC-VAD", "E-PANNs", "PANNs", "AST"],
1356
+ value="PANNs",
1357
  label="Model B (Bottom Panel)"
1358
  )
1359