Spaces:

gbibbo
/

vad_demo

Sleeping

App Files Files Community

Gabriel Bibbó commited on Aug 5

Commit

4fd21cb

1 Parent(s): b647def

Hotfix: Restore basic functionality - fix AST saturation and PANNs execution

Browse files

Files changed (1) hide show

app.py +42 -28

app.py CHANGED Viewed

@@ -276,9 +276,14 @@ class OptimizedEPANNs:
                 print(f"📊 E-PANNs: energy={energy:.2f}, centroid={spectral_centroid:.1f}, mfcc_var={mfcc_var:.4f}")
-                # Combine features for better speech detection
-                speech_score = ((energy + 80) / 40) * 0.4 + (spectral_centroid / 5000) * 0.3 + (mfcc_var / 100) * 0.3
-                print(f"📈 E-PANNs: speech_score={speech_score:.4f}")
             else:
                 print("⚠️ E-PANNs: Using scipy fallback")
                 from scipy import signal
@@ -333,9 +338,10 @@ class OptimizedPANNs:
             if len(audio) > 0:
                 energy = np.sum(audio ** 2)
                 threshold = 0.01
-                probability = min(energy / threshold, 1.0)
                 is_speech = energy > threshold
-                print(f"🔄 PANNs fallback: energy={energy:.6f}, prob={probability:.4f}")
             else:
                 probability = 0.0
                 is_speech = False
@@ -372,8 +378,8 @@ class OptimizedPANNs:
                 print(f"✅ PANNs: Padded, final_len={len(audio_resampled)}")
             print(f"🚀 PANNs: Running inference...")
-            clip_probs, _ = self.model.inference(audio_resampled[np.newaxis, :],
-                                                 input_sr=self.sample_rate)
             print(f"✅ PANNs: Inference complete, output_shape={clip_probs.shape}")
             # Find speech-related indices
@@ -406,9 +412,10 @@ class OptimizedPANNs:
             if len(audio) > 0:
                 energy = np.sum(audio ** 2)
                 threshold = 0.01
-                probability = min(energy / threshold, 1.0)
                 is_speech = energy > threshold
-                print(f"🔄 PANNs error fallback: energy={energy:.6f}, prob={probability:.4f}")
             else:
                 probability = 0.0
                 is_speech = False
@@ -1159,25 +1166,32 @@ class VADDemo:
                     # Critical fix: Always process at least once, even if audio is shorter than window
                     if len(processed_audio) < window_samples:
-                        debug_info.append(f"  ⚠️ Audio too short ({len(processed_audio)} < {window_samples}), processing once")
-                        # Audio is shorter than required window - process once with available audio
-                        chunk = processed_audio
-                        timestamp = 0.0
-                        debug_info.append(f"  🔄 Processing chunk at t={timestamp:.2f}s, size={len(chunk)}")
-                        # Special handling for different models
-                        if model_name == 'AST':
-                            result = self.models[model_name].predict(chunk, timestamp, full_audio=processed_audio)
-                        else:
-                            result = self.models[model_name].predict(chunk, timestamp)
-                        debug_info.append(f"  📈 Result: prob={result.probability:.4f}, speech={result.is_speech}, time={result.processing_time:.3f}s")
-                        # Use model-specific threshold
-                        result.is_speech = result.probability > model_threshold
-                        vad_results.append(result)
-                        model_results.append(result)
                     else:
                         # Audio is long enough - process in sliding windows
                         debug_info.append(f"  ✅ Audio long enough, processing in windows")
@@ -1333,13 +1347,13 @@ def create_interface():
                 model_a = gr.Dropdown(
                     choices=["Silero-VAD", "WebRTC-VAD", "E-PANNs", "PANNs", "AST"],
-                    value="Silero-VAD",
                     label="Model A (Top Panel)"
                 )
                 model_b = gr.Dropdown(
                     choices=["Silero-VAD", "WebRTC-VAD", "E-PANNs", "PANNs", "AST"],
-                    value="WebRTC-VAD",
                     label="Model B (Bottom Panel)"
                 )

                 print(f"📊 E-PANNs: energy={energy:.2f}, centroid={spectral_centroid:.1f}, mfcc_var={mfcc_var:.4f}")
+                # Combine features for better speech detection with more conservative scaling
+                energy_score = np.clip((energy + 80) / 60, 0, 1)  # More conservative energy scaling
+                centroid_score = np.clip(spectral_centroid / 8000, 0, 1)  # More conservative centroid scaling
+                mfcc_score = np.clip(mfcc_var / 200, 0, 1)  # More conservative MFCC scaling
+                speech_score = energy_score * 0.5 + centroid_score * 0.3 + mfcc_score * 0.2
+                print(f"📈 E-PANNs: energy_score={energy_score:.3f}, centroid_score={centroid_score:.3f}, mfcc_score={mfcc_score:.3f}")
+                print(f"📈 E-PANNs: final_speech_score={speech_score:.4f}")
             else:
                 print("⚠️ E-PANNs: Using scipy fallback")
                 from scipy import signal
             if len(audio) > 0:
                 energy = np.sum(audio ** 2)
                 threshold = 0.01
+                # More conservative energy scaling for fallback
+                probability = min(energy / (threshold * 100), 1.0)  # Divide by 100 to reduce sensitivity
                 is_speech = energy > threshold
+                print(f"🔄 PANNs fallback: energy={energy:.6f}, threshold={threshold}, prob={probability:.4f}")
             else:
                 probability = 0.0
                 is_speech = False
                 print(f"✅ PANNs: Padded, final_len={len(audio_resampled)}")
             print(f"🚀 PANNs: Running inference...")
+            # Fix: PANNs inference doesn't take input_sr parameter
+            clip_probs, _ = self.model.inference(audio_resampled[np.newaxis, :])
             print(f"✅ PANNs: Inference complete, output_shape={clip_probs.shape}")
             # Find speech-related indices
             if len(audio) > 0:
                 energy = np.sum(audio ** 2)
                 threshold = 0.01
+                # More conservative energy scaling for error fallback
+                probability = min(energy / (threshold * 100), 1.0)  # Divide by 100 to reduce sensitivity
                 is_speech = energy > threshold
+                print(f"🔄 PANNs error fallback: energy={energy:.6f}, threshold={threshold}, prob={probability:.4f}")
             else:
                 probability = 0.0
                 is_speech = False
                     # Critical fix: Always process at least once, even if audio is shorter than window
                     if len(processed_audio) < window_samples:
+                        debug_info.append(f"  ⚠️ Audio too short ({len(processed_audio)} < {window_samples}), processing multiple times with overlap")
+                        # Generate multiple timestamps for visualization even with short audio
+                        num_points = max(3, int(len(processed_audio) / self.processor.sample_rate))  # At least 3 points
+                        for point_idx in range(num_points):
+                            timestamp = (point_idx / (num_points - 1)) * (len(processed_audio) / self.processor.sample_rate) if num_points > 1 else 0.0
+                            chunk = processed_audio  # Use full audio for each point
+                            debug_info.append(f"  🔄 Processing point {point_idx} at t={timestamp:.2f}s, size={len(chunk)}")
+                            # Special handling for different models
+                            if model_name == 'AST':
+                                result = self.models[model_name].predict(chunk, timestamp, full_audio=processed_audio)
+                            else:
+                                result = self.models[model_name].predict(chunk, timestamp)
+                            # Update timestamp to spread points
+                            result.timestamp = timestamp
+                            debug_info.append(f"  📈 Point {point_idx}: prob={result.probability:.4f}, speech={result.is_speech}")
+                            # Use model-specific threshold
+                            result.is_speech = result.probability > model_threshold
+                            vad_results.append(result)
+                            model_results.append(result)
                     else:
                         # Audio is long enough - process in sliding windows
                         debug_info.append(f"  ✅ Audio long enough, processing in windows")
                 model_a = gr.Dropdown(
                     choices=["Silero-VAD", "WebRTC-VAD", "E-PANNs", "PANNs", "AST"],
+                    value="E-PANNs",
                     label="Model A (Top Panel)"
                 )
                 model_b = gr.Dropdown(
                     choices=["Silero-VAD", "WebRTC-VAD", "E-PANNs", "PANNs", "AST"],
+                    value="PANNs",
                     label="Model B (Bottom Panel)"
                 )