Spaces:

gbibbo
/

vad_demo

Sleeping

App Files Files Community

Gabriel Bibbó commited on Aug 5

Commit

bcae560

1 Parent(s): 9d07682

Hotfix: Restore basic functionality - fix AST saturation and PANNs execution

Browse files

Files changed (1) hide show

app.py +63 -34

app.py CHANGED Viewed

@@ -230,8 +230,9 @@ class OptimizedEPANNs:
             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
-            # Resample to E-PANNs sample rate
             if LIBROSA_AVAILABLE:
                 audio_resampled = librosa.resample(audio.astype(float),
                                                  orig_sr=16000,
                                                  target_sr=self.sample_rate)
@@ -270,7 +271,6 @@ class OptimizedPANNs:
         self.sample_rate = 32000
         self.model = None
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        self.processor = AudioProcessor()  # For fast resampling
         self.load_model()
     def load_model(self):
@@ -303,8 +303,19 @@ class OptimizedPANNs:
             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
-            # Fast resampling to PANNs sample rate
-            audio_resampled = self.processor.fast_resample(audio, 16000, self.sample_rate)
             # Ensure minimum length for PANNs (need at least 1 second)
             min_samples = self.sample_rate  # 1 second
@@ -373,11 +384,17 @@ class OptimizedAST:
         start_time = time.time()
         if self.model is None or len(audio) == 0:
-            # Simple energy-based fallback
             if len(audio) > 0:
                 energy = np.sum(audio ** 2)
-                probability = min(energy * 20, 1.0)
-                is_speech = probability > 0.2
             else:
                 probability = 0.0
                 is_speech = False
@@ -387,16 +404,33 @@ class OptimizedAST:
             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
-            # Use 1 second minimum for AST
-            if len(audio) < self.sample_rate:
-                audio = np.pad(audio, (0, self.sample_rate - len(audio)), 'constant')
-            # Feature extraction
             inputs = self.feature_extractor(
-                audio,
                 sampling_rate=self.sample_rate,
                 return_tensors="pt",
-                max_length=1024,
                 truncation=True
             )
@@ -418,23 +452,23 @@ class OptimizedAST:
             if speech_indices:
                 speech_prob = probs[0, speech_indices].mean().item()
             else:
-                # Fallback to energy
-                energy = np.sum(audio ** 2)
-                speech_prob = min(energy * 10, 1.0)
-            # Ensure reasonable range
-            speech_prob = np.clip(speech_prob, 0.0, 1.0)
-            return VADResult(float(speech_prob), speech_prob > 0.5, self.model_name, time.time()-start_time, timestamp)
         except Exception as e:
             print(f"Error in {self.model_name}: {e}")
-            # Simple fallback
             if len(audio) > 0:
                 energy = np.sum(audio ** 2)
-                probability = min(energy * 15, 1.0)
-                is_speech = energy > 0.01
             else:
                 probability = 0.0
                 is_speech = False
@@ -477,7 +511,6 @@ class AudioProcessor:
             if len(audio_data.shape) > 1:
                 audio_data = audio_data.mean(axis=1)
-            # Simple peak normalization
             if np.max(np.abs(audio_data)) > 0:
                 audio_data = audio_data / np.max(np.abs(audio_data))
@@ -707,11 +740,10 @@ def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
         )
         if len(time_frames) > 0:
-            # Add threshold lines to both panels with layer='above' to show over spectrograms
             fig.add_hline(
                 y=threshold,
                 line=dict(color='cyan', width=2, dash='dash'),
-                layer='above',
                 annotation_text=f'Threshold: {threshold:.2f}',
                 annotation_position="top right",
                 row=1, col=1, secondary_y=True
@@ -719,7 +751,6 @@ def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
             fig.add_hline(
                 y=threshold,
                 line=dict(color='cyan', width=2, dash='dash'),
-                layer='above',
                 annotation_text=f'Threshold: {threshold:.2f}',
                 annotation_position="top right",
                 row=2, col=1, secondary_y=True
@@ -809,7 +840,6 @@ def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
             height=500,
             title_text="Real-Time Speech Visualizer",
             showlegend=True,
-            uirevision="const",  # Preserve zoom/pan when updating
             legend=dict(
                 x=1.02,
                 y=1,
@@ -874,10 +904,6 @@ class VADDemo:
         print("🎤 Real-time VAD Demo initialized successfully")
         print(f"📊 Available models: {list(self.models.keys())}")
-# Initialize demo globally for callbacks
-print("🎤 Initializing VAD Demo...")
-demo_app = VADDemo()
     def process_audio_with_events(self, audio, model_a, model_b, threshold):
         if audio is None:
@@ -895,7 +921,7 @@ demo_app = VADDemo()
             selected_models = list(set([model_a, model_b]))
-            # Process each window - simplified without complex scheduling
             for i in range(0, len(processed_audio) - window_samples, hop_samples):
                 timestamp = i / self.processor.sample_rate
                 chunk = processed_audio[i:i + window_samples]
@@ -971,7 +997,6 @@ demo_app = VADDemo()
 # ===== GRADIO INTERFACE =====
 def create_interface():
     # Load logos
     logos = load_logos()
@@ -1080,5 +1105,9 @@ def create_interface():
 # Create and launch interface
 if __name__ == "__main__":
     interface = create_interface()
     interface.launch(share=True, debug=False)

             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
+            # Convert audio to target sample rate for E-PANNs
             if LIBROSA_AVAILABLE:
+                # Resample to E-PANNs sample rate if needed
                 audio_resampled = librosa.resample(audio.astype(float),
                                                  orig_sr=16000,
                                                  target_sr=self.sample_rate)
         self.sample_rate = 32000
         self.model = None
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         self.load_model()
     def load_model(self):
             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
+            # Convert audio to PANNs sample rate
+            if LIBROSA_AVAILABLE:
+                audio_resampled = librosa.resample(audio.astype(float),
+                                                 orig_sr=16000,
+                                                 target_sr=self.sample_rate)
+            else:
+                # Simple resampling fallback
+                resample_factor = self.sample_rate / 16000
+                audio_resampled = np.interp(
+                    np.linspace(0, len(audio) - 1, int(len(audio) * resample_factor)),
+                    np.arange(len(audio)),
+                    audio
+                )
             # Ensure minimum length for PANNs (need at least 1 second)
             min_samples = self.sample_rate  # 1 second
         start_time = time.time()
         if self.model is None or len(audio) == 0:
+            # Enhanced fallback using spectral features
             if len(audio) > 0:
                 energy = np.sum(audio ** 2)
+                if LIBROSA_AVAILABLE:
+                    spectral_features = librosa.feature.spectral_rolloff(y=audio, sr=self.sample_rate)
+                    spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=self.sample_rate))
+                    # Combine multiple features for better speech detection
+                    probability = min((energy * 100 + spectral_centroid / 500) / 2, 1.0)
+                else:
+                    probability = min(energy * 50, 1.0)
+                is_speech = probability > 0.3
             else:
                 probability = 0.0
                 is_speech = False
             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
+            # Use longer context for AST - take from full audio if available
+            if full_audio is not None and len(full_audio) > self.sample_rate:
+                # Take 3-second window centered around current timestamp
+                center_pos = int(timestamp * self.sample_rate)
+                window_size = int(1.5 * self.sample_rate)  # 1.5 seconds each side
+                start_pos = max(0, center_pos - window_size)
+                end_pos = min(len(full_audio), center_pos + window_size)
+                # Ensure we have at least 1 second
+                if end_pos - start_pos < self.sample_rate:
+                    end_pos = min(len(full_audio), start_pos + self.sample_rate)
+                audio_for_ast = full_audio[start_pos:end_pos]
+            else:
+                audio_for_ast = audio
+            # Ensure minimum length for AST
+            if len(audio_for_ast) < self.sample_rate:
+                audio_for_ast = np.pad(audio_for_ast, (0, self.sample_rate - len(audio_for_ast)), 'constant')
+            # Feature extraction with proper AST parameters
             inputs = self.feature_extractor(
+                audio_for_ast,
                 sampling_rate=self.sample_rate,
                 return_tensors="pt",
+                max_length=1024,  # Proper AST context
                 truncation=True
             )
             if speech_indices:
                 speech_prob = probs[0, speech_indices].mean().item()
+                # Boost the probability if it's too low but there's clear audio content
+                if speech_prob < 0.1 and np.sum(audio_for_ast ** 2) > 0.001:
+                    speech_prob = min(speech_prob * 5, 0.8)  # Boost but cap at 0.8
             else:
+                # Fallback to energy-based detection
+                energy = np.sum(audio_for_ast ** 2)
+                speech_prob = min(energy * 20, 1.0)
+            return VADResult(float(speech_prob), speech_prob > 0.4, self.model_name, time.time()-start_time, timestamp)
         except Exception as e:
             print(f"Error in {self.model_name}: {e}")
+            # Enhanced fallback
             if len(audio) > 0:
                 energy = np.sum(audio ** 2)
+                probability = min(energy * 30, 1.0)  # More aggressive energy scaling
+                is_speech = energy > 0.002
             else:
                 probability = 0.0
                 is_speech = False
             if len(audio_data.shape) > 1:
                 audio_data = audio_data.mean(axis=1)
             if np.max(np.abs(audio_data)) > 0:
                 audio_data = audio_data / np.max(np.abs(audio_data))
         )
         if len(time_frames) > 0:
+            # Add threshold lines to both panels
             fig.add_hline(
                 y=threshold,
                 line=dict(color='cyan', width=2, dash='dash'),
                 annotation_text=f'Threshold: {threshold:.2f}',
                 annotation_position="top right",
                 row=1, col=1, secondary_y=True
             fig.add_hline(
                 y=threshold,
                 line=dict(color='cyan', width=2, dash='dash'),
                 annotation_text=f'Threshold: {threshold:.2f}',
                 annotation_position="top right",
                 row=2, col=1, secondary_y=True
             height=500,
             title_text="Real-Time Speech Visualizer",
             showlegend=True,
             legend=dict(
                 x=1.02,
                 y=1,
         print("🎤 Real-time VAD Demo initialized successfully")
         print(f"📊 Available models: {list(self.models.keys())}")
     def process_audio_with_events(self, audio, model_a, model_b, threshold):
         if audio is None:
             selected_models = list(set([model_a, model_b]))
+            # Process each window individually for all models
             for i in range(0, len(processed_audio) - window_samples, hop_samples):
                 timestamp = i / self.processor.sample_rate
                 chunk = processed_audio[i:i + window_samples]
 # ===== GRADIO INTERFACE =====
 def create_interface():
     # Load logos
     logos = load_logos()
 # Create and launch interface
 if __name__ == "__main__":
+    # Initialize demo
+    print("🎤 Initializing VAD Demo...")
+    demo_app = VADDemo()
     interface = create_interface()
     interface.launch(share=True, debug=False)