Spaces:

gbibbo
/

vad_demo

Sleeping

App Files Files Community

Gabriel Bibbó commited on Aug 5

Commit

9d07682

1 Parent(s): e60e716

Hotfix: Restore basic functionality - fix AST saturation and PANNs execution

Browse files

Files changed (1) hide show

app.py +34 -63

app.py CHANGED Viewed

@@ -230,9 +230,8 @@ class OptimizedEPANNs:
             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
-            # Convert audio to target sample rate for E-PANNs
             if LIBROSA_AVAILABLE:
-                # Resample to E-PANNs sample rate if needed
                 audio_resampled = librosa.resample(audio.astype(float),
                                                  orig_sr=16000,
                                                  target_sr=self.sample_rate)
@@ -271,6 +270,7 @@ class OptimizedPANNs:
         self.sample_rate = 32000
         self.model = None
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         self.load_model()
     def load_model(self):
@@ -303,19 +303,8 @@ class OptimizedPANNs:
             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
-            # Convert audio to PANNs sample rate
-            if LIBROSA_AVAILABLE:
-                audio_resampled = librosa.resample(audio.astype(float),
-                                                 orig_sr=16000,
-                                                 target_sr=self.sample_rate)
-            else:
-                # Simple resampling fallback
-                resample_factor = self.sample_rate / 16000
-                audio_resampled = np.interp(
-                    np.linspace(0, len(audio) - 1, int(len(audio) * resample_factor)),
-                    np.arange(len(audio)),
-                    audio
-                )
             # Ensure minimum length for PANNs (need at least 1 second)
             min_samples = self.sample_rate  # 1 second
@@ -384,17 +373,11 @@ class OptimizedAST:
         start_time = time.time()
         if self.model is None or len(audio) == 0:
-            # Enhanced fallback using spectral features
             if len(audio) > 0:
                 energy = np.sum(audio ** 2)
-                if LIBROSA_AVAILABLE:
-                    spectral_features = librosa.feature.spectral_rolloff(y=audio, sr=self.sample_rate)
-                    spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=self.sample_rate))
-                    # Combine multiple features for better speech detection
-                    probability = min((energy * 100 + spectral_centroid / 500) / 2, 1.0)
-                else:
-                    probability = min(energy * 50, 1.0)
-                is_speech = probability > 0.3
             else:
                 probability = 0.0
                 is_speech = False
@@ -404,33 +387,16 @@ class OptimizedAST:
             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
-            # Use longer context for AST - take from full audio if available
-            if full_audio is not None and len(full_audio) > self.sample_rate:
-                # Take 3-second window centered around current timestamp
-                center_pos = int(timestamp * self.sample_rate)
-                window_size = int(1.5 * self.sample_rate)  # 1.5 seconds each side
-                start_pos = max(0, center_pos - window_size)
-                end_pos = min(len(full_audio), center_pos + window_size)
-                # Ensure we have at least 1 second
-                if end_pos - start_pos < self.sample_rate:
-                    end_pos = min(len(full_audio), start_pos + self.sample_rate)
-                audio_for_ast = full_audio[start_pos:end_pos]
-            else:
-                audio_for_ast = audio
-            # Ensure minimum length for AST
-            if len(audio_for_ast) < self.sample_rate:
-                audio_for_ast = np.pad(audio_for_ast, (0, self.sample_rate - len(audio_for_ast)), 'constant')
-            # Feature extraction with proper AST parameters
             inputs = self.feature_extractor(
-                audio_for_ast,
                 sampling_rate=self.sample_rate,
                 return_tensors="pt",
-                max_length=1024,  # Proper AST context
                 truncation=True
             )
@@ -452,23 +418,23 @@ class OptimizedAST:
             if speech_indices:
                 speech_prob = probs[0, speech_indices].mean().item()
-                # Boost the probability if it's too low but there's clear audio content
-                if speech_prob < 0.1 and np.sum(audio_for_ast ** 2) > 0.001:
-                    speech_prob = min(speech_prob * 5, 0.8)  # Boost but cap at 0.8
             else:
-                # Fallback to energy-based detection
-                energy = np.sum(audio_for_ast ** 2)
-                speech_prob = min(energy * 20, 1.0)
-            return VADResult(float(speech_prob), speech_prob > 0.4, self.model_name, time.time()-start_time, timestamp)
         except Exception as e:
             print(f"Error in {self.model_name}: {e}")
-            # Enhanced fallback
             if len(audio) > 0:
                 energy = np.sum(audio ** 2)
-                probability = min(energy * 30, 1.0)  # More aggressive energy scaling
-                is_speech = energy > 0.002
             else:
                 probability = 0.0
                 is_speech = False
@@ -511,6 +477,7 @@ class AudioProcessor:
             if len(audio_data.shape) > 1:
                 audio_data = audio_data.mean(axis=1)
             if np.max(np.abs(audio_data)) > 0:
                 audio_data = audio_data / np.max(np.abs(audio_data))
@@ -740,10 +707,11 @@ def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
         )
         if len(time_frames) > 0:
-            # Add threshold lines to both panels
             fig.add_hline(
                 y=threshold,
                 line=dict(color='cyan', width=2, dash='dash'),
                 annotation_text=f'Threshold: {threshold:.2f}',
                 annotation_position="top right",
                 row=1, col=1, secondary_y=True
@@ -751,6 +719,7 @@ def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
             fig.add_hline(
                 y=threshold,
                 line=dict(color='cyan', width=2, dash='dash'),
                 annotation_text=f'Threshold: {threshold:.2f}',
                 annotation_position="top right",
                 row=2, col=1, secondary_y=True
@@ -840,6 +809,7 @@ def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
             height=500,
             title_text="Real-Time Speech Visualizer",
             showlegend=True,
             legend=dict(
                 x=1.02,
                 y=1,
@@ -904,6 +874,10 @@ class VADDemo:
         print("🎤 Real-time VAD Demo initialized successfully")
         print(f"📊 Available models: {list(self.models.keys())}")
     def process_audio_with_events(self, audio, model_a, model_b, threshold):
         if audio is None:
@@ -921,7 +895,7 @@ class VADDemo:
             selected_models = list(set([model_a, model_b]))
-            # Process each window individually for all models
             for i in range(0, len(processed_audio) - window_samples, hop_samples):
                 timestamp = i / self.processor.sample_rate
                 chunk = processed_audio[i:i + window_samples]
@@ -997,6 +971,7 @@ demo_app = VADDemo()
 # ===== GRADIO INTERFACE =====
 def create_interface():
     # Load logos
     logos = load_logos()
@@ -1105,9 +1080,5 @@ def create_interface():
 # Create and launch interface
 if __name__ == "__main__":
-    # Initialize demo
-    print("🎤 Initializing VAD Demo...")
-    demo_app = VADDemo()
     interface = create_interface()
     interface.launch(share=True, debug=False)

             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
+            # Resample to E-PANNs sample rate
             if LIBROSA_AVAILABLE:
                 audio_resampled = librosa.resample(audio.astype(float),
                                                  orig_sr=16000,
                                                  target_sr=self.sample_rate)
         self.sample_rate = 32000
         self.model = None
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.processor = AudioProcessor()  # For fast resampling
         self.load_model()
     def load_model(self):
             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
+            # Fast resampling to PANNs sample rate
+            audio_resampled = self.processor.fast_resample(audio, 16000, self.sample_rate)
             # Ensure minimum length for PANNs (need at least 1 second)
             min_samples = self.sample_rate  # 1 second
         start_time = time.time()
         if self.model is None or len(audio) == 0:
+            # Simple energy-based fallback
             if len(audio) > 0:
                 energy = np.sum(audio ** 2)
+                probability = min(energy * 20, 1.0)
+                is_speech = probability > 0.2
             else:
                 probability = 0.0
                 is_speech = False
             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
+            # Use 1 second minimum for AST
+            if len(audio) < self.sample_rate:
+                audio = np.pad(audio, (0, self.sample_rate - len(audio)), 'constant')
+            # Feature extraction
             inputs = self.feature_extractor(
+                audio,
                 sampling_rate=self.sample_rate,
                 return_tensors="pt",
+                max_length=1024,
                 truncation=True
             )
             if speech_indices:
                 speech_prob = probs[0, speech_indices].mean().item()
             else:
+                # Fallback to energy
+                energy = np.sum(audio ** 2)
+                speech_prob = min(energy * 10, 1.0)
+            # Ensure reasonable range
+            speech_prob = np.clip(speech_prob, 0.0, 1.0)
+            return VADResult(float(speech_prob), speech_prob > 0.5, self.model_name, time.time()-start_time, timestamp)
         except Exception as e:
             print(f"Error in {self.model_name}: {e}")
+            # Simple fallback
             if len(audio) > 0:
                 energy = np.sum(audio ** 2)
+                probability = min(energy * 15, 1.0)
+                is_speech = energy > 0.01
             else:
                 probability = 0.0
                 is_speech = False
             if len(audio_data.shape) > 1:
                 audio_data = audio_data.mean(axis=1)
+            # Simple peak normalization
             if np.max(np.abs(audio_data)) > 0:
                 audio_data = audio_data / np.max(np.abs(audio_data))
         )
         if len(time_frames) > 0:
+            # Add threshold lines to both panels with layer='above' to show over spectrograms
             fig.add_hline(
                 y=threshold,
                 line=dict(color='cyan', width=2, dash='dash'),
+                layer='above',
                 annotation_text=f'Threshold: {threshold:.2f}',
                 annotation_position="top right",
                 row=1, col=1, secondary_y=True
             fig.add_hline(
                 y=threshold,
                 line=dict(color='cyan', width=2, dash='dash'),
+                layer='above',
                 annotation_text=f'Threshold: {threshold:.2f}',
                 annotation_position="top right",
                 row=2, col=1, secondary_y=True
             height=500,
             title_text="Real-Time Speech Visualizer",
             showlegend=True,
+            uirevision="const",  # Preserve zoom/pan when updating
             legend=dict(
                 x=1.02,
                 y=1,
         print("🎤 Real-time VAD Demo initialized successfully")
         print(f"📊 Available models: {list(self.models.keys())}")
+# Initialize demo globally for callbacks
+print("🎤 Initializing VAD Demo...")
+demo_app = VADDemo()
     def process_audio_with_events(self, audio, model_a, model_b, threshold):
         if audio is None:
             selected_models = list(set([model_a, model_b]))
+            # Process each window - simplified without complex scheduling
             for i in range(0, len(processed_audio) - window_samples, hop_samples):
                 timestamp = i / self.processor.sample_rate
                 chunk = processed_audio[i:i + window_samples]
 # ===== GRADIO INTERFACE =====
 def create_interface():
     # Load logos
     logos = load_logos()
 # Create and launch interface
 if __name__ == "__main__":
     interface = create_interface()
     interface.launch(share=True, debug=False)