Spaces:

gbibbo
/

vad_demo

Sleeping

App Files Files Community

Gabriel Bibbó commited on Aug 4

Commit

25b51aa

1 Parent(s): 2a8cb45

Fix threshold lines visibility and AST probability detection

Browse files

Files changed (1) hide show

app.py +60 -89

app.py CHANGED Viewed

@@ -362,9 +362,6 @@ class OptimizedAST:
         self.model = None
         self.feature_extractor = None
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        # Cache for long audio segments (not tiny chunks)
-        self.segment_cache = {}
-        self.min_audio_length = self.sample_rate  # 1 second minimum
         self.load_model()
     def load_model(self):
@@ -374,8 +371,6 @@ class OptimizedAST:
                 self.feature_extractor = ASTFeatureExtractor.from_pretrained(model_name)
                 self.model = ASTForAudioClassification.from_pretrained(model_name)
                 self.model.to(self.device)
-                if torch.cuda.is_available():
-                    self.model.half()  # Use FP16 for speed
                 self.model.eval()
                 print(f"✅ {self.model_name} loaded successfully")
             else:
@@ -396,10 +391,10 @@ class OptimizedAST:
                     spectral_features = librosa.feature.spectral_rolloff(y=audio, sr=self.sample_rate)
                     spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=self.sample_rate))
                     # Combine multiple features for better speech detection
-                    probability = min((energy * 50 + spectral_centroid / 1000 + np.mean(spectral_features) / 1000) / 3, 1.0)
                 else:
-                    probability = min(energy / 0.01, 1.0)
-                is_speech = probability > 0.3  # Lower threshold for fallback
             else:
                 probability = 0.0
                 is_speech = False
@@ -409,98 +404,71 @@ class OptimizedAST:
             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
-            # AST needs much longer context - use at least 1 second
-            if len(audio) < self.min_audio_length:
-                # Pad to minimum length (1 second)
-                audio = np.pad(audio, (0, self.min_audio_length - len(audio)), 'constant')
-            # For very long audio, take a representative 2-second segment
-            if len(audio) > self.sample_rate * 2:
-                # Take segment around current timestamp from full audio if available
-                if full_audio is not None and len(full_audio) > self.sample_rate:
-                    # Calculate position in full audio
-                    center_pos = int(timestamp * self.sample_rate)
-                    half_window = self.sample_rate  # 1 second each side
-                    start_pos = max(0, center_pos - half_window)
-                    end_pos = min(len(full_audio), center_pos + half_window)
-                    # Ensure we have at least 1 second
-                    if end_pos - start_pos < self.min_audio_length:
-                        end_pos = min(len(full_audio), start_pos + self.min_audio_length)
-                    audio = full_audio[start_pos:end_pos]
-                else:
-                    # Fallback: take middle part
-                    start_idx = (len(audio) - self.sample_rate * 2) // 2
-                    audio = audio[start_idx:start_idx + self.sample_rate * 2]
-            # Create cache key based on timestamp range instead of audio bytes
-            cache_key = f"{int(timestamp * 10)}"  # Cache per 100ms of timestamp
-            if cache_key in self.segment_cache:
-                speech_prob = self.segment_cache[cache_key]
-            else:
-                # Feature extraction with proper parameters for AST
-                inputs = self.feature_extractor(
-                    audio,
-                    sampling_rate=self.sample_rate,
-                    return_tensors="pt",
-                    padding="max_length",
-                    max_length=1024,  # Proper context length (~10s worth of frames)
-                    truncation=True
-                )
-                # Move to device and convert to proper dtype
-                inputs = {k: v.to(self.device) for k, v in inputs.items()}
-                if torch.cuda.is_available():
-                    inputs = {k: v.half() if v.dtype == torch.float32 else v for k, v in inputs.items()}
-                with torch.no_grad():
-                    outputs = self.model(**inputs)
-                    logits = outputs.logits
-                    probs = torch.sigmoid(logits)
-                # Find speech-related indices with broader search
-                label2id = self.model.config.label2id
-                speech_indices = []
-                speech_keywords = ['speech', 'voice', 'talk', 'conversation', 'speaking', 'human', 'vocal', 'verbal']
-                for lbl, idx in label2id.items():
-                    if any(word in lbl.lower() for word in speech_keywords):
-                        speech_indices.append(idx)
-                if speech_indices:
-                    speech_prob = probs[0, speech_indices].mean().item()
-                else:
-                    # Enhanced fallback: look for any human-related audio classes
-                    human_indices = []
-                    for lbl, idx in label2id.items():
-                        if any(word in lbl.lower() for word in ['human', 'people', 'person', 'male', 'female', 'child']):
-                            human_indices.append(idx)
-                    if human_indices:
-                        speech_prob = probs[0, human_indices].mean().item()
-                    else:
-                        # Last resort: use top activations
-                        speech_prob = probs[0].topk(10).values.mean().item()
-                # Cache with limited size
-                if len(self.segment_cache) < 100:
-                    self.segment_cache[cache_key] = speech_prob
-                elif len(self.segment_cache) >= 200:  # Clear cache when too large
-                    self.segment_cache.clear()
-            return VADResult(float(speech_prob), speech_prob > 0.5, self.model_name, time.time()-start_time, timestamp)
         except Exception as e:
             print(f"Error in {self.model_name}: {e}")
             # Enhanced fallback
             if len(audio) > 0:
                 energy = np.sum(audio ** 2)
-                # Use energy-based detection with better threshold
-                probability = min(energy / 0.005, 1.0)  # More sensitive threshold
-                is_speech = energy > 0.005
             else:
                 probability = 0.0
                 is_speech = False
@@ -772,6 +740,7 @@ def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
         )
         if len(time_frames) > 0:
             fig.add_hline(
                 y=threshold,
                 line=dict(color='cyan', width=2, dash='dash'),
@@ -782,6 +751,8 @@ def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
             fig.add_hline(
                 y=threshold,
                 line=dict(color='cyan', width=2, dash='dash'),
                 row=2, col=1, secondary_y=True
             )

         self.model = None
         self.feature_extractor = None
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         self.load_model()
     def load_model(self):
                 self.feature_extractor = ASTFeatureExtractor.from_pretrained(model_name)
                 self.model = ASTForAudioClassification.from_pretrained(model_name)
                 self.model.to(self.device)
                 self.model.eval()
                 print(f"✅ {self.model_name} loaded successfully")
             else:
                     spectral_features = librosa.feature.spectral_rolloff(y=audio, sr=self.sample_rate)
                     spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=self.sample_rate))
                     # Combine multiple features for better speech detection
+                    probability = min((energy * 100 + spectral_centroid / 500) / 2, 1.0)
                 else:
+                    probability = min(energy * 50, 1.0)
+                is_speech = probability > 0.3
             else:
                 probability = 0.0
                 is_speech = False
             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
+            # Use longer context for AST - take from full audio if available
+            if full_audio is not None and len(full_audio) > self.sample_rate:
+                # Take 3-second window centered around current timestamp
+                center_pos = int(timestamp * self.sample_rate)
+                window_size = int(1.5 * self.sample_rate)  # 1.5 seconds each side
+                start_pos = max(0, center_pos - window_size)
+                end_pos = min(len(full_audio), center_pos + window_size)
+                # Ensure we have at least 1 second
+                if end_pos - start_pos < self.sample_rate:
+                    end_pos = min(len(full_audio), start_pos + self.sample_rate)
+                audio_for_ast = full_audio[start_pos:end_pos]
+            else:
+                audio_for_ast = audio
+            # Ensure minimum length for AST
+            if len(audio_for_ast) < self.sample_rate:
+                audio_for_ast = np.pad(audio_for_ast, (0, self.sample_rate - len(audio_for_ast)), 'constant')
+            # Feature extraction with proper AST parameters
+            inputs = self.feature_extractor(
+                audio_for_ast,
+                sampling_rate=self.sample_rate,
+                return_tensors="pt",
+                max_length=1024,  # Proper AST context
+                truncation=True
+            )
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            with torch.no_grad():
+                outputs = self.model(**inputs)
+                logits = outputs.logits
+                probs = torch.sigmoid(logits)
+            # Find speech-related classes
+            label2id = self.model.config.label2id
+            speech_indices = []
+            speech_keywords = ['speech', 'voice', 'talk', 'conversation', 'speaking']
+            for lbl, idx in label2id.items():
+                if any(word in lbl.lower() for word in speech_keywords):
+                    speech_indices.append(idx)
+            if speech_indices:
+                speech_prob = probs[0, speech_indices].mean().item()
+                # Boost the probability if it's too low but there's clear audio content
+                if speech_prob < 0.1 and np.sum(audio_for_ast ** 2) > 0.001:
+                    speech_prob = min(speech_prob * 5, 0.8)  # Boost but cap at 0.8
+            else:
+                # Fallback to energy-based detection
+                energy = np.sum(audio_for_ast ** 2)
+                speech_prob = min(energy * 20, 1.0)
+            return VADResult(float(speech_prob), speech_prob > 0.4, self.model_name, time.time()-start_time, timestamp)
         except Exception as e:
             print(f"Error in {self.model_name}: {e}")
             # Enhanced fallback
             if len(audio) > 0:
                 energy = np.sum(audio ** 2)
+                probability = min(energy * 30, 1.0)  # More aggressive energy scaling
+                is_speech = energy > 0.002
             else:
                 probability = 0.0
                 is_speech = False
         )
         if len(time_frames) > 0:
+            # Add threshold lines to both panels
             fig.add_hline(
                 y=threshold,
                 line=dict(color='cyan', width=2, dash='dash'),
             fig.add_hline(
                 y=threshold,
                 line=dict(color='cyan', width=2, dash='dash'),
+                annotation_text=f'Threshold: {threshold:.2f}',
+                annotation_position="top right",
                 row=2, col=1, secondary_y=True
             )