Spaces:

gbibbo
/

vad_demo

Sleeping

App Files Files Community

Gabriel Bibbó commited on Aug 11

Commit

d7e6fe4

1 Parent(s): d758548

fix: ajustes en app.py

Browse files

Files changed (1) hide show

app.py +222 -291

app.py CHANGED Viewed

@@ -1,177 +1,3 @@
-from __future__ import annotations   # pospone la evaluación de las anotaciones
-import numpy as np                   # hace visible np para el resto del módulo
-def predict(self, audio: np.ndarray, timestamp: float = 0.0, full_audio: np.ndarray = None) -> VADResult:
-        start_time = time.time()
-        if self.model is None or len(audio) == 0:
-            # Enhanced fallback using spectral features
-            if len(audio) > 0:
-                energy = np.sum(audio ** 2)
-                if LIBROSA_AVAILABLE:
-                    spectral_features = librosa.feature.spectral_rolloff(y=audio, sr=self.sample_rate)
-                    spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=self.sample_rate))
-                    probability = min((energy * 100 + spectral_centroid / 1000) / 2, 1.0)
-                else:
-                    probability = min(energy * 50, 1.0)
-                is_speech = probability > 0.25
-            else:
-                probability = 0.0
-                is_speech = False
-            return VADResult(probability, is_speech, f"{self.model_name} (fallback)", time.time() - start_time, timestamp)
-        try:
-            # Cache key based on timestamp rounded to cache window
-            cache_key = int(timestamp / self.cache_window)
-            # Check cache first
-            if cache_key in self.prediction_cache:
-                cached_result = self.prediction_cache[cache_key]
-                # Return cached result with updated timestamp
-                return VADResult(
-                    cached_result.probability,
-                    cached_result.is_speech,
-                    cached_result.model_name + " (cached)",
-                    time.time() - start_time,
-                    timestamp
-                )
-            if len(audio.shape) > 1:
-                audio = audio.mean(axis=1)
-            # Use longer context for AST - preferably 6.4 seconds (1024 frames)
-            window_duration = 6.4  # seconds
-            window_samples = int(window_duration * self.sample_rate)
-            # If full_audio is provided, use it for better context
-            if full_audio is not None and len(full_audio) > window_samples:
-                # Take window centered around current timestamp
-                center_pos = int(timestamp * self.sample_rate)
-                half_window = window_samples // 2
-                start_pos = max(0, center_pos - half_window)
-                end_pos = min(len(full_audio), start_pos + window_samples)
-                # Adjust if at the end of audio
-                if end_pos == len(full_audio) and end_pos - start_pos < window_samples:
-                    start_pos = max(0, end_pos - window_samples)
-                audio_for_ast = full_audio[start_pos:end_pos]
-            else:
-                # Extract window from provided audio based on timestamp
-                center_sample = int(timestamp * self.sample_rate)
-                half_window = window_samples // 2
-                start_idx = max(0, center_sample - half_window)
-                end_idx = min(len(audio), start_idx + window_samples)
-                # Adjust if at the end
-                if end_idx == len(audio) and end_idx - start_idx < window_samples:
-                    start_idx = max(0, end_idx - window_samples)
-                audio_for_ast = audio[start_idx:end_idx]
-            # For short audio, use intelligent strategy
-            min_samples = int(6.4 * self.sample_rate)  # 6.4 seconds
-            if len(audio_for_ast) < min_samples:
-                # Repeat the audio cyclically to maintain temporal patterns
-                num_repeats = int(np.ceil(min_samples / len(audio_for_ast)))
-                audio_repeated = np.tile(audio_for_ast, num_repeats)[:min_samples]
-                # Apply smooth transitions at repetition boundaries
-                fade_samples = int(0.01 * self.sample_rate)  # 10ms fade
-                for i in range(1, num_repeats):
-                    if i * len(audio_for_ast) < len(audio_repeated):
-                        start_idx = i * len(audio_for_ast) - fade_samples
-                        end_idx = i * len(audio_for_ast) + fade_samples
-                        if start_idx >= 0 and end_idx < len(audio_repeated):
-                            audio_repeated[start_idx:end_idx] *= np.linspace(1, 1, 2 * fade_samples)
-                audio_for_ast = audio_repeated
-            # Truncate if too long
-            max_samples = 8 * self.sample_rate
-            if len(audio_for_ast) > max_samples:
-                audio_for_ast = audio_for_ast[:max_samples]
-            # Feature extraction
-            inputs = self.feature_extractor(
-                audio_for_ast,
-                sampling_rate=self.sample_rate,
-                return_tensors="pt",
-                max_length=1024,
-                padding="max_length",
-                truncation=True
-            )
-            # Move inputs to correct device and dtype
-            inputs = {k: v.to(self.device) for k, v in inputs.items()}
-            if self.device.type == 'cuda' and hasattr(self.model, 'half'):
-                inputs = {k: v.half() if v.dtype == torch.float32 else v for k, v in inputs.items()}
-            with torch.no_grad():
-                outputs = self.model(**inputs)
-                logits = outputs.logits
-                probs = torch.sigmoid(logits)
-            # Find speech-related classes
-            label2id = self.model.config.label2id
-            speech_indices = []
-            speech_keywords = [
-                'speech', 'voice', 'talk', 'conversation', 'speaking',
-                'male speech', 'female speech', 'child speech',
-                'speech synthesizer', 'narration'
-            ]
-            for lbl, idx in label2id.items():
-                if any(word in lbl.lower() for word in speech_keywords):
-                    speech_indices.append(idx)
-            # Also identify background/noise classes
-            noise_keywords = ['silence', 'white noise', 'background']
-            noise_indices = []
-            for lbl, idx in label2id.items():
-                if any(word in lbl.lower() for word in noise_keywords):
-                    noise_indices.append(idx)
-            if speech_indices:
-                # Use max probability among speech classes
-                speech_probs = probs[0, speech_indices]
-                speech_prob = torch.max(speech_probs).item()
-                # Consider noise/silence probability
-                if noise_indices:
-                    noise_prob = torch.mean(probs[0, noise_indices]).item()
-                    speech_prob = speech_prob * (1 - noise_prob * 0.3)
-                # Adjust confidence for short audio
-                if len(audio) < self.sample_rate * 2:
-                    confidence_factor = len(audio) / (self.sample_rate * 2)
-                speech_prob = speech_prob * (0.6 + 0.4 * confidence_factor)
-            # ── FIN DEL CÁLCULO DENTRO DE try ──────────────────────────
-            is_speech_ast = speech_prob > 0.25
-            return VADResult(
-                float(speech_prob),
-                is_speech_ast,
-                self.model_name,
-                time.time() - start_time,
-                timestamp
-            )
-        except Exception as e:
-            print(f"❌ AST ERROR: {e}")
-            import traceback
-            traceback.print_exc()
-            return VADResult(
-                0.0,
-                False,
-                f"{self.model_name} (error)",
-                time.time() - start_time,
-                timestamp
-            )
 import gradio as gr
 import numpy as np
 import torch
@@ -243,14 +69,22 @@ except ImportError:
     PLOTLY_AVAILABLE = False
     print("⚠️ Plotly not available")
-# PANNs imports
 try:
-    from panns_inference import AudioTagging, labels
     PANNS_AVAILABLE = True
-    print("✅ PANNs available")
 except ImportError:
-    PANNS_AVAILABLE = False
-    print("⚠️ PANNs not available, using fallback")
 # Transformers for AST
 try:
@@ -264,6 +98,25 @@ except ImportError:
 print("🚀 Creating Real-time VAD Demo...")
 # ===== DATA STRUCTURES =====
 @dataclass
@@ -403,10 +256,20 @@ class OptimizedWebRTCVAD:
             return VADResult(0.0, False, self.model_name, time.time() - start_time, timestamp)
 class OptimizedEPANNs:
     def __init__(self):
         self.model_name = "E-PANNs"
         self.sample_rate = 32000
         print(f"✅ {self.model_name} initialized")
     def predict(self, audio: np.ndarray, timestamp: float = 0.0) -> VADResult:
         start_time = time.time()
@@ -436,7 +299,7 @@ class OptimizedEPANNs:
             audio_window = audio[start_idx:end_idx]
-            # Convert audio to target sample rate for E-PANNs
             if LIBROSA_AVAILABLE:
                 # Resample to E-PANNs sample rate
                 audio_resampled = librosa.resample(audio_window.astype(float),
@@ -450,30 +313,54 @@ class OptimizedEPANNs:
                     num_repeats = int(np.ceil(min_samples / len(audio_resampled)))
                     audio_resampled = np.tile(audio_resampled, num_repeats)[:min_samples]
-                # Compute features
-                mel_spec = librosa.feature.melspectrogram(y=audio_resampled, sr=self.sample_rate, n_mels=64)
-                energy = np.mean(librosa.power_to_db(mel_spec, ref=np.max))
-                # Use actual non-repeated audio for some features
-                actual_audio_len = min(len(audio_resampled), int(len(audio_window) * self.sample_rate / 16000))
-                actual_audio = audio_resampled[:actual_audio_len]
-                spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=actual_audio, sr=self.sample_rate))
-                mfcc = librosa.feature.mfcc(y=actual_audio, sr=self.sample_rate, n_mfcc=13)
-                mfcc_var = np.var(mfcc, axis=1).mean()
-                zcr = np.mean(librosa.feature.zero_crossing_rate(actual_audio))
-                # Adjusted scaling for better speech detection
-                energy_score = np.clip((energy + 80) / 40, 0, 1)
-                centroid_score = np.clip((spectral_centroid - 200) / 3000, 0, 1)
-                mfcc_score = np.clip(mfcc_var / 100, 0, 1)
-                zcr_score = np.clip(zcr * 10, 0, 1)
-                # Weighted combination
-                speech_score = (energy_score * 0.4 +
-                              centroid_score * 0.2 +
-                              mfcc_score * 0.3 +
-                              zcr_score * 0.1)
             else:
                 from scipy import signal
                 # Basic fallback without librosa
@@ -493,29 +380,44 @@ class OptimizedEPANNs:
             return VADResult(0.0, False, self.model_name, time.time() - start_time, timestamp)
 class OptimizedPANNs:
     def __init__(self):
         self.model_name = "PANNs"
         self.sample_rate = 32000
         self.model = None
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         self.load_model()
     def load_model(self):
         try:
             if PANNS_AVAILABLE:
-                self.model = AudioTagging(checkpoint_path=None, device=self.device)
-                print(f"✅ {self.model_name} loaded successfully")
             else:
                 print(f"⚠️ {self.model_name} not available, using fallback")
                 self.model = None
         except Exception as e:
             print(f"❌ Error loading {self.model_name}: {e}")
             self.model = None
     def predict(self, audio: np.ndarray, timestamp: float = 0.0) -> VADResult:
         start_time = time.time()
-        if self.model is None or len(audio) == 0:
             if len(audio) > 0:
                 energy = np.sum(audio ** 2)
                 threshold = 0.01
@@ -579,48 +481,86 @@ class OptimizedPANNs:
                 audio_resampled = audio_repeated
-            # Run inference
-            clip_probs, _ = self.model.inference(audio_resampled[np.newaxis, :])
-            # Enhanced speech detection using multiple relevant labels
-            speech_keywords = [
-                'speech', 'voice', 'talk', 'conversation', 'speaking',
-                'male speech', 'female speech', 'child speech',
-                'narration', 'monologue'
-            ]
-            speech_indices = []
-            for i, lbl in enumerate(labels):
-                if any(word in lbl.lower() for word in speech_keywords):
-                    speech_indices.append(i)
-            # Also get silence/noise indices for contrast
-            noise_keywords = ['silence', 'white noise', 'pink noise']
-            noise_indices = []
-            for i, lbl in enumerate(labels):
-                if any(word in lbl.lower() for word in noise_keywords):
-                    noise_indices.append(i)
-            if speech_indices:
-                # Get speech probability
-                speech_probs = clip_probs[0, speech_indices]
-                speech_prob = np.max(speech_probs)  # Use max instead of mean for better detection
-                # Get noise probability for contrast
-                if noise_indices:
-                    noise_prob = np.mean(clip_probs[0, noise_indices])
-                    # Adjust speech probability based on noise
-                    speech_prob = speech_prob * (1 - noise_prob * 0.5)
-                # If using repeated audio, scale confidence based on original length
-                if len(audio_window) < 16000 * 2:  # Less than 2 seconds
-                    confidence_scale = len(audio_window) / (16000 * 2)
-                    speech_prob = speech_prob * (0.5 + 0.5 * confidence_scale)
-            else:
-                # Fallback if no speech indices found
-                top_indices = np.argsort(clip_probs[0])[-10:]
-                speech_prob = np.mean(clip_probs[0, top_indices])
             return VADResult(float(speech_prob), speech_prob > 0.4, self.model_name, time.time()-start_time, timestamp)
@@ -639,9 +579,10 @@ class OptimizedPANNs:
             return VADResult(probability, is_speech, f"{self.model_name} (error)", time.time() - start_time, timestamp)
 class OptimizedAST:
     def __init__(self):
         self.model_name = "AST"
-        self.sample_rate = 16000
         self.model = None
         self.feature_extractor = None
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
@@ -720,59 +661,49 @@ class OptimizedAST:
                 audio = audio.mean(axis=1)
                 print(f"🔄 AST: Converted to mono")
-            # Use longer context for AST - preferably 6.4 seconds (1024 frames)
-            if full_audio is not None and len(full_audio) >= 6.4 * self.sample_rate:
-                print(f"✅ AST: Using full audio context")
-                # Take 6.4-second window centered around current timestamp
-                center_pos = int(timestamp * self.sample_rate)
-                window_size = int(3.2 * self.sample_rate)  # 3.2 seconds each side
-                start_pos = max(0, center_pos - window_size)
-                end_pos = min(len(full_audio), center_pos + window_size)
-                # Ensure we have at least 6.4 seconds
-                if end_pos - start_pos < 6.4 * self.sample_rate:
-                    end_pos = min(len(full_audio), start_pos + int(6.4 * self.sample_rate))
-                    if end_pos - start_pos < 6.4 * self.sample_rate:
-                        start_pos = max(0, end_pos - int(6.4 * self.sample_rate))
-                audio_for_ast = full_audio[start_pos:end_pos]
-                print(f"🔄 AST: Extracted window [{start_pos}:{end_pos}], len={len(audio_for_ast)}")
-            else:
-                print(f"⚠️ AST: Using provided audio chunk")
-                audio_for_ast = audio
             # For short audio, use intelligent strategy
-            min_samples = int(6.4 * self.sample_rate)  # 6.4 seconds
             if len(audio_for_ast) < min_samples:
-                print(f"⚠️ AST: Audio too short ({len(audio_for_ast)} samples), using cyclic repetition")
-                # Repeat the audio cyclically to maintain temporal patterns
-                num_repeats = int(np.ceil(min_samples / len(audio_for_ast)))
-                audio_repeated = np.tile(audio_for_ast, num_repeats)[:min_samples]
-                # Apply smooth transitions at repetition boundaries
-                fade_samples = int(0.01 * self.sample_rate)  # 10ms fade
-                for i in range(1, num_repeats):
-                    if i * len(audio_for_ast) < len(audio_repeated):
-                        start_idx = i * len(audio_for_ast) - fade_samples
-                        end_idx = i * len(audio_for_ast) + fade_samples
-                        if start_idx >= 0 and end_idx < len(audio_repeated):
-                            audio_repeated[start_idx:end_idx] *= np.linspace(1, 1, 2 * fade_samples)
-                audio_for_ast = audio_repeated
-                print(f"✅ AST: Repeated with smoothing, final_len={len(audio_for_ast)}")
-            # Truncate if too long (AST can handle up to ~10s, but we'll use 8s max for efficiency)
-            max_samples = 8 * self.sample_rate
             if len(audio_for_ast) > max_samples:
                 audio_for_ast = audio_for_ast[:max_samples]
                 print(f"✂️ AST: Truncated to {len(audio_for_ast)} samples")
             print(f"🔄 AST: Feature extraction...")
-            # Feature extraction with proper AST parameters (closer to 1024 frames)
             inputs = self.feature_extractor(
                 audio_for_ast,
-                sampling_rate=self.sample_rate,
                 return_tensors="pt",
                 max_length=1024,  # Proper AST context
                 padding="max_length",  # Ensure consistent length
@@ -896,7 +827,7 @@ class AudioProcessor:
             "WebRTC-VAD": 0.03,   # 30ms frames (480 samples)
             "E-PANNs": 6.0,       # 6 seconds minimum for reliable results
             "PANNs": 10.0,        # 10 seconds for optimal performance
-            "AST": 6.4            # ~6.4 seconds (1024 frames * 6.25ms)
         }
         # Model-specific hop sizes for efficiency

 import gradio as gr
 import numpy as np
 import torch
     PLOTLY_AVAILABLE = False
     print("⚠️ Plotly not available")
+# PANNs imports - UPDATED to include SoundEventDetection
 try:
+    from panns_inference import AudioTagging, SoundEventDetection, labels
     PANNS_AVAILABLE = True
+    PANNS_SED_AVAILABLE = True
+    print("✅ PANNs available with SoundEventDetection")
 except ImportError:
+    try:
+        from panns_inference import AudioTagging, labels
+        PANNS_AVAILABLE = True
+        PANNS_SED_AVAILABLE = False
+        print("✅ PANNs available (AudioTagging only)")
+    except ImportError:
+        PANNS_AVAILABLE = False
+        PANNS_SED_AVAILABLE = False
+        print("⚠️ PANNs not available, using fallback")
 # Transformers for AST
 try:
 print("🚀 Creating Real-time VAD Demo...")
+# ===== HELPER FUNCTIONS FOR CORRECTED MODELS =====
+def safe_resample(x, sr_in, sr_out):
+    """Safely resample audio from sr_in to sr_out"""
+    if sr_in == sr_out:
+        return x.astype(np.float32)
+    try:
+        if LIBROSA_AVAILABLE:
+            return librosa.resample(x.astype(float), orig_sr=sr_in, target_sr=sr_out)
+        else:
+            # Fallback linear interpolation
+            dur = len(x) / sr_in
+            n_out = max(1, int(round(dur * sr_out)))
+            xi = np.linspace(0, len(x)-1, num=len(x))
+            xo = np.linspace(0, len(x)-1, num=n_out)
+            return np.interp(xo, xi, x).astype(np.float32)
+    except Exception as e:
+        print(f"Resample error: {e}")
+        return x.astype(np.float32)
 # ===== DATA STRUCTURES =====
 @dataclass
             return VADResult(0.0, False, self.model_name, time.time() - start_time, timestamp)
 class OptimizedEPANNs:
+    """CORRECTED E-PANNs with proper temporal resolution using sliding windows"""
     def __init__(self):
         self.model_name = "E-PANNs"
         self.sample_rate = 32000
         print(f"✅ {self.model_name} initialized")
+        # Try to load PANNs AudioTagging as backend for E-PANNs
+        self.at_model = None
+        if PANNS_AVAILABLE:
+            try:
+                self.at_model = AudioTagging(checkpoint_path=None, device='cpu')
+                print(f"✅ {self.model_name} using PANNs AT backend")
+            except Exception as e:
+                print(f"⚠️ {self.model_name} PANNs AT unavailable: {e}")
     def predict(self, audio: np.ndarray, timestamp: float = 0.0) -> VADResult:
         start_time = time.time()
             audio_window = audio[start_idx:end_idx]
+            # Convert audio to target sample rate for E-PANNs (32kHz)
             if LIBROSA_AVAILABLE:
                 # Resample to E-PANNs sample rate
                 audio_resampled = librosa.resample(audio_window.astype(float),
                     num_repeats = int(np.ceil(min_samples / len(audio_resampled)))
                     audio_resampled = np.tile(audio_resampled, num_repeats)[:min_samples]
+                # If we have PANNs AT model, use it
+                if self.at_model is not None:
+                    # Run inference
+                    clipwise_output, _ = self.at_model.inference(audio_resampled[np.newaxis, :])
+                    # Get speech-related classes
+                    speech_keywords = [
+                        'speech', 'voice', 'talk', 'conversation', 'speaking',
+                        'male speech', 'female speech', 'child speech',
+                        'narration', 'monologue'
+                    ]
+                    speech_indices = []
+                    for i, lbl in enumerate(labels):
+                        if any(word in lbl.lower() for word in speech_keywords):
+                            speech_indices.append(i)
+                    if speech_indices:
+                        speech_probs = clipwise_output[0, speech_indices]
+                        speech_score = float(np.max(speech_probs))
+                    else:
+                        speech_score = float(np.max(clipwise_output[0]))
+                else:
+                    # Fallback to spectral features
+                    # Compute features
+                    mel_spec = librosa.feature.melspectrogram(y=audio_resampled, sr=self.sample_rate, n_mels=64)
+                    energy = np.mean(librosa.power_to_db(mel_spec, ref=np.max))
+                    # Use actual non-repeated audio for some features
+                    actual_audio_len = min(len(audio_resampled), int(len(audio_window) * self.sample_rate / 16000))
+                    actual_audio = audio_resampled[:actual_audio_len]
+                    spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=actual_audio, sr=self.sample_rate))
+                    mfcc = librosa.feature.mfcc(y=actual_audio, sr=self.sample_rate, n_mfcc=13)
+                    mfcc_var = np.var(mfcc, axis=1).mean()
+                    zcr = np.mean(librosa.feature.zero_crossing_rate(actual_audio))
+                    # Adjusted scaling for better speech detection
+                    energy_score = np.clip((energy + 80) / 40, 0, 1)
+                    centroid_score = np.clip((spectral_centroid - 200) / 3000, 0, 1)
+                    mfcc_score = np.clip(mfcc_var / 100, 0, 1)
+                    zcr_score = np.clip(zcr * 10, 0, 1)
+                    # Weighted combination
+                    speech_score = (energy_score * 0.4 +
+                                  centroid_score * 0.2 +
+                                  mfcc_score * 0.3 +
+                                  zcr_score * 0.1)
             else:
                 from scipy import signal
                 # Basic fallback without librosa
             return VADResult(0.0, False, self.model_name, time.time() - start_time, timestamp)
 class OptimizedPANNs:
+    """CORRECTED PANNs with SoundEventDetection for framewise output when available"""
     def __init__(self):
         self.model_name = "PANNs"
         self.sample_rate = 32000
         self.model = None
+        self.sed_model = None
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         self.load_model()
     def load_model(self):
         try:
             if PANNS_AVAILABLE:
+                # Try to load SED model first for framewise output
+                if PANNS_SED_AVAILABLE:
+                    try:
+                        self.sed_model = SoundEventDetection(checkpoint_path=None, device=self.device)
+                        print(f"✅ {self.model_name} SED loaded successfully (framewise mode)")
+                    except Exception as e:
+                        print(f"⚠️ {self.model_name} SED initialization failed: {e}")
+                        self.sed_model = None
+                # Load AudioTagging as fallback or primary
+                if self.sed_model is None:
+                    self.model = AudioTagging(checkpoint_path=None, device=self.device)
+                    print(f"✅ {self.model_name} AT loaded successfully")
             else:
                 print(f"⚠️ {self.model_name} not available, using fallback")
                 self.model = None
+                self.sed_model = None
         except Exception as e:
             print(f"❌ Error loading {self.model_name}: {e}")
             self.model = None
+            self.sed_model = None
     def predict(self, audio: np.ndarray, timestamp: float = 0.0) -> VADResult:
         start_time = time.time()
+        if (self.model is None and self.sed_model is None) or len(audio) == 0:
             if len(audio) > 0:
                 energy = np.sum(audio ** 2)
                 threshold = 0.01
                 audio_resampled = audio_repeated
+            # Use SED for framewise predictions if available
+            if self.sed_model is not None:
+                # SED gives framewise output
+                framewise_output = self.sed_model.inference(audio_resampled[np.newaxis, :])
+                if hasattr(framewise_output, 'cpu'):
+                    framewise_output = framewise_output.cpu().numpy()
+                if framewise_output.ndim == 3:
+                    framewise_output = framewise_output[0]  # Remove batch dimension
+                # Get frame corresponding to timestamp
+                audio_duration = len(audio_resampled) / self.sample_rate
+                if audio_duration > 0:
+                    frame_idx = int((timestamp % audio_duration) / audio_duration * framewise_output.shape[0])
+                    frame_idx = min(frame_idx, framewise_output.shape[0] - 1)
+                else:
+                    frame_idx = 0
+                # Get speech-related classes
+                speech_keywords = [
+                    'speech', 'voice', 'talk', 'conversation', 'speaking',
+                    'male speech', 'female speech', 'child speech',
+                    'narration', 'monologue'
+                ]
+                speech_indices = []
+                for i, lbl in enumerate(labels):
+                    if any(word in lbl.lower() for word in speech_keywords):
+                        speech_indices.append(i)
+                if speech_indices and frame_idx < framewise_output.shape[0]:
+                    speech_probs = framewise_output[frame_idx, speech_indices]
+                    speech_prob = float(np.max(speech_probs))
+                else:
+                    speech_prob = float(np.max(framewise_output[frame_idx])) if frame_idx < framewise_output.shape[0] else 0.0
+            else:
+                # Use AudioTagging model
+                # Run inference
+                clip_probs, _ = self.model.inference(audio_resampled[np.newaxis, :])
+                # Enhanced speech detection using multiple relevant labels
+                speech_keywords = [
+                    'speech', 'voice', 'talk', 'conversation', 'speaking',
+                    'male speech', 'female speech', 'child speech',
+                    'narration', 'monologue'
+                ]
+                speech_indices = []
+                for i, lbl in enumerate(labels):
+                    if any(word in lbl.lower() for word in speech_keywords):
+                        speech_indices.append(i)
+                # Also get silence/noise indices for contrast
+                noise_keywords = ['silence', 'white noise', 'pink noise']
+                noise_indices = []
+                for i, lbl in enumerate(labels):
+                    if any(word in lbl.lower() for word in noise_keywords):
+                        noise_indices.append(i)
+                if speech_indices:
+                    # Get speech probability
+                    speech_probs = clip_probs[0, speech_indices]
+                    speech_prob = np.max(speech_probs)  # Use max instead of mean for better detection
+                    # Get noise probability for contrast
+                    if noise_indices:
+                        noise_prob = np.mean(clip_probs[0, noise_indices])
+                        # Adjust speech probability based on noise
+                        speech_prob = speech_prob * (1 - noise_prob * 0.5)
+                    # If using repeated audio, scale confidence based on original length
+                    if len(audio_window) < 16000 * 2:  # Less than 2 seconds
+                        confidence_scale = len(audio_window) / (16000 * 2)
+                        speech_prob = speech_prob * (0.5 + 0.5 * confidence_scale)
+                else:
+                    # Fallback if no speech indices found
+                    top_indices = np.argsort(clip_probs[0])[-10:]
+                    speech_prob = np.mean(clip_probs[0, top_indices])
             return VADResult(float(speech_prob), speech_prob > 0.4, self.model_name, time.time()-start_time, timestamp)
             return VADResult(probability, is_speech, f"{self.model_name} (error)", time.time() - start_time, timestamp)
 class OptimizedAST:
+    """CORRECTED AST with proper 16kHz sample rate and sliding windows"""
     def __init__(self):
         self.model_name = "AST"
+        self.sample_rate = 16000  # AST REQUIRES 16kHz
         self.model = None
         self.feature_extractor = None
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
                 audio = audio.mean(axis=1)
                 print(f"🔄 AST: Converted to mono")
+            # CRITICAL FIX: AST uses 16kHz, but input is already at 16kHz
+            # So we DON'T need to resample, just ensure it's float32
+            audio = audio.astype(np.float32)
+            # Use sliding window approach for temporal resolution
+            window_duration = 1.0  # 1 second windows
+            window_samples = int(window_duration * self.sample_rate)
+            # Get window for this timestamp
+            center_sample = int(timestamp * self.sample_rate)
+            half_window = window_samples // 2
+            start_idx = max(0, center_sample - half_window)
+            end_idx = min(len(audio), start_idx + window_samples)
+            # Adjust if at the end
+            if end_idx == len(audio) and end_idx - start_idx < window_samples:
+                start_idx = max(0, end_idx - window_samples)
+            audio_for_ast = audio[start_idx:end_idx]
+            print(f"🔄 AST: Extracted window [{start_idx}:{end_idx}], len={len(audio_for_ast)}")
             # For short audio, use intelligent strategy
+            min_samples = int(1.0 * self.sample_rate)  # 1 second minimum
             if len(audio_for_ast) < min_samples:
+                print(f"⚠️ AST: Audio too short ({len(audio_for_ast)} samples), padding")
+                # Pad with zeros
+                audio_padded = np.zeros(min_samples)
+                audio_padded[:len(audio_for_ast)] = audio_for_ast
+                audio_for_ast = audio_padded
+                print(f"✅ AST: Padded to {len(audio_for_ast)} samples")
+            # Truncate if too long (AST can handle up to ~10s, but we use 1s windows)
+            max_samples = int(1.5 * self.sample_rate)
             if len(audio_for_ast) > max_samples:
                 audio_for_ast = audio_for_ast[:max_samples]
                 print(f"✂️ AST: Truncated to {len(audio_for_ast)} samples")
             print(f"🔄 AST: Feature extraction...")
+            # Feature extraction with proper AST parameters
             inputs = self.feature_extractor(
                 audio_for_ast,
+                sampling_rate=self.sample_rate,  # Must be 16kHz
                 return_tensors="pt",
                 max_length=1024,  # Proper AST context
                 padding="max_length",  # Ensure consistent length
             "WebRTC-VAD": 0.03,   # 30ms frames (480 samples)
             "E-PANNs": 6.0,       # 6 seconds minimum for reliable results
             "PANNs": 10.0,        # 10 seconds for optimal performance
+            "AST": 1.0            # Changed to 1 second for better temporal resolution
         }
         # Model-specific hop sizes for efficiency