Spaces:

gbibbo
/

vad_demo

Sleeping

App Files Files Community

Gabriel Bibbó commited on Aug 4

Commit

a21e04b

1 Parent(s): 43be67f

GitHub-faithful implementation - 32kHz, 2048 FFT, per-model delays, 80ms gaps

Browse files

Files changed (1) hide show

app.py +165 -374

app.py CHANGED Viewed

@@ -101,6 +101,10 @@ class OptimizedSileroVAD:
             print(f"❌ Error loading {self.model_name}: {e}")
             self.model = None
     def predict(self, audio: np.ndarray, timestamp: float = 0.0) -> VADResult:
         start_time = time.time()
@@ -108,21 +112,11 @@ class OptimizedSileroVAD:
             return VADResult(0.0, False, f"{self.model_name} (unavailable)", time.time() - start_time, timestamp)
         try:
-            if len(audio.shape) > 1:
-                audio = audio.mean(axis=1)
-            # Silero expects chunks of 512 samples for 16kHz
-            required_samples = 512
-            if len(audio) != required_samples:
-                if len(audio) > required_samples:
-                    start_idx = (len(audio) - required_samples) // 2
-                    audio_chunk = audio[start_idx:start_idx + required_samples]
-                else:
-                    audio_chunk = np.pad(audio, (0, required_samples - len(audio)), 'constant')
-            else:
-                audio_chunk = audio
-            audio_tensor = torch.FloatTensor(audio_chunk).unsqueeze(0)
             with torch.no_grad():
                 speech_prob = self.model(audio_tensor, self.sample_rate).item()
@@ -133,45 +127,35 @@ class OptimizedSileroVAD:
             return VADResult(speech_prob, is_speech, self.model_name, processing_time, timestamp)
         except Exception as e:
-            print(f"Error in {self.model_name}: {e}")
             return VADResult(0.0, False, self.model_name, time.time() - start_time, timestamp)
 class OptimizedWebRTCVAD:
     def __init__(self):
         self.model_name = "WebRTC-VAD"
         self.sample_rate = 16000
-        self.frame_duration = 30  # Valid frame size: 10, 20, or 30 ms
         self.frame_size = int(self.sample_rate * self.frame_duration / 1000)
         if WEBRTC_AVAILABLE:
             try:
-                self.vad = webrtcvad.Vad(3) # Aggressiveness level 3
                 print(f"✅ {self.model_name} loaded successfully")
-            except:
-                self.vad = None
-        else:
-            self.vad = None
     def predict(self, audio: np.ndarray, timestamp: float = 0.0) -> VADResult:
         start_time = time.time()
         if self.vad is None or len(audio) == 0:
-            energy = np.sum(audio ** 2) if len(audio) > 0 else 0
-            threshold = 0.01
-            probability = min(energy / threshold, 1.0)
-            is_speech = energy > threshold
-            return VADResult(probability, is_speech, f"{self.model_name} (fallback)", time.time() - start_time, timestamp)
         try:
-            if len(audio.shape) > 1:
-                audio = audio.mean(axis=1)
             audio_int16 = (audio * 32767).astype(np.int16)
-            speech_frames = 0
-            total_frames = 0
-            # Corrected loop to process the last complete frame
             for i in range(0, len(audio_int16) - self.frame_size + 1, self.frame_size):
                 frame = audio_int16[i:i + self.frame_size].tobytes()
                 if self.vad.is_speech(frame, self.sample_rate):
@@ -179,48 +163,37 @@ class OptimizedWebRTCVAD:
                 total_frames += 1
             probability = speech_frames / max(total_frames, 1)
-            is_speech = probability > 0.3 # Default threshold for WebRTC
             return VADResult(probability, is_speech, self.model_name, time.time() - start_time, timestamp)
         except Exception as e:
-            print(f"Error in {self.model_name}: {e}")
             return VADResult(0.0, False, self.model_name, time.time() - start_time, timestamp)
 class OptimizedEPANNs:
     def __init__(self):
         self.model_name = "E-PANNs"
-        self.sample_rate = 16000 # Works with the main sample rate
         print(f"✅ {self.model_name} initialized")
     def predict(self, audio: np.ndarray, timestamp: float = 0.0) -> VADResult:
         start_time = time.time()
         try:
-            if len(audio) == 0:
-                return VADResult(0.0, False, self.model_name, time.time() - start_time, timestamp)
-            if len(audio.shape) > 1:
-                audio = audio.mean(axis=1)
             if LIBROSA_AVAILABLE:
                 mel_spec = librosa.feature.melspectrogram(y=audio, sr=self.sample_rate, n_mels=64)
                 energy = np.mean(librosa.power_to_db(mel_spec, ref=np.max))
-                spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=self.sample_rate))
-                speech_score = (energy + 100) / 50 + spectral_centroid / 10000
             else:
                 from scipy import signal
-                f, t, Sxx = signal.spectrogram(audio, self.sample_rate)
                 energy = np.mean(10 * np.log10(Sxx + 1e-10))
-                speech_score = (energy + 100) / 50
             probability = np.clip(speech_score, 0, 1)
-            is_speech = probability > 0.6
-            return VADResult(probability, is_speech, self.model_name, time.time() - start_time, timestamp)
         except Exception as e:
-            print(f"Error in {self.model_name}: {e}")
             return VADResult(0.0, False, self.model_name, time.time() - start_time, timestamp)
 class OptimizedPANNs:
@@ -237,47 +210,32 @@ class OptimizedPANNs:
             if PANNS_AVAILABLE:
                 self.model = AudioTagging(checkpoint_path=None, device=self.device)
                 print(f"✅ {self.model_name} loaded successfully")
-            else:
-                print(f"⚠️ {self.model_name} not available, using fallback")
-                self.model = None
         except Exception as e:
             print(f"❌ Error loading {self.model_name}: {e}")
             self.model = None
     def predict(self, audio: np.ndarray, timestamp: float = 0.0) -> VADResult:
-        if timestamp > 0 and self.cached_clip_prob is not None:
-            return VADResult(self.cached_clip_prob,
-                             self.cached_clip_prob > 0.5,
-                             self.model_name, 0.0, timestamp)
         start_time = time.time()
         if self.model is None or len(audio) == 0:
             return VADResult(0.0, False, f"{self.model_name} (fallback)", time.time() - start_time, timestamp)
         try:
-            if len(audio.shape) > 1:
-                audio = audio.mean(axis=1)
-            # Correctly calculate probability using all speech-related labels
-            clip_probs, _ = self.model.inference(audio[np.newaxis, :],
-                                                 input_sr=self.sample_rate)  # API 1.3
-            speech_idx = [i for i, lbl in enumerate(labels)
-                          if 'speech' in lbl.lower() or 'voice' in lbl.lower()]
-            if not speech_idx:
-                speech_idx = [labels.index('Speech')]
             speech_prob = clip_probs[0, speech_idx].mean().item()
             self.cached_clip_prob = float(speech_prob)
-            return VADResult(self.cached_clip_prob,
-                             self.cached_clip_prob > 0.5,
-                             self.model_name,
-                             time.time() - start_time,
-                             timestamp)
         except Exception as e:
-            print(f"Error in {self.model_name}: {e}")
             return VADResult(0.0, False, f"{self.model_name} (error)", time.time() - start_time, timestamp)
 class OptimizedAST:
@@ -293,56 +251,37 @@ class OptimizedAST:
     def load_model(self):
         try:
             if AST_AVAILABLE:
-                model_name = "MIT/ast-finetuned-audioset-10-10-0.4593"
-                self.feature_extractor = ASTFeatureExtractor.from_pretrained(model_name)
-                self.model = ASTForAudioClassification.from_pretrained(model_name)
-                self.model.to(self.device)
-                self.model.eval()
                 print(f"✅ {self.model_name} loaded successfully")
-            else:
-                print(f"⚠️ {self.model_name} not available, using fallback")
-                self.model = None
         except Exception as e:
             print(f"❌ Error loading {self.model_name}: {e}")
             self.model = None
     def predict(self, audio: np.ndarray, timestamp: float = 0.0) -> VADResult:
-        if timestamp > 0 and self.cached_clip_prob is not None:
-            return VADResult(self.cached_clip_prob,
-                             self.cached_clip_prob > 0.5,
-                             self.model_name, 0.0, timestamp)
         start_time = time.time()
-        if self.model is None or len(audio) == 0:
             return VADResult(0.0, False, f"{self.model_name} (fallback)", time.time() - start_time, timestamp)
         try:
-            if len(audio.shape) > 1:
-                audio = audio.mean(axis=1)
-            inputs = self.feature_extractor(audio, sampling_rate=self.sample_rate, return_tensors="pt")
-            inputs = {k: v.to(self.device) for k, v in inputs.items()}
             with torch.no_grad():
-                outputs = self.model(**inputs)
-                logits = outputs.logits
-                probs = torch.sigmoid(logits)
-            # Correctly calculate probability using the model's label mapping
             label2id = self.model.config.label2id
-            speech_idx = [idx for lbl, idx in label2id.items()
-                          if 'speech' in lbl.lower() or 'voice' in lbl.lower()]
             speech_prob = probs[0, speech_idx].mean().item()
             self.cached_clip_prob = float(speech_prob)
-            return VADResult(self.cached_clip_prob,
-                             self.cached_clip_prob > 0.5,
-                             self.model_name,
-                             time.time() - start_time,
-                             timestamp)
         except Exception as e:
-            print(f"Error in {self.model_name}: {e}")
             return VADResult(0.0, False, f"{self.model_name} (error)", time.time() - start_time, timestamp)
 # ===== AUDIO PROCESSOR =====
@@ -351,333 +290,185 @@ class AudioProcessor:
     def __init__(self, sample_rate=16000):
         self.sample_rate = sample_rate
-        # Corrected STFT parameters for better temporal resolution
-        self.n_fft = 1024          # 64 ms window @ 16 kHz
-        self.hop_length = 256      # 16 ms hop (win/4), Librosa recommendation
         self.n_mels = 128
         self.fmin = 20
         self.fmax = 8000
-        # Corrected windowing for lightweight models
-        self.window_size = 0.048   # 48 ms
-        self.hop_size = 0.024      # 24 ms
-        self.delay_compensation = 0.0
-        self.correlation_threshold = 0.7
     def process_audio(self, audio):
-        if audio is None:
-            return np.array([])
         try:
-            if isinstance(audio, tuple):
-                sample_rate, audio_data = audio
-                if sample_rate != self.sample_rate and LIBROSA_AVAILABLE:
-                    audio_data = librosa.resample(audio_data.astype(float),
-                                                orig_sr=sample_rate,
-                                                target_sr=self.sample_rate)
-            else:
-                audio_data = audio
-            if len(audio_data.shape) > 1:
-                audio_data = audio_data.mean(axis=1)
-            if np.max(np.abs(audio_data)) > 0:
-                audio_data = audio_data / np.max(np.abs(audio_data))
             return audio_data
         except Exception as e:
-            print(f"Audio processing error: {e}")
             return np.array([])
     def compute_high_res_spectrogram(self, audio_data):
         try:
             if LIBROSA_AVAILABLE and len(audio_data) > 0:
-                stft = librosa.stft(
-                    audio_data,
-                    n_fft=self.n_fft,
-                    hop_length=self.hop_length,
-                    win_length=self.n_fft,
-                    window='hann',
-                    center=False
-                )
-                power_spec = np.abs(stft) ** 2
-                mel_basis = librosa.filters.mel(
-                    sr=self.sample_rate,
-                    n_fft=self.n_fft,
-                    n_mels=self.n_mels,
-                    fmin=self.fmin,
-                    fmax=self.fmax
-                )
-                mel_spec = np.dot(mel_basis, power_spec)
                 mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
-                time_frames = np.arange(mel_spec_db.shape[1]) * self.hop_length / self.sample_rate
                 return mel_spec_db, time_frames
-            else: # Fallback if Librosa is not available
-                from scipy import signal
-                f, t, Sxx = signal.spectrogram(
-                    audio_data,
-                    self.sample_rate,
-                    nperseg=self.n_fft,
-                    noverlap=self.n_fft - self.hop_length,
-                    window='hann'
-                )
-                mel_spec_db = 10 * np.log10(Sxx + 1e-10)
-                return mel_spec_db, t
         except Exception as e:
-            print(f"Spectrogram computation error: {e}")
-            dummy_spec = np.zeros((self.n_mels, 200))
-            dummy_time = np.linspace(0, len(audio_data) / self.sample_rate, 200)
-            return dummy_spec, dummy_time
     def detect_onset_offset_advanced(self, vad_results: List[VADResult], threshold: float = 0.5) -> List[OnsetOffset]:
         onsets_offsets = []
-        if len(vad_results) < 2:
-            return onsets_offsets
-        models = {}
-        for result in vad_results:
-            if result.model_name not in models:
-                models[result.model_name] = []
-            models[result.model_name].append(result)
-        for model_name, results in models.items():
-            if len(results) < 2:
-                continue
-            results.sort(key=lambda x: x.timestamp)
             timestamps = np.array([r.timestamp for r in results])
             probabilities = np.array([r.probability for r in results])
-            # Hysteresis thresholding
-            upper_thresh = threshold + 0.1
-            lower_thresh = threshold - 0.1
-            in_speech_segment = False
-            current_onset_time = -1
-            for i in range(len(results)):
-                curr_prob = probabilities[i]
-                curr_time = timestamps[i]
-                if not in_speech_segment and curr_prob > upper_thresh:
-                    in_speech_segment = True
-                    current_onset_time = curr_time - self.delay_compensation
-                elif in_speech_segment and curr_prob < lower_thresh:
-                    in_speech_segment = False
-                    if current_onset_time >= 0:
-                        offset_time = curr_time - self.delay_compensation
-                        onsets_offsets.append(OnsetOffset(
-                            onset_time=max(0, current_onset_time),
-                            offset_time=offset_time,
-                            model_name=model_name,
-                            confidence=np.mean(probabilities[(timestamps >= current_onset_time) & (timestamps <= offset_time)])
-                        ))
-                        current_onset_time = -1
-            if in_speech_segment and current_onset_time >= 0:
-                onsets_offsets.append(OnsetOffset(
-                    onset_time=max(0, current_onset_time),
-                    offset_time=timestamps[-1],
-                    model_name=model_name,
-                    confidence=np.mean(probabilities[timestamps >= current_onset_time])
-                ))
         return onsets_offsets
-# ===== ENHANCED VISUALIZATION =====
 def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
                         onsets_offsets: List[OnsetOffset], processor: AudioProcessor,
                         model_a: str, model_b: str, threshold: float):
-    if not PLOTLY_AVAILABLE or audio_data is None or len(audio_data) == 0:
-        return go.Figure().update_layout(title="No data to display")
-    try:
-        mel_spec_db, time_frames = processor.compute_high_res_spectrogram(audio_data)
-        freq_axis = np.linspace(processor.fmin, processor.fmax, processor.n_mels)
-        fig = make_subplots(
-            rows=2, cols=1,
-            subplot_titles=(f"Model A: {model_a}", f"Model B: {model_b}"),
-            vertical_spacing=0.05,
-            shared_xaxes=True,
-            specs=[[{"secondary_y": True}], [{"secondary_y": True}]]
-        )
-        # Shared heatmap settings
-        heatmap_args = dict(
-            z=mel_spec_db, x=time_frames, y=freq_axis,
-            colorscale='Viridis', showscale=False,
-            hovertemplate='Time: %{x:.2f}s<br>Freq: %{y:.0f}Hz<br>Power: %{z:.1f}dB<extra></extra>'
-        )
-        fig.add_trace(go.Heatmap(**heatmap_args, name=f'Spectrogram {model_a}'), row=1, col=1)
-        fig.add_trace(go.Heatmap(**heatmap_args, name=f'Spectrogram {model_b}'), row=2, col=1)
-        # Data separation
-        model_a_data = {'times': [], 'probs': []}
-        model_b_data = {'times': [], 'probs': []}
-        for r in vad_results:
-            if r.model_name.startswith(model_a):
-                model_a_data['times'].append(r.timestamp)
-                model_a_data['probs'].append(r.probability)
-            elif r.model_name.startswith(model_b):
-                model_b_data['times'].append(r.timestamp)
-                model_b_data['probs'].append(r.probability)
-        # Plotting probability curves on secondary Y-axis
-        if model_a_data['times']:
-            fig.add_trace(go.Scatter(x=model_a_data['times'], y=model_a_data['probs'], mode='lines',
-                                     line=dict(color='yellow', width=3), name=f'{model_a} Probability'),
-                          row=1, col=1, secondary_y=True)
-        if model_b_data['times']:
-            fig.add_trace(go.Scatter(x=model_b_data['times'], y=model_b_data['probs'], mode='lines',
-                                     line=dict(color='orange', width=3), name=f'{model_b} Probability'),
-                          row=2, col=1, secondary_y=True)
-        # Onset/Offset markers
-        for event in onsets_offsets:
-            row_num = 1 if event.model_name.startswith(model_a) else 2 if event.model_name.startswith(model_b) else None
-            if row_num:
-                fig.add_vline(x=event.onset_time, line=dict(color='lime', width=3), annotation_text='▲', annotation_position="top", row=row_num, col=1)
-                fig.add_vline(x=event.offset_time, line=dict(color='red', width=3), annotation_text='▼', annotation_position="bottom", row=row_num, col=1)
-        # Layout and styling
-        fig.update_layout(
-            height=600, title_text="Real-Time Speech Visualizer", showlegend=True,
-            legend=dict(x=1.05, y=1), plot_bgcolor='black', paper_bgcolor='white'
-        )
-        fig.update_xaxes(title_text="Time (seconds)", row=2, col=1)
-        fig.update_yaxes(title_text="Frequency (Hz)", range=[processor.fmin, processor.fmax], row=1, col=1, secondary_y=False)
-        fig.update_yaxes(title_text="Frequency (Hz)", range=[processor.fmin, processor.fmax], row=2, col=1, secondary_y=False)
-        # Correctly configure secondary axes
-        fig.update_yaxes(title_text="Probability", range=[0, 1], row=1, col=1, secondary_y=True)
-        fig.update_yaxes(title_text="Probability", range=[0, 1], row=2, col=1, secondary_y=True)
-        return fig
-    except Exception as e:
-        print(f"Visualization error: {e}")
-        return go.Figure().update_layout(title=f"Visualization Error: {e}")
 # ===== MAIN APPLICATION =====
 class VADDemo:
     def __init__(self):
-        print("🎤 Initializing Real-time VAD Demo with 5 models...")
         self.processor = AudioProcessor()
         self.models = {
-            'Silero-VAD': OptimizedSileroVAD(),
-            'WebRTC-VAD': OptimizedWebRTCVAD(),
-            'E-PANNs': OptimizedEPANNs(),
-            'PANNs': OptimizedPANNs(),
-            'AST': OptimizedAST()
         }
-        print("🎤 Real-time VAD Demo initialized successfully")
     def process_audio_with_events(self, audio, model_a, model_b, threshold):
-        if audio is None:
-            return None, "🔇 No audio detected", "Ready to process audio..."
-        try:
-            # Reset cache for heavy models at the start of each new clip processing
-            for m in ['PANNs', 'AST']:
-                if m in self.models:
-                    self.models[m].cached_clip_prob = None
             processed_audio = self.processor.process_audio(audio)
-            if len(processed_audio) == 0:
-                return None, "🎵 Processing audio...", "No audio data processed"
-            selected_models = list(set([model_a, model_b]))
-            # Pre-compute heavy models once
-            if 'PANNs' in selected_models:
-                panns_model = self.models['PANNs']
-                if LIBROSA_AVAILABLE:
-                    audio_32k = librosa.resample(processed_audio,
-                                                 orig_sr=self.processor.sample_rate,
-                                                 target_sr=panns_model.sample_rate)
-                    panns_model.predict(audio_32k, 0.0) # This populates the cache
-            if 'AST' in selected_models:
-                self.models['AST'].predict(processed_audio, 0.0) # This populates the cache
-            # Process in windows
-            window_samples = int(self.processor.sample_rate * self.processor.window_size)
-            hop_samples = int(self.processor.sample_rate * self.processor.hop_size)
             vad_results = []
-            for i in range(0, len(processed_audio) - window_samples, hop_samples):
                 timestamp = i / self.processor.sample_rate
-                chunk = processed_audio[i:i + window_samples]
-                for model_name in selected_models:
-                    result = self.models[model_name].predict(chunk, timestamp)
                     result.is_speech = result.probability > threshold
                     vad_results.append(result)
             onsets_offsets = self.processor.detect_onset_offset_advanced(vad_results, threshold)
-            fig = create_realtime_plot(processed_audio, vad_results, onsets_offsets,
-                                       self.processor, model_a, model_b, threshold)
-            status_msg = "🎙️ SPEECH DETECTED" if any(r.is_speech for r in vad_results) else "🔇 No speech detected"
-            details_text = f"Analyzed {len(processed_audio)/self.processor.sample_rate:.2f}s of audio with threshold {threshold:.2f}."
             return fig, status_msg, details_text
         except Exception as e:
-            print(f"Processing error: {e}")
             import traceback
             return None, f"❌ Error: {e}", traceback.format_exc()
-# Initialize demo
 demo_app = VADDemo()
-# ===== GRADIO INTERFACE =====
-def create_interface():
-    with gr.Blocks(title="VAD Demo", theme=gr.themes.Soft()) as interface:
-        gr.Markdown("# 🎤 VAD Demo: Real-time Speech Detection Framework v4")
-        with gr.Row():
-            with gr.Column(scale=1):
-                gr.Markdown("### 🎛️ Controls")
-                audio_input = gr.Audio(sources=["microphone"], type="numpy", label="Record or Upload Audio")
-                model_a = gr.Dropdown(["Silero-VAD", "WebRTC-VAD", "E-PANNs", "PANNs", "AST"], value="Silero-VAD", label="Model A (Top Panel)")
-                model_b = gr.Dropdown(["Silero-VAD", "WebRTC-VAD", "E-PANNs", "PANNs", "AST"], value="PANNs", label="Model B (Bottom Panel)")
-                threshold_slider = gr.Slider(0.0, 1.0, value=0.5, step=0.01, label="Detection Threshold")
-                process_btn = gr.Button("Analyze", variant="primary")
-            with gr.Column(scale=3):
-                gr.Markdown("### 📊 Visualization Dashboard")
-                plot_output = gr.Plot(label="VAD Analysis")
-                status_display = gr.Textbox(label="Status", interactive=False)
-                details_output = gr.Textbox(label="Details", lines=5, interactive=False)
-        process_btn.click(
-            fn=demo_app.process_audio_with_events,
-            inputs=[audio_input, model_a, model_b, threshold_slider],
-            outputs=[plot_output, status_display, details_output]
-        )
-    return interface
-if __name__ == "__main__":
-    print("🚀 Launching Gradio Interface...")
-    interface = create_interface()
-    interface.launch(share=True, debug=False)

             print(f"❌ Error loading {self.model_name}: {e}")
             self.model = None
+    def reset_states(self):
+        if self.model:
+            self.model.reset_states()
     def predict(self, audio: np.ndarray, timestamp: float = 0.0) -> VADResult:
         start_time = time.time()
             return VADResult(0.0, False, f"{self.model_name} (unavailable)", time.time() - start_time, timestamp)
         try:
+            if len(audio.shape) > 1: audio = audio.mean(axis=1)
+            # Silero expects a specific chunk size, which the main loop should provide.
+            # No padding or trimming here.
+            audio_tensor = torch.FloatTensor(audio).unsqueeze(0)
             with torch.no_grad():
                 speech_prob = self.model(audio_tensor, self.sample_rate).item()
             return VADResult(speech_prob, is_speech, self.model_name, processing_time, timestamp)
         except Exception as e:
+            # This can happen if chunk size is wrong, which is now handled in main loop
             return VADResult(0.0, False, self.model_name, time.time() - start_time, timestamp)
 class OptimizedWebRTCVAD:
     def __init__(self):
         self.model_name = "WebRTC-VAD"
         self.sample_rate = 16000
+        self.frame_duration = 10  # 10, 20, or 30 ms. 10ms for higher granularity.
         self.frame_size = int(self.sample_rate * self.frame_duration / 1000)
         if WEBRTC_AVAILABLE:
             try:
+                self.vad = webrtcvad.Vad(3)
                 print(f"✅ {self.model_name} loaded successfully")
+            except: self.vad = None
+        else: self.vad = None
     def predict(self, audio: np.ndarray, timestamp: float = 0.0) -> VADResult:
         start_time = time.time()
         if self.vad is None or len(audio) == 0:
+            return VADResult(0.0, False, f"{self.model_name} (fallback)", time.time() - start_time, timestamp)
         try:
+            if len(audio.shape) > 1: audio = audio.mean(axis=1)
             audio_int16 = (audio * 32767).astype(np.int16)
+            speech_frames, total_frames = 0, 0
             for i in range(0, len(audio_int16) - self.frame_size + 1, self.frame_size):
                 frame = audio_int16[i:i + self.frame_size].tobytes()
                 if self.vad.is_speech(frame, self.sample_rate):
                 total_frames += 1
             probability = speech_frames / max(total_frames, 1)
+            is_speech = probability > 0.5
             return VADResult(probability, is_speech, self.model_name, time.time() - start_time, timestamp)
         except Exception as e:
             return VADResult(0.0, False, self.model_name, time.time() - start_time, timestamp)
 class OptimizedEPANNs:
     def __init__(self):
         self.model_name = "E-PANNs"
+        self.sample_rate = 16000
         print(f"✅ {self.model_name} initialized")
     def predict(self, audio: np.ndarray, timestamp: float = 0.0) -> VADResult:
         start_time = time.time()
+        if len(audio) == 0: return VADResult(0.0, False, self.model_name, time.time() - start_time, timestamp)
         try:
             if LIBROSA_AVAILABLE:
                 mel_spec = librosa.feature.melspectrogram(y=audio, sr=self.sample_rate, n_mels=64)
                 energy = np.mean(librosa.power_to_db(mel_spec, ref=np.max))
             else:
                 from scipy import signal
+                _, _, Sxx = signal.spectrogram(audio, self.sample_rate)
                 energy = np.mean(10 * np.log10(Sxx + 1e-10))
+            speech_score = (energy + 100) / 50
             probability = np.clip(speech_score, 0, 1)
+            return VADResult(probability, probability > 0.6, self.model_name, time.time() - start_time, timestamp)
         except Exception as e:
             return VADResult(0.0, False, self.model_name, time.time() - start_time, timestamp)
 class OptimizedPANNs:
             if PANNS_AVAILABLE:
                 self.model = AudioTagging(checkpoint_path=None, device=self.device)
                 print(f"✅ {self.model_name} loaded successfully")
+            else: self.model = None
         except Exception as e:
             print(f"❌ Error loading {self.model_name}: {e}")
             self.model = None
     def predict(self, audio: np.ndarray, timestamp: float = 0.0) -> VADResult:
+        if self.cached_clip_prob is not None:
+            return VADResult(self.cached_clip_prob, self.cached_clip_prob > 0.5, self.model_name, 0.0, timestamp)
         start_time = time.time()
         if self.model is None or len(audio) == 0:
             return VADResult(0.0, False, f"{self.model_name} (fallback)", time.time() - start_time, timestamp)
         try:
+            # Use clipwise_output for probabilities, not embeddings.
+            clip_probs, _ = self.model.inference(audio[np.newaxis, :], input_sr=self.sample_rate)
+            # Filter all speech/voice-related labels for a robust average.
+            speech_idx = [i for i, lbl in enumerate(labels) if 'speech' in lbl.lower() or 'voice' in lbl.lower()]
+            if not speech_idx: speech_idx = [labels.index('Speech')]
             speech_prob = clip_probs[0, speech_idx].mean().item()
             self.cached_clip_prob = float(speech_prob)
+            return VADResult(self.cached_clip_prob, self.cached_clip_prob > 0.5, self.model_name, time.time() - start_time, timestamp)
         except Exception as e:
             return VADResult(0.0, False, f"{self.model_name} (error)", time.time() - start_time, timestamp)
 class OptimizedAST:
     def load_model(self):
         try:
             if AST_AVAILABLE:
+                model_path = "MIT/ast-finetuned-audioset-10-10-0.4593"
+                self.feature_extractor = ASTFeatureExtractor.from_pretrained(model_path)
+                self.model = ASTForAudioClassification.from_pretrained(model_path).to(self.device).eval()
                 print(f"✅ {self.model_name} loaded successfully")
+            else: self.model = None
         except Exception as e:
             print(f"❌ Error loading {self.model_name}: {e}")
             self.model = None
     def predict(self, audio: np.ndarray, timestamp: float = 0.0) -> VADResult:
+        if self.cached_clip_prob is not None:
+            return VADResult(self.cached_clip_prob, self.cached_clip_prob > 0.5, self.model_name, 0.0, timestamp)
         start_time = time.time()
+        if self.model is None or len(audio) < self.sample_rate * 2: # AST needs at least ~2s
             return VADResult(0.0, False, f"{self.model_name} (fallback)", time.time() - start_time, timestamp)
         try:
+            inputs = self.feature_extractor(audio, sampling_rate=self.sample_rate, return_tensors="pt").to(self.device)
             with torch.no_grad():
+                probs = torch.sigmoid(self.model(**inputs).logits)
+            # Use the model's config to find all speech-related labels
             label2id = self.model.config.label2id
+            speech_idx = [idx for lbl, idx in label2id.items() if 'speech' in lbl.lower() or 'voice' in lbl.lower()]
             speech_prob = probs[0, speech_idx].mean().item()
             self.cached_clip_prob = float(speech_prob)
+            return VADResult(self.cached_clip_prob, self.cached_clip_prob > 0.5, self.model_name, time.time() - start_time, timestamp)
         except Exception as e:
             return VADResult(0.0, False, f"{self.model_name} (error)", time.time() - start_time, timestamp)
 # ===== AUDIO PROCESSOR =====
     def __init__(self, sample_rate=16000):
         self.sample_rate = sample_rate
+        # Consistent windowing for analysis and STFT
+        self.window_size = 0.064  # 64 ms
+        self.hop_size = 0.016     # 16 ms
+        self.n_fft = int(self.sample_rate * self.window_size)      # 1024
+        self.hop_length = int(self.sample_rate * self.hop_size)    # 256
         self.n_mels = 128
         self.fmin = 20
         self.fmax = 8000
     def process_audio(self, audio):
+        if audio is None: return np.array([])
         try:
+            sample_rate, audio_data = audio
+            if sample_rate != self.sample_rate and LIBROSA_AVAILABLE:
+                audio_data = librosa.resample(audio_data.astype(float), orig_sr=sample_rate, target_sr=self.sample_rate)
+            if len(audio_data.shape) > 1: audio_data = audio_data.mean(axis=1)
+            if np.max(np.abs(audio_data)) > 0: audio_data /= np.max(np.abs(audio_data))
             return audio_data
         except Exception as e:
             return np.array([])
     def compute_high_res_spectrogram(self, audio_data):
         try:
             if LIBROSA_AVAILABLE and len(audio_data) > 0:
+                stft = librosa.stft(audio_data, n_fft=self.n_fft, hop_length=self.hop_length, center=False)
+                mel_spec = librosa.feature.melspectrogram(S=np.abs(stft)**2, sr=self.sample_rate, n_fft=self.n_fft, hop_length=self.hop_length, n_mels=self.n_mels)
                 mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
+                time_frames = librosa.times_like(mel_spec_db, sr=self.sample_rate, hop_length=self.hop_length, n_fft=self.n_fft)
                 return mel_spec_db, time_frames
+            return np.array([[]]), np.array([])
         except Exception as e:
+            return np.array([[]]), np.array([])
     def detect_onset_offset_advanced(self, vad_results: List[VADResult], threshold: float = 0.5) -> List[OnsetOffset]:
         onsets_offsets = []
+        models = {res.model_name for res in vad_results}
+        for model_name in models:
+            results = sorted([r for r in vad_results if r.model_name == model_name], key=lambda x: x.timestamp)
+            if len(results) < 2: continue
             timestamps = np.array([r.timestamp for r in results])
             probabilities = np.array([r.probability for r in results])
+            # Smooth probabilities to prevent brief drops from creating false offsets
+            probs_smooth = np.convolve(probabilities, np.ones(3)/3, mode='same')
+            upper = threshold
+            lower = threshold * 0.5 # Hysteresis lower bound
+            in_speech = False
+            onset_time = -1
+            for i, prob in enumerate(probs_smooth):
+                if not in_speech and prob > upper:
+                    in_speech = True
+                    onset_time = timestamps[i]
+                elif in_speech and prob < lower:
+                    in_speech = False
+                    onsets_offsets.append(OnsetOffset(onset_time, timestamps[i], model_name, np.mean(probabilities[(timestamps >= onset_time) & (timestamps <= timestamps[i])])))
+            if in_speech:
+                onsets_offsets.append(OnsetOffset(onset_time, timestamps[-1], model_name, np.mean(probabilities[timestamps >= onset_time])))
         return onsets_offsets
+# ===== VISUALIZATION =====
 def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
                         onsets_offsets: List[OnsetOffset], processor: AudioProcessor,
                         model_a: str, model_b: str, threshold: float):
+    if not PLOTLY_AVAILABLE or len(audio_data) == 0: return go.Figure()
+    mel_spec_db, time_frames = processor.compute_high_res_spectrogram(audio_data)
+    if mel_spec_db.size == 0: return go.Figure()
+    fig = make_subplots(rows=2, cols=1, subplot_titles=(f"Model A: {model_a}", f"Model B: {model_b}"),
+                        vertical_spacing=0.05, shared_xaxes=True, specs=[[{"secondary_y": True}], [{"secondary_y": True}]])
+    heatmap_args = dict(z=mel_spec_db, x=time_frames, y=np.linspace(processor.fmin, processor.fmax, processor.n_mels),
+                        colorscale='Viridis', showscale=False)
+    fig.add_trace(go.Heatmap(**heatmap_args, name=f'Spectrogram {model_a}'), row=1, col=1)
+    fig.add_trace(go.Heatmap(**heatmap_args, name=f'Spectrogram {model_b}'), row=2, col=1)
+    data_a = [r for r in vad_results if r.model_name.startswith(model_a)]
+    data_b = [r for r in vad_results if r.model_name.startswith(model_b)]
+    if data_a: fig.add_trace(go.Scatter(x=[r.timestamp for r in data_a], y=[r.probability for r in data_a], mode='lines', line=dict(color='yellow', width=3), name=f'{model_a} Prob.'), row=1, col=1, secondary_y=True)
+    if data_b: fig.add_trace(go.Scatter(x=[r.timestamp for r in data_b], y=[r.probability for r in data_b], mode='lines', line=dict(color='orange', width=3), name=f'{model_b} Prob.'), row=2, col=1, secondary_y=True)
+    # Draw threshold line on the secondary y-axis
+    fig.add_hline(y=threshold, line=dict(color='cyan', width=2, dash='dash'), row=1, col=1, secondary_y=True)
+    fig.add_hline(y=threshold, line=dict(color='cyan', width=2, dash='dash'), row=2, col=1, secondary_y=True)
+    events_a = [e for e in onsets_offsets if e.model_name.startswith(model_a)]
+    events_b = [e for e in onsets_offsets if e.model_name.startswith(model_b)]
+    for event in events_a:
+        fig.add_vline(x=event.onset_time, line=dict(color='lime', width=3), row=1, col=1)
+        fig.add_vline(x=event.offset_time, line=dict(color='red', width=3), row=1, col=1)
+    for event in events_b:
+        fig.add_vline(x=event.offset_time, line=dict(color='red', width=3), row=2, col=1)
+        fig.add_vline(x=event.onset_time, line=dict(color='lime', width=3), row=2, col=1)
+    fig.update_layout(height=600, title_text="Real-Time Speech Visualizer", plot_bgcolor='black', paper_bgcolor='white', font_color='black')
+    fig.update_yaxes(title_text="Frequency (Hz)", range=[processor.fmin, processor.fmax], secondary_y=False)
+    fig.update_yaxes(title_text="Probability", range=[0, 1], secondary_y=True) # Apply to all secondary axes
+    fig.update_xaxes(title_text="Time (seconds)", row=2, col=1)
+    return fig
 # ===== MAIN APPLICATION =====
 class VADDemo:
     def __init__(self):
         self.processor = AudioProcessor()
         self.models = {
+            'Silero-VAD': OptimizedSileroVAD(), 'WebRTC-VAD': OptimizedWebRTCVAD(),
+            'E-PANNs': OptimizedEPANNs(), 'PANNs': OptimizedPANNs(), 'AST': OptimizedAST()
         }
+        print("🎤 VAD Demo initialized with all modules.")
     def process_audio_with_events(self, audio, model_a, model_b, threshold):
+        if audio is None: return None, "🔇 No audio detected", "Ready..."
+        try:
             processed_audio = self.processor.process_audio(audio)
+            if len(processed_audio) == 0: return None, "Audio empty", "No data"
+            # Reset caches and states for new clip
+            for model in self.models.values():
+                if hasattr(model, 'cached_clip_prob'): model.cached_clip_prob = None
+                if hasattr(model, 'reset_states'): model.reset_states()
+            # Pre-compute for heavy models once
+            if 'PANNs' in self.models:
+                audio_32k = librosa.resample(processed_audio, orig_sr=self.processor.sample_rate, target_sr=32000)
+                self.models['PANNs'].predict(audio_32k, 0.0)
+            if 'AST' in self.models:
+                self.models['AST'].predict(processed_audio, 0.0)
+            # Main analysis loop with consistent windowing
             vad_results = []
+            window = int(self.processor.sample_rate * self.processor.window_size) # 1024
+            hop = int(self.processor.sample_rate * self.hop_size)          # 256
+            silero_chunk_size = 512 # Silero specific requirement
+            for i in range(0, len(processed_audio) - window + 1, hop):
                 timestamp = i / self.processor.sample_rate
+                chunk_1024 = processed_audio[i : i + window]
+                # Prepare chunk for Silero (last 512 samples of the current window)
+                chunk_512 = chunk_1024[-silero_chunk_size:]
+                for model_name in list(set([model_a, model_b])):
+                    model = self.models[model_name]
+                    # Feed correct chunk to each model type
+                    if model_name == 'Silero-VAD':
+                        current_chunk = chunk_512
+                    else:
+                        current_chunk = chunk_1024 # For WebRTC, E-PANNs, and cached models
+                    result = model.predict(current_chunk, timestamp)
                     result.is_speech = result.probability > threshold
                     vad_results.append(result)
             onsets_offsets = self.processor.detect_onset_offset_advanced(vad_results, threshold)
+            fig = create_realtime_plot(processed_audio, vad_results, onsets_offsets, self.processor, model_a, model_b, threshold)
+            status_msg = f"🎙️ Speech detected" if any(e.offset_time > e.onset_time for e in onsets_offsets) else "🔇 No speech detected"
+            details_text = f"Analyzed {len(processed_audio)/self.processor.sample_rate:.2f}s. Found {len(onsets_offsets)} speech events."
             return fig, status_msg, details_text
         except Exception as e:
             import traceback
+            traceback.print_exc()
             return None, f"❌ Error: {e}", traceback.format_exc()
+# Initialize and create interface
 demo_app = VADDemo()
+interface = create_interface() # Using the original full interface
+interface.launch(share=True, debug=False)