Spaces:

gbibbo
/

vad_demo

Sleeping

App Files Files Community

Gabriel Bibbó commited on Aug 12

Commit

dac6057

1 Parent(s): e78c137

adjust app.py

Browse files

Files changed (1) hide show

app.py +54 -63

app.py CHANGED Viewed

@@ -522,7 +522,12 @@ class OptimizedAST:
                     self.model = self.model.half()
                     print(f"✅ {self.model_name} loaded with FP16 optimization")
                 else:
-                    print(f"✅ {self.model_name} loaded successfully")
                 self.model.eval()
             else:
@@ -665,16 +670,16 @@ class AudioProcessor:
             "WebRTC-VAD": 0.03,   # 30ms frames (480 samples)
             "E-PANNs": 1.0,       # CHANGED from 6.0 to 1.0 for better temporal resolution
             "PANNs": 1.0,         # CHANGED from 10.0 to 1.0 for better temporal resolution
-            "AST": 1.0            # 1 second for better temporal resolution
         }
-        # Model-specific hop sizes for efficiency - INCREASED to 20Hz
         self.model_hop_sizes = {
             "Silero-VAD": 0.016,  # 16ms hop for Silero (512 samples window)
             "WebRTC-VAD": 0.03,   # 30ms hop for WebRTC (match frame duration)
             "E-PANNs": 0.05,      # CHANGED from 0.1 to 0.05 for 20Hz
             "PANNs": 0.05,        # CHANGED from 0.1 to 0.05 for 20Hz
-            "AST": 0.05           # CHANGED from 0.1 to 0.05 for 20Hz
         }
         # Model-specific thresholds for better detection
@@ -724,7 +729,7 @@ class AudioProcessor:
                     hop_length=self.hop_length,
                     win_length=self.n_fft,
                     window='hann',
-                    center=True  # CAMBIO 2: True para alineación con timestamps centrados
                 )
                 power_spec = np.abs(stft) ** 2
@@ -781,80 +786,67 @@ class AudioProcessor:
             return dummy_spec, dummy_time
     def detect_onset_offset_advanced(self, vad_results: List[VADResult],
-                                     model_thresholds: Dict[str, float]) -> List[OnsetOffset]:
         """
-        CAMBIO 4: Cruces exactos de umbral, sin suavizado ni histéresis.
         Onset: p[i-1] < thr y p[i] >= thr
         Offset: p[i-1] >= thr y p[i] < thr
         El instante se obtiene por interpolación lineal entre (t[i-1], p[i-1]) y (t[i], p[i]).
         """
-        onsets_offsets: List[OnsetOffset] = []
         if len(vad_results) < 2:
             return onsets_offsets
-        # agrupar por modelo (base_name)
-        grouped: Dict[str, List[VADResult]] = {}
         for r in vad_results:
             base = r.model_name.split('(')[0].strip()
-            grouped.setdefault(base, []).append(r)
         for base, rs in grouped.items():
             rs.sort(key=lambda r: r.timestamp)
             t = np.array([r.timestamp for r in rs], dtype=float)
             p = np.array([r.probability for r in rs], dtype=float)
-            thr = float(model_thresholds.get(base, 0.5))
             in_seg = False
             onset_t = None
-            # si arrancamos por encima del umbral
             if p[0] > thr:
                 in_seg = True
                 onset_t = t[0]
             for i in range(1, len(p)):
                 p0, p1 = p[i-1], p[i]
                 t0, t1 = t[i-1], t[i]
-                # ONSET: p0 < thr y p1 >= thr
                 if (not in_seg) and (p0 < thr) and (p1 >= thr):
-                    if p1 == p0:
-                        cross = t1
-                    else:
-                        alpha = (thr - p0) / (p1 - p0)
-                        cross = t0 + alpha * (t1 - t0)
-                    onset_t = cross
                     in_seg = True
-                # OFFSET: p0 >= thr y p1 < thr
                 elif in_seg and (p0 >= thr) and (p1 < thr):
-                    if p1 == p0:
-                        cross = t1
-                    else:
-                        alpha = (thr - p0) / (p1 - p0)
-                        cross = t0 + alpha * (t1 - t0)
-                    # confianza como media de probs dentro del segmento (crudas)
-                    mask = (t >= onset_t) & (t <= cross)
-                    conf = float(p[mask].mean()) if np.any(mask) else float(max(p0, p1))
-                    onsets_offsets.append(OnsetOffset(
-                        onset_time=max(0.0, float(onset_t)),
-                        offset_time=float(cross),
-                        model_name=base,
-                        confidence=conf
-                    ))
                     in_seg = False
                     onset_t = None
-            # si termina por encima del umbral, cerramos en el último timestamp
             if in_seg and onset_t is not None:
-                mask = (t >= onset_t)
-                conf = float(p[mask].mean()) if np.any(mask) else float(p[-1])
-                onsets_offsets.append(OnsetOffset(
-                    onset_time=max(0.0, float(onset_t)),
-                    offset_time=float(t[-1]),
-                    model_name=base,
-                    confidence=conf
-                ))
         return onsets_offsets
@@ -949,9 +941,9 @@ def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
             row=2, col=1
         )
-        # Use model-specific thresholds
-        thr_a = processor.model_thresholds.get(model_a, threshold)
-        thr_b = processor.model_thresholds.get(model_b, threshold)
         if len(time_frames) > 0:
             # Add threshold lines using model-specific thresholds
@@ -972,10 +964,10 @@ def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
                 yref="y4"  # Reference to secondary y-axis of second subplot
             )
-            # Add threshold annotations with model-specific values
             fig.add_annotation(
                 x=time_frames[-1] * 0.95, y=thr_a,
-                text=f'Threshold: {thr_a:.2f}',
                 showarrow=False,
                 font=dict(color='cyan', size=10),
                 row=1, col=1,
@@ -983,7 +975,7 @@ def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
             )
             fig.add_annotation(
                 x=time_frames[-1] * 0.95, y=thr_b,
-                text=f'Threshold: {thr_b:.2f}',
                 showarrow=False,
                 font=dict(color='cyan', size=10),
                 row=2, col=1,
@@ -1191,7 +1183,7 @@ class VADDemo:
                 if model_name in self.models:
                     window_size = self.processor.model_windows[model_name]
                     hop_size = self.processor.model_hop_sizes[model_name]
-                    model_threshold = self.processor.model_thresholds.get(model_name, threshold)
                     window_samples = int(self.processor.sample_rate * window_size)
                     hop_samples = int(self.processor.sample_rate * hop_size)
@@ -1251,8 +1243,10 @@ class VADDemo:
             delay_compensation = self.processor.estimate_delay_compensation(processed_audio, vad_results)
-            # CAMBIO 4: Use exact threshold crossing detection
-            onsets_offsets = self.processor.detect_onset_offset_advanced(vad_results, self.processor.model_thresholds)
             debug_info.append(f"\n🎭 **EVENTS**: {len(onsets_offsets)} onset/offset pairs detected")
@@ -1282,18 +1276,15 @@ class VADDemo:
                 if result.is_speech:
                     summary['speech_chunks'] += 1
-            # Show model-specific thresholds
-            thr_a = self.processor.model_thresholds.get(model_a, threshold)
-            thr_b = self.processor.model_thresholds.get(model_b, threshold)
-            details_lines = [f"**Analysis Results** (Thresholds → {model_a}:{thr_a:.2f} | {model_b}:{thr_b:.2f})"]
             for model_name, summary in model_summaries.items():
                 avg_prob = np.mean(summary['probs']) if summary['probs'] else 0
                 speech_ratio = (summary['speech_chunks'] / summary['total_chunks']) if summary['total_chunks'] > 0 else 0
-                model_thresh = self.processor.model_thresholds.get(model_name, threshold)
                 status_icon = "🟢" if speech_ratio > 0.5 else "🟡" if speech_ratio > 0.2 else "🔴"
-                details_lines.append(f"{status_icon} **{model_name}**: {avg_prob:.3f} avg prob, {speech_ratio*100:.1f}% speech (thresh: {model_thresh:.2f})")
             if onsets_offsets:
                 details_lines.append(f"\n**Speech Events**: {len(onsets_offsets)} detected")
@@ -1383,7 +1374,7 @@ def create_interface():
                     maximum=1.0,
                     value=0.5,
                     step=0.01,
-                    label="Global Detection Threshold (Reference Only)"
                 )
                 process_btn = gr.Button("🎤 Analyze", variant="primary", size="lg")
@@ -1421,7 +1412,7 @@ def create_interface():
         ---
         **Models**: Silero-VAD, WebRTC-VAD, E-PANNs, PANNs, AST | **Research**: WASPAA 2025 | **Institution**: University of Surrey, CVSSP
-        **Note**: All models now provide high temporal resolution (20Hz) for accurate real-time speech detection.
         """)
     return interface

                     self.model = self.model.half()
                     print(f"✅ {self.model_name} loaded with FP16 optimization")
                 else:
+                    # Apply quantization for CPU acceleration
+                    import torch.nn as nn
+                    self.model = torch.quantization.quantize_dynamic(
+                        self.model, {nn.Linear}, dtype=torch.qint8
+                    )
+                    print(f"✅ {self.model_name} loaded with CPU quantization")
                 self.model.eval()
             else:
             "WebRTC-VAD": 0.03,   # 30ms frames (480 samples)
             "E-PANNs": 1.0,       # CHANGED from 6.0 to 1.0 for better temporal resolution
             "PANNs": 1.0,         # CHANGED from 10.0 to 1.0 for better temporal resolution
+            "AST": 0.96           # OPTIMIZED: Natural window size for AST
         }
+        # Model-specific hop sizes for efficiency - OPTIMIZED for performance
         self.model_hop_sizes = {
             "Silero-VAD": 0.016,  # 16ms hop for Silero (512 samples window)
             "WebRTC-VAD": 0.03,   # 30ms hop for WebRTC (match frame duration)
             "E-PANNs": 0.05,      # CHANGED from 0.1 to 0.05 for 20Hz
             "PANNs": 0.05,        # CHANGED from 0.1 to 0.05 for 20Hz
+            "AST": 0.24           # OPTIMIZED: Reduced frequency (4.17 Hz) for performance
         }
         # Model-specific thresholds for better detection
                     hop_length=self.hop_length,
                     win_length=self.n_fft,
                     window='hann',
+                    center=False  # CAMBIO: False para tiempo real sin padding
                 )
                 power_spec = np.abs(stft) ** 2
             return dummy_spec, dummy_time
     def detect_onset_offset_advanced(self, vad_results: List[VADResult],
+                                     threshold: float,
+                                     apply_delay: float = 0.0,
+                                     min_duration: float = 0.12) -> List[OnsetOffset]:
         """
+        Cruces exactos de umbral global, con compensación de delay y filtro de duración mínima.
         Onset: p[i-1] < thr y p[i] >= thr
         Offset: p[i-1] >= thr y p[i] < thr
         El instante se obtiene por interpolación lineal entre (t[i-1], p[i-1]) y (t[i], p[i]).
         """
+        onsets_offsets = []
         if len(vad_results) < 2:
             return onsets_offsets
+        # agrupar por modelo
+        grouped = {}
         for r in vad_results:
             base = r.model_name.split('(')[0].strip()
+            # aplica delay al guardar
+            grouped.setdefault(base, []).append(
+                VADResult(r.probability, r.is_speech, base, r.processing_time, r.timestamp - apply_delay)
+            )
         for base, rs in grouped.items():
             rs.sort(key=lambda r: r.timestamp)
             t = np.array([r.timestamp for r in rs], dtype=float)
             p = np.array([r.probability for r in rs], dtype=float)
+            thr = float(threshold)
             in_seg = False
             onset_t = None
             if p[0] > thr:
                 in_seg = True
                 onset_t = t[0]
+            def xcross(t0, p0, t1, p1, thr):
+                if p1 == p0: return t1
+                alpha = (thr - p0) / (p1 - p0)
+                return t0 + alpha * (t1 - t0)
             for i in range(1, len(p)):
                 p0, p1 = p[i-1], p[i]
                 t0, t1 = t[i-1], t[i]
                 if (not in_seg) and (p0 < thr) and (p1 >= thr):
+                    onset_t = xcross(t0, p0, t1, p1, thr)
                     in_seg = True
                 elif in_seg and (p0 >= thr) and (p1 < thr):
+                    off = xcross(t0, p0, t1, p1, thr)
+                    if off - onset_t >= min_duration:            # debounce
+                        mask = (t >= onset_t) & (t <= off)
+                        conf = float(p[mask].mean()) if np.any(mask) else float(max(p0, p1))
+                        onsets_offsets.append(OnsetOffset(max(0.0, float(onset_t)), float(off), base, conf))
                     in_seg = False
                     onset_t = None
             if in_seg and onset_t is not None:
+                off = float(t[-1])
+                if off - onset_t >= min_duration:
+                    mask = (t >= onset_t)
+                    conf = float(p[mask].mean()) if np.any(mask) else float(p[-1])
+                    onsets_offsets.append(OnsetOffset(max(0.0, float(onset_t)), off, base, conf))
         return onsets_offsets
             row=2, col=1
         )
+        # Use global threshold for both models
+        thr_a = threshold
+        thr_b = threshold
         if len(time_frames) > 0:
             # Add threshold lines using model-specific thresholds
                 yref="y4"  # Reference to secondary y-axis of second subplot
             )
+            # Add threshold annotations with global threshold
             fig.add_annotation(
                 x=time_frames[-1] * 0.95, y=thr_a,
+                text=f'Threshold: {threshold:.2f}',
                 showarrow=False,
                 font=dict(color='cyan', size=10),
                 row=1, col=1,
             )
             fig.add_annotation(
                 x=time_frames[-1] * 0.95, y=thr_b,
+                text=f'Threshold: {threshold:.2f}',
                 showarrow=False,
                 font=dict(color='cyan', size=10),
                 row=2, col=1,
                 if model_name in self.models:
                     window_size = self.processor.model_windows[model_name]
                     hop_size = self.processor.model_hop_sizes[model_name]
+                    model_threshold = threshold  # CORRECTED: Use global threshold from slider
                     window_samples = int(self.processor.sample_rate * window_size)
                     hop_samples = int(self.processor.sample_rate * hop_size)
             delay_compensation = self.processor.estimate_delay_compensation(processed_audio, vad_results)
+            # CORRECTED: Use global threshold with delay compensation and min duration
+            onsets_offsets = self.processor.detect_onset_offset_advanced(
+                vad_results, threshold, apply_delay=delay_compensation, min_duration=0.12
+            )
             debug_info.append(f"\n🎭 **EVENTS**: {len(onsets_offsets)} onset/offset pairs detected")
                 if result.is_speech:
                     summary['speech_chunks'] += 1
+            # Show global threshold in analysis results
+            details_lines = [f"**Analysis Results** (Global Threshold: {threshold:.2f})"]
             for model_name, summary in model_summaries.items():
                 avg_prob = np.mean(summary['probs']) if summary['probs'] else 0
                 speech_ratio = (summary['speech_chunks'] / summary['total_chunks']) if summary['total_chunks'] > 0 else 0
                 status_icon = "🟢" if speech_ratio > 0.5 else "🟡" if speech_ratio > 0.2 else "🔴"
+                details_lines.append(f"{status_icon} **{model_name}**: {avg_prob:.3f} avg prob, {speech_ratio*100:.1f}% speech")
             if onsets_offsets:
                 details_lines.append(f"\n**Speech Events**: {len(onsets_offsets)} detected")
                     maximum=1.0,
                     value=0.5,
                     step=0.01,
+                    label="Detection Threshold (Global)"
                 )
                 process_btn = gr.Button("🎤 Analyze", variant="primary", size="lg")
         ---
         **Models**: Silero-VAD, WebRTC-VAD, E-PANNs, PANNs, AST | **Research**: WASPAA 2025 | **Institution**: University of Surrey, CVSSP
+        **Note**: Optimized for real-time performance with global threshold control and exact temporal alignment.
         """)
     return interface