Spaces:

gbibbo
/

vad_demo

Sleeping

App Files Files Community

Gabriel Bibbó commited on Aug 4

Commit

ec04aee

1 Parent(s): 3891a49

Simplified interface with AST optimization

Browse files

Files changed (1) hide show

app.py +103 -195

app.py CHANGED Viewed

@@ -333,6 +333,8 @@ class OptimizedAST:
         self.model = None
         self.feature_extractor = None
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
         self.load_model()
     def load_model(self):
@@ -356,12 +358,12 @@ class OptimizedAST:
         if self.model is None or len(audio) == 0:
             if len(audio) > 0:
                 if LIBROSA_AVAILABLE:
                     spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=self.sample_rate))
-                    energy = np.sum(audio ** 2)
                     probability = min((energy * spectral_centroid) / 10000, 1.0)
                 else:
-                    energy = np.sum(audio ** 2)
                     probability = min(energy / 0.01, 1.0)
                 is_speech = probability > 0.5
             else:
@@ -373,40 +375,63 @@ class OptimizedAST:
             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
-            # Ensure minimum length for AST (typically needs longer sequences)
-            min_samples = self.sample_rate  # 1 second minimum
-            if len(audio) < min_samples:
-                audio = np.pad(audio, (0, min_samples - len(audio)), 'constant')
-            inputs = self.feature_extractor(audio, sampling_rate=self.sample_rate, return_tensors="pt")
-            inputs = {k: v.to(self.device) for k, v in inputs.items()}
-            with torch.no_grad():
-                outputs = self.model(**inputs)
-                logits = outputs.logits
-                probs = torch.sigmoid(logits)
-            label2id = self.model.config.label2id
-            speech_indices = []
-            for lbl, idx in label2id.items():
-                if any(word in lbl.lower() for word in ['speech', 'voice', 'talk', 'conversation', 'speaking', 'human']):
-                    speech_indices.append(idx)
-            if speech_indices:
-                speech_prob = probs[0, speech_indices].mean().item()
             else:
-                # Fallback: use average of first few probabilities
-                speech_prob = probs[0, :10].mean().item()
             return VADResult(float(speech_prob), speech_prob > 0.5, self.model_name, time.time()-start_time, timestamp)
         except Exception as e:
             print(f"Error in {self.model_name}: {e}")
             if len(audio) > 0:
                 energy = np.sum(audio ** 2)
-                threshold = 0.01
-                probability = min(energy / threshold, 1.0)
-                is_speech = energy > threshold
             else:
                 probability = 0.0
                 is_speech = False
@@ -628,7 +653,7 @@ class AudioProcessor:
             print(f"Delay estimation error: {e}")
             return 0.0
-# ===== ENHANCED VISUALIZATION (Complete GitHub Implementation) =====
 def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
                         onsets_offsets: List[OnsetOffset], processor: AudioProcessor,
@@ -811,28 +836,6 @@ def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
             secondary_y=True
         )
-        if hasattr(processor, 'delay_compensation') and processor.delay_compensation != 0:
-            fig.add_annotation(
-                text=f"Delay Compensation: {processor.delay_compensation*1000:.1f}ms",
-                xref="paper", yref="paper",
-                x=0.02, y=0.98,
-                showarrow=False,
-                bgcolor="yellow",
-                bordercolor="black",
-                borderwidth=1
-            )
-        resolution_text = f"Resolution: {processor.n_fft}-point FFT, {processor.hop_length}-sample hop"
-        fig.add_annotation(
-            text=resolution_text,
-            xref="paper", yref="paper",
-            x=0.02, y=0.02,
-            showarrow=False,
-            bgcolor="lightblue",
-            bordercolor="black",
-            borderwidth=1
-        )
         return fig
     except Exception as e:
@@ -900,80 +903,37 @@ class VADDemo:
             speech_detected = any(result.is_speech for result in vad_results)
             total_speech_time = sum(1 for r in vad_results if r.is_speech) * self.processor.hop_size
-            delay_info = f" | Delay: {delay_compensation*1000:.1f}ms" if delay_compensation != 0 else ""
             if speech_detected:
-                status_msg = f"🎙️ SPEECH DETECTED - {total_speech_time:.1f}s total{delay_info}"
             else:
-                status_msg = f"🔇 No speech detected{delay_info}"
-            details_lines = [
-                f"📊 **Advanced VAD Analysis** (Threshold: {threshold:.2f})",
-                f"📏 **Audio Duration**: {len(processed_audio)/self.processor.sample_rate:.2f} seconds",
-                f"🎯 **Processing Windows**: {len(vad_results)} ({self.processor.window_size*1000:.0f}ms each)",
-                f"⏱️ **Time Resolution**: {self.processor.hop_size*1000:.0f}ms hop size (ultra-smooth)",
-                f"🔧 **Delay Compensation**: {delay_compensation*1000:.1f}ms",
-                ""
-            ]
             model_summaries = {}
             for result in vad_results:
                 name = result.model_name.split(' ')[0]
                 if name not in model_summaries:
-                    model_summaries[name] = {
-                        'probs': [], 'speech_chunks': 0, 'total_chunks': 0,
-                        'avg_time': 0, 'max_prob': 0, 'min_prob': 1, 'full_name': result.model_name
-                    }
                 summary = model_summaries[name]
                 summary['probs'].append(result.probability)
                 summary['total_chunks'] += 1
-                summary['avg_time'] += result.processing_time
-                summary['max_prob'] = max(summary['max_prob'], result.probability)
-                summary['min_prob'] = min(summary['min_prob'], result.probability)
                 if result.is_speech:
                     summary['speech_chunks'] += 1
             for model_name, summary in model_summaries.items():
                 avg_prob = np.mean(summary['probs']) if summary['probs'] else 0
-                std_prob = np.std(summary['probs']) if summary['probs'] else 0
                 speech_ratio = (summary['speech_chunks'] / summary['total_chunks']) if summary['total_chunks'] > 0 else 0
-                avg_time = (summary['avg_time'] / summary['total_chunks']) * 1000 if summary['total_chunks'] > 0 else 0
                 status_icon = "🟢" if speech_ratio > 0.5 else "🟡" if speech_ratio > 0.2 else "🔴"
-                details_lines.extend([
-                    f"{status_icon} **{summary['full_name']}**:",
-                    f"   • Probability: {avg_prob:.3f} (±{std_prob:.3f}) [{summary['min_prob']:.3f}-{summary['max_prob']:.3f}]",
-                    f"   • Speech Detection: {speech_ratio*100:.1f}% ({summary['speech_chunks']}/{summary['total_chunks']} windows)",
-                    f"   • Processing Speed: {avg_time:.1f}ms/window (RTF: {avg_time/32:.3f})",
-                    ""
-                ])
             if onsets_offsets:
-                details_lines.append("🎯 **Speech Events (with Delay Compensation)**:")
-                total_speech_duration = 0
-                for i, event in enumerate(onsets_offsets[:10]):
-                    if event.offset_time > event.onset_time:
-                        duration = event.offset_time - event.onset_time
-                        total_speech_duration += duration
-                        details_lines.append(
-                            f"   • {event.model_name}: {event.onset_time:.2f}s → {event.offset_time:.2f}s "
-                            f"({duration:.2f}s, conf: {event.confidence:.3f})"
-                        )
-                    else:
-                        details_lines.append(
-                            f"   • {event.model_name}: {event.onset_time:.2f}s → ongoing (conf: {event.confidence:.3f})"
-                        )
-                if len(onsets_offsets) > 10:
-                    details_lines.append(f"   • ... and {len(onsets_offsets) - 10} more events")
-                speech_percentage = (total_speech_duration / (len(processed_audio)/self.processor.sample_rate)) * 100
-                details_lines.extend([
-                    "",
-                    f"📈 **Summary**: {total_speech_duration:.2f}s speech ({speech_percentage:.1f}% of audio)"
-                ])
-            else:
-                details_lines.append("🎯 **Speech Events**: No clear onset/offset boundaries detected")
             details_text = "\n".join(details_lines)
@@ -991,39 +951,39 @@ demo_app = VADDemo()
 # ===== GRADIO INTERFACE =====
-print("🚀 Launching Real-time VAD Demo...")
 def create_interface():
-    with gr.Blocks(title="VAD Demo - Real-time Speech Detection", theme=gr.themes.Soft()) as interface:
         gr.Markdown("""
-        # 🎤 VAD Demo: Real-time Speech Detection Framework v3
-        **Multi-Model Voice Activity Detection with Advanced Onset/Offset Detection**
-        ✨ **Ultra-High Resolution Features**:
-        - 🟢 **Green markers**: Speech onset detection with delay compensation
-        - 🔴 **Red markers**: Speech offset detection
-        - 📊 **Ultra-HD spectrograms**: 2048-point FFT, 256-sample hop (8x temporal resolution)
-        - 💫 **Separated probability curves**: Model A (yellow) in top panel, Model B (orange) in bottom
-        - 🔧 **Auto delay correction**: Cross-correlation-based compensation
-        - 📈 **Threshold visualization**: Cyan threshold line on both panels
-        - 🎨 **Matched color palettes**: Same Viridis colorscale for both spectrograms
-        | Model | Type | Description |
-        |-------|------|-------------|
-        | **Silero-VAD** | Neural Network | Production-ready VAD (1.8M params) |
-        | **WebRTC-VAD** | Signal Processing | Google's real-time VAD |
-        | **E-PANNs** | Deep Learning | Efficient audio analysis |
-        | **PANNs** | Deep CNN | Large-scale pretrained audio networks |
-        | **AST** | Transformer | Audio Spectrogram Transformer |
-        **Instructions:** Record audio → Select models → Adjust threshold → Analyze!
         """)
         with gr.Row():
             with gr.Column():
-                gr.Markdown("### 🎛️ **Advanced Controls**")
                 model_a = gr.Dropdown(
                     choices=["Silero-VAD", "WebRTC-VAD", "E-PANNs", "PANNs", "AST"],
@@ -1042,56 +1002,28 @@ def create_interface():
                     maximum=1.0,
                     value=0.5,
                     step=0.01,
-                    label="Detection Threshold (with hysteresis)"
                 )
-                process_btn = gr.Button("🎤 Advanced Analysis", variant="primary", size="lg")
-                gr.Markdown("""
-                ### 📖 **Enhanced Features**
-                1. 🎙️ **Record**: High-quality audio capture
-                2. 🔧 **Compare**: Different models in each panel
-                3. ⚙️ **Threshold**: Cyan line shows threshold level on both panels
-                4. 📈 **Curves**: Yellow (Model A) and orange (Model B) probability curves
-                5. 🔄 **Auto-sync**: Automatic delay compensation
-                6. 👀 **Events**: Model-specific onset/offset detection per panel!
-                ### 🎨 **Visualization Elements**
-                - **🟢 Green lines**: Speech onset (▲ markers) - model-specific per panel
-                - **🔴 Red lines**: Speech offset (▼ markers) - model-specific per panel
-                - **🔵 Cyan line**: Detection threshold (same on both panels)
-                - **🟡 Yellow curve**: Model A probability (top panel only)
-                - **🟠 Orange curve**: Model B probability (bottom panel only)
-                - **Ultra-HD spectrograms**: 2048-point FFT, same Viridis colorscale
-                """)
-            with gr.Column():
-                gr.Markdown("### 🎙️ **Audio Input**")
-                audio_input = gr.Audio(
-                    sources=["microphone"],
-                    type="numpy",
-                    label="Record Audio (3-15 seconds recommended)"
                 )
-        gr.Markdown("### 📊 **Real-Time Speech Visualizer Dashboard**")
         with gr.Row():
-            plot_output = gr.Plot(label="Advanced VAD Analysis with Complete Feature Set")
-        with gr.Row():
-            with gr.Column():
-                status_display = gr.Textbox(
-                    label="🎯 Real-time Status",
-                    value="🔇 Ready for advanced speech analysis",
-                    interactive=False
-                )
         with gr.Row():
             details_output = gr.Textbox(
-                label="📋 Comprehensive Analysis Report",
-                lines=25,
-                max_lines=30,
                 interactive=False
             )
@@ -1102,34 +1034,10 @@ def create_interface():
             outputs=[plot_output, status_display, details_output]
         )
         gr.Markdown("""
         ---
-        ### 🔬 **Research Context - WASPAA 2025**
-        This demo implements the complete **speech removal framework** from our WASPAA 2025 paper:
-        **🎯 Core Innovations:**
-        - **Advanced Onset/Offset Detection**: Sub-frame precision with delay compensation
-        - **Multi-Model Architecture**: Real-time comparison of 5 VAD approaches
-        - **High-Resolution Analysis**: 2048-point FFT with 256-sample hop (ultra-smooth)
-        - **Adaptive Thresholding**: Hysteresis-based decision boundaries
-        - **Cross-Correlation Sync**: Automatic delay compensation up to ±100ms
-        **🏠 Real-World Applications:**
-        - Smart home privacy: Remove conversations, keep environmental sounds
-        - GDPR audio compliance: Privacy-aware dataset processing
-        - Call center automation: Real-time speech/silence detection
-        - Voice assistant optimization: Precise wake-word boundaries
-        **📊 Performance Metrics:**
-        - **Precision**: 94.2% on CHiME-Home dataset
-        - **Recall**: 91.8% with optimized thresholds
-        - **Latency**: <50ms processing time (Real-Time Factor: 0.05)
-        - **Resolution**: 16ms time resolution, 128 mel bins (ultra-high definition)
-        **Citation:** *Speech Removal Framework for Privacy-Preserving Audio Recordings*, WASPAA 2025
-        **⚡ CPU Optimized** | **🆓 Hugging Face Spaces** | **🎯 Production Ready**
         """)
     return interface

         self.model = None
         self.feature_extractor = None
         self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        # Cache for features to avoid recomputing
+        self.feature_cache = {}
         self.load_model()
     def load_model(self):
         if self.model is None or len(audio) == 0:
             if len(audio) > 0:
+                # Fast fallback using energy and spectral features
+                energy = np.sum(audio ** 2)
                 if LIBROSA_AVAILABLE:
                     spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=self.sample_rate))
                     probability = min((energy * spectral_centroid) / 10000, 1.0)
                 else:
                     probability = min(energy / 0.01, 1.0)
                 is_speech = probability > 0.5
             else:
             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
+            # OPTIMIZATION: Use smaller chunks for faster processing
+            # AST can work with shorter sequences than the full required length
+            max_length = self.sample_rate * 2  # Max 2 seconds to keep it fast
+            if len(audio) > max_length:
+                # Take the middle part of the audio for better representation
+                start_idx = (len(audio) - max_length) // 2
+                audio = audio[start_idx:start_idx + max_length]
+            elif len(audio) < self.sample_rate // 2:  # If less than 0.5 seconds
+                # Pad to minimum length
+                audio = np.pad(audio, (0, self.sample_rate // 2 - len(audio)), 'constant')
+            # Create a hash for caching (to avoid recomputing same features)
+            audio_hash = hash(audio.tobytes())
+            if audio_hash in self.feature_cache:
+                speech_prob = self.feature_cache[audio_hash]
             else:
+                # Feature extraction with reduced parameters for speed
+                inputs = self.feature_extractor(
+                    audio,
+                    sampling_rate=self.sample_rate,
+                    return_tensors="pt",
+                    max_length=512,  # Reduced from default for speed
+                    truncation=True
+                )
+                inputs = {k: v.to(self.device) for k, v in inputs.items()}
+                with torch.no_grad():
+                    outputs = self.model(**inputs)
+                    logits = outputs.logits
+                    probs = torch.sigmoid(logits)
+                label2id = self.model.config.label2id
+                speech_indices = []
+                for lbl, idx in label2id.items():
+                    if any(word in lbl.lower() for word in ['speech', 'voice', 'talk', 'conversation', 'speaking', 'human']):
+                        speech_indices.append(idx)
+                if speech_indices:
+                    speech_prob = probs[0, speech_indices].mean().item()
+                else:
+                    # Fallback: use average of first few probabilities
+                    speech_prob = probs[0, :10].mean().item()
+                # Cache the result if audio is not too long (to prevent memory issues)
+                if len(self.feature_cache) < 50:  # Limit cache size
+                    self.feature_cache[audio_hash] = speech_prob
             return VADResult(float(speech_prob), speech_prob > 0.5, self.model_name, time.time()-start_time, timestamp)
         except Exception as e:
             print(f"Error in {self.model_name}: {e}")
+            # Fast fallback
             if len(audio) > 0:
                 energy = np.sum(audio ** 2)
+                probability = min(energy / 0.01, 1.0)
+                is_speech = energy > 0.01
             else:
                 probability = 0.0
                 is_speech = False
             print(f"Delay estimation error: {e}")
             return 0.0
+# ===== ENHANCED VISUALIZATION =====
 def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
                         onsets_offsets: List[OnsetOffset], processor: AudioProcessor,
             secondary_y=True
         )
         return fig
     except Exception as e:
             speech_detected = any(result.is_speech for result in vad_results)
             total_speech_time = sum(1 for r in vad_results if r.is_speech) * self.processor.hop_size
             if speech_detected:
+                status_msg = f"🎙️ SPEECH DETECTED - {total_speech_time:.1f}s total"
             else:
+                status_msg = f"🔇 No speech detected"
+            # Simplified details
             model_summaries = {}
             for result in vad_results:
                 name = result.model_name.split(' ')[0]
                 if name not in model_summaries:
+                    model_summaries[name] = {'probs': [], 'speech_chunks': 0, 'total_chunks': 0}
                 summary = model_summaries[name]
                 summary['probs'].append(result.probability)
                 summary['total_chunks'] += 1
                 if result.is_speech:
                     summary['speech_chunks'] += 1
+            details_lines = [f"**Analysis Results** (Threshold: {threshold:.2f})"]
             for model_name, summary in model_summaries.items():
                 avg_prob = np.mean(summary['probs']) if summary['probs'] else 0
                 speech_ratio = (summary['speech_chunks'] / summary['total_chunks']) if summary['total_chunks'] > 0 else 0
                 status_icon = "🟢" if speech_ratio > 0.5 else "🟡" if speech_ratio > 0.2 else "🔴"
+                details_lines.append(f"{status_icon} **{model_name}**: {avg_prob:.3f} avg prob, {speech_ratio*100:.1f}% speech")
             if onsets_offsets:
+                details_lines.append(f"\n**Speech Events**: {len(onsets_offsets)} detected")
+                for i, event in enumerate(onsets_offsets[:5]):  # Show first 5 only
+                    duration = event.offset_time - event.onset_time if event.offset_time > event.onset_time else 0
+                    details_lines.append(f"• {event.model_name}: {event.onset_time:.2f}s - {event.offset_time:.2f}s ({duration:.2f}s)")
             details_text = "\n".join(details_lines)
 # ===== GRADIO INTERFACE =====
 def create_interface():
+    with gr.Blocks(title="VAD Demo - Voice Activity Detection", theme=gr.themes.Soft()) as interface:
+        # Header with logos
         gr.Markdown("""
+        <div style="text-align: center; margin-bottom: 20px;">
+            <h1>🎤 VAD Demo - Voice Activity Detection</h1>
+            <p><strong>Multi-Model Real-time Speech Detection Framework</strong></p>
+        </div>
         """)
+        # Logos section
         with gr.Row():
             with gr.Column():
+                gr.HTML("""
+                <div style="display: flex; justify-content: center; align-items: center; gap: 20px; margin: 20px 0; flex-wrap: wrap;">
+                    <img src="file/ai4s_banner.png" alt="AI4S" style="height: 60px; object-fit: contain;">
+                    <img src="file/surrey_logo.png" alt="University of Surrey" style="height: 60px; object-fit: contain;">
+                    <img src="file/EPSRC_logo.png" alt="EPSRC" style="height: 60px; object-fit: contain;">
+                    <img src="file/CVSSP_logo.png" alt="CVSSP" style="height: 60px; object-fit: contain;">
+                </div>
+                """)
+        # Main interface
+        with gr.Row():
+            with gr.Column(scale=1):
+                gr.Markdown("### 🎛️ Controls")
+                audio_input = gr.Audio(
+                    sources=["microphone"],
+                    type="numpy",
+                    label="Record Audio"
+                )
                 model_a = gr.Dropdown(
                     choices=["Silero-VAD", "WebRTC-VAD", "E-PANNs", "PANNs", "AST"],
                     maximum=1.0,
                     value=0.5,
                     step=0.01,
+                    label="Detection Threshold"
                 )
+                process_btn = gr.Button("🎤 Analyze", variant="primary", size="lg")
+            with gr.Column(scale=2):
+                status_display = gr.Textbox(
+                    label="Status",
+                    value="🔇 Ready to analyze audio",
+                    interactive=False
                 )
+        # Results
+        gr.Markdown("### 📊 Results")
         with gr.Row():
+            plot_output = gr.Plot(label="Speech Detection Visualization")
         with gr.Row():
             details_output = gr.Textbox(
+                label="Analysis Details",
+                lines=10,
                 interactive=False
             )
             outputs=[plot_output, status_display, details_output]
         )
+        # Footer
         gr.Markdown("""
         ---
+        **Models**: Silero-VAD, WebRTC-VAD, E-PANNs, PANNs, AST | **Research**: WASPAA 2025 | **Institution**: University of Surrey, CVSSP
         """)
     return interface