import gradio as gr import numpy as np import torch import time import warnings from dataclasses import dataclass # Suppress warnings warnings.filterwarnings('ignore') # Optional imports with fallbacks try: import librosa LIBROSA_AVAILABLE = True print("✅ Librosa available") except ImportError: LIBROSA_AVAILABLE = False print("⚠️ Librosa not available, using scipy fallback") try: import webrtcvad WEBRTC_AVAILABLE = True print("✅ WebRTC VAD available") except ImportError: WEBRTC_AVAILABLE = False print("⚠️ WebRTC VAD not available, using fallback") print("🚀 Creating VAD Demo instance...") # ===== SIMPLIFIED DATA STRUCTURES ===== @dataclass class VADResult: probability: float is_speech: bool model_name: str processing_time: float # ===== SIMPLIFIED MODEL IMPLEMENTATIONS ===== class OptimizedSileroVAD: def __init__(self): self.model = None self.sample_rate = 16000 self.model_name = "Silero-VAD" self.load_model() def load_model(self): try: self.model, _ = torch.hub.load( repo_or_dir='snakers4/silero-vad', model='silero_vad', force_reload=False, onnx=False ) self.model.eval() print(f"✅ {self.model_name} loaded successfully") except Exception as e: print(f"❌ Error loading {self.model_name}: {e}") self.model = None def predict(self, audio: np.ndarray) -> VADResult: start_time = time.time() if self.model is None or len(audio) == 0: return VADResult(0.0, False, f"{self.model_name} (unavailable)", time.time() - start_time) try: if len(audio.shape) > 1: audio = audio.mean(axis=1) # Silero requires 512 samples for 16kHz required_samples = 512 if len(audio) != required_samples: if len(audio) > required_samples: start_idx = (len(audio) - required_samples) // 2 audio_chunk = audio[start_idx:start_idx + required_samples] else: audio_chunk = np.pad(audio, (0, required_samples - len(audio)), 'constant') else: audio_chunk = audio audio_tensor = torch.FloatTensor(audio_chunk).unsqueeze(0) with torch.no_grad(): speech_prob = self.model(audio_tensor, self.sample_rate).item() is_speech = speech_prob > 0.5 processing_time = time.time() - start_time return VADResult(speech_prob, is_speech, self.model_name, processing_time) except Exception as e: print(f"Error in {self.model_name}: {e}") return VADResult(0.0, False, self.model_name, time.time() - start_time) class OptimizedWebRTCVAD: def __init__(self): self.model_name = "WebRTC-VAD" self.sample_rate = 16000 self.frame_duration = 30 self.frame_size = int(self.sample_rate * self.frame_duration / 1000) if WEBRTC_AVAILABLE: try: self.vad = webrtcvad.Vad(3) print(f"✅ {self.model_name} loaded successfully") except: self.vad = None else: self.vad = None def predict(self, audio: np.ndarray) -> VADResult: start_time = time.time() if self.vad is None or len(audio) == 0: # Energy-based fallback energy = np.sum(audio ** 2) if len(audio) > 0 else 0 threshold = 0.01 probability = min(energy / threshold, 1.0) is_speech = energy > threshold return VADResult(probability, is_speech, f"{self.model_name} (fallback)", time.time() - start_time) try: if len(audio.shape) > 1: audio = audio.mean(axis=1) audio_int16 = (audio * 32767).astype(np.int16) speech_frames = 0 total_frames = 0 for i in range(0, len(audio_int16) - self.frame_size, self.frame_size): frame = audio_int16[i:i + self.frame_size].tobytes() if self.vad.is_speech(frame, self.sample_rate): speech_frames += 1 total_frames += 1 probability = speech_frames / max(total_frames, 1) is_speech = probability > 0.3 return VADResult(probability, is_speech, self.model_name, time.time() - start_time) except Exception as e: print(f"Error in {self.model_name}: {e}") return VADResult(0.0, False, self.model_name, time.time() - start_time) class OptimizedEPANNs: def __init__(self): self.model_name = "E-PANNs" self.sample_rate = 32000 print(f"✅ {self.model_name} initialized") def predict(self, audio: np.ndarray) -> VADResult: start_time = time.time() try: if len(audio) == 0: return VADResult(0.0, False, self.model_name, time.time() - start_time) if len(audio.shape) > 1: audio = audio.mean(axis=1) # Simple spectral analysis if LIBROSA_AVAILABLE: mel_spec = librosa.feature.melspectrogram(y=audio, sr=self.sample_rate, n_mels=64) energy = np.mean(librosa.power_to_db(mel_spec, ref=np.max)) spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=self.sample_rate)) speech_score = (energy + 100) / 50 + spectral_centroid / 10000 else: # Scipy fallback from scipy import signal f, t, Sxx = signal.spectrogram(audio, self.sample_rate) energy = np.mean(10 * np.log10(Sxx + 1e-10)) speech_score = (energy + 100) / 50 probability = np.clip(speech_score, 0, 1) is_speech = probability > 0.6 return VADResult(probability, is_speech, self.model_name, time.time() - start_time) except Exception as e: print(f"Error in {self.model_name}: {e}") return VADResult(0.0, False, self.model_name, time.time() - start_time) # ===== SIMPLIFIED AUDIO PROCESSOR ===== class AudioProcessor: def __init__(self, sample_rate=16000): self.sample_rate = sample_rate def process_audio(self, audio): if audio is None: return np.array([]) try: if isinstance(audio, tuple): sample_rate, audio_data = audio if sample_rate != self.sample_rate and LIBROSA_AVAILABLE: audio_data = librosa.resample(audio_data.astype(float), orig_sr=sample_rate, target_sr=self.sample_rate) else: audio_data = audio if len(audio_data.shape) > 1: audio_data = audio_data.mean(axis=1) if np.max(np.abs(audio_data)) > 0: audio_data = audio_data / np.max(np.abs(audio_data)) return audio_data except Exception as e: print(f"Audio processing error: {e}") return np.array([]) # ===== MAIN APPLICATION ===== class VADDemo: def __init__(self): print("🎤 Initializing VAD Demo...") self.processor = AudioProcessor() self.models = { 'Silero-VAD': OptimizedSileroVAD(), 'WebRTC-VAD': OptimizedWebRTCVAD(), 'E-PANNs': OptimizedEPANNs() } print("🎤 VAD Demo initialized successfully") print(f"📊 Available models: {list(self.models.keys())}") def process_audio_simple(self, audio, model_a: str, model_b: str, threshold: float): if audio is None: return "🔇 No audio detected", "Ready to process audio..." try: # Process audio processed_audio = self.processor.process_audio(audio) if len(processed_audio) == 0: return "🎵 Processing audio...", "No audio data processed" # Get predictions from selected models selected_models = [model_a, model_b] if model_a != model_b else [model_a] vad_results = {} for model_name in selected_models: if model_name in self.models: result = self.models[model_name].predict(processed_audio) vad_results[model_name] = result # Create status message speech_detected = any(result.is_speech for result in vad_results.values()) status_msg = "🎙️ SPEECH DETECTED" if speech_detected else "🔇 No speech detected" # Create simple details string details_lines = [f"📊 **Analysis Results** (Threshold: {threshold:.2f})"] details_lines.append("") for name, result in vad_results.items(): status_icon = "🟢" if result.is_speech else "🔴" details_lines.append(f"{status_icon} **{name}**:") details_lines.append(f" • Probability: {result.probability:.3f}") details_lines.append(f" • Detection: {'SPEECH' if result.is_speech else 'NO SPEECH'}") details_lines.append(f" • Processing: {result.processing_time*1000:.1f}ms") details_lines.append("") # Add audio info details_lines.append(f"🎵 **Audio Info**:") details_lines.append(f" • Duration: {len(processed_audio)/16000:.2f} seconds") details_lines.append(f" • Samples: {len(processed_audio):,}") details_lines.append(f" • Max amplitude: {np.max(np.abs(processed_audio)):.3f}") details_text = "\n".join(details_lines) return status_msg, details_text except Exception as e: print(f"Processing error: {e}") return f"❌ Error: {str(e)}", f"Error details: {str(e)}" # Initialize demo print("🎤 Initializing VAD Demo...") demo_app = VADDemo() # ===== ULTRA-SIMPLIFIED GRADIO INTERFACE ===== print("🚀 Launching VAD Demo...") # Create minimal interface without problematic components with gr.Blocks(title="VAD Demo - Speech Detection") as interface: gr.Markdown(""" # 🎤 VAD Demo: Multi-Model Speech Detection **Compare AI models for voice activity detection** | Model | Type | Description | |-------|------|-------------| | **Silero-VAD** | Neural Network | Production-ready VAD (1.8M params) | | **WebRTC-VAD** | Signal Processing | Google's real-time VAD | | **E-PANNs** | Deep Learning | Efficient audio analysis | **Instructions:** Record audio → Select models → Adjust threshold → Click Process """) with gr.Row(): with gr.Column(): gr.Markdown("### 🎛️ **Controls**") model_a = gr.Dropdown( choices=["Silero-VAD", "WebRTC-VAD", "E-PANNs"], value="Silero-VAD", label="Model A" ) model_b = gr.Dropdown( choices=["Silero-VAD", "WebRTC-VAD", "E-PANNs"], value="WebRTC-VAD", label="Model B" ) threshold_slider = gr.Slider( minimum=0.0, maximum=1.0, value=0.5, step=0.05, label="Detection Threshold" ) process_btn = gr.Button("🎤 Process Audio", variant="primary", size="lg") with gr.Column(): gr.Markdown("### 🎙️ **Audio Input**") audio_input = gr.Audio( sources=["microphone"], type="numpy", label="Record Audio (2-4 seconds recommended)" ) gr.Markdown("### 📊 **Results**") with gr.Row(): status_display = gr.Textbox( label="Detection Status", value="🔇 Ready to process speech", interactive=False, container=True ) with gr.Row(): details_output = gr.Textbox( label="Analysis Details", lines=15, interactive=False, container=True, show_copy_button=True ) # Event handlers - only button click to avoid conflicts process_btn.click( fn=demo_app.process_audio_simple, inputs=[audio_input, model_a, model_b, threshold_slider], outputs=[status_display, details_output], show_progress=True ) gr.Markdown(""" --- ### 🔬 **Research Context** This demo supports research in **privacy-preserving audio processing** for smart home applications. The framework enables **selective speech removal** while preserving environmental sounds. **Applications:** - 🏠 Smart home privacy protection - 📊 GDPR-compliant audio processing - 🎯 Real-time voice activity detection - 🔊 Environmental sound preservation **Citation:** *Speech Removal Framework for Privacy-Preserving Audio Recordings*, WASPAA 2025 **⚡ CPU Optimized** | **🆓 Hugging Face Spaces** | **🎯 WASPAA Demo Ready** """) # Launch for HF Spaces with correct settings if __name__ == "__main__": interface.launch( share=True, # Required for HF Spaces server_name="0.0.0.0", server_port=7860 )