import gradio as gr
import numpy as np
import torch
import time
import warnings
from dataclasses import dataclass

# Suppress warnings
warnings.filterwarnings('ignore')

# Optional imports with fallbacks
try:
    import librosa
    LIBROSA_AVAILABLE = True
    print("✅ Librosa available")
except ImportError:
    LIBROSA_AVAILABLE = False
    print("⚠️ Librosa not available, using scipy fallback")

try:
    import webrtcvad
    WEBRTC_AVAILABLE = True
    print("✅ WebRTC VAD available")
except ImportError:
    WEBRTC_AVAILABLE = False
    print("⚠️ WebRTC VAD not available, using fallback")

print("🚀 Creating VAD Demo instance...")

# ===== SIMPLIFIED DATA STRUCTURES =====

@dataclass
class VADResult:
    probability: float
    is_speech: bool
    model_name: str
    processing_time: float

# ===== SIMPLIFIED MODEL IMPLEMENTATIONS =====

class OptimizedSileroVAD:
    def __init__(self):
        self.model = None
        self.sample_rate = 16000
        self.model_name = "Silero-VAD"
        self.load_model()
    
    def load_model(self):
        try:
            self.model, _ = torch.hub.load(
                repo_or_dir='snakers4/silero-vad',
                model='silero_vad',
                force_reload=False,
                onnx=False
            )
            self.model.eval()
            print(f"✅ {self.model_name} loaded successfully")
        except Exception as e:
            print(f"❌ Error loading {self.model_name}: {e}")
            self.model = None
    
    def predict(self, audio: np.ndarray) -> VADResult:
        start_time = time.time()
        
        if self.model is None or len(audio) == 0:
            return VADResult(0.0, False, f"{self.model_name} (unavailable)", time.time() - start_time)
        
        try:
            if len(audio.shape) > 1:
                audio = audio.mean(axis=1)
            
            # Silero requires 512 samples for 16kHz
            required_samples = 512
            if len(audio) != required_samples:
                if len(audio) > required_samples:
                    start_idx = (len(audio) - required_samples) // 2
                    audio_chunk = audio[start_idx:start_idx + required_samples]
                else:
                    audio_chunk = np.pad(audio, (0, required_samples - len(audio)), 'constant')
            else:
                audio_chunk = audio
            
            audio_tensor = torch.FloatTensor(audio_chunk).unsqueeze(0)
            
            with torch.no_grad():
                speech_prob = self.model(audio_tensor, self.sample_rate).item()
            
            is_speech = speech_prob > 0.5
            processing_time = time.time() - start_time
            
            return VADResult(speech_prob, is_speech, self.model_name, processing_time)
            
        except Exception as e:
            print(f"Error in {self.model_name}: {e}")
            return VADResult(0.0, False, self.model_name, time.time() - start_time)

class OptimizedWebRTCVAD:
    def __init__(self):
        self.model_name = "WebRTC-VAD"
        self.sample_rate = 16000
        self.frame_duration = 30
        self.frame_size = int(self.sample_rate * self.frame_duration / 1000)
        
        if WEBRTC_AVAILABLE:
            try:
                self.vad = webrtcvad.Vad(3)
                print(f"✅ {self.model_name} loaded successfully")
            except:
                self.vad = None
        else:
            self.vad = None
    
    def predict(self, audio: np.ndarray) -> VADResult:
        start_time = time.time()
        
        if self.vad is None or len(audio) == 0:
            # Energy-based fallback
            energy = np.sum(audio ** 2) if len(audio) > 0 else 0
            threshold = 0.01
            probability = min(energy / threshold, 1.0)
            is_speech = energy > threshold
            return VADResult(probability, is_speech, f"{self.model_name} (fallback)", time.time() - start_time)
        
        try:
            if len(audio.shape) > 1:
                audio = audio.mean(axis=1)
            
            audio_int16 = (audio * 32767).astype(np.int16)
            
            speech_frames = 0
            total_frames = 0
            
            for i in range(0, len(audio_int16) - self.frame_size, self.frame_size):
                frame = audio_int16[i:i + self.frame_size].tobytes()
                if self.vad.is_speech(frame, self.sample_rate):
                    speech_frames += 1
                total_frames += 1
            
            probability = speech_frames / max(total_frames, 1)
            is_speech = probability > 0.3
            
            return VADResult(probability, is_speech, self.model_name, time.time() - start_time)
            
        except Exception as e:
            print(f"Error in {self.model_name}: {e}")
            return VADResult(0.0, False, self.model_name, time.time() - start_time)

class OptimizedEPANNs:
    def __init__(self):
        self.model_name = "E-PANNs"
        self.sample_rate = 32000
        print(f"✅ {self.model_name} initialized")
    
    def predict(self, audio: np.ndarray) -> VADResult:
        start_time = time.time()
        
        try:
            if len(audio) == 0:
                return VADResult(0.0, False, self.model_name, time.time() - start_time)
            
            if len(audio.shape) > 1:
                audio = audio.mean(axis=1)
            
            # Simple spectral analysis
            if LIBROSA_AVAILABLE:
                mel_spec = librosa.feature.melspectrogram(y=audio, sr=self.sample_rate, n_mels=64)
                energy = np.mean(librosa.power_to_db(mel_spec, ref=np.max))
                spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=self.sample_rate))
                speech_score = (energy + 100) / 50 + spectral_centroid / 10000
            else:
                # Scipy fallback
                from scipy import signal
                f, t, Sxx = signal.spectrogram(audio, self.sample_rate)
                energy = np.mean(10 * np.log10(Sxx + 1e-10))
                speech_score = (energy + 100) / 50
            
            probability = np.clip(speech_score, 0, 1)
            is_speech = probability > 0.6
            
            return VADResult(probability, is_speech, self.model_name, time.time() - start_time)
            
        except Exception as e:
            print(f"Error in {self.model_name}: {e}")
            return VADResult(0.0, False, self.model_name, time.time() - start_time)

# ===== SIMPLIFIED AUDIO PROCESSOR =====

class AudioProcessor:
    def __init__(self, sample_rate=16000):
        self.sample_rate = sample_rate
        
    def process_audio(self, audio):
        if audio is None:
            return np.array([])
        
        try:
            if isinstance(audio, tuple):
                sample_rate, audio_data = audio
                if sample_rate != self.sample_rate and LIBROSA_AVAILABLE:
                    audio_data = librosa.resample(audio_data.astype(float), 
                                                orig_sr=sample_rate, 
                                                target_sr=self.sample_rate)
            else:
                audio_data = audio
            
            if len(audio_data.shape) > 1:
                audio_data = audio_data.mean(axis=1)
            
            if np.max(np.abs(audio_data)) > 0:
                audio_data = audio_data / np.max(np.abs(audio_data))
            
            return audio_data
            
        except Exception as e:
            print(f"Audio processing error: {e}")
            return np.array([])

# ===== MAIN APPLICATION =====

class VADDemo:
    def __init__(self):
        print("🎤 Initializing VAD Demo...")
        
        self.processor = AudioProcessor()
        self.models = {
            'Silero-VAD': OptimizedSileroVAD(),
            'WebRTC-VAD': OptimizedWebRTCVAD(),
            'E-PANNs': OptimizedEPANNs()
        }
        
        print("🎤 VAD Demo initialized successfully")
        print(f"📊 Available models: {list(self.models.keys())}")
    
    def process_audio_simple(self, audio, model_a: str, model_b: str, threshold: float):
        if audio is None:
            return "🔇 No audio detected", "Ready to process audio..."
        
        try:
            # Process audio
            processed_audio = self.processor.process_audio(audio)
            
            if len(processed_audio) == 0:
                return "🎵 Processing audio...", "No audio data processed"
            
            # Get predictions from selected models
            selected_models = [model_a, model_b] if model_a != model_b else [model_a]
            vad_results = {}
            
            for model_name in selected_models:
                if model_name in self.models:
                    result = self.models[model_name].predict(processed_audio)
                    vad_results[model_name] = result
            
            # Create status message
            speech_detected = any(result.is_speech for result in vad_results.values())
            status_msg = "🎙️ SPEECH DETECTED" if speech_detected else "🔇 No speech detected"
            
            # Create simple details string
            details_lines = [f"📊 **Analysis Results** (Threshold: {threshold:.2f})"]
            details_lines.append("")
            
            for name, result in vad_results.items():
                status_icon = "🟢" if result.is_speech else "🔴"
                details_lines.append(f"{status_icon} **{name}**:")
                details_lines.append(f"   • Probability: {result.probability:.3f}")
                details_lines.append(f"   • Detection: {'SPEECH' if result.is_speech else 'NO SPEECH'}")
                details_lines.append(f"   • Processing: {result.processing_time*1000:.1f}ms")
                details_lines.append("")
            
            # Add audio info
            details_lines.append(f"🎵 **Audio Info**:")
            details_lines.append(f"   • Duration: {len(processed_audio)/16000:.2f} seconds")
            details_lines.append(f"   • Samples: {len(processed_audio):,}")
            details_lines.append(f"   • Max amplitude: {np.max(np.abs(processed_audio)):.3f}")
            
            details_text = "\n".join(details_lines)
            
            return status_msg, details_text
            
        except Exception as e:
            print(f"Processing error: {e}")
            return f"❌ Error: {str(e)}", f"Error details: {str(e)}"

# Initialize demo
print("🎤 Initializing VAD Demo...")
demo_app = VADDemo()

# ===== ULTRA-SIMPLIFIED GRADIO INTERFACE =====

print("🚀 Launching VAD Demo...")

# Create minimal interface without problematic components
with gr.Blocks(title="VAD Demo - Speech Detection") as interface:
    
    gr.Markdown("""
    # 🎤 VAD Demo: Multi-Model Speech Detection
    
    **Compare AI models for voice activity detection**
    
    | Model | Type | Description |
    |-------|------|-------------|
    | **Silero-VAD** | Neural Network | Production-ready VAD (1.8M params) |
    | **WebRTC-VAD** | Signal Processing | Google's real-time VAD |
    | **E-PANNs** | Deep Learning | Efficient audio analysis |
    
    **Instructions:** Record audio → Select models → Adjust threshold → Click Process
    """)
    
    with gr.Row():
        with gr.Column():
            gr.Markdown("### 🎛️ **Controls**")
            
            model_a = gr.Dropdown(
                choices=["Silero-VAD", "WebRTC-VAD", "E-PANNs"],
                value="Silero-VAD",
                label="Model A"
            )
            
            model_b = gr.Dropdown(
                choices=["Silero-VAD", "WebRTC-VAD", "E-PANNs"],
                value="WebRTC-VAD",
                label="Model B"
            )
            
            threshold_slider = gr.Slider(
                minimum=0.0,
                maximum=1.0,
                value=0.5,
                step=0.05,
                label="Detection Threshold"
            )
            
            process_btn = gr.Button("🎤 Process Audio", variant="primary", size="lg")
            
        with gr.Column():
            gr.Markdown("### 🎙️ **Audio Input**")
            
            audio_input = gr.Audio(
                sources=["microphone"],
                type="numpy",
                label="Record Audio (2-4 seconds recommended)"
            )
    
    gr.Markdown("### 📊 **Results**")
    
    with gr.Row():
        status_display = gr.Textbox(
            label="Detection Status",
            value="🔇 Ready to process speech",
            interactive=False,
            container=True
        )
    
    with gr.Row():
        details_output = gr.Textbox(
            label="Analysis Details", 
            lines=15,
            interactive=False,
            container=True,
            show_copy_button=True
        )
    
    # Event handlers - only button click to avoid conflicts
    process_btn.click(
        fn=demo_app.process_audio_simple,
        inputs=[audio_input, model_a, model_b, threshold_slider],
        outputs=[status_display, details_output],
        show_progress=True
    )
    
    gr.Markdown("""
    ---
    ### 🔬 **Research Context**
    
    This demo supports research in **privacy-preserving audio processing** for smart home applications.
    The framework enables **selective speech removal** while preserving environmental sounds.
    
    **Applications:**
    - 🏠 Smart home privacy protection
    - 📊 GDPR-compliant audio processing  
    - 🎯 Real-time voice activity detection
    - 🔊 Environmental sound preservation
    
    **Citation:** *Speech Removal Framework for Privacy-Preserving Audio Recordings*, WASPAA 2025
    
    **⚡ CPU Optimized** | **🆓 Hugging Face Spaces** | **🎯 WASPAA Demo Ready**
    """)

# Launch for HF Spaces with correct settings
if __name__ == "__main__":
    interface.launch(
        share=True,  # Required for HF Spaces
        server_name="0.0.0.0",
        server_port=7860
    )