Spaces:

gbibbo
/

vad_demo

Sleeping

App Files Files Community

Gabriel Bibbó commited on Aug 5

Commit

96f8e9f

1 Parent(s): 5bbaead

Hotfix: Restore basic functionality - fix AST saturation and PANNs execution

Browse files

Files changed (1) hide show

app.py +106 -63

app.py CHANGED Viewed

@@ -237,6 +237,16 @@ class OptimizedEPANNs:
                                                  orig_sr=16000,
                                                  target_sr=self.sample_rate)
                 mel_spec = librosa.feature.melspectrogram(y=audio_resampled, sr=self.sample_rate, n_mels=64)
                 energy = np.mean(librosa.power_to_db(mel_spec, ref=np.max))
                 spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio_resampled, sr=self.sample_rate))
@@ -317,10 +327,15 @@ class OptimizedPANNs:
                     audio
                 )
-            # Ensure minimum length for PANNs (need at least 1 second)
             min_samples = self.sample_rate  # 1 second
             if len(audio_resampled) < min_samples:
-                audio_resampled = np.pad(audio_resampled, (0, min_samples - len(audio_resampled)), 'constant')
             clip_probs, _ = self.model.inference(audio_resampled[np.newaxis, :],
                                                  input_sr=self.sample_rate)
@@ -373,8 +388,15 @@ class OptimizedAST:
                 self.feature_extractor = ASTFeatureExtractor.from_pretrained(model_name)
                 self.model = ASTForAudioClassification.from_pretrained(model_name)
                 self.model.to(self.device)
                 self.model.eval()
-                print(f"✅ {self.model_name} loaded successfully")
             else:
                 print(f"⚠️ {self.model_name} not available, using fallback")
                 self.model = None
@@ -421,45 +443,50 @@ class OptimizedAST:
             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
-            # Use longer context for AST - preferably 2 seconds
-            if full_audio is not None and len(full_audio) >= 2 * self.sample_rate:
-                # Take 2-second window centered around current timestamp
                 center_pos = int(timestamp * self.sample_rate)
-                window_size = self.sample_rate  # 1 second each side
                 start_pos = max(0, center_pos - window_size)
                 end_pos = min(len(full_audio), center_pos + window_size)
-                # Ensure we have at least 2 seconds
-                if end_pos - start_pos < 2 * self.sample_rate:
-                    end_pos = min(len(full_audio), start_pos + 2 * self.sample_rate)
-                    if end_pos - start_pos < 2 * self.sample_rate:
-                        start_pos = max(0, end_pos - 2 * self.sample_rate)
                 audio_for_ast = full_audio[start_pos:end_pos]
             else:
                 audio_for_ast = audio
-            # Ensure minimum length for AST (2 seconds preferred, minimum 1 second)
-            min_samples = 2 * self.sample_rate  # 2 seconds
             if len(audio_for_ast) < min_samples:
                 audio_for_ast = np.pad(audio_for_ast, (0, min_samples - len(audio_for_ast)), 'constant')
-            # Truncate if too long (AST can handle up to ~10s, but we'll use 3s max for efficiency)
-            max_samples = 3 * self.sample_rate
             if len(audio_for_ast) > max_samples:
                 audio_for_ast = audio_for_ast[:max_samples]
-            # Feature extraction with proper AST parameters
             inputs = self.feature_extractor(
                 audio_for_ast,
                 sampling_rate=self.sample_rate,
                 return_tensors="pt",
                 max_length=1024,  # Proper AST context
                 truncation=True
             )
             inputs = {k: v.to(self.device) for k, v in inputs.items()}
             with torch.no_grad():
                 outputs = self.model(**inputs)
@@ -477,21 +504,23 @@ class OptimizedAST:
             if speech_indices:
                 speech_prob = probs[0, speech_indices].mean().item()
-                # Apply more reasonable thresholding for AST
-                if speech_prob < 0.1 and np.sum(audio_for_ast ** 2) > 0.001:
-                    speech_prob = min(speech_prob * 3, 0.7)  # Moderate boost, cap at 0.7
             else:
                 # Fallback to energy-based detection with higher threshold
                 energy = np.sum(audio_for_ast ** 2) / len(audio_for_ast)  # Normalize by length
                 speech_prob = min(energy * 50, 1.0)
-            result = VADResult(float(speech_prob), speech_prob > 0.4, self.model_name, time.time()-start_time, timestamp)
             # Cache the result
             self.prediction_cache[cache_key] = result
-            # Clean old cache entries (keep only last 10 seconds)
-            cache_keys_to_remove = [k for k in self.prediction_cache.keys() if k < cache_key - 10]
             for k in cache_keys_to_remove:
                 del self.prediction_cache[k]
@@ -523,16 +552,25 @@ class AudioProcessor:
         self.fmin = 20
         self.fmax = 8000
-        self.window_size = 0.064
-        self.hop_size = 0.032
         # Model-specific hop sizes for efficiency
         self.model_hop_sizes = {
             "Silero-VAD": 0.032,
             "WebRTC-VAD": 0.03,
-            "E-PANNs": 1.0,
-            "PANNs": 1.0,
-            "AST": 1.0  # Process AST only once per second
         }
         self.delay_compensation = 0.0
@@ -697,8 +735,8 @@ class AudioProcessor:
             if len(audio_data) == 0 or len(vad_results) == 0:
                 return 0.0
-            window_size = int(self.sample_rate * self.window_size)
-            hop_size = int(self.sample_rate * self.hop_size)
             energy_signal = []
             for i in range(0, len(audio_data) - window_size, hop_size):
@@ -715,14 +753,14 @@ class AudioProcessor:
             vad_times = np.array([r.timestamp for r in vad_results])
             vad_probs = np.array([r.probability for r in vad_results])
-            energy_times = np.arange(len(energy_signal)) * self.hop_size
             vad_interp = np.interp(energy_times, vad_times, vad_probs)
             vad_interp = (vad_interp - np.mean(vad_interp)) / (np.std(vad_interp) + 1e-8)
             if len(energy_signal) > 10 and len(vad_interp) > 10:
                 correlation = np.correlate(energy_signal, vad_interp, mode='full')
                 delay_samples = np.argmax(correlation) - len(vad_interp) + 1
-                delay_seconds = delay_samples * self.hop_size
                 max_corr = np.max(correlation) / (len(vad_interp) * np.std(energy_signal) * np.std(vad_interp))
                 if max_corr > self.correlation_threshold:
@@ -804,20 +842,23 @@ def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
         model_b_data = {'times': [], 'probs': []}
         for result in vad_results:
-            if result.model_name.startswith(model_a):
                 model_a_data['times'].append(result.timestamp)
                 model_a_data['probs'].append(result.probability)
-            elif result.model_name.startswith(model_b):
                 model_b_data['times'].append(result.timestamp)
                 model_b_data['probs'].append(result.probability)
-        if len(model_a_data['times']) > 1:
             fig.add_trace(
                 go.Scatter(
                     x=model_a_data['times'],
                     y=model_a_data['probs'],
-                    mode='lines',
                     line=dict(color='yellow', width=3),
                     name=f'{model_a} Probability',
                     hovertemplate='Time: %{x:.2f}s<br>Probability: %{y:.3f}<extra></extra>',
                     showlegend=True
@@ -825,13 +866,14 @@ def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
                 row=1, col=1, secondary_y=True
             )
-        if len(model_b_data['times']) > 1:
             fig.add_trace(
                 go.Scatter(
                     x=model_b_data['times'],
                     y=model_b_data['probs'],
-                    mode='lines',
                     line=dict(color='orange', width=3),
                     name=f'{model_b} Probability',
                     hovertemplate='Time: %{x:.2f}s<br>Probability: %{y:.3f}<extra></extra>',
                     showlegend=True
@@ -959,30 +1001,30 @@ class VADDemo:
             if len(processed_audio) == 0:
                 return None, "🎵 Processing audio...", "No audio data processed"
-            window_samples = int(self.processor.sample_rate * self.processor.window_size)
-            hop_samples = int(self.processor.sample_rate * self.processor.hop_size)
             vad_results = []
             selected_models = list(set([model_a, model_b]))
-            # Process each window with model-specific hop sizes for efficiency
-            for i in range(0, len(processed_audio) - window_samples, hop_samples):
-                timestamp = i / self.processor.sample_rate
-                chunk = processed_audio[i:i + window_samples]
-                for model_name in selected_models:
-                    if model_name in self.models:
-                        # Check if we should process this model at this timestamp
-                        model_hop = self.processor.model_hop_sizes.get(model_name, self.processor.hop_size)
-                        if i % int(model_hop * self.processor.sample_rate) == 0:
-                            # Special handling for AST - pass full audio for context
-                            if model_name == 'AST':
-                                result = self.models[model_name].predict(chunk, timestamp, full_audio=processed_audio)
-                            else:
-                                result = self.models[model_name].predict(chunk, timestamp)
-                            result.is_speech = result.probability > threshold
-                            vad_results.append(result)
             delay_compensation = self.processor.estimate_delay_compensation(processed_audio, vad_results)
             onsets_offsets = self.processor.detect_onset_offset_advanced(vad_results, threshold)
@@ -993,17 +1035,18 @@ class VADDemo:
             )
             speech_detected = any(result.is_speech for result in vad_results)
-            total_speech_time = sum(1 for r in vad_results if r.is_speech) * self.processor.hop_size
             if speech_detected:
-                status_msg = f"🎙️ SPEECH DETECTED - {total_speech_time:.1f}s total"
             else:
                 status_msg = f"🔇 No speech detected"
             # Simplified details
             model_summaries = {}
             for result in vad_results:
-                name = result.model_name.split(' ')[0]
                 if name not in model_summaries:
                     model_summaries[name] = {'probs': [], 'speech_chunks': 0, 'total_chunks': 0}
                 summary = model_summaries[name]
@@ -1096,7 +1139,7 @@ def create_interface():
                 model_b = gr.Dropdown(
                     choices=["Silero-VAD", "WebRTC-VAD", "E-PANNs", "PANNs", "AST"],
-                    value="AST",
                     label="Model B (Bottom Panel)"
                 )

                                                  orig_sr=16000,
                                                  target_sr=self.sample_rate)
+                # Ensure minimum length (1 second) using wrap mode instead of zero padding
+                min_samples = self.sample_rate  # 1 second
+                if len(audio_resampled) < min_samples:
+                    if LIBROSA_AVAILABLE:
+                        audio_resampled = librosa.util.fix_length(audio_resampled, size=min_samples, mode='wrap')
+                    else:
+                        # Fallback: repeat the signal
+                        repeat_factor = int(np.ceil(min_samples / len(audio_resampled)))
+                        audio_resampled = np.tile(audio_resampled, repeat_factor)[:min_samples]
                 mel_spec = librosa.feature.melspectrogram(y=audio_resampled, sr=self.sample_rate, n_mels=64)
                 energy = np.mean(librosa.power_to_db(mel_spec, ref=np.max))
                 spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio_resampled, sr=self.sample_rate))
                     audio
                 )
+            # Ensure minimum length for PANNs (1 second) using wrap mode instead of zero padding
             min_samples = self.sample_rate  # 1 second
             if len(audio_resampled) < min_samples:
+                if LIBROSA_AVAILABLE:
+                    audio_resampled = librosa.util.fix_length(audio_resampled, size=min_samples, mode='wrap')
+                else:
+                    # Fallback: repeat the signal
+                    repeat_factor = int(np.ceil(min_samples / len(audio_resampled)))
+                    audio_resampled = np.tile(audio_resampled, repeat_factor)[:min_samples]
             clip_probs, _ = self.model.inference(audio_resampled[np.newaxis, :],
                                                  input_sr=self.sample_rate)
                 self.feature_extractor = ASTFeatureExtractor.from_pretrained(model_name)
                 self.model = ASTForAudioClassification.from_pretrained(model_name)
                 self.model.to(self.device)
+                # Use FP16 for faster inference on GPU
+                if self.device.type == 'cuda':
+                    self.model = self.model.half()
+                    print(f"✅ {self.model_name} loaded with FP16 optimization")
+                else:
+                    print(f"✅ {self.model_name} loaded successfully")
                 self.model.eval()
             else:
                 print(f"⚠️ {self.model_name} not available, using fallback")
                 self.model = None
             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
+            # Use longer context for AST - preferably 4 seconds for better performance
+            if full_audio is not None and len(full_audio) >= 4 * self.sample_rate:
+                # Take 4-second window centered around current timestamp
                 center_pos = int(timestamp * self.sample_rate)
+                window_size = 2 * self.sample_rate  # 2 seconds each side
                 start_pos = max(0, center_pos - window_size)
                 end_pos = min(len(full_audio), center_pos + window_size)
+                # Ensure we have at least 4 seconds
+                if end_pos - start_pos < 4 * self.sample_rate:
+                    end_pos = min(len(full_audio), start_pos + 4 * self.sample_rate)
+                    if end_pos - start_pos < 4 * self.sample_rate:
+                        start_pos = max(0, end_pos - 4 * self.sample_rate)
                 audio_for_ast = full_audio[start_pos:end_pos]
             else:
                 audio_for_ast = audio
+            # Ensure minimum length for AST (4 seconds preferred, minimum 2 seconds)
+            min_samples = 4 * self.sample_rate  # 4 seconds for better performance
             if len(audio_for_ast) < min_samples:
                 audio_for_ast = np.pad(audio_for_ast, (0, min_samples - len(audio_for_ast)), 'constant')
+            # Truncate if too long (AST can handle up to ~10s, but we'll use 5s max for efficiency)
+            max_samples = 5 * self.sample_rate
             if len(audio_for_ast) > max_samples:
                 audio_for_ast = audio_for_ast[:max_samples]
+            # Feature extraction with proper AST parameters (closer to 1024 frames)
             inputs = self.feature_extractor(
                 audio_for_ast,
                 sampling_rate=self.sample_rate,
                 return_tensors="pt",
                 max_length=1024,  # Proper AST context
+                padding="max_length",  # Ensure consistent length
                 truncation=True
             )
+            # Move inputs to correct device and dtype
             inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            if self.device.type == 'cuda' and hasattr(self.model, 'half'):
+                # Convert inputs to FP16 if model is in FP16
+                inputs = {k: v.half() if v.dtype == torch.float32 else v for k, v in inputs.items()}
             with torch.no_grad():
                 outputs = self.model(**inputs)
             if speech_indices:
                 speech_prob = probs[0, speech_indices].mean().item()
+                # Apply more reasonable thresholding for AST with lower threshold
+                if speech_prob < 0.15 and np.sum(audio_for_ast ** 2) > 0.001:
+                    speech_prob = min(speech_prob * 2.5, 0.6)  # Moderate boost, cap at 0.6
             else:
                 # Fallback to energy-based detection with higher threshold
                 energy = np.sum(audio_for_ast ** 2) / len(audio_for_ast)  # Normalize by length
                 speech_prob = min(energy * 50, 1.0)
+            # Use lower threshold specifically for AST (0.25 instead of 0.4)
+            is_speech_ast = speech_prob > 0.25
+            result = VADResult(float(speech_prob), is_speech_ast, self.model_name, time.time()-start_time, timestamp)
             # Cache the result
             self.prediction_cache[cache_key] = result
+            # Clean old cache entries (keep only last 30 seconds for longer sessions)
+            cache_keys_to_remove = [k for k in self.prediction_cache.keys() if k < cache_key - 30]
             for k in cache_keys_to_remove:
                 del self.prediction_cache[k]
         self.fmin = 20
         self.fmax = 8000
+        self.base_window = 0.064
+        self.base_hop = 0.032
+        # Model-specific window sizes (each model gets appropriate context)
+        self.model_windows = {
+            "Silero-VAD": 0.064,  # 64ms as required
+            "WebRTC-VAD": 0.03,   # 30ms frames
+            "E-PANNs": 1.0,       # 1 second minimum
+            "PANNs": 1.0,         # 1 second minimum
+            "AST": 2.0            # 2 seconds for better performance
+        }
         # Model-specific hop sizes for efficiency
         self.model_hop_sizes = {
             "Silero-VAD": 0.032,
             "WebRTC-VAD": 0.03,
+            "E-PANNs": 0.5,       # Process every 0.5s
+            "PANNs": 0.5,         # Process every 0.5s
+            "AST": 0.5            # Process every 0.5s
         }
         self.delay_compensation = 0.0
             if len(audio_data) == 0 or len(vad_results) == 0:
                 return 0.0
+            window_size = int(self.sample_rate * self.base_window)
+            hop_size = int(self.sample_rate * self.base_hop)
             energy_signal = []
             for i in range(0, len(audio_data) - window_size, hop_size):
             vad_times = np.array([r.timestamp for r in vad_results])
             vad_probs = np.array([r.probability for r in vad_results])
+            energy_times = np.arange(len(energy_signal)) * self.base_hop
             vad_interp = np.interp(energy_times, vad_times, vad_probs)
             vad_interp = (vad_interp - np.mean(vad_interp)) / (np.std(vad_interp) + 1e-8)
             if len(energy_signal) > 10 and len(vad_interp) > 10:
                 correlation = np.correlate(energy_signal, vad_interp, mode='full')
                 delay_samples = np.argmax(correlation) - len(vad_interp) + 1
+                delay_seconds = delay_samples * self.base_hop
                 max_corr = np.max(correlation) / (len(vad_interp) * np.std(energy_signal) * np.std(vad_interp))
                 if max_corr > self.correlation_threshold:
         model_b_data = {'times': [], 'probs': []}
         for result in vad_results:
+            # Fix model name filtering - remove suffixes like (cached), (fallback), (error)
+            model_base_name = result.model_name.split(' ')[0].split('(')[0]
+            if model_base_name == model_a or result.model_name.startswith(model_a):
                 model_a_data['times'].append(result.timestamp)
                 model_a_data['probs'].append(result.probability)
+            elif model_base_name == model_b or result.model_name.startswith(model_b):
                 model_b_data['times'].append(result.timestamp)
                 model_b_data['probs'].append(result.probability)
+        if len(model_a_data['times']) > 0:
             fig.add_trace(
                 go.Scatter(
                     x=model_a_data['times'],
                     y=model_a_data['probs'],
+                    mode='lines+markers',  # Add markers to show single points
                     line=dict(color='yellow', width=3),
+                    marker=dict(size=6, color='yellow'),
                     name=f'{model_a} Probability',
                     hovertemplate='Time: %{x:.2f}s<br>Probability: %{y:.3f}<extra></extra>',
                     showlegend=True
                 row=1, col=1, secondary_y=True
             )
+        if len(model_b_data['times']) > 0:
             fig.add_trace(
                 go.Scatter(
                     x=model_b_data['times'],
                     y=model_b_data['probs'],
+                    mode='lines+markers',  # Add markers to show single points
                     line=dict(color='orange', width=3),
+                    marker=dict(size=6, color='orange'),
                     name=f'{model_b} Probability',
                     hovertemplate='Time: %{x:.2f}s<br>Probability: %{y:.3f}<extra></extra>',
                     showlegend=True
             if len(processed_audio) == 0:
                 return None, "🎵 Processing audio...", "No audio data processed"
             vad_results = []
             selected_models = list(set([model_a, model_b]))
+            # Process each model with its specific window and hop size
+            for model_name in selected_models:
+                if model_name in self.models:
+                    window_size = self.processor.model_windows[model_name]
+                    hop_size = self.processor.model_hop_sizes[model_name]
+                    window_samples = int(self.processor.sample_rate * window_size)
+                    hop_samples = int(self.processor.sample_rate * hop_size)
+                    for i in range(0, len(processed_audio) - window_samples, hop_samples):
+                        timestamp = i / self.processor.sample_rate
+                        chunk = processed_audio[i:i + window_samples]
+                        # Special handling for AST - pass full audio for context
+                        if model_name == 'AST':
+                            result = self.models[model_name].predict(chunk, timestamp, full_audio=processed_audio)
+                        else:
+                            result = self.models[model_name].predict(chunk, timestamp)
+                        result.is_speech = result.probability > threshold
+                        vad_results.append(result)
             delay_compensation = self.processor.estimate_delay_compensation(processed_audio, vad_results)
             onsets_offsets = self.processor.detect_onset_offset_advanced(vad_results, threshold)
             )
             speech_detected = any(result.is_speech for result in vad_results)
+            total_speech_chunks = sum(1 for r in vad_results if r.is_speech)
             if speech_detected:
+                status_msg = f"🎙️ SPEECH DETECTED - {total_speech_chunks} active chunks"
             else:
                 status_msg = f"🔇 No speech detected"
             # Simplified details
             model_summaries = {}
             for result in vad_results:
+                # Fix model name filtering - remove suffixes like (cached), (fallback)
+                name = result.model_name.split(' ')[0].split('(')[0]
                 if name not in model_summaries:
                     model_summaries[name] = {'probs': [], 'speech_chunks': 0, 'total_chunks': 0}
                 summary = model_summaries[name]
                 model_b = gr.Dropdown(
                     choices=["Silero-VAD", "WebRTC-VAD", "E-PANNs", "PANNs", "AST"],
+                    value="PANNs",
                     label="Model B (Bottom Panel)"
                 )