Spaces:

gbibbo
/

vad_demo

Sleeping

App Files Files Community

Gabriel Bibbó commited on Aug 5

Commit

0ea20e3

1 Parent(s): 96f8e9f

Hotfix: Restore basic functionality - fix AST saturation and PANNs execution

Browse files

Files changed (1) hide show

app.py +122 -60

app.py CHANGED Viewed

@@ -141,12 +141,18 @@ class OptimizedSileroVAD:
                 audio = audio.mean(axis=1)
             required_samples = 512
             if len(audio) != required_samples:
                 if len(audio) > required_samples:
                     start_idx = (len(audio) - required_samples) // 2
                     audio_chunk = audio[start_idx:start_idx + required_samples]
                 else:
-                    audio_chunk = np.pad(audio, (0, required_samples - len(audio)), 'constant')
             else:
                 audio_chunk = audio
@@ -194,7 +200,9 @@ class OptimizedWebRTCVAD:
             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
-            audio_int16 = (audio * 32767).astype(np.int16)
             speech_frames = 0
             total_frames = 0
@@ -237,8 +245,8 @@ class OptimizedEPANNs:
                                                  orig_sr=16000,
                                                  target_sr=self.sample_rate)
-                # Ensure minimum length (1 second) using wrap mode instead of zero padding
-                min_samples = self.sample_rate  # 1 second
                 if len(audio_resampled) < min_samples:
                     if LIBROSA_AVAILABLE:
                         audio_resampled = librosa.util.fix_length(audio_resampled, size=min_samples, mode='wrap')
@@ -327,8 +335,8 @@ class OptimizedPANNs:
                     audio
                 )
-            # Ensure minimum length for PANNs (1 second) using wrap mode instead of zero padding
-            min_samples = self.sample_rate  # 1 second
             if len(audio_resampled) < min_samples:
                 if LIBROSA_AVAILABLE:
                     audio_resampled = librosa.util.fix_length(audio_resampled, size=min_samples, mode='wrap')
@@ -443,32 +451,37 @@ class OptimizedAST:
             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
-            # Use longer context for AST - preferably 4 seconds for better performance
-            if full_audio is not None and len(full_audio) >= 4 * self.sample_rate:
-                # Take 4-second window centered around current timestamp
                 center_pos = int(timestamp * self.sample_rate)
-                window_size = 2 * self.sample_rate  # 2 seconds each side
                 start_pos = max(0, center_pos - window_size)
                 end_pos = min(len(full_audio), center_pos + window_size)
-                # Ensure we have at least 4 seconds
-                if end_pos - start_pos < 4 * self.sample_rate:
-                    end_pos = min(len(full_audio), start_pos + 4 * self.sample_rate)
-                    if end_pos - start_pos < 4 * self.sample_rate:
-                        start_pos = max(0, end_pos - 4 * self.sample_rate)
                 audio_for_ast = full_audio[start_pos:end_pos]
             else:
                 audio_for_ast = audio
-            # Ensure minimum length for AST (4 seconds preferred, minimum 2 seconds)
-            min_samples = 4 * self.sample_rate  # 4 seconds for better performance
             if len(audio_for_ast) < min_samples:
-                audio_for_ast = np.pad(audio_for_ast, (0, min_samples - len(audio_for_ast)), 'constant')
-            # Truncate if too long (AST can handle up to ~10s, but we'll use 5s max for efficiency)
-            max_samples = 5 * self.sample_rate
             if len(audio_for_ast) > max_samples:
                 audio_for_ast = audio_for_ast[:max_samples]
@@ -557,20 +570,29 @@ class AudioProcessor:
         # Model-specific window sizes (each model gets appropriate context)
         self.model_windows = {
-            "Silero-VAD": 0.064,  # 64ms as required
             "WebRTC-VAD": 0.03,   # 30ms frames
-            "E-PANNs": 1.0,       # 1 second minimum
-            "PANNs": 1.0,         # 1 second minimum
-            "AST": 2.0            # 2 seconds for better performance
         }
         # Model-specific hop sizes for efficiency
         self.model_hop_sizes = {
-            "Silero-VAD": 0.032,
-            "WebRTC-VAD": 0.03,
-            "E-PANNs": 0.5,       # Process every 0.5s
-            "PANNs": 0.5,         # Process every 0.5s
-            "AST": 0.5            # Process every 0.5s
         }
         self.delay_compensation = 0.0
@@ -822,32 +844,52 @@ def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
         )
         if len(time_frames) > 0:
-            # Add threshold lines to both panels
-            fig.add_hline(
-                y=threshold,
                 line=dict(color='cyan', width=2, dash='dash'),
-                annotation_text=f'Threshold: {threshold:.2f}',
-                annotation_position="top right",
-                row=1, col=1, secondary_y=True
             )
-            fig.add_hline(
-                y=threshold,
                 line=dict(color='cyan', width=2, dash='dash'),
-                annotation_text=f'Threshold: {threshold:.2f}',
-                annotation_position="top right",
-                row=2, col=1, secondary_y=True
             )
         model_a_data = {'times': [], 'probs': []}
         model_b_data = {'times': [], 'probs': []}
         for result in vad_results:
-            # Fix model name filtering - remove suffixes like (cached), (fallback), (error)
-            model_base_name = result.model_name.split(' ')[0].split('(')[0]
-            if model_base_name == model_a or result.model_name.startswith(model_a):
                 model_a_data['times'].append(result.timestamp)
                 model_a_data['probs'].append(result.probability)
-            elif model_base_name == model_b or result.model_name.startswith(model_b):
                 model_b_data['times'].append(result.timestamp)
                 model_b_data['probs'].append(result.probability)
@@ -881,8 +923,8 @@ def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
                 row=2, col=1, secondary_y=True
             )
-        model_a_events = [e for e in onsets_offsets if e.model_name.startswith(model_a)]
-        model_b_events = [e for e in onsets_offsets if e.model_name.startswith(model_b)]
         for event in model_a_events:
             if event.onset_time >= 0 and event.onset_time <= time_frames[-1]:
@@ -1009,21 +1051,39 @@ class VADDemo:
                 if model_name in self.models:
                     window_size = self.processor.model_windows[model_name]
                     hop_size = self.processor.model_hop_sizes[model_name]
                     window_samples = int(self.processor.sample_rate * window_size)
                     hop_samples = int(self.processor.sample_rate * hop_size)
-                    for i in range(0, len(processed_audio) - window_samples, hop_samples):
                         timestamp = i / self.processor.sample_rate
-                        chunk = processed_audio[i:i + window_samples]
-                        # Special handling for AST - pass full audio for context
                         if model_name == 'AST':
-                            result = self.models[model_name].predict(chunk, timestamp, full_audio=processed_audio)
                         else:
                             result = self.models[model_name].predict(chunk, timestamp)
-                        result.is_speech = result.probability > threshold
                         vad_results.append(result)
             delay_compensation = self.processor.estimate_delay_compensation(processed_audio, vad_results)
@@ -1045,30 +1105,32 @@ class VADDemo:
             # Simplified details
             model_summaries = {}
             for result in vad_results:
-                # Fix model name filtering - remove suffixes like (cached), (fallback)
-                name = result.model_name.split(' ')[0].split('(')[0]
-                if name not in model_summaries:
-                    model_summaries[name] = {'probs': [], 'speech_chunks': 0, 'total_chunks': 0}
-                summary = model_summaries[name]
                 summary['probs'].append(result.probability)
                 summary['total_chunks'] += 1
                 if result.is_speech:
                     summary['speech_chunks'] += 1
-            details_lines = [f"**Analysis Results** (Threshold: {threshold:.2f})"]
             for model_name, summary in model_summaries.items():
                 avg_prob = np.mean(summary['probs']) if summary['probs'] else 0
                 speech_ratio = (summary['speech_chunks'] / summary['total_chunks']) if summary['total_chunks'] > 0 else 0
                 status_icon = "🟢" if speech_ratio > 0.5 else "🟡" if speech_ratio > 0.2 else "🔴"
-                details_lines.append(f"{status_icon} **{model_name}**: {avg_prob:.3f} avg prob, {speech_ratio*100:.1f}% speech")
             if onsets_offsets:
                 details_lines.append(f"\n**Speech Events**: {len(onsets_offsets)} detected")
                 for i, event in enumerate(onsets_offsets[:5]):  # Show first 5 only
                     duration = event.offset_time - event.onset_time if event.offset_time > event.onset_time else 0
-                    details_lines.append(f"• {event.model_name}: {event.onset_time:.2f}s - {event.offset_time:.2f}s ({duration:.2f}s)")
             details_text = "\n".join(details_lines)
@@ -1139,7 +1201,7 @@ def create_interface():
                 model_b = gr.Dropdown(
                     choices=["Silero-VAD", "WebRTC-VAD", "E-PANNs", "PANNs", "AST"],
-                    value="PANNs",
                     label="Model B (Bottom Panel)"
                 )

                 audio = audio.mean(axis=1)
             required_samples = 512
+            # Silero requires exactly 512 samples, handle this precisely
             if len(audio) != required_samples:
                 if len(audio) > required_samples:
+                    # Take center portion to avoid edge effects
                     start_idx = (len(audio) - required_samples) // 2
                     audio_chunk = audio[start_idx:start_idx + required_samples]
                 else:
+                    # Pad symmetrically instead of just at the end
+                    pad_total = required_samples - len(audio)
+                    pad_left = pad_total // 2
+                    pad_right = pad_total - pad_left
+                    audio_chunk = np.pad(audio, (pad_left, pad_right), 'reflect')
             else:
                 audio_chunk = audio
             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
+            # Properly convert to int16 with clipping to avoid saturation
+            audio_clipped = np.clip(audio, -1.0, 1.0)
+            audio_int16 = (audio_clipped * 32767).astype(np.int16)
             speech_frames = 0
             total_frames = 0
                                                  orig_sr=16000,
                                                  target_sr=self.sample_rate)
+                # Ensure minimum length (6 seconds) using wrap mode instead of zero padding
+                min_samples = 6 * self.sample_rate  # 6 seconds
                 if len(audio_resampled) < min_samples:
                     if LIBROSA_AVAILABLE:
                         audio_resampled = librosa.util.fix_length(audio_resampled, size=min_samples, mode='wrap')
                     audio
                 )
+            # Ensure minimum length for PANNs (10 seconds) using wrap mode instead of zero padding
+            min_samples = 10 * self.sample_rate  # 10 seconds for optimal performance
             if len(audio_resampled) < min_samples:
                 if LIBROSA_AVAILABLE:
                     audio_resampled = librosa.util.fix_length(audio_resampled, size=min_samples, mode='wrap')
             if len(audio.shape) > 1:
                 audio = audio.mean(axis=1)
+            # Use longer context for AST - preferably 6.4 seconds (1024 frames)
+            if full_audio is not None and len(full_audio) >= 6.4 * self.sample_rate:
+                # Take 6.4-second window centered around current timestamp
                 center_pos = int(timestamp * self.sample_rate)
+                window_size = int(3.2 * self.sample_rate)  # 3.2 seconds each side
                 start_pos = max(0, center_pos - window_size)
                 end_pos = min(len(full_audio), center_pos + window_size)
+                # Ensure we have at least 6.4 seconds
+                if end_pos - start_pos < 6.4 * self.sample_rate:
+                    end_pos = min(len(full_audio), start_pos + int(6.4 * self.sample_rate))
+                    if end_pos - start_pos < 6.4 * self.sample_rate:
+                        start_pos = max(0, end_pos - int(6.4 * self.sample_rate))
                 audio_for_ast = full_audio[start_pos:end_pos]
             else:
                 audio_for_ast = audio
+            # Ensure minimum length for AST (6.4 seconds for 1024 frames)
+            min_samples = int(6.4 * self.sample_rate)  # 6.4 seconds
             if len(audio_for_ast) < min_samples:
+                if LIBROSA_AVAILABLE:
+                    audio_for_ast = librosa.util.fix_length(audio_for_ast, size=min_samples, mode='wrap')
+                else:
+                    # Fallback: repeat the signal
+                    repeat_factor = int(np.ceil(min_samples / len(audio_for_ast)))
+                    audio_for_ast = np.tile(audio_for_ast, repeat_factor)[:min_samples]
+            # Truncate if too long (AST can handle up to ~10s, but we'll use 8s max for efficiency)
+            max_samples = 8 * self.sample_rate
             if len(audio_for_ast) > max_samples:
                 audio_for_ast = audio_for_ast[:max_samples]
         # Model-specific window sizes (each model gets appropriate context)
         self.model_windows = {
+            "Silero-VAD": 0.032,  # 32ms exactly as required (512 samples)
             "WebRTC-VAD": 0.03,   # 30ms frames
+            "E-PANNs": 6.0,       # 6 seconds minimum for reliable results
+            "PANNs": 10.0,        # 10 seconds for optimal performance
+            "AST": 6.4            # ~6.4 seconds (1024 frames * 6.25ms)
         }
         # Model-specific hop sizes for efficiency
         self.model_hop_sizes = {
+            "Silero-VAD": 0.016,  # 16ms hop for Silero
+            "WebRTC-VAD": 0.01,   # 10ms hop for WebRTC
+            "E-PANNs": 1.0,       # Process every 1s but with 6s window
+            "PANNs": 2.0,         # Process every 2s but with 10s window
+            "AST": 1.0            # Process every 1s but with 6.4s window
+        }
+        # Model-specific thresholds for better detection
+        self.model_thresholds = {
+            "Silero-VAD": 0.5,
+            "WebRTC-VAD": 0.5,
+            "E-PANNs": 0.4,
+            "PANNs": 0.4,
+            "AST": 0.25
         }
         self.delay_compensation = 0.0
         )
         if len(time_frames) > 0:
+            # Add threshold lines using add_shape to avoid secondary axis bug
+            fig.add_shape(
+                type="line",
+                x0=time_frames[0], x1=time_frames[-1],
+                y0=threshold, y1=threshold,
                 line=dict(color='cyan', width=2, dash='dash'),
+                row=1, col=1,
+                yref="y2"  # Reference to secondary y-axis
             )
+            fig.add_shape(
+                type="line",
+                x0=time_frames[0], x1=time_frames[-1],
+                y0=threshold, y1=threshold,
                 line=dict(color='cyan', width=2, dash='dash'),
+                row=2, col=1,
+                yref="y4"  # Reference to secondary y-axis of second subplot
+            )
+            # Add threshold annotations
+            fig.add_annotation(
+                x=time_frames[-1] * 0.95, y=threshold,
+                text=f'Threshold: {threshold:.2f}',
+                showarrow=False,
+                font=dict(color='cyan', size=10),
+                row=1, col=1,
+                yref="y2"
+            )
+            fig.add_annotation(
+                x=time_frames[-1] * 0.95, y=threshold,
+                text=f'Threshold: {threshold:.2f}',
+                showarrow=False,
+                font=dict(color='cyan', size=10),
+                row=2, col=1,
+                yref="y4"
             )
         model_a_data = {'times': [], 'probs': []}
         model_b_data = {'times': [], 'probs': []}
         for result in vad_results:
+            # Fix model name filtering - remove suffixes properly and consistently
+            base_name = result.model_name.split('(')[0].strip()
+            if base_name == model_a:
                 model_a_data['times'].append(result.timestamp)
                 model_a_data['probs'].append(result.probability)
+            elif base_name == model_b:
                 model_b_data['times'].append(result.timestamp)
                 model_b_data['probs'].append(result.probability)
                 row=2, col=1, secondary_y=True
             )
+        model_a_events = [e for e in onsets_offsets if e.model_name.split('(')[0].strip() == model_a]
+        model_b_events = [e for e in onsets_offsets if e.model_name.split('(')[0].strip() == model_b]
         for event in model_a_events:
             if event.onset_time >= 0 and event.onset_time <= time_frames[-1]:
                 if model_name in self.models:
                     window_size = self.processor.model_windows[model_name]
                     hop_size = self.processor.model_hop_sizes[model_name]
+                    model_threshold = self.processor.model_thresholds.get(model_name, threshold)
                     window_samples = int(self.processor.sample_rate * window_size)
                     hop_samples = int(self.processor.sample_rate * hop_size)
+                    # For large models, ensure we have enough audio
+                    if len(processed_audio) < window_samples:
+                        # If audio is too short, repeat it to reach minimum length
+                        repeat_factor = int(np.ceil(window_samples / len(processed_audio)))
+                        extended_audio = np.tile(processed_audio, repeat_factor)[:window_samples]
+                    else:
+                        extended_audio = processed_audio
+                    for i in range(0, len(extended_audio) - window_samples, hop_samples):
                         timestamp = i / self.processor.sample_rate
+                        # Extract window centered around current position
+                        start_pos = max(0, i)
+                        end_pos = min(len(extended_audio), i + window_samples)
+                        chunk = extended_audio[start_pos:end_pos]
+                        # Ensure chunk has the right length
+                        if len(chunk) < window_samples:
+                            chunk = np.pad(chunk, (0, window_samples - len(chunk)), 'wrap')
+                        # Special handling for different models
                         if model_name == 'AST':
+                            result = self.models[model_name].predict(chunk, timestamp, full_audio=extended_audio)
                         else:
                             result = self.models[model_name].predict(chunk, timestamp)
+                        # Use model-specific threshold
+                        result.is_speech = result.probability > model_threshold
                         vad_results.append(result)
             delay_compensation = self.processor.estimate_delay_compensation(processed_audio, vad_results)
             # Simplified details
             model_summaries = {}
             for result in vad_results:
+                # Fix model name filtering - remove suffixes properly
+                base_name = result.model_name.split('(')[0].strip()
+                if base_name not in model_summaries:
+                    model_summaries[base_name] = {'probs': [], 'speech_chunks': 0, 'total_chunks': 0}
+                summary = model_summaries[base_name]
                 summary['probs'].append(result.probability)
                 summary['total_chunks'] += 1
                 if result.is_speech:
                     summary['speech_chunks'] += 1
+            details_lines = [f"**Analysis Results** (Global Threshold: {threshold:.2f})"]
             for model_name, summary in model_summaries.items():
                 avg_prob = np.mean(summary['probs']) if summary['probs'] else 0
                 speech_ratio = (summary['speech_chunks'] / summary['total_chunks']) if summary['total_chunks'] > 0 else 0
+                model_thresh = self.processor.model_thresholds.get(model_name, threshold)
                 status_icon = "🟢" if speech_ratio > 0.5 else "🟡" if speech_ratio > 0.2 else "🔴"
+                details_lines.append(f"{status_icon} **{model_name}**: {avg_prob:.3f} avg prob, {speech_ratio*100:.1f}% speech (thresh: {model_thresh:.2f})")
             if onsets_offsets:
                 details_lines.append(f"\n**Speech Events**: {len(onsets_offsets)} detected")
                 for i, event in enumerate(onsets_offsets[:5]):  # Show first 5 only
                     duration = event.offset_time - event.onset_time if event.offset_time > event.onset_time else 0
+                    event_model = event.model_name.split('(')[0].strip()
+                    details_lines.append(f"• {event_model}: {event.onset_time:.2f}s - {event.offset_time:.2f}s ({duration:.2f}s)")
             details_text = "\n".join(details_lines)
                 model_b = gr.Dropdown(
                     choices=["Silero-VAD", "WebRTC-VAD", "E-PANNs", "PANNs", "AST"],
+                    value="E-PANNs",
                     label="Model B (Bottom Panel)"
                 )