Spaces:

gbibbo
/

vad_demo

Sleeping

App Files Files Community

Gabriel Bibbó commited on Aug 12

Commit

bd1af2c

1 Parent(s): dac6057

adjust app.py

Browse files

Files changed (1) hide show

app.py +81 -35

app.py CHANGED Viewed

@@ -679,7 +679,7 @@ class AudioProcessor:
             "WebRTC-VAD": 0.03,   # 30ms hop for WebRTC (match frame duration)
             "E-PANNs": 0.05,      # CHANGED from 0.1 to 0.05 for 20Hz
             "PANNs": 0.05,        # CHANGED from 0.1 to 0.05 for 20Hz
-            "AST": 0.24           # OPTIMIZED: Reduced frequency (4.17 Hz) for performance
         }
         # Model-specific thresholds for better detection
@@ -745,9 +745,12 @@ class AudioProcessor:
                 mel_spec = np.dot(mel_basis, power_spec)
                 mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
-                # CAMBIO 2: Usar librosa.frames_to_time para consistencia con center=True
                 frames = np.arange(mel_spec_db.shape[1])
-                time_frames = librosa.frames_to_time(frames, sr=self.sample_rate, hop_length=self.hop_length)
                 return mel_spec_db, time_frames
             else:
@@ -995,35 +998,69 @@ def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
                 model_b_data['times'].append(result.timestamp)
                 model_b_data['probs'].append(result.probability)
-        if len(model_a_data['times']) > 0:
-            fig.add_trace(
-                go.Scatter(
-                    x=model_a_data['times'],
-                    y=model_a_data['probs'],
-                    mode='lines+markers',  # Add markers to show single points
-                    line=dict(color='yellow', width=3),
-                    marker=dict(size=6, color='yellow'),
-                    name=f'{model_a} Probability',
-                    hovertemplate='Time: %{x:.2f}s<br>Probability: %{y:.3f}<extra></extra>',
-                    showlegend=True
-                ),
-                row=1, col=1, secondary_y=True
-            )
-        if len(model_b_data['times']) > 0:
-            fig.add_trace(
-                go.Scatter(
-                    x=model_b_data['times'],
-                    y=model_b_data['probs'],
-                    mode='lines+markers',  # Add markers to show single points
-                    line=dict(color='orange', width=3),
-                    marker=dict(size=6, color='orange'),
-                    name=f'{model_b} Probability',
-                    hovertemplate='Time: %{x:.2f}s<br>Probability: %{y:.3f}<extra></extra>',
-                    showlegend=True
-                ),
-                row=2, col=1, secondary_y=True
-            )
         model_a_events = [e for e in onsets_offsets if e.model_name.split('(')[0].strip() == model_a]
         model_b_events = [e for e in onsets_offsets if e.model_name.split('(')[0].strip() == model_b]
@@ -1208,9 +1245,14 @@ class VADDemo:
                         end_pos = min(len(processed_audio), start_pos + window_samples)
                         chunk = processed_audio[start_pos:end_pos]
-                        # Pad if necessary (with zeros, not repeating)
                         if len(chunk) < window_samples:
-                            chunk = np.pad(chunk, (0, window_samples - len(chunk)), mode='constant')
                         if window_count < 3:  # Log first 3 windows
                             debug_info.append(f"  🔄 Window {window_count}: t={timestamp:.2f}s (center), chunk_size={len(chunk)}")
@@ -1243,9 +1285,13 @@ class VADDemo:
             delay_compensation = self.processor.estimate_delay_compensation(processed_audio, vad_results)
             # CORRECTED: Use global threshold with delay compensation and min duration
             onsets_offsets = self.processor.detect_onset_offset_advanced(
-                vad_results, threshold, apply_delay=delay_compensation, min_duration=0.12
             )
             debug_info.append(f"\n🎭 **EVENTS**: {len(onsets_offsets)} onset/offset pairs detected")

             "WebRTC-VAD": 0.03,   # 30ms hop for WebRTC (match frame duration)
             "E-PANNs": 0.05,      # CHANGED from 0.1 to 0.05 for 20Hz
             "PANNs": 0.05,        # CHANGED from 0.1 to 0.05 for 20Hz
+            "AST": 0.1            # IMPROVED: Better resolution (10 Hz) while maintaining performance
         }
         # Model-specific thresholds for better detection
                 mel_spec = np.dot(mel_basis, power_spec)
                 mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
+                # CAMBIO 2: Usar librosa.frames_to_time para consistencia con center=False
                 frames = np.arange(mel_spec_db.shape[1])
+                time_frames = librosa.frames_to_time(
+                    frames, sr=self.sample_rate, hop_length=self.hop_length,
+                    n_fft=self.n_fft, initial_time=0.0  # CRITICAL: Fix offset for center=False
+                )
                 return mel_spec_db, time_frames
             else:
                 model_b_data['times'].append(result.timestamp)
                 model_b_data['probs'].append(result.probability)
+        # IMPROVEMENT: Use common high-resolution time grid for better alignment
+        if len(time_frames) > 0:
+            common_times = np.linspace(0, time_frames[-1], 1000)  # High-res grid
+            if len(model_a_data['times']) > 1:
+                # Interpolate to common grid for smooth visualization
+                interp_probs_a = np.interp(common_times, model_a_data['times'], model_a_data['probs'], left=0, right=0)
+                fig.add_trace(
+                    go.Scatter(
+                        x=common_times,
+                        y=interp_probs_a,
+                        mode='lines',
+                        line=dict(color='yellow', width=3),
+                        name=f'{model_a} Probability',
+                        hovertemplate='Time: %{x:.2f}s<br>Probability: %{y:.3f}<extra></extra>',
+                        showlegend=True
+                    ),
+                    row=1, col=1, secondary_y=True
+                )
+            elif len(model_a_data['times']) == 1:
+                # Single point fallback
+                fig.add_trace(
+                    go.Scatter(
+                        x=model_a_data['times'],
+                        y=model_a_data['probs'],
+                        mode='markers',
+                        marker=dict(size=8, color='yellow'),
+                        name=f'{model_a} Probability',
+                        hovertemplate='Time: %{x:.2f}s<br>Probability: %{y:.3f}<extra></extra>',
+                        showlegend=True
+                    ),
+                    row=1, col=1, secondary_y=True
+                )
+            if len(model_b_data['times']) > 1:
+                # Interpolate to common grid for smooth visualization
+                interp_probs_b = np.interp(common_times, model_b_data['times'], model_b_data['probs'], left=0, right=0)
+                fig.add_trace(
+                    go.Scatter(
+                        x=common_times,
+                        y=interp_probs_b,
+                        mode='lines',
+                        line=dict(color='orange', width=3),
+                        name=f'{model_b} Probability',
+                        hovertemplate='Time: %{x:.2f}s<br>Probability: %{y:.3f}<extra></extra>',
+                        showlegend=True
+                    ),
+                    row=2, col=1, secondary_y=True
+                )
+            elif len(model_b_data['times']) == 1:
+                # Single point fallback
+                fig.add_trace(
+                    go.Scatter(
+                        x=model_b_data['times'],
+                        y=model_b_data['probs'],
+                        mode='markers',
+                        marker=dict(size=8, color='orange'),
+                        name=f'{model_b} Probability',
+                        hovertemplate='Time: %{x:.2f}s<br>Probability: %{y:.3f}<extra></extra>',
+                        showlegend=True
+                    ),
+                    row=2, col=1, secondary_y=True
+                )
         model_a_events = [e for e in onsets_offsets if e.model_name.split('(')[0].strip() == model_a]
         model_b_events = [e for e in onsets_offsets if e.model_name.split('(')[0].strip() == model_b]
                         end_pos = min(len(processed_audio), start_pos + window_samples)
                         chunk = processed_audio[start_pos:end_pos]
+                        # Pad if necessary (with reflection, not zeros to avoid artificial silence)
                         if len(chunk) < window_samples:
+                            chunk = np.pad(chunk, (0, window_samples - len(chunk)), mode='reflect')
+                        # Skip chunks with excessive padding to avoid skewed predictions
+                        padding_ratio = (window_samples - (end_pos - start_pos)) / window_samples
+                        if padding_ratio > 0.5:
+                            continue  # Skip heavily padded chunks
                         if window_count < 3:  # Log first 3 windows
                             debug_info.append(f"  🔄 Window {window_count}: t={timestamp:.2f}s (center), chunk_size={len(chunk)}")
             delay_compensation = self.processor.estimate_delay_compensation(processed_audio, vad_results)
+            # CRITICAL: Apply delay compensation to ALL VAD timestamps, not just events
+            for result in vad_results:
+                result.timestamp -= delay_compensation
             # CORRECTED: Use global threshold with delay compensation and min duration
             onsets_offsets = self.processor.detect_onset_offset_advanced(
+                vad_results, threshold, apply_delay=0.0, min_duration=0.12  # delay already applied above
             )
             debug_info.append(f"\n🎭 **EVENTS**: {len(onsets_offsets)} onset/offset pairs detected")