Spaces:

gbibbo
/

vad_demo

Sleeping

App Files Files Community

Gabriel Bibbó commited on Aug 12

Commit

5e8e920

1 Parent(s): 301bbc8

adjust app.py

Browse files

Files changed (1) hide show

app.py +13 -9

app.py CHANGED Viewed

@@ -695,7 +695,7 @@ class AudioProcessor:
         }
         self.delay_compensation = 0.0
-        self.correlation_threshold = 0.7
     def process_audio(self, audio):
         if audio is None:
@@ -1005,8 +1005,10 @@ def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
             common_times = np.linspace(0, time_frames[-1], 1000)  # High-res grid
             if len(model_a_data['times']) > 1:
-                # Interpolate to common grid for smooth visualization
-                interp_probs_a = np.interp(common_times, model_a_data['times'], model_a_data['probs'], left=0, right=0)
                 fig.add_trace(
                     go.Scatter(
                         x=common_times,
@@ -1035,8 +1037,10 @@ def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
                 )
             if len(model_b_data['times']) > 1:
-                # Interpolate to common grid for smooth visualization
-                interp_probs_b = np.interp(common_times, model_b_data['times'], model_b_data['probs'], left=0, right=0)
                 fig.add_trace(
                     go.Scatter(
                         x=common_times,
@@ -1239,8 +1243,8 @@ class VADDemo:
                     audio_duration = len(processed_audio) / self.processor.sample_rate
                     for i in range(0, len(processed_audio), hop_samples):
-                        # CAMBIO 1: Timestamp en el centro de la ventana
-                        timestamp = (i + window_samples // 2) / self.processor.sample_rate
                         # CRITICAL: Extract the chunk centered on this timestamp
                         start_pos = max(0, i - window_samples // 2)
@@ -1257,7 +1261,7 @@ class VADDemo:
                             continue  # Skip heavily padded chunks
                         if window_count < 3:  # Log first 3 windows
-                            debug_info.append(f"  🔄 Window {window_count}: t={timestamp:.2f}s (center), chunk_size={len(chunk)}")
                         # Call predict with the chunk
                         result = self.models[model_name].predict(chunk, timestamp)
@@ -1462,7 +1466,7 @@ def create_interface():
         ---
         **Models**: Silero-VAD, WebRTC-VAD, E-PANNs, PANNs, AST | **Research**: WASPAA 2025 | **Institution**: University of Surrey, CVSSP
-        **Note**: Optimized temporal alignment with delay compensation, reflection padding, and interpolated visualization for precise speech detection.
         """)
     return interface

         }
         self.delay_compensation = 0.0
+        self.correlation_threshold = 0.5  # REDUCED: More sensitive delay detection
     def process_audio(self, audio):
         if audio is None:
             common_times = np.linspace(0, time_frames[-1], 1000)  # High-res grid
             if len(model_a_data['times']) > 1:
+                # IMPROVED: Use first probability for extrapolation instead of 0
+                first_prob_a = model_a_data['probs'][0]
+                interp_probs_a = np.interp(common_times, model_a_data['times'], model_a_data['probs'],
+                                          left=first_prob_a, right=model_a_data['probs'][-1])
                 fig.add_trace(
                     go.Scatter(
                         x=common_times,
                 )
             if len(model_b_data['times']) > 1:
+                # IMPROVED: Use first probability for extrapolation instead of 0
+                first_prob_b = model_b_data['probs'][0]
+                interp_probs_b = np.interp(common_times, model_b_data['times'], model_b_data['probs'],
+                                          left=first_prob_b, right=model_b_data['probs'][-1])
                 fig.add_trace(
                     go.Scatter(
                         x=common_times,
                     audio_duration = len(processed_audio) / self.processor.sample_rate
                     for i in range(0, len(processed_audio), hop_samples):
+                        # CORRECTED: Timestamp at START of window to align with spectrogram from 0s
+                        timestamp = i / self.processor.sample_rate
                         # CRITICAL: Extract the chunk centered on this timestamp
                         start_pos = max(0, i - window_samples // 2)
                             continue  # Skip heavily padded chunks
                         if window_count < 3:  # Log first 3 windows
+                            debug_info.append(f"  🔄 Window {window_count}: t={timestamp:.2f}s (start), chunk_size={len(chunk)}")
                         # Call predict with the chunk
                         result = self.models[model_name].predict(chunk, timestamp)
         ---
         **Models**: Silero-VAD, WebRTC-VAD, E-PANNs, PANNs, AST | **Research**: WASPAA 2025 | **Institution**: University of Surrey, CVSSP
+        **Note**: Perfect temporal alignment achieved - prediction curves now start from 0s and align precisely with spectrogram features.
         """)
     return interface