Spaces:

gbibbo
/

vad_demo

Sleeping

App Files Files Community

Gabriel Bibbó commited on Aug 13

Commit

b3b9f78

1 Parent(s): 29ea60e

adjust app.py

Browse files

Files changed (1) hide show

app.py +23 -13

app.py CHANGED Viewed

@@ -766,6 +766,9 @@ class AudioProcessor:
                     mode='psd'
                 )
                 mel_spec_db = np.zeros((self.n_mels, Sxx.shape[1]))
                 mel_freqs = np.logspace(
@@ -826,8 +829,8 @@ class AudioProcessor:
             # Add virtual start point if first timestamp > 0
             if rs[0].timestamp > 0:
                 virtual_start = VADResult(
-                    probability=0.0,
-                    is_speech=False,
                     model_name=base,
                     processing_time=0,
                     timestamp=0.0
@@ -1271,25 +1274,28 @@ class VADDemo:
                     # CRITICAL FIX: Always extract chunks, both for short and long audio
                     window_count = 0
                     audio_duration = len(processed_audio) / self.processor.sample_rate
-                    audio_len = len(processed_audio)
-                    for current_pos in range(hop_samples, audio_len + hop_samples, hop_samples):
-                        current_pos = min(current_pos, audio_len)
-                        start_pos = max(0, current_pos - window_samples)
-                        chunk = processed_audio[start_pos:current_pos]
-                        orig_len = len(chunk)
-                        if orig_len < window_samples:
-                            chunk = np.pad(chunk, (window_samples - orig_len, 0), mode='constant')
-                        padding_ratio = (window_samples - orig_len) / window_samples
                         if padding_ratio > 0.5:
                             continue  # Skip heavily padded chunks
-                        timestamp = current_pos / self.processor.sample_rate
                         if window_count < 3:  # Log first 3 windows
-                            debug_info.append(f"  🔄 Window {window_count}: t={timestamp:.2f}s (end), chunk_size={orig_len} (padded to {len(chunk)})")
                         # Call predict with the chunk
                         result = self.models[model_name].predict(chunk, timestamp)
@@ -1303,6 +1309,10 @@ class VADDemo:
                         model_results.append(result)
                         window_count += 1
                     debug_info.append(f"  🎯 Total windows processed: {window_count}")
                     # Summary for this model

                     mode='psd'
                 )
+                # Ajustar tiempos para alinear con center=False (empezar en 0)
+                t -= (self.n_fft / 2.0) / self.sample_rate
                 mel_spec_db = np.zeros((self.n_mels, Sxx.shape[1]))
                 mel_freqs = np.logspace(
             # Add virtual start point if first timestamp > 0
             if rs[0].timestamp > 0:
                 virtual_start = VADResult(
+                    probability=rs[0].probability,
+                    is_speech=rs[0].probability > threshold,
                     model_name=base,
                     processing_time=0,
                     timestamp=0.0
                     # CRITICAL FIX: Always extract chunks, both for short and long audio
                     window_count = 0
                     audio_duration = len(processed_audio) / self.processor.sample_rate
+                    for i in range(0, len(processed_audio), hop_samples):
+                        # CRITICAL: Extract the chunk centered on this timestamp
+                        start_pos = max(0, i - window_samples // 2)
+                        end_pos = min(len(processed_audio), start_pos + window_samples)
+                        chunk = processed_audio[start_pos:end_pos]
+                        # Pad if necessary (with reflection, not zeros to avoid artificial silence)
+                        if len(chunk) < window_samples:
+                            chunk = np.pad(chunk, (0, window_samples - len(chunk)), mode='reflect')
+                        # Skip chunks with excessive padding to avoid skewed predictions
+                        padding_ratio = (window_samples - (end_pos - start_pos)) / window_samples
                         if padding_ratio > 0.5:
                             continue  # Skip heavily padded chunks
+                        # CORRECTED: Timestamp at ACTUAL CENTER of the chunk for alignment
+                        actual_center = start_pos + (end_pos - start_pos) / 2.0
+                        timestamp = actual_center / self.processor.sample_rate
                         if window_count < 3:  # Log first 3 windows
+                            debug_info.append(f"  🔄 Window {window_count}: t={timestamp:.2f}s (center), chunk_size={len(chunk)}")
                         # Call predict with the chunk
                         result = self.models[model_name].predict(chunk, timestamp)
                         model_results.append(result)
                         window_count += 1
+                        # Stop if we've gone past the audio length
+                        if timestamp >= audio_duration:
+                            break
                     debug_info.append(f"  🎯 Total windows processed: {window_count}")
                     # Summary for this model