Spaces:

gbibbo
/

vad_demo

Sleeping

App Files Files Community

Gabriel Bibbó commited on Aug 5

Commit

bce828d

1 Parent(s): 0ea20e3

Hotfix: Restore basic functionality - fix AST saturation and PANNs execution

Browse files

Files changed (1) hide show

app.py +48 -47

app.py CHANGED Viewed

@@ -174,13 +174,13 @@ class OptimizedWebRTCVAD:
     def __init__(self):
         self.model_name = "WebRTC-VAD"
         self.sample_rate = 16000
-        self.frame_duration = 30
-        self.frame_size = int(self.sample_rate * self.frame_duration / 1000)
         if WEBRTC_AVAILABLE:
             try:
                 self.vad = webrtcvad.Vad(3)
-                print(f"✅ {self.model_name} loaded successfully")
             except:
                 self.vad = None
         else:
@@ -204,10 +204,16 @@ class OptimizedWebRTCVAD:
             audio_clipped = np.clip(audio, -1.0, 1.0)
             audio_int16 = (audio_clipped * 32767).astype(np.int16)
             speech_frames = 0
             total_frames = 0
-            for i in range(0, len(audio_int16) - self.frame_size, self.frame_size):
                 frame = audio_int16[i:i + self.frame_size].tobytes()
                 if self.vad.is_speech(frame, self.sample_rate):
                     speech_frames += 1
@@ -245,15 +251,11 @@ class OptimizedEPANNs:
                                                  orig_sr=16000,
                                                  target_sr=self.sample_rate)
-                # Ensure minimum length (6 seconds) using wrap mode instead of zero padding
                 min_samples = 6 * self.sample_rate  # 6 seconds
                 if len(audio_resampled) < min_samples:
-                    if LIBROSA_AVAILABLE:
-                        audio_resampled = librosa.util.fix_length(audio_resampled, size=min_samples, mode='wrap')
-                    else:
-                        # Fallback: repeat the signal
-                        repeat_factor = int(np.ceil(min_samples / len(audio_resampled)))
-                        audio_resampled = np.tile(audio_resampled, repeat_factor)[:min_samples]
                 mel_spec = librosa.feature.melspectrogram(y=audio_resampled, sr=self.sample_rate, n_mels=64)
                 energy = np.mean(librosa.power_to_db(mel_spec, ref=np.max))
@@ -335,15 +337,11 @@ class OptimizedPANNs:
                     audio
                 )
-            # Ensure minimum length for PANNs (10 seconds) using wrap mode instead of zero padding
             min_samples = 10 * self.sample_rate  # 10 seconds for optimal performance
             if len(audio_resampled) < min_samples:
-                if LIBROSA_AVAILABLE:
-                    audio_resampled = librosa.util.fix_length(audio_resampled, size=min_samples, mode='wrap')
-                else:
-                    # Fallback: repeat the signal
-                    repeat_factor = int(np.ceil(min_samples / len(audio_resampled)))
-                    audio_resampled = np.tile(audio_resampled, repeat_factor)[:min_samples]
             clip_probs, _ = self.model.inference(audio_resampled[np.newaxis, :],
                                                  input_sr=self.sample_rate)
@@ -473,12 +471,8 @@ class OptimizedAST:
             # Ensure minimum length for AST (6.4 seconds for 1024 frames)
             min_samples = int(6.4 * self.sample_rate)  # 6.4 seconds
             if len(audio_for_ast) < min_samples:
-                if LIBROSA_AVAILABLE:
-                    audio_for_ast = librosa.util.fix_length(audio_for_ast, size=min_samples, mode='wrap')
-                else:
-                    # Fallback: repeat the signal
-                    repeat_factor = int(np.ceil(min_samples / len(audio_for_ast)))
-                    audio_for_ast = np.tile(audio_for_ast, repeat_factor)[:min_samples]
             # Truncate if too long (AST can handle up to ~10s, but we'll use 8s max for efficiency)
             max_samples = 8 * self.sample_rate
@@ -571,7 +565,7 @@ class AudioProcessor:
         # Model-specific window sizes (each model gets appropriate context)
         self.model_windows = {
             "Silero-VAD": 0.032,  # 32ms exactly as required (512 samples)
-            "WebRTC-VAD": 0.03,   # 30ms frames
             "E-PANNs": 6.0,       # 6 seconds minimum for reliable results
             "PANNs": 10.0,        # 10 seconds for optimal performance
             "AST": 6.4            # ~6.4 seconds (1024 frames * 6.25ms)
@@ -579,8 +573,8 @@ class AudioProcessor:
         # Model-specific hop sizes for efficiency
         self.model_hop_sizes = {
-            "Silero-VAD": 0.016,  # 16ms hop for Silero
-            "WebRTC-VAD": 0.01,   # 10ms hop for WebRTC
             "E-PANNs": 1.0,       # Process every 1s but with 6s window
             "PANNs": 2.0,         # Process every 2s but with 10s window
             "AST": 1.0            # Process every 1s but with 6.4s window
@@ -1056,35 +1050,40 @@ class VADDemo:
                     window_samples = int(self.processor.sample_rate * window_size)
                     hop_samples = int(self.processor.sample_rate * hop_size)
-                    # For large models, ensure we have enough audio
                     if len(processed_audio) < window_samples:
-                        # If audio is too short, repeat it to reach minimum length
-                        repeat_factor = int(np.ceil(window_samples / len(processed_audio)))
-                        extended_audio = np.tile(processed_audio, repeat_factor)[:window_samples]
-                    else:
-                        extended_audio = processed_audio
-                    for i in range(0, len(extended_audio) - window_samples, hop_samples):
-                        timestamp = i / self.processor.sample_rate
-                        # Extract window centered around current position
-                        start_pos = max(0, i)
-                        end_pos = min(len(extended_audio), i + window_samples)
-                        chunk = extended_audio[start_pos:end_pos]
-                        # Ensure chunk has the right length
-                        if len(chunk) < window_samples:
-                            chunk = np.pad(chunk, (0, window_samples - len(chunk)), 'wrap')
                         # Special handling for different models
                         if model_name == 'AST':
-                            result = self.models[model_name].predict(chunk, timestamp, full_audio=extended_audio)
                         else:
                             result = self.models[model_name].predict(chunk, timestamp)
                         # Use model-specific threshold
                         result.is_speech = result.probability > model_threshold
                         vad_results.append(result)
             delay_compensation = self.processor.estimate_delay_compensation(processed_audio, vad_results)
             onsets_offsets = self.processor.detect_onset_offset_advanced(vad_results, threshold)
@@ -1201,7 +1200,7 @@ def create_interface():
                 model_b = gr.Dropdown(
                     choices=["Silero-VAD", "WebRTC-VAD", "E-PANNs", "PANNs", "AST"],
-                    value="E-PANNs",
                     label="Model B (Bottom Panel)"
                 )
@@ -1247,6 +1246,8 @@ def create_interface():
         gr.Markdown("""
         ---
         **Models**: Silero-VAD, WebRTC-VAD, E-PANNs, PANNs, AST | **Research**: WASPAA 2025 | **Institution**: University of Surrey, CVSSP
         """)
     return interface

     def __init__(self):
         self.model_name = "WebRTC-VAD"
         self.sample_rate = 16000
+        self.frame_duration = 30  # Only 10, 20, or 30 ms are supported
+        self.frame_size = int(self.sample_rate * self.frame_duration / 1000)  # 480 samples for 30ms
         if WEBRTC_AVAILABLE:
             try:
                 self.vad = webrtcvad.Vad(3)
+                print(f"✅ {self.model_name} loaded successfully (frame size: {self.frame_size} samples)")
             except:
                 self.vad = None
         else:
             audio_clipped = np.clip(audio, -1.0, 1.0)
             audio_int16 = (audio_clipped * 32767).astype(np.int16)
+            # Ensure we have enough samples for at least one frame
+            if len(audio_int16) < self.frame_size:
+                # Pad to frame size
+                audio_int16 = np.pad(audio_int16, (0, self.frame_size - len(audio_int16)), 'constant')
             speech_frames = 0
             total_frames = 0
+            # Process exact frame sizes only
+            for i in range(0, len(audio_int16) - self.frame_size + 1, self.frame_size):
                 frame = audio_int16[i:i + self.frame_size].tobytes()
                 if self.vad.is_speech(frame, self.sample_rate):
                     speech_frames += 1
                                                  orig_sr=16000,
                                                  target_sr=self.sample_rate)
+                # Ensure minimum length (6 seconds) using constant padding instead of wrap
                 min_samples = 6 * self.sample_rate  # 6 seconds
                 if len(audio_resampled) < min_samples:
+                    # Use constant padding with small value instead of wrap to avoid artificial periodicity
+                    audio_resampled = np.pad(audio_resampled, (0, min_samples - len(audio_resampled)), 'constant', constant_values=0.0)
                 mel_spec = librosa.feature.melspectrogram(y=audio_resampled, sr=self.sample_rate, n_mels=64)
                 energy = np.mean(librosa.power_to_db(mel_spec, ref=np.max))
                     audio
                 )
+            # Ensure minimum length for PANNs (10 seconds) using constant padding instead of wrap
             min_samples = 10 * self.sample_rate  # 10 seconds for optimal performance
             if len(audio_resampled) < min_samples:
+                # Use constant padding instead of wrap to avoid artificial periodicity
+                audio_resampled = np.pad(audio_resampled, (0, min_samples - len(audio_resampled)), 'constant', constant_values=0.0)
             clip_probs, _ = self.model.inference(audio_resampled[np.newaxis, :],
                                                  input_sr=self.sample_rate)
             # Ensure minimum length for AST (6.4 seconds for 1024 frames)
             min_samples = int(6.4 * self.sample_rate)  # 6.4 seconds
             if len(audio_for_ast) < min_samples:
+                # Use constant padding instead of wrap to avoid artificial periodicity
+                audio_for_ast = np.pad(audio_for_ast, (0, min_samples - len(audio_for_ast)), 'constant', constant_values=0.0)
             # Truncate if too long (AST can handle up to ~10s, but we'll use 8s max for efficiency)
             max_samples = 8 * self.sample_rate
         # Model-specific window sizes (each model gets appropriate context)
         self.model_windows = {
             "Silero-VAD": 0.032,  # 32ms exactly as required (512 samples)
+            "WebRTC-VAD": 0.03,   # 30ms frames (480 samples)
             "E-PANNs": 6.0,       # 6 seconds minimum for reliable results
             "PANNs": 10.0,        # 10 seconds for optimal performance
             "AST": 6.4            # ~6.4 seconds (1024 frames * 6.25ms)
         # Model-specific hop sizes for efficiency
         self.model_hop_sizes = {
+            "Silero-VAD": 0.016,  # 16ms hop for Silero (512 samples window)
+            "WebRTC-VAD": 0.03,   # 30ms hop for WebRTC (match frame duration)
             "E-PANNs": 1.0,       # Process every 1s but with 6s window
             "PANNs": 2.0,         # Process every 2s but with 10s window
             "AST": 1.0            # Process every 1s but with 6.4s window
                     window_samples = int(self.processor.sample_rate * window_size)
                     hop_samples = int(self.processor.sample_rate * hop_size)
+                    # Critical fix: Always process at least once, even if audio is shorter than window
                     if len(processed_audio) < window_samples:
+                        # Audio is shorter than required window - process once with available audio
+                        chunk = processed_audio
+                        timestamp = 0.0
                         # Special handling for different models
                         if model_name == 'AST':
+                            result = self.models[model_name].predict(chunk, timestamp, full_audio=processed_audio)
                         else:
                             result = self.models[model_name].predict(chunk, timestamp)
                         # Use model-specific threshold
                         result.is_speech = result.probability > model_threshold
                         vad_results.append(result)
+                    else:
+                        # Audio is long enough - process in sliding windows
+                        for i in range(0, len(processed_audio) - window_samples + 1, hop_samples):
+                            timestamp = i / self.processor.sample_rate
+                            # Extract window
+                            start_pos = i
+                            end_pos = min(len(processed_audio), i + window_samples)
+                            chunk = processed_audio[start_pos:end_pos]
+                            # Special handling for different models
+                            if model_name == 'AST':
+                                result = self.models[model_name].predict(chunk, timestamp, full_audio=processed_audio)
+                            else:
+                                result = self.models[model_name].predict(chunk, timestamp)
+                            # Use model-specific threshold
+                            result.is_speech = result.probability > model_threshold
+                            vad_results.append(result)
             delay_compensation = self.processor.estimate_delay_compensation(processed_audio, vad_results)
             onsets_offsets = self.processor.detect_onset_offset_advanced(vad_results, threshold)
                 model_b = gr.Dropdown(
                     choices=["Silero-VAD", "WebRTC-VAD", "E-PANNs", "PANNs", "AST"],
+                    value="WebRTC-VAD",
                     label="Model B (Bottom Panel)"
                 )
         gr.Markdown("""
         ---
         **Models**: Silero-VAD, WebRTC-VAD, E-PANNs, PANNs, AST | **Research**: WASPAA 2025 | **Institution**: University of Surrey, CVSSP
+        **Note**: Large models (PANNs: 10s, E-PANNs: 6s, AST: 6.4s) work best with longer recordings. Short clips will be padded appropriately.
         """)
     return interface