Gabriel Bibbó commited on
Commit
bce828d
·
1 Parent(s): 0ea20e3

Hotfix: Restore basic functionality - fix AST saturation and PANNs execution

Browse files
Files changed (1) hide show
  1. app.py +48 -47
app.py CHANGED
@@ -174,13 +174,13 @@ class OptimizedWebRTCVAD:
174
  def __init__(self):
175
  self.model_name = "WebRTC-VAD"
176
  self.sample_rate = 16000
177
- self.frame_duration = 30
178
- self.frame_size = int(self.sample_rate * self.frame_duration / 1000)
179
 
180
  if WEBRTC_AVAILABLE:
181
  try:
182
  self.vad = webrtcvad.Vad(3)
183
- print(f"✅ {self.model_name} loaded successfully")
184
  except:
185
  self.vad = None
186
  else:
@@ -204,10 +204,16 @@ class OptimizedWebRTCVAD:
204
  audio_clipped = np.clip(audio, -1.0, 1.0)
205
  audio_int16 = (audio_clipped * 32767).astype(np.int16)
206
 
 
 
 
 
 
207
  speech_frames = 0
208
  total_frames = 0
209
 
210
- for i in range(0, len(audio_int16) - self.frame_size, self.frame_size):
 
211
  frame = audio_int16[i:i + self.frame_size].tobytes()
212
  if self.vad.is_speech(frame, self.sample_rate):
213
  speech_frames += 1
@@ -245,15 +251,11 @@ class OptimizedEPANNs:
245
  orig_sr=16000,
246
  target_sr=self.sample_rate)
247
 
248
- # Ensure minimum length (6 seconds) using wrap mode instead of zero padding
249
  min_samples = 6 * self.sample_rate # 6 seconds
250
  if len(audio_resampled) < min_samples:
251
- if LIBROSA_AVAILABLE:
252
- audio_resampled = librosa.util.fix_length(audio_resampled, size=min_samples, mode='wrap')
253
- else:
254
- # Fallback: repeat the signal
255
- repeat_factor = int(np.ceil(min_samples / len(audio_resampled)))
256
- audio_resampled = np.tile(audio_resampled, repeat_factor)[:min_samples]
257
 
258
  mel_spec = librosa.feature.melspectrogram(y=audio_resampled, sr=self.sample_rate, n_mels=64)
259
  energy = np.mean(librosa.power_to_db(mel_spec, ref=np.max))
@@ -335,15 +337,11 @@ class OptimizedPANNs:
335
  audio
336
  )
337
 
338
- # Ensure minimum length for PANNs (10 seconds) using wrap mode instead of zero padding
339
  min_samples = 10 * self.sample_rate # 10 seconds for optimal performance
340
  if len(audio_resampled) < min_samples:
341
- if LIBROSA_AVAILABLE:
342
- audio_resampled = librosa.util.fix_length(audio_resampled, size=min_samples, mode='wrap')
343
- else:
344
- # Fallback: repeat the signal
345
- repeat_factor = int(np.ceil(min_samples / len(audio_resampled)))
346
- audio_resampled = np.tile(audio_resampled, repeat_factor)[:min_samples]
347
 
348
  clip_probs, _ = self.model.inference(audio_resampled[np.newaxis, :],
349
  input_sr=self.sample_rate)
@@ -473,12 +471,8 @@ class OptimizedAST:
473
  # Ensure minimum length for AST (6.4 seconds for 1024 frames)
474
  min_samples = int(6.4 * self.sample_rate) # 6.4 seconds
475
  if len(audio_for_ast) < min_samples:
476
- if LIBROSA_AVAILABLE:
477
- audio_for_ast = librosa.util.fix_length(audio_for_ast, size=min_samples, mode='wrap')
478
- else:
479
- # Fallback: repeat the signal
480
- repeat_factor = int(np.ceil(min_samples / len(audio_for_ast)))
481
- audio_for_ast = np.tile(audio_for_ast, repeat_factor)[:min_samples]
482
 
483
  # Truncate if too long (AST can handle up to ~10s, but we'll use 8s max for efficiency)
484
  max_samples = 8 * self.sample_rate
@@ -571,7 +565,7 @@ class AudioProcessor:
571
  # Model-specific window sizes (each model gets appropriate context)
572
  self.model_windows = {
573
  "Silero-VAD": 0.032, # 32ms exactly as required (512 samples)
574
- "WebRTC-VAD": 0.03, # 30ms frames
575
  "E-PANNs": 6.0, # 6 seconds minimum for reliable results
576
  "PANNs": 10.0, # 10 seconds for optimal performance
577
  "AST": 6.4 # ~6.4 seconds (1024 frames * 6.25ms)
@@ -579,8 +573,8 @@ class AudioProcessor:
579
 
580
  # Model-specific hop sizes for efficiency
581
  self.model_hop_sizes = {
582
- "Silero-VAD": 0.016, # 16ms hop for Silero
583
- "WebRTC-VAD": 0.01, # 10ms hop for WebRTC
584
  "E-PANNs": 1.0, # Process every 1s but with 6s window
585
  "PANNs": 2.0, # Process every 2s but with 10s window
586
  "AST": 1.0 # Process every 1s but with 6.4s window
@@ -1056,35 +1050,40 @@ class VADDemo:
1056
  window_samples = int(self.processor.sample_rate * window_size)
1057
  hop_samples = int(self.processor.sample_rate * hop_size)
1058
 
1059
- # For large models, ensure we have enough audio
1060
  if len(processed_audio) < window_samples:
1061
- # If audio is too short, repeat it to reach minimum length
1062
- repeat_factor = int(np.ceil(window_samples / len(processed_audio)))
1063
- extended_audio = np.tile(processed_audio, repeat_factor)[:window_samples]
1064
- else:
1065
- extended_audio = processed_audio
1066
-
1067
- for i in range(0, len(extended_audio) - window_samples, hop_samples):
1068
- timestamp = i / self.processor.sample_rate
1069
-
1070
- # Extract window centered around current position
1071
- start_pos = max(0, i)
1072
- end_pos = min(len(extended_audio), i + window_samples)
1073
- chunk = extended_audio[start_pos:end_pos]
1074
-
1075
- # Ensure chunk has the right length
1076
- if len(chunk) < window_samples:
1077
- chunk = np.pad(chunk, (0, window_samples - len(chunk)), 'wrap')
1078
 
1079
  # Special handling for different models
1080
  if model_name == 'AST':
1081
- result = self.models[model_name].predict(chunk, timestamp, full_audio=extended_audio)
1082
  else:
1083
  result = self.models[model_name].predict(chunk, timestamp)
1084
 
1085
  # Use model-specific threshold
1086
  result.is_speech = result.probability > model_threshold
1087
  vad_results.append(result)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1088
 
1089
  delay_compensation = self.processor.estimate_delay_compensation(processed_audio, vad_results)
1090
  onsets_offsets = self.processor.detect_onset_offset_advanced(vad_results, threshold)
@@ -1201,7 +1200,7 @@ def create_interface():
1201
 
1202
  model_b = gr.Dropdown(
1203
  choices=["Silero-VAD", "WebRTC-VAD", "E-PANNs", "PANNs", "AST"],
1204
- value="E-PANNs",
1205
  label="Model B (Bottom Panel)"
1206
  )
1207
 
@@ -1247,6 +1246,8 @@ def create_interface():
1247
  gr.Markdown("""
1248
  ---
1249
  **Models**: Silero-VAD, WebRTC-VAD, E-PANNs, PANNs, AST | **Research**: WASPAA 2025 | **Institution**: University of Surrey, CVSSP
 
 
1250
  """)
1251
 
1252
  return interface
 
174
  def __init__(self):
175
  self.model_name = "WebRTC-VAD"
176
  self.sample_rate = 16000
177
+ self.frame_duration = 30 # Only 10, 20, or 30 ms are supported
178
+ self.frame_size = int(self.sample_rate * self.frame_duration / 1000) # 480 samples for 30ms
179
 
180
  if WEBRTC_AVAILABLE:
181
  try:
182
  self.vad = webrtcvad.Vad(3)
183
+ print(f"✅ {self.model_name} loaded successfully (frame size: {self.frame_size} samples)")
184
  except:
185
  self.vad = None
186
  else:
 
204
  audio_clipped = np.clip(audio, -1.0, 1.0)
205
  audio_int16 = (audio_clipped * 32767).astype(np.int16)
206
 
207
+ # Ensure we have enough samples for at least one frame
208
+ if len(audio_int16) < self.frame_size:
209
+ # Pad to frame size
210
+ audio_int16 = np.pad(audio_int16, (0, self.frame_size - len(audio_int16)), 'constant')
211
+
212
  speech_frames = 0
213
  total_frames = 0
214
 
215
+ # Process exact frame sizes only
216
+ for i in range(0, len(audio_int16) - self.frame_size + 1, self.frame_size):
217
  frame = audio_int16[i:i + self.frame_size].tobytes()
218
  if self.vad.is_speech(frame, self.sample_rate):
219
  speech_frames += 1
 
251
  orig_sr=16000,
252
  target_sr=self.sample_rate)
253
 
254
+ # Ensure minimum length (6 seconds) using constant padding instead of wrap
255
  min_samples = 6 * self.sample_rate # 6 seconds
256
  if len(audio_resampled) < min_samples:
257
+ # Use constant padding with small value instead of wrap to avoid artificial periodicity
258
+ audio_resampled = np.pad(audio_resampled, (0, min_samples - len(audio_resampled)), 'constant', constant_values=0.0)
 
 
 
 
259
 
260
  mel_spec = librosa.feature.melspectrogram(y=audio_resampled, sr=self.sample_rate, n_mels=64)
261
  energy = np.mean(librosa.power_to_db(mel_spec, ref=np.max))
 
337
  audio
338
  )
339
 
340
+ # Ensure minimum length for PANNs (10 seconds) using constant padding instead of wrap
341
  min_samples = 10 * self.sample_rate # 10 seconds for optimal performance
342
  if len(audio_resampled) < min_samples:
343
+ # Use constant padding instead of wrap to avoid artificial periodicity
344
+ audio_resampled = np.pad(audio_resampled, (0, min_samples - len(audio_resampled)), 'constant', constant_values=0.0)
 
 
 
 
345
 
346
  clip_probs, _ = self.model.inference(audio_resampled[np.newaxis, :],
347
  input_sr=self.sample_rate)
 
471
  # Ensure minimum length for AST (6.4 seconds for 1024 frames)
472
  min_samples = int(6.4 * self.sample_rate) # 6.4 seconds
473
  if len(audio_for_ast) < min_samples:
474
+ # Use constant padding instead of wrap to avoid artificial periodicity
475
+ audio_for_ast = np.pad(audio_for_ast, (0, min_samples - len(audio_for_ast)), 'constant', constant_values=0.0)
 
 
 
 
476
 
477
  # Truncate if too long (AST can handle up to ~10s, but we'll use 8s max for efficiency)
478
  max_samples = 8 * self.sample_rate
 
565
  # Model-specific window sizes (each model gets appropriate context)
566
  self.model_windows = {
567
  "Silero-VAD": 0.032, # 32ms exactly as required (512 samples)
568
+ "WebRTC-VAD": 0.03, # 30ms frames (480 samples)
569
  "E-PANNs": 6.0, # 6 seconds minimum for reliable results
570
  "PANNs": 10.0, # 10 seconds for optimal performance
571
  "AST": 6.4 # ~6.4 seconds (1024 frames * 6.25ms)
 
573
 
574
  # Model-specific hop sizes for efficiency
575
  self.model_hop_sizes = {
576
+ "Silero-VAD": 0.016, # 16ms hop for Silero (512 samples window)
577
+ "WebRTC-VAD": 0.03, # 30ms hop for WebRTC (match frame duration)
578
  "E-PANNs": 1.0, # Process every 1s but with 6s window
579
  "PANNs": 2.0, # Process every 2s but with 10s window
580
  "AST": 1.0 # Process every 1s but with 6.4s window
 
1050
  window_samples = int(self.processor.sample_rate * window_size)
1051
  hop_samples = int(self.processor.sample_rate * hop_size)
1052
 
1053
+ # Critical fix: Always process at least once, even if audio is shorter than window
1054
  if len(processed_audio) < window_samples:
1055
+ # Audio is shorter than required window - process once with available audio
1056
+ chunk = processed_audio
1057
+ timestamp = 0.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1058
 
1059
  # Special handling for different models
1060
  if model_name == 'AST':
1061
+ result = self.models[model_name].predict(chunk, timestamp, full_audio=processed_audio)
1062
  else:
1063
  result = self.models[model_name].predict(chunk, timestamp)
1064
 
1065
  # Use model-specific threshold
1066
  result.is_speech = result.probability > model_threshold
1067
  vad_results.append(result)
1068
+ else:
1069
+ # Audio is long enough - process in sliding windows
1070
+ for i in range(0, len(processed_audio) - window_samples + 1, hop_samples):
1071
+ timestamp = i / self.processor.sample_rate
1072
+
1073
+ # Extract window
1074
+ start_pos = i
1075
+ end_pos = min(len(processed_audio), i + window_samples)
1076
+ chunk = processed_audio[start_pos:end_pos]
1077
+
1078
+ # Special handling for different models
1079
+ if model_name == 'AST':
1080
+ result = self.models[model_name].predict(chunk, timestamp, full_audio=processed_audio)
1081
+ else:
1082
+ result = self.models[model_name].predict(chunk, timestamp)
1083
+
1084
+ # Use model-specific threshold
1085
+ result.is_speech = result.probability > model_threshold
1086
+ vad_results.append(result)
1087
 
1088
  delay_compensation = self.processor.estimate_delay_compensation(processed_audio, vad_results)
1089
  onsets_offsets = self.processor.detect_onset_offset_advanced(vad_results, threshold)
 
1200
 
1201
  model_b = gr.Dropdown(
1202
  choices=["Silero-VAD", "WebRTC-VAD", "E-PANNs", "PANNs", "AST"],
1203
+ value="WebRTC-VAD",
1204
  label="Model B (Bottom Panel)"
1205
  )
1206
 
 
1246
  gr.Markdown("""
1247
  ---
1248
  **Models**: Silero-VAD, WebRTC-VAD, E-PANNs, PANNs, AST | **Research**: WASPAA 2025 | **Institution**: University of Surrey, CVSSP
1249
+
1250
+ **Note**: Large models (PANNs: 10s, E-PANNs: 6s, AST: 6.4s) work best with longer recordings. Short clips will be padded appropriately.
1251
  """)
1252
 
1253
  return interface