Gabriel Bibb贸 commited on
Commit
bcae560
1 Parent(s): 9d07682

Hotfix: Restore basic functionality - fix AST saturation and PANNs execution

Browse files
Files changed (1) hide show
  1. app.py +63 -34
app.py CHANGED
@@ -230,8 +230,9 @@ class OptimizedEPANNs:
230
  if len(audio.shape) > 1:
231
  audio = audio.mean(axis=1)
232
 
233
- # Resample to E-PANNs sample rate
234
  if LIBROSA_AVAILABLE:
 
235
  audio_resampled = librosa.resample(audio.astype(float),
236
  orig_sr=16000,
237
  target_sr=self.sample_rate)
@@ -270,7 +271,6 @@ class OptimizedPANNs:
270
  self.sample_rate = 32000
271
  self.model = None
272
  self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
273
- self.processor = AudioProcessor() # For fast resampling
274
  self.load_model()
275
 
276
  def load_model(self):
@@ -303,8 +303,19 @@ class OptimizedPANNs:
303
  if len(audio.shape) > 1:
304
  audio = audio.mean(axis=1)
305
 
306
- # Fast resampling to PANNs sample rate
307
- audio_resampled = self.processor.fast_resample(audio, 16000, self.sample_rate)
 
 
 
 
 
 
 
 
 
 
 
308
 
309
  # Ensure minimum length for PANNs (need at least 1 second)
310
  min_samples = self.sample_rate # 1 second
@@ -373,11 +384,17 @@ class OptimizedAST:
373
  start_time = time.time()
374
 
375
  if self.model is None or len(audio) == 0:
376
- # Simple energy-based fallback
377
  if len(audio) > 0:
378
  energy = np.sum(audio ** 2)
379
- probability = min(energy * 20, 1.0)
380
- is_speech = probability > 0.2
 
 
 
 
 
 
381
  else:
382
  probability = 0.0
383
  is_speech = False
@@ -387,16 +404,33 @@ class OptimizedAST:
387
  if len(audio.shape) > 1:
388
  audio = audio.mean(axis=1)
389
 
390
- # Use 1 second minimum for AST
391
- if len(audio) < self.sample_rate:
392
- audio = np.pad(audio, (0, self.sample_rate - len(audio)), 'constant')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
393
 
394
- # Feature extraction
395
  inputs = self.feature_extractor(
396
- audio,
397
  sampling_rate=self.sample_rate,
398
  return_tensors="pt",
399
- max_length=1024,
400
  truncation=True
401
  )
402
 
@@ -418,23 +452,23 @@ class OptimizedAST:
418
 
419
  if speech_indices:
420
  speech_prob = probs[0, speech_indices].mean().item()
 
 
 
421
  else:
422
- # Fallback to energy
423
- energy = np.sum(audio ** 2)
424
- speech_prob = min(energy * 10, 1.0)
425
 
426
- # Ensure reasonable range
427
- speech_prob = np.clip(speech_prob, 0.0, 1.0)
428
-
429
- return VADResult(float(speech_prob), speech_prob > 0.5, self.model_name, time.time()-start_time, timestamp)
430
 
431
  except Exception as e:
432
  print(f"Error in {self.model_name}: {e}")
433
- # Simple fallback
434
  if len(audio) > 0:
435
  energy = np.sum(audio ** 2)
436
- probability = min(energy * 15, 1.0)
437
- is_speech = energy > 0.01
438
  else:
439
  probability = 0.0
440
  is_speech = False
@@ -477,7 +511,6 @@ class AudioProcessor:
477
  if len(audio_data.shape) > 1:
478
  audio_data = audio_data.mean(axis=1)
479
 
480
- # Simple peak normalization
481
  if np.max(np.abs(audio_data)) > 0:
482
  audio_data = audio_data / np.max(np.abs(audio_data))
483
 
@@ -707,11 +740,10 @@ def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
707
  )
708
 
709
  if len(time_frames) > 0:
710
- # Add threshold lines to both panels with layer='above' to show over spectrograms
711
  fig.add_hline(
712
  y=threshold,
713
  line=dict(color='cyan', width=2, dash='dash'),
714
- layer='above',
715
  annotation_text=f'Threshold: {threshold:.2f}',
716
  annotation_position="top right",
717
  row=1, col=1, secondary_y=True
@@ -719,7 +751,6 @@ def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
719
  fig.add_hline(
720
  y=threshold,
721
  line=dict(color='cyan', width=2, dash='dash'),
722
- layer='above',
723
  annotation_text=f'Threshold: {threshold:.2f}',
724
  annotation_position="top right",
725
  row=2, col=1, secondary_y=True
@@ -809,7 +840,6 @@ def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
809
  height=500,
810
  title_text="Real-Time Speech Visualizer",
811
  showlegend=True,
812
- uirevision="const", # Preserve zoom/pan when updating
813
  legend=dict(
814
  x=1.02,
815
  y=1,
@@ -874,10 +904,6 @@ class VADDemo:
874
 
875
  print("馃帳 Real-time VAD Demo initialized successfully")
876
  print(f"馃搳 Available models: {list(self.models.keys())}")
877
-
878
- # Initialize demo globally for callbacks
879
- print("馃帳 Initializing VAD Demo...")
880
- demo_app = VADDemo()
881
 
882
  def process_audio_with_events(self, audio, model_a, model_b, threshold):
883
  if audio is None:
@@ -895,7 +921,7 @@ demo_app = VADDemo()
895
 
896
  selected_models = list(set([model_a, model_b]))
897
 
898
- # Process each window - simplified without complex scheduling
899
  for i in range(0, len(processed_audio) - window_samples, hop_samples):
900
  timestamp = i / self.processor.sample_rate
901
  chunk = processed_audio[i:i + window_samples]
@@ -971,7 +997,6 @@ demo_app = VADDemo()
971
  # ===== GRADIO INTERFACE =====
972
 
973
  def create_interface():
974
-
975
  # Load logos
976
  logos = load_logos()
977
 
@@ -1080,5 +1105,9 @@ def create_interface():
1080
 
1081
  # Create and launch interface
1082
  if __name__ == "__main__":
 
 
 
 
1083
  interface = create_interface()
1084
  interface.launch(share=True, debug=False)
 
230
  if len(audio.shape) > 1:
231
  audio = audio.mean(axis=1)
232
 
233
+ # Convert audio to target sample rate for E-PANNs
234
  if LIBROSA_AVAILABLE:
235
+ # Resample to E-PANNs sample rate if needed
236
  audio_resampled = librosa.resample(audio.astype(float),
237
  orig_sr=16000,
238
  target_sr=self.sample_rate)
 
271
  self.sample_rate = 32000
272
  self.model = None
273
  self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
274
  self.load_model()
275
 
276
  def load_model(self):
 
303
  if len(audio.shape) > 1:
304
  audio = audio.mean(axis=1)
305
 
306
+ # Convert audio to PANNs sample rate
307
+ if LIBROSA_AVAILABLE:
308
+ audio_resampled = librosa.resample(audio.astype(float),
309
+ orig_sr=16000,
310
+ target_sr=self.sample_rate)
311
+ else:
312
+ # Simple resampling fallback
313
+ resample_factor = self.sample_rate / 16000
314
+ audio_resampled = np.interp(
315
+ np.linspace(0, len(audio) - 1, int(len(audio) * resample_factor)),
316
+ np.arange(len(audio)),
317
+ audio
318
+ )
319
 
320
  # Ensure minimum length for PANNs (need at least 1 second)
321
  min_samples = self.sample_rate # 1 second
 
384
  start_time = time.time()
385
 
386
  if self.model is None or len(audio) == 0:
387
+ # Enhanced fallback using spectral features
388
  if len(audio) > 0:
389
  energy = np.sum(audio ** 2)
390
+ if LIBROSA_AVAILABLE:
391
+ spectral_features = librosa.feature.spectral_rolloff(y=audio, sr=self.sample_rate)
392
+ spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=self.sample_rate))
393
+ # Combine multiple features for better speech detection
394
+ probability = min((energy * 100 + spectral_centroid / 500) / 2, 1.0)
395
+ else:
396
+ probability = min(energy * 50, 1.0)
397
+ is_speech = probability > 0.3
398
  else:
399
  probability = 0.0
400
  is_speech = False
 
404
  if len(audio.shape) > 1:
405
  audio = audio.mean(axis=1)
406
 
407
+ # Use longer context for AST - take from full audio if available
408
+ if full_audio is not None and len(full_audio) > self.sample_rate:
409
+ # Take 3-second window centered around current timestamp
410
+ center_pos = int(timestamp * self.sample_rate)
411
+ window_size = int(1.5 * self.sample_rate) # 1.5 seconds each side
412
+
413
+ start_pos = max(0, center_pos - window_size)
414
+ end_pos = min(len(full_audio), center_pos + window_size)
415
+
416
+ # Ensure we have at least 1 second
417
+ if end_pos - start_pos < self.sample_rate:
418
+ end_pos = min(len(full_audio), start_pos + self.sample_rate)
419
+
420
+ audio_for_ast = full_audio[start_pos:end_pos]
421
+ else:
422
+ audio_for_ast = audio
423
+
424
+ # Ensure minimum length for AST
425
+ if len(audio_for_ast) < self.sample_rate:
426
+ audio_for_ast = np.pad(audio_for_ast, (0, self.sample_rate - len(audio_for_ast)), 'constant')
427
 
428
+ # Feature extraction with proper AST parameters
429
  inputs = self.feature_extractor(
430
+ audio_for_ast,
431
  sampling_rate=self.sample_rate,
432
  return_tensors="pt",
433
+ max_length=1024, # Proper AST context
434
  truncation=True
435
  )
436
 
 
452
 
453
  if speech_indices:
454
  speech_prob = probs[0, speech_indices].mean().item()
455
+ # Boost the probability if it's too low but there's clear audio content
456
+ if speech_prob < 0.1 and np.sum(audio_for_ast ** 2) > 0.001:
457
+ speech_prob = min(speech_prob * 5, 0.8) # Boost but cap at 0.8
458
  else:
459
+ # Fallback to energy-based detection
460
+ energy = np.sum(audio_for_ast ** 2)
461
+ speech_prob = min(energy * 20, 1.0)
462
 
463
+ return VADResult(float(speech_prob), speech_prob > 0.4, self.model_name, time.time()-start_time, timestamp)
 
 
 
464
 
465
  except Exception as e:
466
  print(f"Error in {self.model_name}: {e}")
467
+ # Enhanced fallback
468
  if len(audio) > 0:
469
  energy = np.sum(audio ** 2)
470
+ probability = min(energy * 30, 1.0) # More aggressive energy scaling
471
+ is_speech = energy > 0.002
472
  else:
473
  probability = 0.0
474
  is_speech = False
 
511
  if len(audio_data.shape) > 1:
512
  audio_data = audio_data.mean(axis=1)
513
 
 
514
  if np.max(np.abs(audio_data)) > 0:
515
  audio_data = audio_data / np.max(np.abs(audio_data))
516
 
 
740
  )
741
 
742
  if len(time_frames) > 0:
743
+ # Add threshold lines to both panels
744
  fig.add_hline(
745
  y=threshold,
746
  line=dict(color='cyan', width=2, dash='dash'),
 
747
  annotation_text=f'Threshold: {threshold:.2f}',
748
  annotation_position="top right",
749
  row=1, col=1, secondary_y=True
 
751
  fig.add_hline(
752
  y=threshold,
753
  line=dict(color='cyan', width=2, dash='dash'),
 
754
  annotation_text=f'Threshold: {threshold:.2f}',
755
  annotation_position="top right",
756
  row=2, col=1, secondary_y=True
 
840
  height=500,
841
  title_text="Real-Time Speech Visualizer",
842
  showlegend=True,
 
843
  legend=dict(
844
  x=1.02,
845
  y=1,
 
904
 
905
  print("馃帳 Real-time VAD Demo initialized successfully")
906
  print(f"馃搳 Available models: {list(self.models.keys())}")
 
 
 
 
907
 
908
  def process_audio_with_events(self, audio, model_a, model_b, threshold):
909
  if audio is None:
 
921
 
922
  selected_models = list(set([model_a, model_b]))
923
 
924
+ # Process each window individually for all models
925
  for i in range(0, len(processed_audio) - window_samples, hop_samples):
926
  timestamp = i / self.processor.sample_rate
927
  chunk = processed_audio[i:i + window_samples]
 
997
  # ===== GRADIO INTERFACE =====
998
 
999
  def create_interface():
 
1000
  # Load logos
1001
  logos = load_logos()
1002
 
 
1105
 
1106
  # Create and launch interface
1107
  if __name__ == "__main__":
1108
+ # Initialize demo
1109
+ print("馃帳 Initializing VAD Demo...")
1110
+ demo_app = VADDemo()
1111
+
1112
  interface = create_interface()
1113
  interface.launch(share=True, debug=False)