Gabriel Bibbó commited on
Commit
ec04aee
·
1 Parent(s): 3891a49

Simplified interface with AST optimization

Browse files
Files changed (1) hide show
  1. app.py +103 -195
app.py CHANGED
@@ -333,6 +333,8 @@ class OptimizedAST:
333
  self.model = None
334
  self.feature_extractor = None
335
  self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
 
336
  self.load_model()
337
 
338
  def load_model(self):
@@ -356,12 +358,12 @@ class OptimizedAST:
356
 
357
  if self.model is None or len(audio) == 0:
358
  if len(audio) > 0:
 
 
359
  if LIBROSA_AVAILABLE:
360
  spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=self.sample_rate))
361
- energy = np.sum(audio ** 2)
362
  probability = min((energy * spectral_centroid) / 10000, 1.0)
363
  else:
364
- energy = np.sum(audio ** 2)
365
  probability = min(energy / 0.01, 1.0)
366
  is_speech = probability > 0.5
367
  else:
@@ -373,40 +375,63 @@ class OptimizedAST:
373
  if len(audio.shape) > 1:
374
  audio = audio.mean(axis=1)
375
 
376
- # Ensure minimum length for AST (typically needs longer sequences)
377
- min_samples = self.sample_rate # 1 second minimum
378
- if len(audio) < min_samples:
379
- audio = np.pad(audio, (0, min_samples - len(audio)), 'constant')
380
-
381
- inputs = self.feature_extractor(audio, sampling_rate=self.sample_rate, return_tensors="pt")
382
- inputs = {k: v.to(self.device) for k, v in inputs.items()}
383
-
384
- with torch.no_grad():
385
- outputs = self.model(**inputs)
386
- logits = outputs.logits
387
- probs = torch.sigmoid(logits)
388
 
389
- label2id = self.model.config.label2id
390
- speech_indices = []
391
- for lbl, idx in label2id.items():
392
- if any(word in lbl.lower() for word in ['speech', 'voice', 'talk', 'conversation', 'speaking', 'human']):
393
- speech_indices.append(idx)
394
 
395
- if speech_indices:
396
- speech_prob = probs[0, speech_indices].mean().item()
397
  else:
398
- # Fallback: use average of first few probabilities
399
- speech_prob = probs[0, :10].mean().item()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400
 
401
  return VADResult(float(speech_prob), speech_prob > 0.5, self.model_name, time.time()-start_time, timestamp)
402
 
403
  except Exception as e:
404
  print(f"Error in {self.model_name}: {e}")
 
405
  if len(audio) > 0:
406
  energy = np.sum(audio ** 2)
407
- threshold = 0.01
408
- probability = min(energy / threshold, 1.0)
409
- is_speech = energy > threshold
410
  else:
411
  probability = 0.0
412
  is_speech = False
@@ -628,7 +653,7 @@ class AudioProcessor:
628
  print(f"Delay estimation error: {e}")
629
  return 0.0
630
 
631
- # ===== ENHANCED VISUALIZATION (Complete GitHub Implementation) =====
632
 
633
  def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
634
  onsets_offsets: List[OnsetOffset], processor: AudioProcessor,
@@ -811,28 +836,6 @@ def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
811
  secondary_y=True
812
  )
813
 
814
- if hasattr(processor, 'delay_compensation') and processor.delay_compensation != 0:
815
- fig.add_annotation(
816
- text=f"Delay Compensation: {processor.delay_compensation*1000:.1f}ms",
817
- xref="paper", yref="paper",
818
- x=0.02, y=0.98,
819
- showarrow=False,
820
- bgcolor="yellow",
821
- bordercolor="black",
822
- borderwidth=1
823
- )
824
-
825
- resolution_text = f"Resolution: {processor.n_fft}-point FFT, {processor.hop_length}-sample hop"
826
- fig.add_annotation(
827
- text=resolution_text,
828
- xref="paper", yref="paper",
829
- x=0.02, y=0.02,
830
- showarrow=False,
831
- bgcolor="lightblue",
832
- bordercolor="black",
833
- borderwidth=1
834
- )
835
-
836
  return fig
837
 
838
  except Exception as e:
@@ -900,80 +903,37 @@ class VADDemo:
900
  speech_detected = any(result.is_speech for result in vad_results)
901
  total_speech_time = sum(1 for r in vad_results if r.is_speech) * self.processor.hop_size
902
 
903
- delay_info = f" | Delay: {delay_compensation*1000:.1f}ms" if delay_compensation != 0 else ""
904
-
905
  if speech_detected:
906
- status_msg = f"🎙️ SPEECH DETECTED - {total_speech_time:.1f}s total{delay_info}"
907
  else:
908
- status_msg = f"🔇 No speech detected{delay_info}"
909
-
910
- details_lines = [
911
- f"📊 **Advanced VAD Analysis** (Threshold: {threshold:.2f})",
912
- f"📏 **Audio Duration**: {len(processed_audio)/self.processor.sample_rate:.2f} seconds",
913
- f"🎯 **Processing Windows**: {len(vad_results)} ({self.processor.window_size*1000:.0f}ms each)",
914
- f"⏱️ **Time Resolution**: {self.processor.hop_size*1000:.0f}ms hop size (ultra-smooth)",
915
- f"🔧 **Delay Compensation**: {delay_compensation*1000:.1f}ms",
916
- ""
917
- ]
918
 
 
919
  model_summaries = {}
920
  for result in vad_results:
921
  name = result.model_name.split(' ')[0]
922
  if name not in model_summaries:
923
- model_summaries[name] = {
924
- 'probs': [], 'speech_chunks': 0, 'total_chunks': 0,
925
- 'avg_time': 0, 'max_prob': 0, 'min_prob': 1, 'full_name': result.model_name
926
- }
927
  summary = model_summaries[name]
928
  summary['probs'].append(result.probability)
929
  summary['total_chunks'] += 1
930
- summary['avg_time'] += result.processing_time
931
- summary['max_prob'] = max(summary['max_prob'], result.probability)
932
- summary['min_prob'] = min(summary['min_prob'], result.probability)
933
  if result.is_speech:
934
  summary['speech_chunks'] += 1
935
 
 
 
936
  for model_name, summary in model_summaries.items():
937
  avg_prob = np.mean(summary['probs']) if summary['probs'] else 0
938
- std_prob = np.std(summary['probs']) if summary['probs'] else 0
939
  speech_ratio = (summary['speech_chunks'] / summary['total_chunks']) if summary['total_chunks'] > 0 else 0
940
- avg_time = (summary['avg_time'] / summary['total_chunks']) * 1000 if summary['total_chunks'] > 0 else 0
941
 
942
  status_icon = "🟢" if speech_ratio > 0.5 else "🟡" if speech_ratio > 0.2 else "🔴"
943
- details_lines.extend([
944
- f"{status_icon} **{summary['full_name']}**:",
945
- f" • Probability: {avg_prob:.3f} (±{std_prob:.3f}) [{summary['min_prob']:.3f}-{summary['max_prob']:.3f}]",
946
- f" • Speech Detection: {speech_ratio*100:.1f}% ({summary['speech_chunks']}/{summary['total_chunks']} windows)",
947
- f" • Processing Speed: {avg_time:.1f}ms/window (RTF: {avg_time/32:.3f})",
948
- ""
949
- ])
950
 
951
  if onsets_offsets:
952
- details_lines.append("🎯 **Speech Events (with Delay Compensation)**:")
953
- total_speech_duration = 0
954
- for i, event in enumerate(onsets_offsets[:10]):
955
- if event.offset_time > event.onset_time:
956
- duration = event.offset_time - event.onset_time
957
- total_speech_duration += duration
958
- details_lines.append(
959
- f" • {event.model_name}: {event.onset_time:.2f}s → {event.offset_time:.2f}s "
960
- f"({duration:.2f}s, conf: {event.confidence:.3f})"
961
- )
962
- else:
963
- details_lines.append(
964
- f" • {event.model_name}: {event.onset_time:.2f}s → ongoing (conf: {event.confidence:.3f})"
965
- )
966
-
967
- if len(onsets_offsets) > 10:
968
- details_lines.append(f" • ... and {len(onsets_offsets) - 10} more events")
969
-
970
- speech_percentage = (total_speech_duration / (len(processed_audio)/self.processor.sample_rate)) * 100
971
- details_lines.extend([
972
- "",
973
- f"📈 **Summary**: {total_speech_duration:.2f}s speech ({speech_percentage:.1f}% of audio)"
974
- ])
975
- else:
976
- details_lines.append("🎯 **Speech Events**: No clear onset/offset boundaries detected")
977
 
978
  details_text = "\n".join(details_lines)
979
 
@@ -991,39 +951,39 @@ demo_app = VADDemo()
991
 
992
  # ===== GRADIO INTERFACE =====
993
 
994
- print("🚀 Launching Real-time VAD Demo...")
995
-
996
  def create_interface():
997
- with gr.Blocks(title="VAD Demo - Real-time Speech Detection", theme=gr.themes.Soft()) as interface:
998
 
 
999
  gr.Markdown("""
1000
- # 🎤 VAD Demo: Real-time Speech Detection Framework v3
1001
-
1002
- **Multi-Model Voice Activity Detection with Advanced Onset/Offset Detection**
1003
-
1004
- ✨ **Ultra-High Resolution Features**:
1005
- - 🟢 **Green markers**: Speech onset detection with delay compensation
1006
- - 🔴 **Red markers**: Speech offset detection
1007
- - 📊 **Ultra-HD spectrograms**: 2048-point FFT, 256-sample hop (8x temporal resolution)
1008
- - 💫 **Separated probability curves**: Model A (yellow) in top panel, Model B (orange) in bottom
1009
- - 🔧 **Auto delay correction**: Cross-correlation-based compensation
1010
- - 📈 **Threshold visualization**: Cyan threshold line on both panels
1011
- - 🎨 **Matched color palettes**: Same Viridis colorscale for both spectrograms
1012
-
1013
- | Model | Type | Description |
1014
- |-------|------|-------------|
1015
- | **Silero-VAD** | Neural Network | Production-ready VAD (1.8M params) |
1016
- | **WebRTC-VAD** | Signal Processing | Google's real-time VAD |
1017
- | **E-PANNs** | Deep Learning | Efficient audio analysis |
1018
- | **PANNs** | Deep CNN | Large-scale pretrained audio networks |
1019
- | **AST** | Transformer | Audio Spectrogram Transformer |
1020
-
1021
- **Instructions:** Record audio → Select models → Adjust threshold → Analyze!
1022
  """)
1023
 
 
1024
  with gr.Row():
1025
  with gr.Column():
1026
- gr.Markdown("### 🎛️ **Advanced Controls**")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1027
 
1028
  model_a = gr.Dropdown(
1029
  choices=["Silero-VAD", "WebRTC-VAD", "E-PANNs", "PANNs", "AST"],
@@ -1042,56 +1002,28 @@ def create_interface():
1042
  maximum=1.0,
1043
  value=0.5,
1044
  step=0.01,
1045
- label="Detection Threshold (with hysteresis)"
1046
  )
1047
 
1048
- process_btn = gr.Button("🎤 Advanced Analysis", variant="primary", size="lg")
1049
 
1050
- gr.Markdown("""
1051
- ### 📖 **Enhanced Features**
1052
- 1. 🎙️ **Record**: High-quality audio capture
1053
- 2. 🔧 **Compare**: Different models in each panel
1054
- 3. ⚙️ **Threshold**: Cyan line shows threshold level on both panels
1055
- 4. 📈 **Curves**: Yellow (Model A) and orange (Model B) probability curves
1056
- 5. 🔄 **Auto-sync**: Automatic delay compensation
1057
- 6. 👀 **Events**: Model-specific onset/offset detection per panel!
1058
-
1059
- ### 🎨 **Visualization Elements**
1060
- - **🟢 Green lines**: Speech onset (▲ markers) - model-specific per panel
1061
- - **🔴 Red lines**: Speech offset (▼ markers) - model-specific per panel
1062
- - **🔵 Cyan line**: Detection threshold (same on both panels)
1063
- - **🟡 Yellow curve**: Model A probability (top panel only)
1064
- - **🟠 Orange curve**: Model B probability (bottom panel only)
1065
- - **Ultra-HD spectrograms**: 2048-point FFT, same Viridis colorscale
1066
- """)
1067
-
1068
- with gr.Column():
1069
- gr.Markdown("### 🎙️ **Audio Input**")
1070
-
1071
- audio_input = gr.Audio(
1072
- sources=["microphone"],
1073
- type="numpy",
1074
- label="Record Audio (3-15 seconds recommended)"
1075
  )
1076
 
1077
- gr.Markdown("### 📊 **Real-Time Speech Visualizer Dashboard**")
 
1078
 
1079
  with gr.Row():
1080
- plot_output = gr.Plot(label="Advanced VAD Analysis with Complete Feature Set")
1081
 
1082
- with gr.Row():
1083
- with gr.Column():
1084
- status_display = gr.Textbox(
1085
- label="🎯 Real-time Status",
1086
- value="🔇 Ready for advanced speech analysis",
1087
- interactive=False
1088
- )
1089
-
1090
  with gr.Row():
1091
  details_output = gr.Textbox(
1092
- label="📋 Comprehensive Analysis Report",
1093
- lines=25,
1094
- max_lines=30,
1095
  interactive=False
1096
  )
1097
 
@@ -1102,34 +1034,10 @@ def create_interface():
1102
  outputs=[plot_output, status_display, details_output]
1103
  )
1104
 
 
1105
  gr.Markdown("""
1106
  ---
1107
- ### 🔬 **Research Context - WASPAA 2025**
1108
-
1109
- This demo implements the complete **speech removal framework** from our WASPAA 2025 paper:
1110
-
1111
- **🎯 Core Innovations:**
1112
- - **Advanced Onset/Offset Detection**: Sub-frame precision with delay compensation
1113
- - **Multi-Model Architecture**: Real-time comparison of 5 VAD approaches
1114
- - **High-Resolution Analysis**: 2048-point FFT with 256-sample hop (ultra-smooth)
1115
- - **Adaptive Thresholding**: Hysteresis-based decision boundaries
1116
- - **Cross-Correlation Sync**: Automatic delay compensation up to ±100ms
1117
-
1118
- **🏠 Real-World Applications:**
1119
- - Smart home privacy: Remove conversations, keep environmental sounds
1120
- - GDPR audio compliance: Privacy-aware dataset processing
1121
- - Call center automation: Real-time speech/silence detection
1122
- - Voice assistant optimization: Precise wake-word boundaries
1123
-
1124
- **📊 Performance Metrics:**
1125
- - **Precision**: 94.2% on CHiME-Home dataset
1126
- - **Recall**: 91.8% with optimized thresholds
1127
- - **Latency**: <50ms processing time (Real-Time Factor: 0.05)
1128
- - **Resolution**: 16ms time resolution, 128 mel bins (ultra-high definition)
1129
-
1130
- **Citation:** *Speech Removal Framework for Privacy-Preserving Audio Recordings*, WASPAA 2025
1131
-
1132
- **⚡ CPU Optimized** | **🆓 Hugging Face Spaces** | **🎯 Production Ready**
1133
  """)
1134
 
1135
  return interface
 
333
  self.model = None
334
  self.feature_extractor = None
335
  self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
336
+ # Cache for features to avoid recomputing
337
+ self.feature_cache = {}
338
  self.load_model()
339
 
340
  def load_model(self):
 
358
 
359
  if self.model is None or len(audio) == 0:
360
  if len(audio) > 0:
361
+ # Fast fallback using energy and spectral features
362
+ energy = np.sum(audio ** 2)
363
  if LIBROSA_AVAILABLE:
364
  spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=self.sample_rate))
 
365
  probability = min((energy * spectral_centroid) / 10000, 1.0)
366
  else:
 
367
  probability = min(energy / 0.01, 1.0)
368
  is_speech = probability > 0.5
369
  else:
 
375
  if len(audio.shape) > 1:
376
  audio = audio.mean(axis=1)
377
 
378
+ # OPTIMIZATION: Use smaller chunks for faster processing
379
+ # AST can work with shorter sequences than the full required length
380
+ max_length = self.sample_rate * 2 # Max 2 seconds to keep it fast
381
+ if len(audio) > max_length:
382
+ # Take the middle part of the audio for better representation
383
+ start_idx = (len(audio) - max_length) // 2
384
+ audio = audio[start_idx:start_idx + max_length]
385
+ elif len(audio) < self.sample_rate // 2: # If less than 0.5 seconds
386
+ # Pad to minimum length
387
+ audio = np.pad(audio, (0, self.sample_rate // 2 - len(audio)), 'constant')
 
 
388
 
389
+ # Create a hash for caching (to avoid recomputing same features)
390
+ audio_hash = hash(audio.tobytes())
 
 
 
391
 
392
+ if audio_hash in self.feature_cache:
393
+ speech_prob = self.feature_cache[audio_hash]
394
  else:
395
+ # Feature extraction with reduced parameters for speed
396
+ inputs = self.feature_extractor(
397
+ audio,
398
+ sampling_rate=self.sample_rate,
399
+ return_tensors="pt",
400
+ max_length=512, # Reduced from default for speed
401
+ truncation=True
402
+ )
403
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
404
+
405
+ with torch.no_grad():
406
+ outputs = self.model(**inputs)
407
+ logits = outputs.logits
408
+ probs = torch.sigmoid(logits)
409
+
410
+ label2id = self.model.config.label2id
411
+ speech_indices = []
412
+ for lbl, idx in label2id.items():
413
+ if any(word in lbl.lower() for word in ['speech', 'voice', 'talk', 'conversation', 'speaking', 'human']):
414
+ speech_indices.append(idx)
415
+
416
+ if speech_indices:
417
+ speech_prob = probs[0, speech_indices].mean().item()
418
+ else:
419
+ # Fallback: use average of first few probabilities
420
+ speech_prob = probs[0, :10].mean().item()
421
+
422
+ # Cache the result if audio is not too long (to prevent memory issues)
423
+ if len(self.feature_cache) < 50: # Limit cache size
424
+ self.feature_cache[audio_hash] = speech_prob
425
 
426
  return VADResult(float(speech_prob), speech_prob > 0.5, self.model_name, time.time()-start_time, timestamp)
427
 
428
  except Exception as e:
429
  print(f"Error in {self.model_name}: {e}")
430
+ # Fast fallback
431
  if len(audio) > 0:
432
  energy = np.sum(audio ** 2)
433
+ probability = min(energy / 0.01, 1.0)
434
+ is_speech = energy > 0.01
 
435
  else:
436
  probability = 0.0
437
  is_speech = False
 
653
  print(f"Delay estimation error: {e}")
654
  return 0.0
655
 
656
+ # ===== ENHANCED VISUALIZATION =====
657
 
658
  def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
659
  onsets_offsets: List[OnsetOffset], processor: AudioProcessor,
 
836
  secondary_y=True
837
  )
838
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
839
  return fig
840
 
841
  except Exception as e:
 
903
  speech_detected = any(result.is_speech for result in vad_results)
904
  total_speech_time = sum(1 for r in vad_results if r.is_speech) * self.processor.hop_size
905
 
 
 
906
  if speech_detected:
907
+ status_msg = f"🎙️ SPEECH DETECTED - {total_speech_time:.1f}s total"
908
  else:
909
+ status_msg = f"🔇 No speech detected"
 
 
 
 
 
 
 
 
 
910
 
911
+ # Simplified details
912
  model_summaries = {}
913
  for result in vad_results:
914
  name = result.model_name.split(' ')[0]
915
  if name not in model_summaries:
916
+ model_summaries[name] = {'probs': [], 'speech_chunks': 0, 'total_chunks': 0}
 
 
 
917
  summary = model_summaries[name]
918
  summary['probs'].append(result.probability)
919
  summary['total_chunks'] += 1
 
 
 
920
  if result.is_speech:
921
  summary['speech_chunks'] += 1
922
 
923
+ details_lines = [f"**Analysis Results** (Threshold: {threshold:.2f})"]
924
+
925
  for model_name, summary in model_summaries.items():
926
  avg_prob = np.mean(summary['probs']) if summary['probs'] else 0
 
927
  speech_ratio = (summary['speech_chunks'] / summary['total_chunks']) if summary['total_chunks'] > 0 else 0
 
928
 
929
  status_icon = "🟢" if speech_ratio > 0.5 else "🟡" if speech_ratio > 0.2 else "🔴"
930
+ details_lines.append(f"{status_icon} **{model_name}**: {avg_prob:.3f} avg prob, {speech_ratio*100:.1f}% speech")
 
 
 
 
 
 
931
 
932
  if onsets_offsets:
933
+ details_lines.append(f"\n**Speech Events**: {len(onsets_offsets)} detected")
934
+ for i, event in enumerate(onsets_offsets[:5]): # Show first 5 only
935
+ duration = event.offset_time - event.onset_time if event.offset_time > event.onset_time else 0
936
+ details_lines.append(f"• {event.model_name}: {event.onset_time:.2f}s - {event.offset_time:.2f}s ({duration:.2f}s)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
937
 
938
  details_text = "\n".join(details_lines)
939
 
 
951
 
952
  # ===== GRADIO INTERFACE =====
953
 
 
 
954
  def create_interface():
955
+ with gr.Blocks(title="VAD Demo - Voice Activity Detection", theme=gr.themes.Soft()) as interface:
956
 
957
+ # Header with logos
958
  gr.Markdown("""
959
+ <div style="text-align: center; margin-bottom: 20px;">
960
+ <h1>🎤 VAD Demo - Voice Activity Detection</h1>
961
+ <p><strong>Multi-Model Real-time Speech Detection Framework</strong></p>
962
+ </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
963
  """)
964
 
965
+ # Logos section
966
  with gr.Row():
967
  with gr.Column():
968
+ gr.HTML("""
969
+ <div style="display: flex; justify-content: center; align-items: center; gap: 20px; margin: 20px 0; flex-wrap: wrap;">
970
+ <img src="file/ai4s_banner.png" alt="AI4S" style="height: 60px; object-fit: contain;">
971
+ <img src="file/surrey_logo.png" alt="University of Surrey" style="height: 60px; object-fit: contain;">
972
+ <img src="file/EPSRC_logo.png" alt="EPSRC" style="height: 60px; object-fit: contain;">
973
+ <img src="file/CVSSP_logo.png" alt="CVSSP" style="height: 60px; object-fit: contain;">
974
+ </div>
975
+ """)
976
+
977
+ # Main interface
978
+ with gr.Row():
979
+ with gr.Column(scale=1):
980
+ gr.Markdown("### 🎛️ Controls")
981
+
982
+ audio_input = gr.Audio(
983
+ sources=["microphone"],
984
+ type="numpy",
985
+ label="Record Audio"
986
+ )
987
 
988
  model_a = gr.Dropdown(
989
  choices=["Silero-VAD", "WebRTC-VAD", "E-PANNs", "PANNs", "AST"],
 
1002
  maximum=1.0,
1003
  value=0.5,
1004
  step=0.01,
1005
+ label="Detection Threshold"
1006
  )
1007
 
1008
+ process_btn = gr.Button("🎤 Analyze", variant="primary", size="lg")
1009
 
1010
+ with gr.Column(scale=2):
1011
+ status_display = gr.Textbox(
1012
+ label="Status",
1013
+ value="🔇 Ready to analyze audio",
1014
+ interactive=False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1015
  )
1016
 
1017
+ # Results
1018
+ gr.Markdown("### 📊 Results")
1019
 
1020
  with gr.Row():
1021
+ plot_output = gr.Plot(label="Speech Detection Visualization")
1022
 
 
 
 
 
 
 
 
 
1023
  with gr.Row():
1024
  details_output = gr.Textbox(
1025
+ label="Analysis Details",
1026
+ lines=10,
 
1027
  interactive=False
1028
  )
1029
 
 
1034
  outputs=[plot_output, status_display, details_output]
1035
  )
1036
 
1037
+ # Footer
1038
  gr.Markdown("""
1039
  ---
1040
+ **Models**: Silero-VAD, WebRTC-VAD, E-PANNs, PANNs, AST | **Research**: WASPAA 2025 | **Institution**: University of Surrey, CVSSP
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1041
  """)
1042
 
1043
  return interface