Gabriel Bibbó
commited on
Commit
·
bce828d
1
Parent(s):
0ea20e3
Hotfix: Restore basic functionality - fix AST saturation and PANNs execution
Browse files
app.py
CHANGED
|
@@ -174,13 +174,13 @@ class OptimizedWebRTCVAD:
|
|
| 174 |
def __init__(self):
|
| 175 |
self.model_name = "WebRTC-VAD"
|
| 176 |
self.sample_rate = 16000
|
| 177 |
-
self.frame_duration = 30
|
| 178 |
-
self.frame_size = int(self.sample_rate * self.frame_duration / 1000)
|
| 179 |
|
| 180 |
if WEBRTC_AVAILABLE:
|
| 181 |
try:
|
| 182 |
self.vad = webrtcvad.Vad(3)
|
| 183 |
-
print(f"✅ {self.model_name} loaded successfully")
|
| 184 |
except:
|
| 185 |
self.vad = None
|
| 186 |
else:
|
|
@@ -204,10 +204,16 @@ class OptimizedWebRTCVAD:
|
|
| 204 |
audio_clipped = np.clip(audio, -1.0, 1.0)
|
| 205 |
audio_int16 = (audio_clipped * 32767).astype(np.int16)
|
| 206 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
speech_frames = 0
|
| 208 |
total_frames = 0
|
| 209 |
|
| 210 |
-
|
|
|
|
| 211 |
frame = audio_int16[i:i + self.frame_size].tobytes()
|
| 212 |
if self.vad.is_speech(frame, self.sample_rate):
|
| 213 |
speech_frames += 1
|
|
@@ -245,15 +251,11 @@ class OptimizedEPANNs:
|
|
| 245 |
orig_sr=16000,
|
| 246 |
target_sr=self.sample_rate)
|
| 247 |
|
| 248 |
-
# Ensure minimum length (6 seconds) using
|
| 249 |
min_samples = 6 * self.sample_rate # 6 seconds
|
| 250 |
if len(audio_resampled) < min_samples:
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
else:
|
| 254 |
-
# Fallback: repeat the signal
|
| 255 |
-
repeat_factor = int(np.ceil(min_samples / len(audio_resampled)))
|
| 256 |
-
audio_resampled = np.tile(audio_resampled, repeat_factor)[:min_samples]
|
| 257 |
|
| 258 |
mel_spec = librosa.feature.melspectrogram(y=audio_resampled, sr=self.sample_rate, n_mels=64)
|
| 259 |
energy = np.mean(librosa.power_to_db(mel_spec, ref=np.max))
|
|
@@ -335,15 +337,11 @@ class OptimizedPANNs:
|
|
| 335 |
audio
|
| 336 |
)
|
| 337 |
|
| 338 |
-
# Ensure minimum length for PANNs (10 seconds) using
|
| 339 |
min_samples = 10 * self.sample_rate # 10 seconds for optimal performance
|
| 340 |
if len(audio_resampled) < min_samples:
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
else:
|
| 344 |
-
# Fallback: repeat the signal
|
| 345 |
-
repeat_factor = int(np.ceil(min_samples / len(audio_resampled)))
|
| 346 |
-
audio_resampled = np.tile(audio_resampled, repeat_factor)[:min_samples]
|
| 347 |
|
| 348 |
clip_probs, _ = self.model.inference(audio_resampled[np.newaxis, :],
|
| 349 |
input_sr=self.sample_rate)
|
|
@@ -473,12 +471,8 @@ class OptimizedAST:
|
|
| 473 |
# Ensure minimum length for AST (6.4 seconds for 1024 frames)
|
| 474 |
min_samples = int(6.4 * self.sample_rate) # 6.4 seconds
|
| 475 |
if len(audio_for_ast) < min_samples:
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
else:
|
| 479 |
-
# Fallback: repeat the signal
|
| 480 |
-
repeat_factor = int(np.ceil(min_samples / len(audio_for_ast)))
|
| 481 |
-
audio_for_ast = np.tile(audio_for_ast, repeat_factor)[:min_samples]
|
| 482 |
|
| 483 |
# Truncate if too long (AST can handle up to ~10s, but we'll use 8s max for efficiency)
|
| 484 |
max_samples = 8 * self.sample_rate
|
|
@@ -571,7 +565,7 @@ class AudioProcessor:
|
|
| 571 |
# Model-specific window sizes (each model gets appropriate context)
|
| 572 |
self.model_windows = {
|
| 573 |
"Silero-VAD": 0.032, # 32ms exactly as required (512 samples)
|
| 574 |
-
"WebRTC-VAD": 0.03, # 30ms frames
|
| 575 |
"E-PANNs": 6.0, # 6 seconds minimum for reliable results
|
| 576 |
"PANNs": 10.0, # 10 seconds for optimal performance
|
| 577 |
"AST": 6.4 # ~6.4 seconds (1024 frames * 6.25ms)
|
|
@@ -579,8 +573,8 @@ class AudioProcessor:
|
|
| 579 |
|
| 580 |
# Model-specific hop sizes for efficiency
|
| 581 |
self.model_hop_sizes = {
|
| 582 |
-
"Silero-VAD": 0.016, # 16ms hop for Silero
|
| 583 |
-
"WebRTC-VAD": 0.
|
| 584 |
"E-PANNs": 1.0, # Process every 1s but with 6s window
|
| 585 |
"PANNs": 2.0, # Process every 2s but with 10s window
|
| 586 |
"AST": 1.0 # Process every 1s but with 6.4s window
|
|
@@ -1056,35 +1050,40 @@ class VADDemo:
|
|
| 1056 |
window_samples = int(self.processor.sample_rate * window_size)
|
| 1057 |
hop_samples = int(self.processor.sample_rate * hop_size)
|
| 1058 |
|
| 1059 |
-
#
|
| 1060 |
if len(processed_audio) < window_samples:
|
| 1061 |
-
#
|
| 1062 |
-
|
| 1063 |
-
|
| 1064 |
-
else:
|
| 1065 |
-
extended_audio = processed_audio
|
| 1066 |
-
|
| 1067 |
-
for i in range(0, len(extended_audio) - window_samples, hop_samples):
|
| 1068 |
-
timestamp = i / self.processor.sample_rate
|
| 1069 |
-
|
| 1070 |
-
# Extract window centered around current position
|
| 1071 |
-
start_pos = max(0, i)
|
| 1072 |
-
end_pos = min(len(extended_audio), i + window_samples)
|
| 1073 |
-
chunk = extended_audio[start_pos:end_pos]
|
| 1074 |
-
|
| 1075 |
-
# Ensure chunk has the right length
|
| 1076 |
-
if len(chunk) < window_samples:
|
| 1077 |
-
chunk = np.pad(chunk, (0, window_samples - len(chunk)), 'wrap')
|
| 1078 |
|
| 1079 |
# Special handling for different models
|
| 1080 |
if model_name == 'AST':
|
| 1081 |
-
result = self.models[model_name].predict(chunk, timestamp, full_audio=
|
| 1082 |
else:
|
| 1083 |
result = self.models[model_name].predict(chunk, timestamp)
|
| 1084 |
|
| 1085 |
# Use model-specific threshold
|
| 1086 |
result.is_speech = result.probability > model_threshold
|
| 1087 |
vad_results.append(result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1088 |
|
| 1089 |
delay_compensation = self.processor.estimate_delay_compensation(processed_audio, vad_results)
|
| 1090 |
onsets_offsets = self.processor.detect_onset_offset_advanced(vad_results, threshold)
|
|
@@ -1201,7 +1200,7 @@ def create_interface():
|
|
| 1201 |
|
| 1202 |
model_b = gr.Dropdown(
|
| 1203 |
choices=["Silero-VAD", "WebRTC-VAD", "E-PANNs", "PANNs", "AST"],
|
| 1204 |
-
value="
|
| 1205 |
label="Model B (Bottom Panel)"
|
| 1206 |
)
|
| 1207 |
|
|
@@ -1247,6 +1246,8 @@ def create_interface():
|
|
| 1247 |
gr.Markdown("""
|
| 1248 |
---
|
| 1249 |
**Models**: Silero-VAD, WebRTC-VAD, E-PANNs, PANNs, AST | **Research**: WASPAA 2025 | **Institution**: University of Surrey, CVSSP
|
|
|
|
|
|
|
| 1250 |
""")
|
| 1251 |
|
| 1252 |
return interface
|
|
|
|
| 174 |
def __init__(self):
|
| 175 |
self.model_name = "WebRTC-VAD"
|
| 176 |
self.sample_rate = 16000
|
| 177 |
+
self.frame_duration = 30 # Only 10, 20, or 30 ms are supported
|
| 178 |
+
self.frame_size = int(self.sample_rate * self.frame_duration / 1000) # 480 samples for 30ms
|
| 179 |
|
| 180 |
if WEBRTC_AVAILABLE:
|
| 181 |
try:
|
| 182 |
self.vad = webrtcvad.Vad(3)
|
| 183 |
+
print(f"✅ {self.model_name} loaded successfully (frame size: {self.frame_size} samples)")
|
| 184 |
except:
|
| 185 |
self.vad = None
|
| 186 |
else:
|
|
|
|
| 204 |
audio_clipped = np.clip(audio, -1.0, 1.0)
|
| 205 |
audio_int16 = (audio_clipped * 32767).astype(np.int16)
|
| 206 |
|
| 207 |
+
# Ensure we have enough samples for at least one frame
|
| 208 |
+
if len(audio_int16) < self.frame_size:
|
| 209 |
+
# Pad to frame size
|
| 210 |
+
audio_int16 = np.pad(audio_int16, (0, self.frame_size - len(audio_int16)), 'constant')
|
| 211 |
+
|
| 212 |
speech_frames = 0
|
| 213 |
total_frames = 0
|
| 214 |
|
| 215 |
+
# Process exact frame sizes only
|
| 216 |
+
for i in range(0, len(audio_int16) - self.frame_size + 1, self.frame_size):
|
| 217 |
frame = audio_int16[i:i + self.frame_size].tobytes()
|
| 218 |
if self.vad.is_speech(frame, self.sample_rate):
|
| 219 |
speech_frames += 1
|
|
|
|
| 251 |
orig_sr=16000,
|
| 252 |
target_sr=self.sample_rate)
|
| 253 |
|
| 254 |
+
# Ensure minimum length (6 seconds) using constant padding instead of wrap
|
| 255 |
min_samples = 6 * self.sample_rate # 6 seconds
|
| 256 |
if len(audio_resampled) < min_samples:
|
| 257 |
+
# Use constant padding with small value instead of wrap to avoid artificial periodicity
|
| 258 |
+
audio_resampled = np.pad(audio_resampled, (0, min_samples - len(audio_resampled)), 'constant', constant_values=0.0)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
|
| 260 |
mel_spec = librosa.feature.melspectrogram(y=audio_resampled, sr=self.sample_rate, n_mels=64)
|
| 261 |
energy = np.mean(librosa.power_to_db(mel_spec, ref=np.max))
|
|
|
|
| 337 |
audio
|
| 338 |
)
|
| 339 |
|
| 340 |
+
# Ensure minimum length for PANNs (10 seconds) using constant padding instead of wrap
|
| 341 |
min_samples = 10 * self.sample_rate # 10 seconds for optimal performance
|
| 342 |
if len(audio_resampled) < min_samples:
|
| 343 |
+
# Use constant padding instead of wrap to avoid artificial periodicity
|
| 344 |
+
audio_resampled = np.pad(audio_resampled, (0, min_samples - len(audio_resampled)), 'constant', constant_values=0.0)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
|
| 346 |
clip_probs, _ = self.model.inference(audio_resampled[np.newaxis, :],
|
| 347 |
input_sr=self.sample_rate)
|
|
|
|
| 471 |
# Ensure minimum length for AST (6.4 seconds for 1024 frames)
|
| 472 |
min_samples = int(6.4 * self.sample_rate) # 6.4 seconds
|
| 473 |
if len(audio_for_ast) < min_samples:
|
| 474 |
+
# Use constant padding instead of wrap to avoid artificial periodicity
|
| 475 |
+
audio_for_ast = np.pad(audio_for_ast, (0, min_samples - len(audio_for_ast)), 'constant', constant_values=0.0)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 476 |
|
| 477 |
# Truncate if too long (AST can handle up to ~10s, but we'll use 8s max for efficiency)
|
| 478 |
max_samples = 8 * self.sample_rate
|
|
|
|
| 565 |
# Model-specific window sizes (each model gets appropriate context)
|
| 566 |
self.model_windows = {
|
| 567 |
"Silero-VAD": 0.032, # 32ms exactly as required (512 samples)
|
| 568 |
+
"WebRTC-VAD": 0.03, # 30ms frames (480 samples)
|
| 569 |
"E-PANNs": 6.0, # 6 seconds minimum for reliable results
|
| 570 |
"PANNs": 10.0, # 10 seconds for optimal performance
|
| 571 |
"AST": 6.4 # ~6.4 seconds (1024 frames * 6.25ms)
|
|
|
|
| 573 |
|
| 574 |
# Model-specific hop sizes for efficiency
|
| 575 |
self.model_hop_sizes = {
|
| 576 |
+
"Silero-VAD": 0.016, # 16ms hop for Silero (512 samples window)
|
| 577 |
+
"WebRTC-VAD": 0.03, # 30ms hop for WebRTC (match frame duration)
|
| 578 |
"E-PANNs": 1.0, # Process every 1s but with 6s window
|
| 579 |
"PANNs": 2.0, # Process every 2s but with 10s window
|
| 580 |
"AST": 1.0 # Process every 1s but with 6.4s window
|
|
|
|
| 1050 |
window_samples = int(self.processor.sample_rate * window_size)
|
| 1051 |
hop_samples = int(self.processor.sample_rate * hop_size)
|
| 1052 |
|
| 1053 |
+
# Critical fix: Always process at least once, even if audio is shorter than window
|
| 1054 |
if len(processed_audio) < window_samples:
|
| 1055 |
+
# Audio is shorter than required window - process once with available audio
|
| 1056 |
+
chunk = processed_audio
|
| 1057 |
+
timestamp = 0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1058 |
|
| 1059 |
# Special handling for different models
|
| 1060 |
if model_name == 'AST':
|
| 1061 |
+
result = self.models[model_name].predict(chunk, timestamp, full_audio=processed_audio)
|
| 1062 |
else:
|
| 1063 |
result = self.models[model_name].predict(chunk, timestamp)
|
| 1064 |
|
| 1065 |
# Use model-specific threshold
|
| 1066 |
result.is_speech = result.probability > model_threshold
|
| 1067 |
vad_results.append(result)
|
| 1068 |
+
else:
|
| 1069 |
+
# Audio is long enough - process in sliding windows
|
| 1070 |
+
for i in range(0, len(processed_audio) - window_samples + 1, hop_samples):
|
| 1071 |
+
timestamp = i / self.processor.sample_rate
|
| 1072 |
+
|
| 1073 |
+
# Extract window
|
| 1074 |
+
start_pos = i
|
| 1075 |
+
end_pos = min(len(processed_audio), i + window_samples)
|
| 1076 |
+
chunk = processed_audio[start_pos:end_pos]
|
| 1077 |
+
|
| 1078 |
+
# Special handling for different models
|
| 1079 |
+
if model_name == 'AST':
|
| 1080 |
+
result = self.models[model_name].predict(chunk, timestamp, full_audio=processed_audio)
|
| 1081 |
+
else:
|
| 1082 |
+
result = self.models[model_name].predict(chunk, timestamp)
|
| 1083 |
+
|
| 1084 |
+
# Use model-specific threshold
|
| 1085 |
+
result.is_speech = result.probability > model_threshold
|
| 1086 |
+
vad_results.append(result)
|
| 1087 |
|
| 1088 |
delay_compensation = self.processor.estimate_delay_compensation(processed_audio, vad_results)
|
| 1089 |
onsets_offsets = self.processor.detect_onset_offset_advanced(vad_results, threshold)
|
|
|
|
| 1200 |
|
| 1201 |
model_b = gr.Dropdown(
|
| 1202 |
choices=["Silero-VAD", "WebRTC-VAD", "E-PANNs", "PANNs", "AST"],
|
| 1203 |
+
value="WebRTC-VAD",
|
| 1204 |
label="Model B (Bottom Panel)"
|
| 1205 |
)
|
| 1206 |
|
|
|
|
| 1246 |
gr.Markdown("""
|
| 1247 |
---
|
| 1248 |
**Models**: Silero-VAD, WebRTC-VAD, E-PANNs, PANNs, AST | **Research**: WASPAA 2025 | **Institution**: University of Surrey, CVSSP
|
| 1249 |
+
|
| 1250 |
+
**Note**: Large models (PANNs: 10s, E-PANNs: 6s, AST: 6.4s) work best with longer recordings. Short clips will be padded appropriately.
|
| 1251 |
""")
|
| 1252 |
|
| 1253 |
return interface
|