Gabriel Bibb贸
commited on
Commit
路
bcae560
1
Parent(s):
9d07682
Hotfix: Restore basic functionality - fix AST saturation and PANNs execution
Browse files
app.py
CHANGED
|
@@ -230,8 +230,9 @@ class OptimizedEPANNs:
|
|
| 230 |
if len(audio.shape) > 1:
|
| 231 |
audio = audio.mean(axis=1)
|
| 232 |
|
| 233 |
-
#
|
| 234 |
if LIBROSA_AVAILABLE:
|
|
|
|
| 235 |
audio_resampled = librosa.resample(audio.astype(float),
|
| 236 |
orig_sr=16000,
|
| 237 |
target_sr=self.sample_rate)
|
|
@@ -270,7 +271,6 @@ class OptimizedPANNs:
|
|
| 270 |
self.sample_rate = 32000
|
| 271 |
self.model = None
|
| 272 |
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 273 |
-
self.processor = AudioProcessor() # For fast resampling
|
| 274 |
self.load_model()
|
| 275 |
|
| 276 |
def load_model(self):
|
|
@@ -303,8 +303,19 @@ class OptimizedPANNs:
|
|
| 303 |
if len(audio.shape) > 1:
|
| 304 |
audio = audio.mean(axis=1)
|
| 305 |
|
| 306 |
-
#
|
| 307 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 308 |
|
| 309 |
# Ensure minimum length for PANNs (need at least 1 second)
|
| 310 |
min_samples = self.sample_rate # 1 second
|
|
@@ -373,11 +384,17 @@ class OptimizedAST:
|
|
| 373 |
start_time = time.time()
|
| 374 |
|
| 375 |
if self.model is None or len(audio) == 0:
|
| 376 |
-
#
|
| 377 |
if len(audio) > 0:
|
| 378 |
energy = np.sum(audio ** 2)
|
| 379 |
-
|
| 380 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 381 |
else:
|
| 382 |
probability = 0.0
|
| 383 |
is_speech = False
|
|
@@ -387,16 +404,33 @@ class OptimizedAST:
|
|
| 387 |
if len(audio.shape) > 1:
|
| 388 |
audio = audio.mean(axis=1)
|
| 389 |
|
| 390 |
-
# Use
|
| 391 |
-
if len(
|
| 392 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 393 |
|
| 394 |
-
# Feature extraction
|
| 395 |
inputs = self.feature_extractor(
|
| 396 |
-
|
| 397 |
sampling_rate=self.sample_rate,
|
| 398 |
return_tensors="pt",
|
| 399 |
-
max_length=1024,
|
| 400 |
truncation=True
|
| 401 |
)
|
| 402 |
|
|
@@ -418,23 +452,23 @@ class OptimizedAST:
|
|
| 418 |
|
| 419 |
if speech_indices:
|
| 420 |
speech_prob = probs[0, speech_indices].mean().item()
|
|
|
|
|
|
|
|
|
|
| 421 |
else:
|
| 422 |
-
# Fallback to energy
|
| 423 |
-
energy = np.sum(
|
| 424 |
-
speech_prob = min(energy *
|
| 425 |
|
| 426 |
-
|
| 427 |
-
speech_prob = np.clip(speech_prob, 0.0, 1.0)
|
| 428 |
-
|
| 429 |
-
return VADResult(float(speech_prob), speech_prob > 0.5, self.model_name, time.time()-start_time, timestamp)
|
| 430 |
|
| 431 |
except Exception as e:
|
| 432 |
print(f"Error in {self.model_name}: {e}")
|
| 433 |
-
#
|
| 434 |
if len(audio) > 0:
|
| 435 |
energy = np.sum(audio ** 2)
|
| 436 |
-
probability = min(energy *
|
| 437 |
-
is_speech = energy > 0.
|
| 438 |
else:
|
| 439 |
probability = 0.0
|
| 440 |
is_speech = False
|
|
@@ -477,7 +511,6 @@ class AudioProcessor:
|
|
| 477 |
if len(audio_data.shape) > 1:
|
| 478 |
audio_data = audio_data.mean(axis=1)
|
| 479 |
|
| 480 |
-
# Simple peak normalization
|
| 481 |
if np.max(np.abs(audio_data)) > 0:
|
| 482 |
audio_data = audio_data / np.max(np.abs(audio_data))
|
| 483 |
|
|
@@ -707,11 +740,10 @@ def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
|
|
| 707 |
)
|
| 708 |
|
| 709 |
if len(time_frames) > 0:
|
| 710 |
-
# Add threshold lines to both panels
|
| 711 |
fig.add_hline(
|
| 712 |
y=threshold,
|
| 713 |
line=dict(color='cyan', width=2, dash='dash'),
|
| 714 |
-
layer='above',
|
| 715 |
annotation_text=f'Threshold: {threshold:.2f}',
|
| 716 |
annotation_position="top right",
|
| 717 |
row=1, col=1, secondary_y=True
|
|
@@ -719,7 +751,6 @@ def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
|
|
| 719 |
fig.add_hline(
|
| 720 |
y=threshold,
|
| 721 |
line=dict(color='cyan', width=2, dash='dash'),
|
| 722 |
-
layer='above',
|
| 723 |
annotation_text=f'Threshold: {threshold:.2f}',
|
| 724 |
annotation_position="top right",
|
| 725 |
row=2, col=1, secondary_y=True
|
|
@@ -809,7 +840,6 @@ def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
|
|
| 809 |
height=500,
|
| 810 |
title_text="Real-Time Speech Visualizer",
|
| 811 |
showlegend=True,
|
| 812 |
-
uirevision="const", # Preserve zoom/pan when updating
|
| 813 |
legend=dict(
|
| 814 |
x=1.02,
|
| 815 |
y=1,
|
|
@@ -874,10 +904,6 @@ class VADDemo:
|
|
| 874 |
|
| 875 |
print("馃帳 Real-time VAD Demo initialized successfully")
|
| 876 |
print(f"馃搳 Available models: {list(self.models.keys())}")
|
| 877 |
-
|
| 878 |
-
# Initialize demo globally for callbacks
|
| 879 |
-
print("馃帳 Initializing VAD Demo...")
|
| 880 |
-
demo_app = VADDemo()
|
| 881 |
|
| 882 |
def process_audio_with_events(self, audio, model_a, model_b, threshold):
|
| 883 |
if audio is None:
|
|
@@ -895,7 +921,7 @@ demo_app = VADDemo()
|
|
| 895 |
|
| 896 |
selected_models = list(set([model_a, model_b]))
|
| 897 |
|
| 898 |
-
# Process each window
|
| 899 |
for i in range(0, len(processed_audio) - window_samples, hop_samples):
|
| 900 |
timestamp = i / self.processor.sample_rate
|
| 901 |
chunk = processed_audio[i:i + window_samples]
|
|
@@ -971,7 +997,6 @@ demo_app = VADDemo()
|
|
| 971 |
# ===== GRADIO INTERFACE =====
|
| 972 |
|
| 973 |
def create_interface():
|
| 974 |
-
|
| 975 |
# Load logos
|
| 976 |
logos = load_logos()
|
| 977 |
|
|
@@ -1080,5 +1105,9 @@ def create_interface():
|
|
| 1080 |
|
| 1081 |
# Create and launch interface
|
| 1082 |
if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1083 |
interface = create_interface()
|
| 1084 |
interface.launch(share=True, debug=False)
|
|
|
|
| 230 |
if len(audio.shape) > 1:
|
| 231 |
audio = audio.mean(axis=1)
|
| 232 |
|
| 233 |
+
# Convert audio to target sample rate for E-PANNs
|
| 234 |
if LIBROSA_AVAILABLE:
|
| 235 |
+
# Resample to E-PANNs sample rate if needed
|
| 236 |
audio_resampled = librosa.resample(audio.astype(float),
|
| 237 |
orig_sr=16000,
|
| 238 |
target_sr=self.sample_rate)
|
|
|
|
| 271 |
self.sample_rate = 32000
|
| 272 |
self.model = None
|
| 273 |
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
|
|
|
| 274 |
self.load_model()
|
| 275 |
|
| 276 |
def load_model(self):
|
|
|
|
| 303 |
if len(audio.shape) > 1:
|
| 304 |
audio = audio.mean(axis=1)
|
| 305 |
|
| 306 |
+
# Convert audio to PANNs sample rate
|
| 307 |
+
if LIBROSA_AVAILABLE:
|
| 308 |
+
audio_resampled = librosa.resample(audio.astype(float),
|
| 309 |
+
orig_sr=16000,
|
| 310 |
+
target_sr=self.sample_rate)
|
| 311 |
+
else:
|
| 312 |
+
# Simple resampling fallback
|
| 313 |
+
resample_factor = self.sample_rate / 16000
|
| 314 |
+
audio_resampled = np.interp(
|
| 315 |
+
np.linspace(0, len(audio) - 1, int(len(audio) * resample_factor)),
|
| 316 |
+
np.arange(len(audio)),
|
| 317 |
+
audio
|
| 318 |
+
)
|
| 319 |
|
| 320 |
# Ensure minimum length for PANNs (need at least 1 second)
|
| 321 |
min_samples = self.sample_rate # 1 second
|
|
|
|
| 384 |
start_time = time.time()
|
| 385 |
|
| 386 |
if self.model is None or len(audio) == 0:
|
| 387 |
+
# Enhanced fallback using spectral features
|
| 388 |
if len(audio) > 0:
|
| 389 |
energy = np.sum(audio ** 2)
|
| 390 |
+
if LIBROSA_AVAILABLE:
|
| 391 |
+
spectral_features = librosa.feature.spectral_rolloff(y=audio, sr=self.sample_rate)
|
| 392 |
+
spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=self.sample_rate))
|
| 393 |
+
# Combine multiple features for better speech detection
|
| 394 |
+
probability = min((energy * 100 + spectral_centroid / 500) / 2, 1.0)
|
| 395 |
+
else:
|
| 396 |
+
probability = min(energy * 50, 1.0)
|
| 397 |
+
is_speech = probability > 0.3
|
| 398 |
else:
|
| 399 |
probability = 0.0
|
| 400 |
is_speech = False
|
|
|
|
| 404 |
if len(audio.shape) > 1:
|
| 405 |
audio = audio.mean(axis=1)
|
| 406 |
|
| 407 |
+
# Use longer context for AST - take from full audio if available
|
| 408 |
+
if full_audio is not None and len(full_audio) > self.sample_rate:
|
| 409 |
+
# Take 3-second window centered around current timestamp
|
| 410 |
+
center_pos = int(timestamp * self.sample_rate)
|
| 411 |
+
window_size = int(1.5 * self.sample_rate) # 1.5 seconds each side
|
| 412 |
+
|
| 413 |
+
start_pos = max(0, center_pos - window_size)
|
| 414 |
+
end_pos = min(len(full_audio), center_pos + window_size)
|
| 415 |
+
|
| 416 |
+
# Ensure we have at least 1 second
|
| 417 |
+
if end_pos - start_pos < self.sample_rate:
|
| 418 |
+
end_pos = min(len(full_audio), start_pos + self.sample_rate)
|
| 419 |
+
|
| 420 |
+
audio_for_ast = full_audio[start_pos:end_pos]
|
| 421 |
+
else:
|
| 422 |
+
audio_for_ast = audio
|
| 423 |
+
|
| 424 |
+
# Ensure minimum length for AST
|
| 425 |
+
if len(audio_for_ast) < self.sample_rate:
|
| 426 |
+
audio_for_ast = np.pad(audio_for_ast, (0, self.sample_rate - len(audio_for_ast)), 'constant')
|
| 427 |
|
| 428 |
+
# Feature extraction with proper AST parameters
|
| 429 |
inputs = self.feature_extractor(
|
| 430 |
+
audio_for_ast,
|
| 431 |
sampling_rate=self.sample_rate,
|
| 432 |
return_tensors="pt",
|
| 433 |
+
max_length=1024, # Proper AST context
|
| 434 |
truncation=True
|
| 435 |
)
|
| 436 |
|
|
|
|
| 452 |
|
| 453 |
if speech_indices:
|
| 454 |
speech_prob = probs[0, speech_indices].mean().item()
|
| 455 |
+
# Boost the probability if it's too low but there's clear audio content
|
| 456 |
+
if speech_prob < 0.1 and np.sum(audio_for_ast ** 2) > 0.001:
|
| 457 |
+
speech_prob = min(speech_prob * 5, 0.8) # Boost but cap at 0.8
|
| 458 |
else:
|
| 459 |
+
# Fallback to energy-based detection
|
| 460 |
+
energy = np.sum(audio_for_ast ** 2)
|
| 461 |
+
speech_prob = min(energy * 20, 1.0)
|
| 462 |
|
| 463 |
+
return VADResult(float(speech_prob), speech_prob > 0.4, self.model_name, time.time()-start_time, timestamp)
|
|
|
|
|
|
|
|
|
|
| 464 |
|
| 465 |
except Exception as e:
|
| 466 |
print(f"Error in {self.model_name}: {e}")
|
| 467 |
+
# Enhanced fallback
|
| 468 |
if len(audio) > 0:
|
| 469 |
energy = np.sum(audio ** 2)
|
| 470 |
+
probability = min(energy * 30, 1.0) # More aggressive energy scaling
|
| 471 |
+
is_speech = energy > 0.002
|
| 472 |
else:
|
| 473 |
probability = 0.0
|
| 474 |
is_speech = False
|
|
|
|
| 511 |
if len(audio_data.shape) > 1:
|
| 512 |
audio_data = audio_data.mean(axis=1)
|
| 513 |
|
|
|
|
| 514 |
if np.max(np.abs(audio_data)) > 0:
|
| 515 |
audio_data = audio_data / np.max(np.abs(audio_data))
|
| 516 |
|
|
|
|
| 740 |
)
|
| 741 |
|
| 742 |
if len(time_frames) > 0:
|
| 743 |
+
# Add threshold lines to both panels
|
| 744 |
fig.add_hline(
|
| 745 |
y=threshold,
|
| 746 |
line=dict(color='cyan', width=2, dash='dash'),
|
|
|
|
| 747 |
annotation_text=f'Threshold: {threshold:.2f}',
|
| 748 |
annotation_position="top right",
|
| 749 |
row=1, col=1, secondary_y=True
|
|
|
|
| 751 |
fig.add_hline(
|
| 752 |
y=threshold,
|
| 753 |
line=dict(color='cyan', width=2, dash='dash'),
|
|
|
|
| 754 |
annotation_text=f'Threshold: {threshold:.2f}',
|
| 755 |
annotation_position="top right",
|
| 756 |
row=2, col=1, secondary_y=True
|
|
|
|
| 840 |
height=500,
|
| 841 |
title_text="Real-Time Speech Visualizer",
|
| 842 |
showlegend=True,
|
|
|
|
| 843 |
legend=dict(
|
| 844 |
x=1.02,
|
| 845 |
y=1,
|
|
|
|
| 904 |
|
| 905 |
print("馃帳 Real-time VAD Demo initialized successfully")
|
| 906 |
print(f"馃搳 Available models: {list(self.models.keys())}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 907 |
|
| 908 |
def process_audio_with_events(self, audio, model_a, model_b, threshold):
|
| 909 |
if audio is None:
|
|
|
|
| 921 |
|
| 922 |
selected_models = list(set([model_a, model_b]))
|
| 923 |
|
| 924 |
+
# Process each window individually for all models
|
| 925 |
for i in range(0, len(processed_audio) - window_samples, hop_samples):
|
| 926 |
timestamp = i / self.processor.sample_rate
|
| 927 |
chunk = processed_audio[i:i + window_samples]
|
|
|
|
| 997 |
# ===== GRADIO INTERFACE =====
|
| 998 |
|
| 999 |
def create_interface():
|
|
|
|
| 1000 |
# Load logos
|
| 1001 |
logos = load_logos()
|
| 1002 |
|
|
|
|
| 1105 |
|
| 1106 |
# Create and launch interface
|
| 1107 |
if __name__ == "__main__":
|
| 1108 |
+
# Initialize demo
|
| 1109 |
+
print("馃帳 Initializing VAD Demo...")
|
| 1110 |
+
demo_app = VADDemo()
|
| 1111 |
+
|
| 1112 |
interface = create_interface()
|
| 1113 |
interface.launch(share=True, debug=False)
|