Gabriel Bibb贸
commited on
Commit
路
9d07682
1
Parent(s):
e60e716
Hotfix: Restore basic functionality - fix AST saturation and PANNs execution
Browse files
app.py
CHANGED
|
@@ -230,9 +230,8 @@ class OptimizedEPANNs:
|
|
| 230 |
if len(audio.shape) > 1:
|
| 231 |
audio = audio.mean(axis=1)
|
| 232 |
|
| 233 |
-
#
|
| 234 |
if LIBROSA_AVAILABLE:
|
| 235 |
-
# Resample to E-PANNs sample rate if needed
|
| 236 |
audio_resampled = librosa.resample(audio.astype(float),
|
| 237 |
orig_sr=16000,
|
| 238 |
target_sr=self.sample_rate)
|
|
@@ -271,6 +270,7 @@ class OptimizedPANNs:
|
|
| 271 |
self.sample_rate = 32000
|
| 272 |
self.model = None
|
| 273 |
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
|
|
|
| 274 |
self.load_model()
|
| 275 |
|
| 276 |
def load_model(self):
|
|
@@ -303,19 +303,8 @@ class OptimizedPANNs:
|
|
| 303 |
if len(audio.shape) > 1:
|
| 304 |
audio = audio.mean(axis=1)
|
| 305 |
|
| 306 |
-
#
|
| 307 |
-
|
| 308 |
-
audio_resampled = librosa.resample(audio.astype(float),
|
| 309 |
-
orig_sr=16000,
|
| 310 |
-
target_sr=self.sample_rate)
|
| 311 |
-
else:
|
| 312 |
-
# Simple resampling fallback
|
| 313 |
-
resample_factor = self.sample_rate / 16000
|
| 314 |
-
audio_resampled = np.interp(
|
| 315 |
-
np.linspace(0, len(audio) - 1, int(len(audio) * resample_factor)),
|
| 316 |
-
np.arange(len(audio)),
|
| 317 |
-
audio
|
| 318 |
-
)
|
| 319 |
|
| 320 |
# Ensure minimum length for PANNs (need at least 1 second)
|
| 321 |
min_samples = self.sample_rate # 1 second
|
|
@@ -384,17 +373,11 @@ class OptimizedAST:
|
|
| 384 |
start_time = time.time()
|
| 385 |
|
| 386 |
if self.model is None or len(audio) == 0:
|
| 387 |
-
#
|
| 388 |
if len(audio) > 0:
|
| 389 |
energy = np.sum(audio ** 2)
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=audio, sr=self.sample_rate))
|
| 393 |
-
# Combine multiple features for better speech detection
|
| 394 |
-
probability = min((energy * 100 + spectral_centroid / 500) / 2, 1.0)
|
| 395 |
-
else:
|
| 396 |
-
probability = min(energy * 50, 1.0)
|
| 397 |
-
is_speech = probability > 0.3
|
| 398 |
else:
|
| 399 |
probability = 0.0
|
| 400 |
is_speech = False
|
|
@@ -404,33 +387,16 @@ class OptimizedAST:
|
|
| 404 |
if len(audio.shape) > 1:
|
| 405 |
audio = audio.mean(axis=1)
|
| 406 |
|
| 407 |
-
# Use
|
| 408 |
-
if
|
| 409 |
-
|
| 410 |
-
center_pos = int(timestamp * self.sample_rate)
|
| 411 |
-
window_size = int(1.5 * self.sample_rate) # 1.5 seconds each side
|
| 412 |
-
|
| 413 |
-
start_pos = max(0, center_pos - window_size)
|
| 414 |
-
end_pos = min(len(full_audio), center_pos + window_size)
|
| 415 |
-
|
| 416 |
-
# Ensure we have at least 1 second
|
| 417 |
-
if end_pos - start_pos < self.sample_rate:
|
| 418 |
-
end_pos = min(len(full_audio), start_pos + self.sample_rate)
|
| 419 |
-
|
| 420 |
-
audio_for_ast = full_audio[start_pos:end_pos]
|
| 421 |
-
else:
|
| 422 |
-
audio_for_ast = audio
|
| 423 |
-
|
| 424 |
-
# Ensure minimum length for AST
|
| 425 |
-
if len(audio_for_ast) < self.sample_rate:
|
| 426 |
-
audio_for_ast = np.pad(audio_for_ast, (0, self.sample_rate - len(audio_for_ast)), 'constant')
|
| 427 |
|
| 428 |
-
# Feature extraction
|
| 429 |
inputs = self.feature_extractor(
|
| 430 |
-
|
| 431 |
sampling_rate=self.sample_rate,
|
| 432 |
return_tensors="pt",
|
| 433 |
-
max_length=1024,
|
| 434 |
truncation=True
|
| 435 |
)
|
| 436 |
|
|
@@ -452,23 +418,23 @@ class OptimizedAST:
|
|
| 452 |
|
| 453 |
if speech_indices:
|
| 454 |
speech_prob = probs[0, speech_indices].mean().item()
|
| 455 |
-
# Boost the probability if it's too low but there's clear audio content
|
| 456 |
-
if speech_prob < 0.1 and np.sum(audio_for_ast ** 2) > 0.001:
|
| 457 |
-
speech_prob = min(speech_prob * 5, 0.8) # Boost but cap at 0.8
|
| 458 |
else:
|
| 459 |
-
# Fallback to energy
|
| 460 |
-
energy = np.sum(
|
| 461 |
-
speech_prob = min(energy *
|
| 462 |
|
| 463 |
-
|
|
|
|
|
|
|
|
|
|
| 464 |
|
| 465 |
except Exception as e:
|
| 466 |
print(f"Error in {self.model_name}: {e}")
|
| 467 |
-
#
|
| 468 |
if len(audio) > 0:
|
| 469 |
energy = np.sum(audio ** 2)
|
| 470 |
-
probability = min(energy *
|
| 471 |
-
is_speech = energy > 0.
|
| 472 |
else:
|
| 473 |
probability = 0.0
|
| 474 |
is_speech = False
|
|
@@ -511,6 +477,7 @@ class AudioProcessor:
|
|
| 511 |
if len(audio_data.shape) > 1:
|
| 512 |
audio_data = audio_data.mean(axis=1)
|
| 513 |
|
|
|
|
| 514 |
if np.max(np.abs(audio_data)) > 0:
|
| 515 |
audio_data = audio_data / np.max(np.abs(audio_data))
|
| 516 |
|
|
@@ -740,10 +707,11 @@ def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
|
|
| 740 |
)
|
| 741 |
|
| 742 |
if len(time_frames) > 0:
|
| 743 |
-
# Add threshold lines to both panels
|
| 744 |
fig.add_hline(
|
| 745 |
y=threshold,
|
| 746 |
line=dict(color='cyan', width=2, dash='dash'),
|
|
|
|
| 747 |
annotation_text=f'Threshold: {threshold:.2f}',
|
| 748 |
annotation_position="top right",
|
| 749 |
row=1, col=1, secondary_y=True
|
|
@@ -751,6 +719,7 @@ def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
|
|
| 751 |
fig.add_hline(
|
| 752 |
y=threshold,
|
| 753 |
line=dict(color='cyan', width=2, dash='dash'),
|
|
|
|
| 754 |
annotation_text=f'Threshold: {threshold:.2f}',
|
| 755 |
annotation_position="top right",
|
| 756 |
row=2, col=1, secondary_y=True
|
|
@@ -840,6 +809,7 @@ def create_realtime_plot(audio_data: np.ndarray, vad_results: List[VADResult],
|
|
| 840 |
height=500,
|
| 841 |
title_text="Real-Time Speech Visualizer",
|
| 842 |
showlegend=True,
|
|
|
|
| 843 |
legend=dict(
|
| 844 |
x=1.02,
|
| 845 |
y=1,
|
|
@@ -904,6 +874,10 @@ class VADDemo:
|
|
| 904 |
|
| 905 |
print("馃帳 Real-time VAD Demo initialized successfully")
|
| 906 |
print(f"馃搳 Available models: {list(self.models.keys())}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 907 |
|
| 908 |
def process_audio_with_events(self, audio, model_a, model_b, threshold):
|
| 909 |
if audio is None:
|
|
@@ -921,7 +895,7 @@ class VADDemo:
|
|
| 921 |
|
| 922 |
selected_models = list(set([model_a, model_b]))
|
| 923 |
|
| 924 |
-
# Process each window
|
| 925 |
for i in range(0, len(processed_audio) - window_samples, hop_samples):
|
| 926 |
timestamp = i / self.processor.sample_rate
|
| 927 |
chunk = processed_audio[i:i + window_samples]
|
|
@@ -997,6 +971,7 @@ demo_app = VADDemo()
|
|
| 997 |
# ===== GRADIO INTERFACE =====
|
| 998 |
|
| 999 |
def create_interface():
|
|
|
|
| 1000 |
# Load logos
|
| 1001 |
logos = load_logos()
|
| 1002 |
|
|
@@ -1105,9 +1080,5 @@ def create_interface():
|
|
| 1105 |
|
| 1106 |
# Create and launch interface
|
| 1107 |
if __name__ == "__main__":
|
| 1108 |
-
# Initialize demo
|
| 1109 |
-
print("馃帳 Initializing VAD Demo...")
|
| 1110 |
-
demo_app = VADDemo()
|
| 1111 |
-
|
| 1112 |
interface = create_interface()
|
| 1113 |
interface.launch(share=True, debug=False)
|
|
|
|
| 230 |
if len(audio.shape) > 1:
|
| 231 |
audio = audio.mean(axis=1)
|
| 232 |
|
| 233 |
+
# Resample to E-PANNs sample rate
|
| 234 |
if LIBROSA_AVAILABLE:
|
|
|
|
| 235 |
audio_resampled = librosa.resample(audio.astype(float),
|
| 236 |
orig_sr=16000,
|
| 237 |
target_sr=self.sample_rate)
|
|
|
|
| 270 |
self.sample_rate = 32000
|
| 271 |
self.model = None
|
| 272 |
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 273 |
+
self.processor = AudioProcessor() # For fast resampling
|
| 274 |
self.load_model()
|
| 275 |
|
| 276 |
def load_model(self):
|
|
|
|
| 303 |
if len(audio.shape) > 1:
|
| 304 |
audio = audio.mean(axis=1)
|
| 305 |
|
| 306 |
+
# Fast resampling to PANNs sample rate
|
| 307 |
+
audio_resampled = self.processor.fast_resample(audio, 16000, self.sample_rate)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 308 |
|
| 309 |
# Ensure minimum length for PANNs (need at least 1 second)
|
| 310 |
min_samples = self.sample_rate # 1 second
|
|
|
|
| 373 |
start_time = time.time()
|
| 374 |
|
| 375 |
if self.model is None or len(audio) == 0:
|
| 376 |
+
# Simple energy-based fallback
|
| 377 |
if len(audio) > 0:
|
| 378 |
energy = np.sum(audio ** 2)
|
| 379 |
+
probability = min(energy * 20, 1.0)
|
| 380 |
+
is_speech = probability > 0.2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 381 |
else:
|
| 382 |
probability = 0.0
|
| 383 |
is_speech = False
|
|
|
|
| 387 |
if len(audio.shape) > 1:
|
| 388 |
audio = audio.mean(axis=1)
|
| 389 |
|
| 390 |
+
# Use 1 second minimum for AST
|
| 391 |
+
if len(audio) < self.sample_rate:
|
| 392 |
+
audio = np.pad(audio, (0, self.sample_rate - len(audio)), 'constant')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 393 |
|
| 394 |
+
# Feature extraction
|
| 395 |
inputs = self.feature_extractor(
|
| 396 |
+
audio,
|
| 397 |
sampling_rate=self.sample_rate,
|
| 398 |
return_tensors="pt",
|
| 399 |
+
max_length=1024,
|
| 400 |
truncation=True
|
| 401 |
)
|
| 402 |
|
|
|
|
| 418 |
|
| 419 |
if speech_indices:
|
| 420 |
speech_prob = probs[0, speech_indices].mean().item()
|
|
|
|
|
|
|
|
|
|
| 421 |
else:
|
| 422 |
+
# Fallback to energy
|
| 423 |
+
energy = np.sum(audio ** 2)
|
| 424 |
+
speech_prob = min(energy * 10, 1.0)
|
| 425 |
|
| 426 |
+
# Ensure reasonable range
|
| 427 |
+
speech_prob = np.clip(speech_prob, 0.0, 1.0)
|
| 428 |
+
|
| 429 |
+
return VADResult(float(speech_prob), speech_prob > 0.5, self.model_name, time.time()-start_time, timestamp)
|
| 430 |
|
| 431 |
except Exception as e:
|
| 432 |
print(f"Error in {self.model_name}: {e}")
|
| 433 |
+
# Simple fallback
|
| 434 |
if len(audio) > 0:
|
| 435 |
energy = np.sum(audio ** 2)
|
| 436 |
+
probability = min(energy * 15, 1.0)
|
| 437 |
+
is_speech = energy > 0.01
|
| 438 |
else:
|
| 439 |
probability = 0.0
|
| 440 |
is_speech = False
|
|
|
|
| 477 |
if len(audio_data.shape) > 1:
|
| 478 |
audio_data = audio_data.mean(axis=1)
|
| 479 |
|
| 480 |
+
# Simple peak normalization
|
| 481 |
if np.max(np.abs(audio_data)) > 0:
|
| 482 |
audio_data = audio_data / np.max(np.abs(audio_data))
|
| 483 |
|
|
|
|
| 707 |
)
|
| 708 |
|
| 709 |
if len(time_frames) > 0:
|
| 710 |
+
# Add threshold lines to both panels with layer='above' to show over spectrograms
|
| 711 |
fig.add_hline(
|
| 712 |
y=threshold,
|
| 713 |
line=dict(color='cyan', width=2, dash='dash'),
|
| 714 |
+
layer='above',
|
| 715 |
annotation_text=f'Threshold: {threshold:.2f}',
|
| 716 |
annotation_position="top right",
|
| 717 |
row=1, col=1, secondary_y=True
|
|
|
|
| 719 |
fig.add_hline(
|
| 720 |
y=threshold,
|
| 721 |
line=dict(color='cyan', width=2, dash='dash'),
|
| 722 |
+
layer='above',
|
| 723 |
annotation_text=f'Threshold: {threshold:.2f}',
|
| 724 |
annotation_position="top right",
|
| 725 |
row=2, col=1, secondary_y=True
|
|
|
|
| 809 |
height=500,
|
| 810 |
title_text="Real-Time Speech Visualizer",
|
| 811 |
showlegend=True,
|
| 812 |
+
uirevision="const", # Preserve zoom/pan when updating
|
| 813 |
legend=dict(
|
| 814 |
x=1.02,
|
| 815 |
y=1,
|
|
|
|
| 874 |
|
| 875 |
print("馃帳 Real-time VAD Demo initialized successfully")
|
| 876 |
print(f"馃搳 Available models: {list(self.models.keys())}")
|
| 877 |
+
|
| 878 |
+
# Initialize demo globally for callbacks
|
| 879 |
+
print("馃帳 Initializing VAD Demo...")
|
| 880 |
+
demo_app = VADDemo()
|
| 881 |
|
| 882 |
def process_audio_with_events(self, audio, model_a, model_b, threshold):
|
| 883 |
if audio is None:
|
|
|
|
| 895 |
|
| 896 |
selected_models = list(set([model_a, model_b]))
|
| 897 |
|
| 898 |
+
# Process each window - simplified without complex scheduling
|
| 899 |
for i in range(0, len(processed_audio) - window_samples, hop_samples):
|
| 900 |
timestamp = i / self.processor.sample_rate
|
| 901 |
chunk = processed_audio[i:i + window_samples]
|
|
|
|
| 971 |
# ===== GRADIO INTERFACE =====
|
| 972 |
|
| 973 |
def create_interface():
|
| 974 |
+
|
| 975 |
# Load logos
|
| 976 |
logos = load_logos()
|
| 977 |
|
|
|
|
| 1080 |
|
| 1081 |
# Create and launch interface
|
| 1082 |
if __name__ == "__main__":
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1083 |
interface = create_interface()
|
| 1084 |
interface.launch(share=True, debug=False)
|