Gabriel Bibb贸 commited on
Commit
b3b9f78
1 Parent(s): 29ea60e

adjust app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -13
app.py CHANGED
@@ -766,6 +766,9 @@ class AudioProcessor:
766
  mode='psd'
767
  )
768
 
 
 
 
769
  mel_spec_db = np.zeros((self.n_mels, Sxx.shape[1]))
770
 
771
  mel_freqs = np.logspace(
@@ -826,8 +829,8 @@ class AudioProcessor:
826
  # Add virtual start point if first timestamp > 0
827
  if rs[0].timestamp > 0:
828
  virtual_start = VADResult(
829
- probability=0.0,
830
- is_speech=False,
831
  model_name=base,
832
  processing_time=0,
833
  timestamp=0.0
@@ -1271,25 +1274,28 @@ class VADDemo:
1271
  # CRITICAL FIX: Always extract chunks, both for short and long audio
1272
  window_count = 0
1273
  audio_duration = len(processed_audio) / self.processor.sample_rate
1274
- audio_len = len(processed_audio)
1275
 
1276
- for current_pos in range(hop_samples, audio_len + hop_samples, hop_samples):
1277
- current_pos = min(current_pos, audio_len)
1278
- start_pos = max(0, current_pos - window_samples)
1279
- chunk = processed_audio[start_pos:current_pos]
 
1280
 
1281
- orig_len = len(chunk)
1282
- if orig_len < window_samples:
1283
- chunk = np.pad(chunk, (window_samples - orig_len, 0), mode='constant')
1284
 
1285
- padding_ratio = (window_samples - orig_len) / window_samples
 
1286
  if padding_ratio > 0.5:
1287
  continue # Skip heavily padded chunks
1288
 
1289
- timestamp = current_pos / self.processor.sample_rate
 
 
1290
 
1291
  if window_count < 3: # Log first 3 windows
1292
- debug_info.append(f" 馃攧 Window {window_count}: t={timestamp:.2f}s (end), chunk_size={orig_len} (padded to {len(chunk)})")
1293
 
1294
  # Call predict with the chunk
1295
  result = self.models[model_name].predict(chunk, timestamp)
@@ -1303,6 +1309,10 @@ class VADDemo:
1303
  model_results.append(result)
1304
  window_count += 1
1305
 
 
 
 
 
1306
  debug_info.append(f" 馃幆 Total windows processed: {window_count}")
1307
 
1308
  # Summary for this model
 
766
  mode='psd'
767
  )
768
 
769
+ # Ajustar tiempos para alinear con center=False (empezar en 0)
770
+ t -= (self.n_fft / 2.0) / self.sample_rate
771
+
772
  mel_spec_db = np.zeros((self.n_mels, Sxx.shape[1]))
773
 
774
  mel_freqs = np.logspace(
 
829
  # Add virtual start point if first timestamp > 0
830
  if rs[0].timestamp > 0:
831
  virtual_start = VADResult(
832
+ probability=rs[0].probability,
833
+ is_speech=rs[0].probability > threshold,
834
  model_name=base,
835
  processing_time=0,
836
  timestamp=0.0
 
1274
  # CRITICAL FIX: Always extract chunks, both for short and long audio
1275
  window_count = 0
1276
  audio_duration = len(processed_audio) / self.processor.sample_rate
 
1277
 
1278
+ for i in range(0, len(processed_audio), hop_samples):
1279
+ # CRITICAL: Extract the chunk centered on this timestamp
1280
+ start_pos = max(0, i - window_samples // 2)
1281
+ end_pos = min(len(processed_audio), start_pos + window_samples)
1282
+ chunk = processed_audio[start_pos:end_pos]
1283
 
1284
+ # Pad if necessary (with reflection, not zeros to avoid artificial silence)
1285
+ if len(chunk) < window_samples:
1286
+ chunk = np.pad(chunk, (0, window_samples - len(chunk)), mode='reflect')
1287
 
1288
+ # Skip chunks with excessive padding to avoid skewed predictions
1289
+ padding_ratio = (window_samples - (end_pos - start_pos)) / window_samples
1290
  if padding_ratio > 0.5:
1291
  continue # Skip heavily padded chunks
1292
 
1293
+ # CORRECTED: Timestamp at ACTUAL CENTER of the chunk for alignment
1294
+ actual_center = start_pos + (end_pos - start_pos) / 2.0
1295
+ timestamp = actual_center / self.processor.sample_rate
1296
 
1297
  if window_count < 3: # Log first 3 windows
1298
+ debug_info.append(f" 馃攧 Window {window_count}: t={timestamp:.2f}s (center), chunk_size={len(chunk)}")
1299
 
1300
  # Call predict with the chunk
1301
  result = self.models[model_name].predict(chunk, timestamp)
 
1309
  model_results.append(result)
1310
  window_count += 1
1311
 
1312
+ # Stop if we've gone past the audio length
1313
+ if timestamp >= audio_duration:
1314
+ break
1315
+
1316
  debug_info.append(f" 馃幆 Total windows processed: {window_count}")
1317
 
1318
  # Summary for this model