File size: 3,732 Bytes
cffc5b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
data:
  train_dir: /home/jacob/datasets/musdb18/train       # Path to train subfolder of MUSDB18 dataset
  test_dir: /home/jacob/datasets/musdb18/test         # Path to test subfolder of MUSDB18 dataset
  segment_seconds: 6.0                                # Length of audio segments for training [s]   
  pct_train: 0.2                                      # Decimal percentage of full data to use for training (otherwise 1 epoch takes ~15 hrs)
  pct_test: 0.1                                       # Decimal percentage of full data to use for testing
  overlap: 0.1                                        # Overlap between segments for chunked inference [s]
  sample_rate: 44100                                  # Sample rate for audio files [Hz]
  channels: 2                                         # Number of audio channels (1 = mono, 2 = stereo)                   
  random_segments: False                              # Whether to use random segments during training
  augment: True                                       # Whether to use data augmentation (gain adjustment and channel swapping)

model:
  name: Audio-Text-HTDemucs                           # Model name
  model_dim: 384                                      # Model dimension
  text_dim: 512                                       # Text embedding dimension (laion/clap-htsat-unfused is 512)                
  num_heads: 8                                        # Number of attention heads for text cross-attention layer     
  device: cpu                                        # Device to use for training (cuda for GPU or cpu)       
  use_amp: False                                      # Whether to use automatic mixed precision (AMP) during training - WORK IN PROGRESS

training:
  batch_size: 8                                       # Batch size for training         
  num_workers: 0                                      # Number of DataLoader workers
  num_epochs: 20                                      # Number of training epochs
  optimizer:  
    name: AdamW
    lr: 1e-4                                          # Learning rate
    weight_decay: 1e-2                                # Weight decay for optimizer
    grad_clip: 5.0                                    # Gradient clipping value (set to null to disable)
  loss_weights:
    sdr: 0.9                                          # Weight for SDR loss             
    sisdr_weight: 0.1                                 # Weight for SI-SDR loss, total loss is (sdr_weight * sdr) + (sisdr_weight * si_sdr)
  use_L1_comb_loss: False                              # Whether to use L1 combination loss
  L1_comb_loss: 
    sdr_weight: 1.0                                   # Weight for SDR in L1 combination loss
    l1_weight: 0.1                                    # Weight for L1 loss in L1 combination loss                                  
  #resume_from: null                                   # Path to checkpoint to resume training from (set to null to train from scratch)
  resume_from: checkpoints/2025_11_30_batch4/best_model.pt

wandb:
  use_wandb: False                                    # Whether to use Weights & Biases for experiment tracking
  project: audio-text-htdemucs                        # Wandb project name
  run_name: null
  log_every: 50                                       # Log to wandb every N batches
  validate_every: 1                                   # Validate every N epochs
  save_every: 5                                       # Save model checkpoint every N epochs
  checkpoint_dir: checkpoints/2025_12_06/             # Directory to save model checkpoints
  output_dir: results/2025_12_06                      # Directory to save inference results