Model: TextConditionedSeparator( (clap): ClapModel( (text_model): ClapTextModel( (embeddings): ClapTextEmbeddings( (word_embeddings): Embedding(50265, 768, padding_idx=1) (position_embeddings): Embedding(514, 768, padding_idx=1) (token_type_embeddings): Embedding(1, 768) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): ClapTextEncoder( (layer): ModuleList( (0-11): 12 x ClapTextLayer( (attention): ClapTextAttention( (self): ClapTextSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): ClapTextSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): ClapTextIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): ClapTextOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (pooler): ClapTextPooler( (dense): Linear(in_features=768, out_features=768, bias=True) (activation): Tanh() ) ) (text_projection): ClapProjectionLayer( (linear1): Linear(in_features=768, out_features=512, bias=True) (activation): ReLU() (linear2): Linear(in_features=512, out_features=512, bias=True) ) (audio_model): ClapAudioModel( (audio_encoder): ClapAudioEncoder( (patch_embed): ClapAudioPatchEmbed( (proj): Conv2d(1, 96, kernel_size=(4, 4), stride=(4, 4)) (norm): LayerNorm((96,), eps=1e-05, elementwise_affine=True) ) (layers): ModuleList( (0): ClapAudioStage( (blocks): ModuleList( (0-1): 2 x ClapAudioLayer( (layernorm_before): LayerNorm((96,), eps=1e-05, elementwise_affine=True) (attention): ClapAudioAttention( (self): ClapAudioSelfAttention( (query): Linear(in_features=96, out_features=96, bias=True) (key): Linear(in_features=96, out_features=96, bias=True) (value): Linear(in_features=96, out_features=96, bias=True) (dropout): Dropout(p=0.0, inplace=False) ) (output): ClapAudioSelfOutput( (dense): Linear(in_features=96, out_features=96, bias=True) (dropout): Dropout(p=0.0, inplace=False) ) ) (drop_path): Identity() (layernorm_after): LayerNorm((96,), eps=1e-05, elementwise_affine=True) (intermediate): ClapAudioIntermediate( (dense): Linear(in_features=96, out_features=384, bias=True) (intermediate_act_fn): GELUActivation() ) (output): ClapAudioOutput( (dense): Linear(in_features=384, out_features=96, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) (downsample): ClapAudioPatchMerging( (reduction): Linear(in_features=384, out_features=192, bias=False) (norm): LayerNorm((384,), eps=1e-05, elementwise_affine=True) ) ) (1): ClapAudioStage( (blocks): ModuleList( (0-1): 2 x ClapAudioLayer( (layernorm_before): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (attention): ClapAudioAttention( (self): ClapAudioSelfAttention( (query): Linear(in_features=192, out_features=192, bias=True) (key): Linear(in_features=192, out_features=192, bias=True) (value): Linear(in_features=192, out_features=192, bias=True) (dropout): Dropout(p=0.0, inplace=False) ) (output): ClapAudioSelfOutput( (dense): Linear(in_features=192, out_features=192, bias=True) (dropout): Dropout(p=0.0, inplace=False) ) ) (drop_path): Identity() (layernorm_after): LayerNorm((192,), eps=1e-05, elementwise_affine=True) (intermediate): ClapAudioIntermediate( (dense): Linear(in_features=192, out_features=768, bias=True) (intermediate_act_fn): GELUActivation() ) (output): ClapAudioOutput( (dense): Linear(in_features=768, out_features=192, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) (downsample): ClapAudioPatchMerging( (reduction): Linear(in_features=768, out_features=384, bias=False) (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) ) ) (2): ClapAudioStage( (blocks): ModuleList( (0-5): 6 x ClapAudioLayer( (layernorm_before): LayerNorm((384,), eps=1e-05, elementwise_affine=True) (attention): ClapAudioAttention( (self): ClapAudioSelfAttention( (query): Linear(in_features=384, out_features=384, bias=True) (key): Linear(in_features=384, out_features=384, bias=True) (value): Linear(in_features=384, out_features=384, bias=True) (dropout): Dropout(p=0.0, inplace=False) ) (output): ClapAudioSelfOutput( (dense): Linear(in_features=384, out_features=384, bias=True) (dropout): Dropout(p=0.0, inplace=False) ) ) (drop_path): Identity() (layernorm_after): LayerNorm((384,), eps=1e-05, elementwise_affine=True) (intermediate): ClapAudioIntermediate( (dense): Linear(in_features=384, out_features=1536, bias=True) (intermediate_act_fn): GELUActivation() ) (output): ClapAudioOutput( (dense): Linear(in_features=1536, out_features=384, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) (downsample): ClapAudioPatchMerging( (reduction): Linear(in_features=1536, out_features=768, bias=False) (norm): LayerNorm((1536,), eps=1e-05, elementwise_affine=True) ) ) (3): ClapAudioStage( (blocks): ModuleList( (0-1): 2 x ClapAudioLayer( (layernorm_before): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (attention): ClapAudioAttention( (self): ClapAudioSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.0, inplace=False) ) (output): ClapAudioSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.0, inplace=False) ) ) (drop_path): Identity() (layernorm_after): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (intermediate): ClapAudioIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): ClapAudioOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) ) (batch_norm): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True) (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (avgpool): AdaptiveAvgPool1d(output_size=1) ) ) (audio_projection): ClapProjectionLayer( (linear1): Linear(in_features=768, out_features=512, bias=True) (activation): ReLU() (linear2): Linear(in_features=512, out_features=512, bias=True) ) ) (z_encoder): PatchConv1d( (conv): Conv1d(1, 256, kernel_size=(16,), stride=(8,)) ) (text_proj): Linear(in_features=512, out_features=256, bias=True) (z_proj): Linear(in_features=256, out_features=256, bias=True) (cross): CrossAttention( (attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) ) (ln1): LayerNorm((256,), eps=1e-05, elementwise_affine=True) (ff): MLP( (fc1): Linear(in_features=256, out_features=1024, bias=True) (fc2): Linear(in_features=1024, out_features=256, bias=True) (act): GELU(approximate='none') ) (ln2): LayerNorm((256,), eps=1e-05, elementwise_affine=True) ) (transformer): TransformerEncoder( (layers): ModuleList( (0-5): 6 x TransformerEncoderLayer( (self_attn): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True) ) (linear1): Linear(in_features=256, out_features=1024, bias=True) (dropout): Dropout(p=0.1, inplace=False) (linear2): Linear(in_features=1024, out_features=256, bias=True) (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True) (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True) (dropout1): Dropout(p=0.1, inplace=False) (dropout2): Dropout(p=0.1, inplace=False) ) ) ) (spec_decoder): Sequential( (0): Linear(in_features=256, out_features=256, bias=True) (1): GELU(approximate='none') (2): Linear(in_features=256, out_features=2049, bias=True) ) ) output waveform shape: torch.Size([2, 1, 48000]) output spectrogram shape: torch.Size([2, 12001, 2049])