i2vedit / config /customize_train.yaml
weiyuyeh's picture
init
a45ed83
# Pretrained diffusers model path.
# Don't change
pretrained_model_path: "stabilityai/stable-video-diffusion-img2vid"
# The folder where your training outputs will be placed.
# Don't change
output_dir: "/home/user/app/outputs"
seed: 23
num_steps: 25
# Xformers must be installed for best memory savings and performance (< Pytorch 2.0)
enable_xformers_memory_efficient_attention: True
# Use scaled dot product attention (Only available with >= Torch 2.0)
enable_torch_2_attn: True
use_sarp: true
use_motion_lora: true
train_motion_lora_only: false
retrain_motion_lora: true
use_inversed_latents: true
use_attention_matching: true
use_consistency_attention_control: false
dtype: fp16
visualize_attention_store: false
visualize_attention_store_steps: #[0, 5, 10, 15, 20, 24]
save_last_frames: True
load_from_last_frames_latents:
# data_params
data_params:
# Don't change
video_path: "/home/user/app/upload/source_and_edits/source.mp4"
# Don't change
keyframe_paths:
- "/home/user/app/upload/source_and_edits/ref.jpg"
start_t: 0
end_t: 1.6
sample_fps: 10
chunk_size: 16
overlay_size: 1
normalize: true
output_fps: 3
save_sampled_frame: true
output_res: [576, 576]
pad_to_fit: true
begin_clip_id: 0
end_clip_id: 1
train_motion_lora_params:
cache_latents: true
cached_latent_dir: null #/path/to/cached_latents
lora_rank: 32
# Use LoRA for the UNET model.
use_unet_lora: True
# LoRA Dropout. This parameter adds the probability of randomly zeros out elements. Helps prevent overfitting.
# See: https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html
lora_unet_dropout: 0.1
# The only time you want this off is if you're doing full LoRA training.
save_pretrained_model: False
# Learning rate for AdamW
learning_rate: 5e-4
# Weight decay. Higher = more regularization. Lower = closer to dataset.
adam_weight_decay: 1e-2
# Maximum number of train steps. Model is saved after training.
max_train_steps: 600
# Saves a model every nth step.
checkpointing_steps: 100
# How many steps to do for validation if sample_preview is enabled.
validation_steps: 100
# Whether or not we want to use mixed precision with accelerate
mixed_precision: "fp16"
# Trades VRAM usage for speed. You lose roughly 20% of training speed, but save a lot of VRAM.
# If you need to save more VRAM, it can also be enabled for the text encoder, but reduces speed x2.
gradient_checkpointing: True
image_encoder_gradient_checkpointing: True
train_data:
# The width and height in which you want your training data to be resized to.
width: 576
height: 576
# This will find the closest aspect ratio to your input width and height.
# For example, 576x576 width and height with a video of resolution 1280x720 will be resized to 576x256
use_data_aug: ~ #"controlnet"
pad_to_fit: true
validation_data:
# Whether or not to sample preview during training (Requires more VRAM).
sample_preview: True
# The number of frames to sample during validation.
num_frames: 16
# Height and width of validation sample.
width: 576
height: 576
pad_to_fit: true
# scale of spatial LoRAs, default is 0
spatial_scale: 0
# scale of noise prior, i.e. the scale of inversion noises
noise_prior:
#- 0.0
- 1.0
sarp_params:
sarp_noise_scale: 0.005
attention_matching_params:
best_checkpoint_index: 500
lora_scale: 1.0
# lora path
lora_dir: ~
max_guidance_scale: 2.0
disk_store: True
load_attention_store: ~
load_consistency_attention_store: ~
load_consistency_train_attention_store: ~
registered_modules:
BasicTransformerBlock:
- "attn1"
#- "attn2"
TemporalBasicTransformerBlock:
- "attn1"
#- "attn2"
control_mode:
spatial_self: "masked_copy"
temporal_self: "copy_v2"
cross_replace_steps: 0.0
temporal_self_replace_steps: 1.0
spatial_self_replace_steps: 1.0
spatial_attention_chunk_size: 1
params:
edit0:
temporal_step_thr: [0.5, 0.8]
mask_thr: [0.35, 0.35]
edit1:
temporal_step_thr: [0.5, 0.8]
mask_thr: [0.35, 0.35]
long_video_params:
mode: "skip-interval"
registered_modules:
BasicTransformerBlock:
#- "attn1"
#- "attn2"
TemporalBasicTransformerBlock:
- "attn1"
#- "attn2"