Spaces:

NTUST-DDRC
/

i2vedit

Paused

App Files Files Community

weiyuyeh commited on Jul 29

Commit

a45ed83

1 Parent(s): 0daf590

init

Browse files

Files changed (38) hide show

.gitattributes +2 -0
.gitignore +12 -0
README.md +4 -4
app.py +173 -0
config/customize_subsequent_edit.yaml +152 -0
config/customize_train.yaml +149 -0
config/customize_train_multi.yaml +149 -0
i2vedit/__init__.py +0 -0
i2vedit/data.py +317 -0
i2vedit/inference.py +89 -0
i2vedit/prompt_attention/__init__.py +0 -0
i2vedit/prompt_attention/attention_register.py +250 -0
i2vedit/prompt_attention/attention_store.py +305 -0
i2vedit/prompt_attention/attention_util.py +621 -0
i2vedit/prompt_attention/common/__init__.py +0 -0
i2vedit/prompt_attention/common/image_util.py +192 -0
i2vedit/prompt_attention/common/instantiate_from_config.py +33 -0
i2vedit/prompt_attention/common/logger.py +17 -0
i2vedit/prompt_attention/common/set_seed.py +28 -0
i2vedit/prompt_attention/common/util.py +73 -0
i2vedit/prompt_attention/ptp_utils.py +199 -0
i2vedit/prompt_attention/visualization.py +391 -0
i2vedit/train.py +1488 -0
i2vedit/utils/__init__.py +0 -0
i2vedit/utils/bucketing.py +32 -0
i2vedit/utils/dataset.py +705 -0
i2vedit/utils/euler_utils.py +226 -0
i2vedit/utils/lora.py +1493 -0
i2vedit/utils/lora_handler.py +270 -0
i2vedit/utils/model_utils.py +588 -0
i2vedit/utils/svd_util.py +397 -0
i2vedit/version.py +5 -0
main.py +595 -0
mydata/source_and_edits/source.mp4 +3 -0
mydata/source_and_edits/white.jpg +3 -0
req.txt +165 -0
requirements.txt +176 -0
setup.py +73 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+mydata/source_and_edits/source.mp4 filter=lfs diff=lfs merge=lfs -text
+mydata/source_and_edits/white.jpg filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,12 @@

+cache
+ckpts
+trash
+outputs
+run.sh
+__pycache__
+*_tmp
+*.mp4
+*.png
+i2vedit.egg-info/
+!mydata/**/*.mp4
+customize_train_local.yaml

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
 title: I2vedit
-emoji: 📚
-colorFrom: yellow
-colorTo: indigo
 sdk: gradio
-sdk_version: 5.38.2
 app_file: app.py
 pinned: false
 ---

 ---
 title: I2vedit
+emoji: 📈
+colorFrom: purple
+colorTo: purple
 sdk: gradio
+sdk_version: 5.32.1
 app_file: app.py
 pinned: false
 ---

app.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import gradio as gr
+import subprocess
+import os
+import shutil
+import sys
+target_paths = {
+    "video": "/home/user/app/upload/source_and_edits/source.mp4",
+    "image": "/home/user/app/upload/source_and_edits/ref.jpg",
+    "config": "/home/user/app/upload/config/customize_train.yaml",
+    "lora": "/homw/user/app/upload/lora/lora.pt",
+    "output_l": "/home/user/app/outputs/train_motion_lora",
+    "output_r": "/home/user/app/outputs/ref.mp4",
+    "zip": "/home/user/app/outputs/train_motion_lora.zip",
+}
+def zip_outputs():
+    if os.path.exists(target_paths["zip"]):
+        os.remove(target_paths["zip"])
+    shutil.make_archive(target_paths["zip"].replace(".zip", ""), 'zip', root_dir=target_paths["output_l"])
+    return target_paths["zip"]
+def output_video():
+    if os.path.exists(target_paths["output_r"]):
+        return target_paths["output_r"]
+    return None
+def start_training_stream():
+    process = subprocess.Popen(
+        ["python", "main.py", "--config=" + target_paths["config"]],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+        bufsize=1,
+        universal_newlines=True
+    )
+    output = []
+    for line in process.stdout:
+        output.append(line)
+        yield "".join(output)
+def install_i2vedit():
+    try:
+        import i2vedit
+        print("i2vedit already installed")
+    except ImportError:
+        print("Installing i2vedit...")
+        subprocess.check_call([sys.executable, "-m", "pip", "install", "-e", "./i2vedit"])
+        print("i2vedit installed")
+def install_package(package_name):
+    try:
+        result = subprocess.run(
+            [sys.executable, "-m", "pip", "install", package_name],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+        )
+        output = result.stdout + "\n" + result.stderr
+        return output
+    except Exception as e:
+        return f"Error: {str(e)}"
+def show_package(pkg_name):
+    try:
+        result = subprocess.run(
+            [sys.executable, "-m", "pip", "show", pkg_name],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+        )
+        return result.stdout if result.stdout else result.stderr
+    except Exception as e:
+        return str(e)
+def uninstall_package(package_name):
+    try:
+        result = subprocess.run(
+            [sys.executable, "-m", "pip", "uninstall", package_name, "-y"],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+        )
+        output = result.stdout + "\n" + result.stderr
+        return output
+    except Exception as e:
+        return f"Error: {str(e)}"
+def save_files(video_file, image_file, config_file, lora_file=None):
+    os.makedirs(os.path.dirname(target_paths["video"]), exist_ok=True)
+    os.makedirs(os.path.dirname(target_paths["config"]), exist_ok=True)
+    shutil.copy(video_file.name, target_paths["video"])
+    shutil.copy(image_file.name, target_paths["image"])
+    shutil.copy(config_file.name, target_paths["config"])
+    if lora_file:
+        os.makedirs(os.path.dirname(target_paths["lora"]), exist_ok=True)
+        shutil.copy(lora_file.name, target_paths["lora"])
+    return "檔案已成功上傳並儲存！"
+install_i2vedit()
+install_package("huggingface_hub==0.25.1")
+install_package("diffusers==0.25.1")
+install_package("gradio==5.0.0")
+uninstall_package("datasets")
+print("package version set complete")
+with gr.Blocks(theme=gr.themes.Origin()) as demo:
+    gr.Markdown("## 請先上傳檔案")
+    with gr.Row():
+        video_input = gr.File(label="原始影片", file_types=[".mp4"])
+        image_input = gr.File(label="編輯圖像", file_types=[".jpg", ".jpeg", ".png"])
+        config_input = gr.File(label="Config 檔", file_types=[".yaml", ".yml"])
+        lora_input = gr.File(label="LoRA 檔案", file_types=[".pt"])
+    upload_button = gr.Button("上傳並儲存")
+    output = gr.Textbox(label="狀態")
+    gr.Markdown("## Training")
+    with gr.Column():
+        log_output = gr.Textbox(label="Training Log", lines=20)
+        train_btn = gr.Button("Start Training")
+    gr.Markdown("## Pip Installer")
+    with gr.Column():
+        with gr.Row():
+            pkg_input = gr.Textbox(lines=1, placeholder="輸入想安裝的套件名稱，例如 diffusers 或 numpy==1.2.0")
+            install_output = gr.Textbox(label="Install Output", lines=10)
+        install_btn = gr.Button("Install Package")
+    gr.Markdown("## Pip Uninstaller")
+    with gr.Column():
+        with gr.Row():
+            pkg_input2 = gr.Textbox(lines=1, placeholder="輸入想解除安裝的套件名稱，例如 diffusers 或 numpy")
+            uninstall_output = gr.Textbox(label="Uninstall Output", lines=10)
+        uninstall_btn = gr.Button("Uninstall Package")
+    gr.Markdown("## Pip show")
+    with gr.Column():
+        with gr.Row():
+            show_input = gr.Textbox(label="輸入套件名稱（如 diffusers）")
+            show_output = gr.Textbox(label="套件資訊", lines=10)
+        show_btn = gr.Button("pip show")
+    gr.Markdown("## Download lora")
+    with gr.Column():
+        file_output = gr.File(label="點擊下載", interactive=True)
+        download_btn = gr.Button("下載lora")
+    gr.Markdown("## Download results")
+    with gr.Column():
+        file_output2 = gr.File(label="點擊下載", interactive=True)
+        download_btn2 = gr.Button("下載結果")
+    show_btn.click(fn=show_package, inputs=show_input, outputs=show_output)
+    download_btn.click(fn=zip_outputs, outputs=file_output)
+    download_btn2.click(fn=output_video, outputs=file_output2)
+    install_btn.click(fn=install_package, inputs=pkg_input, outputs=install_output)
+    train_btn.click(fn=start_training_stream, outputs=log_output)
+    uninstall_btn.click(fn=uninstall_package, inputs=pkg_input2, outputs=uninstall_output)
+    upload_button.click(fn=save_files,inputs=[video_input, image_input, config_input, lora_input],outputs=output)
+demo.launch()

config/customize_subsequent_edit.yaml ADDED Viewed

	@@ -0,0 +1,152 @@

+# Pretrained diffusers model path.
+pretrained_model_path: "ckpts/stable-video-diffusion-img2vid"
+# The folder where your training outputs will be placed.
+output_dir: "./acc"
+seed: 23
+num_steps: 25
+# Xformers must be installed for best memory savings and performance (< Pytorch 2.0)
+enable_xformers_memory_efficient_attention: True
+# Use scaled dot product attention (Only available with >= Torch 2.0)
+enable_torch_2_attn: True
+use_sarp: true
+use_motion_lora: true
+train_motion_lora_only: false
+retrain_motion_lora: true
+use_inversed_latents: true
+use_attention_matching: true
+use_consistency_attention_control: true
+dtype: fp16
+visualize_attention_store: false
+visualize_attention_store_steps: [0, 5, 10, 15, 20, 24]
+save_last_frames: True
+load_from_last_frames_latents:
+  - "./cache/item1/i2vedit_2024-05-11T15-53-54/clip_0_lastframe_0.pt"
+  - "./cache/item1/i2vedit_2024-05-11T15-53-54/clip_0_lastframe_1.pt"
+load_from_previous_consistency_edit_controller:
+  - "./cache/item1/i2vedit_2024-05-11T15-53-54/consistency_edit0_attention_store"
+  - "./cache/item1/i2vedit_2024-05-11T15-53-54/consistency_edit1_attention_store"
+load_from_previous_consistency_store_controller:
+  "./cache/item1/i2vedit_2024-05-11T15-53-54/consistency_attention_store"
+# data_params
+data_params:
+  video_path: "../datasets/svdedit/item1/source.mp4"
+  keyframe_paths:
+    - "../datasets/svdedit/tmp/edit0.png"
+    - "../datasets/svdedit/tmp/edit1.png"
+  start_t: 0
+  end_t: -1
+  sample_fps: 7
+  chunk_size: 16
+  overlay_size: 1
+  normalize: true
+  output_fps: 7
+  save_sampled_frame: true
+  output_res: [576, 1024]
+  pad_to_fit: false
+  begin_clip_id: 1
+  end_clip_id: 2
+train_motion_lora_params:
+  cache_latents: true
+  cached_latent_dir: null #/path/to/cached_latents
+  lora_rank: 32
+  # Use LoRA for the UNET model.
+  use_unet_lora: True
+  # LoRA Dropout. This parameter adds the probability of randomly zeros out elements. Helps prevent overfitting.
+  # See: https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html
+  lora_unet_dropout: 0.1
+  # The only time you want this off is if you're doing full LoRA training.
+  save_pretrained_model: False
+  # Learning rate for AdamW
+  learning_rate: 5e-4
+  # Weight decay. Higher = more regularization. Lower = closer to dataset.
+  adam_weight_decay: 1e-2
+  # Maximum number of train steps. Model is saved after training.
+  max_train_steps: 250
+  # Saves a model every nth step.
+  checkpointing_steps: 250
+  # How many steps to do for validation if sample_preview is enabled.
+  validation_steps: 300
+  # Whether or not we want to use mixed precision with accelerate
+  mixed_precision: "fp16"
+  # Trades VRAM usage for speed. You lose roughly 20% of training speed, but save a lot of VRAM.
+  # If you need to save more VRAM, it can also be enabled for the text encoder, but reduces speed x2.
+  gradient_checkpointing: True
+  image_encoder_gradient_checkpointing: True
+  train_data:
+    # The width and height in which you want your training data to be resized to.
+    width: 896
+    height: 512
+    # This will find the closest aspect ratio to your input width and height.
+    # For example, 512x512 width and height with a video of resolution 1280x720 will be resized to 512x256
+    use_data_aug: ~ #"controlnet"
+    pad_to_fit: false
+  validation_data:
+    # Whether or not to sample preview during training (Requires more VRAM).
+    sample_preview: True
+    # The number of frames to sample during validation.
+    num_frames: 14
+    # Height and width of validation sample.
+    width: 1024
+    height: 576
+    pad_to_fit: false
+    # scale of spatial LoRAs, default is 0
+    spatial_scale: 0
+    # scale of noise prior, i.e. the scale of inversion noises
+    noise_prior:
+      - 0.0
+      #- 1.0
+sarp_params:
+  sarp_noise_scale: 0.005
+attention_matching_params:
+  best_checkpoint_index: 250
+  lora_scale: 1.0
+  # lora path
+  lora_dir: ~
+  max_guidance_scale: 2.0
+  disk_store: True
+  load_attention_store: "./cache/item1/attention_store"
+  load_consistency_attention_store: ~
+  registered_modules:
+    BasicTransformerBlock:
+      - "attn1"
+      #- "attn2"
+    TemporalBasicTransformerBlock:
+      - "attn1"
+      #- "attn2"
+  control_mode:
+    spatial_self: "masked_copy"
+    temporal_self: "copy_v2"
+  cross_replace_steps: 0.0
+  temporal_self_replace_steps: 1.0
+  spatial_self_replace_steps: 1.0
+  spatial_attention_chunk_size: 1
+  params:
+    edit0:
+      temporal_step_thr: [0.5, 0.8]
+      mask_thr: [0.35, 0.35]
+    edit1:
+      temporal_step_thr: [0.5, 0.8]
+      mask_thr: [0.35, 0.35]
+long_video_params:
+  mode: "skip-interval"
+  registered_modules:
+    BasicTransformerBlock:
+      #- "attn1"
+      #- "attn2"
+    TemporalBasicTransformerBlock:
+      - "attn1"
+      #- "attn2"

config/customize_train.yaml ADDED Viewed

	@@ -0,0 +1,149 @@

+# Pretrained diffusers model path.
+# Don't change
+pretrained_model_path: "stabilityai/stable-video-diffusion-img2vid"
+# The folder where your training outputs will be placed.
+# Don't change
+output_dir: "/home/user/app/outputs"
+seed: 23
+num_steps: 25
+# Xformers must be installed for best memory savings and performance (< Pytorch 2.0)
+enable_xformers_memory_efficient_attention: True
+# Use scaled dot product attention (Only available with >= Torch 2.0)
+enable_torch_2_attn: True
+use_sarp: true
+use_motion_lora: true
+train_motion_lora_only: false
+retrain_motion_lora: true
+use_inversed_latents: true
+use_attention_matching: true
+use_consistency_attention_control: false
+dtype: fp16
+visualize_attention_store: false
+visualize_attention_store_steps: #[0, 5, 10, 15, 20, 24]
+save_last_frames: True
+load_from_last_frames_latents:
+# data_params
+data_params:
+  # Don't change
+  video_path: "/home/user/app/upload/source_and_edits/source.mp4"
+  # Don't change
+  keyframe_paths:
+    - "/home/user/app/upload/source_and_edits/ref.jpg"
+  start_t: 0
+  end_t: 1.6
+  sample_fps: 10
+  chunk_size: 16
+  overlay_size: 1
+  normalize: true
+  output_fps: 3
+  save_sampled_frame: true
+  output_res: [576, 576]
+  pad_to_fit: true
+  begin_clip_id: 0
+  end_clip_id: 1
+train_motion_lora_params:
+  cache_latents: true
+  cached_latent_dir: null #/path/to/cached_latents
+  lora_rank: 32
+  # Use LoRA for the UNET model.
+  use_unet_lora: True
+  # LoRA Dropout. This parameter adds the probability of randomly zeros out elements. Helps prevent overfitting.
+  # See: https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html
+  lora_unet_dropout: 0.1
+  # The only time you want this off is if you're doing full LoRA training.
+  save_pretrained_model: False
+  # Learning rate for AdamW
+  learning_rate: 5e-4
+  # Weight decay. Higher = more regularization. Lower = closer to dataset.
+  adam_weight_decay: 1e-2
+  # Maximum number of train steps. Model is saved after training.
+  max_train_steps: 600
+  # Saves a model every nth step.
+  checkpointing_steps: 100
+  # How many steps to do for validation if sample_preview is enabled.
+  validation_steps: 100
+  # Whether or not we want to use mixed precision with accelerate
+  mixed_precision: "fp16"
+  # Trades VRAM usage for speed. You lose roughly 20% of training speed, but save a lot of VRAM.
+  # If you need to save more VRAM, it can also be enabled for the text encoder, but reduces speed x2.
+  gradient_checkpointing: True
+  image_encoder_gradient_checkpointing: True
+  train_data:
+    # The width and height in which you want your training data to be resized to.
+    width: 576
+    height: 576
+    # This will find the closest aspect ratio to your input width and height.
+    # For example, 576x576 width and height with a video of resolution 1280x720 will be resized to 576x256
+    use_data_aug: ~ #"controlnet"
+    pad_to_fit: true
+  validation_data:
+    # Whether or not to sample preview during training (Requires more VRAM).
+    sample_preview: True
+    # The number of frames to sample during validation.
+    num_frames: 16
+    # Height and width of validation sample.
+    width: 576
+    height: 576
+    pad_to_fit: true
+    # scale of spatial LoRAs, default is 0
+    spatial_scale: 0
+    # scale of noise prior, i.e. the scale of inversion noises
+    noise_prior:
+      #- 0.0
+      - 1.0
+sarp_params:
+  sarp_noise_scale: 0.005
+attention_matching_params:
+  best_checkpoint_index: 500
+  lora_scale: 1.0
+  # lora path
+  lora_dir: ~
+  max_guidance_scale: 2.0
+  disk_store: True
+  load_attention_store: ~
+  load_consistency_attention_store: ~
+  load_consistency_train_attention_store: ~
+  registered_modules:
+    BasicTransformerBlock:
+      - "attn1"
+      #- "attn2"
+    TemporalBasicTransformerBlock:
+      - "attn1"
+      #- "attn2"
+  control_mode:
+    spatial_self: "masked_copy"
+    temporal_self: "copy_v2"
+  cross_replace_steps: 0.0
+  temporal_self_replace_steps: 1.0
+  spatial_self_replace_steps: 1.0
+  spatial_attention_chunk_size: 1
+  params:
+    edit0:
+      temporal_step_thr: [0.5, 0.8]
+      mask_thr: [0.35, 0.35]
+    edit1:
+      temporal_step_thr: [0.5, 0.8]
+      mask_thr: [0.35, 0.35]
+long_video_params:
+  mode: "skip-interval"
+  registered_modules:
+    BasicTransformerBlock:
+      #- "attn1"
+      #- "attn2"
+    TemporalBasicTransformerBlock:
+      - "attn1"
+      #- "attn2"

config/customize_train_multi.yaml ADDED Viewed

	@@ -0,0 +1,149 @@

+# Pretrained diffusers model path.
+# Don't change
+pretrained_model_path: "stabilityai/stable-video-diffusion-img2vid"
+# The folder where your training outputs will be placed.
+# Don't change
+output_dir: "/home/user/app/outputs"
+seed: 23
+num_steps: 25
+# Xformers must be installed for best memory savings and performance (< Pytorch 2.0)
+enable_xformers_memory_efficient_attention: True
+# Use scaled dot product attention (Only available with >= Torch 2.0)
+enable_torch_2_attn: True
+use_sarp: true
+use_motion_lora: true
+train_motion_lora_only: false
+retrain_motion_lora: true
+use_inversed_latents: true
+use_attention_matching: true
+use_consistency_attention_control: true
+dtype: fp16
+visualize_attention_store: false
+visualize_attention_store_steps: #[0, 5, 10, 15, 20, 24]
+save_last_frames: True
+load_from_last_frames_latents:
+# data_params
+data_params:
+  # Don't change
+  video_path: "/home/user/app/upload/source_and_edits/source.mp4"
+  # Don't change
+  keyframe_paths:
+    - "/home/user/app/upload/source_and_edits/ref.jpg"
+  start_t: 0
+  end_t: 4.0
+  sample_fps: 10
+  chunk_size: 12
+  overlay_size: 3
+  normalize: true
+  output_fps: 10
+  save_sampled_frame: true
+  output_res: [768, 768]
+  pad_to_fit: true
+  begin_clip_id: 0
+  end_clip_id: 4
+train_motion_lora_params:
+  cache_latents: true
+  cached_latent_dir: null #/path/to/cached_latents
+  lora_rank: 32
+  # Use LoRA for the UNET model.
+  use_unet_lora: True
+  # LoRA Dropout. This parameter adds the probability of randomly zeros out elements. Helps prevent overfitting.
+  # See: https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html
+  lora_unet_dropout: 0.1
+  # The only time you want this off is if you're doing full LoRA training.
+  save_pretrained_model: False
+  # Learning rate for AdamW
+  learning_rate: 5e-4
+  # Weight decay. Higher = more regularization. Lower = closer to dataset.
+  adam_weight_decay: 1e-2
+  # Maximum number of train steps. Model is saved after training.
+  max_train_steps: 600
+  # Saves a model every nth step.
+  checkpointing_steps: 200
+  # How many steps to do for validation if sample_preview is enabled.
+  validation_steps: 200
+  # Whether or not we want to use mixed precision with accelerate
+  mixed_precision: "fp16"
+  # Trades VRAM usage for speed. You lose roughly 20% of training speed, but save a lot of VRAM.
+  # If you need to save more VRAM, it can also be enabled for the text encoder, but reduces speed x2.
+  gradient_checkpointing: True
+  image_encoder_gradient_checkpointing: True
+  train_data:
+    # The width and height in which you want your training data to be resized to.
+    width: 768
+    height: 768
+    # This will find the closest aspect ratio to your input width and height.
+    # For example, 768x768 width and height with a video of resolution 1280x720 will be resized to 768x256
+    use_data_aug: ~ #"controlnet"
+    pad_to_fit: true
+  validation_data:
+    # Whether or not to sample preview during training (Requires more VRAM).
+    sample_preview: True
+    # The number of frames to sample during validation.
+    num_frames: 8
+    # Height and width of validation sample.
+    width: 768
+    height: 768
+    pad_to_fit: true
+    # scale of spatial LoRAs, default is 0
+    spatial_scale: 0
+    # scale of noise prior, i.e. the scale of inversion noises
+    noise_prior:
+      #- 0.0
+      - 1.0
+sarp_params:
+  sarp_noise_scale: 0.005
+attention_matching_params:
+  best_checkpoint_index: 600
+  lora_scale: 1.0
+  # lora path
+  lora_dir: ~
+  max_guidance_scale: 2.0
+  disk_store: True
+  load_attention_store: ~
+  load_consistency_attention_store: ~
+  load_consistency_train_attention_store: ~
+  registered_modules:
+    BasicTransformerBlock:
+      - "attn1"
+      #- "attn2"
+    TemporalBasicTransformerBlock:
+      - "attn1"
+      #- "attn2"
+  control_mode:
+    spatial_self: "masked_copy"
+    temporal_self: "copy_v2"
+  cross_replace_steps: 0.0
+  temporal_self_replace_steps: 1.0
+  spatial_self_replace_steps: 1.0
+  spatial_attention_chunk_size: 1
+  params:
+    edit0:
+      temporal_step_thr: [0.5, 0.8]
+      mask_thr: [0.35, 0.35]
+    edit1:
+      temporal_step_thr: [0.5, 0.8]
+      mask_thr: [0.35, 0.35]
+long_video_params:
+  mode: "skip-interval"
+  registered_modules:
+    BasicTransformerBlock:
+      #- "attn1"
+      #- "attn2"
+    TemporalBasicTransformerBlock:
+      - "attn1"
+      #- "attn2"

i2vedit/__init__.py ADDED Viewed

File without changes

i2vedit/data.py ADDED Viewed

	@@ -0,0 +1,317 @@

+import os
+import decord
+import imageio
+import numpy as np
+import PIL
+from PIL import Image
+from einops import rearrange, repeat
+from torchvision.transforms import Resize, Pad, InterpolationMode, ToTensor, InterpolationMode
+import torch
+import torch.nn as nn
+from torch.utils.data import Dataset
+#from i2vedit.utils.augment import ControlNetDataAugmentation, ColorDataAugmentation
+# from utils.euler_utils import tensor_to_vae_latent
+class ResolutionControl(object):
+    def __init__(self, input_res, output_res, pad_to_fit=False, fill=0, **kwargs):
+        self.ih, self.iw = input_res
+        self.output_res = output_res
+        self.pad_to_fit = pad_to_fit
+        self.fill=fill
+    def pad_with_ratio(self, frames, res, fill=0):
+        if isinstance(frames, torch.Tensor):
+            original_dim = frames.ndim
+            if frames.ndim > 4:
+                batch_size = frames.shape[0]
+                frames = rearrange(frames, "b f c h w -> (b f) c h w")
+            _, _, ih, iw = frames.shape
+        elif isinstance(frames, PIL.Image.Image):
+            iw, ih = frames.size
+        assert ih == self.ih and iw == self.iw, "resolution doesn't match."
+        #print("ih, iw", ih, iw)
+        i_ratio = ih / iw
+        h, w = res
+        #print("h,w", h ,w)
+        n_ratio = h / w
+        if i_ratio > n_ratio:
+            nw = int(ih / h * w)
+            #print("nw", nw)
+            frames = Pad(((nw - iw)//2,0), fill=fill)(frames)
+        else:
+            nh = int(iw / w * h)
+            frames = Pad((0,(nh - ih)//2), fill=fill)(frames)
+        #print("after pad", frames.shape)
+        if isinstance(frames, torch.Tensor):
+            if original_dim > 4:
+                frames = rearrange(frames, "(b f) c h w -> b f c h w", b=batch_size)
+        return frames
+    def return_to_original_res(self, frames):
+        if isinstance(frames, torch.Tensor):
+            original_dim = frames.ndim
+            if frames.ndim > 4:
+                batch_size = frames.shape[0]
+                frames = rearrange(frames, "b f c h w -> (b f) c h w")
+            _, _, h, w = frames.shape
+        elif isinstance(frames, PIL.Image.Image):
+            w, h = frames.size
+        #print("original res", (self.ih, self.iw))
+        #print("current res", (h, w))
+        assert h == self.output_res[0] and w == self.output_res[1], "resolution doesn't match."
+        n_ratio = h / w
+        ih, iw = self.ih, self.iw
+        i_ratio = ih / iw
+        if self.pad_to_fit:
+            if i_ratio > n_ratio:
+                nw = int(ih / h * w)
+                frames = Resize((ih, iw+2*(nw - iw)//2), interpolation=InterpolationMode.BILINEAR, antialias=True)(frames)
+                if isinstance(frames, torch.Tensor):
+                    frames = frames[...,:,(nw - iw)//2:-(nw - iw)//2]
+                elif isinstance(frames, PIL.Image.Image):
+                    frames = frames.crop(((nw - iw)//2,0,iw+(nw - iw)//2,ih))
+            else:
+                nh = int(iw / w * h)
+                frames = Resize((ih+2*(nh - ih)//2, iw), interpolation=InterpolationMode.BILINEAR, antialias=True)(frames)
+                if isinstance(frames, torch.Tensor):
+                    frames = frames[...,(nh - ih)//2:-(nh - ih)//2,:]
+                elif isinstance(frames, PIL.Image.Image):
+                    frames = frames.crop((0,(nh - ih)//2,iw,ih+(nh - ih)//2))
+        else:
+            frames = Resize((ih, iw), interpolation=InterpolationMode.BILINEAR, antialias=True)(frames)
+        if isinstance(frames, torch.Tensor):
+            if original_dim > 4:
+                frames = rearrange(frames, "(b f) c h w -> b f c h w", b=batch_size)
+        return frames
+    def __call__(self, frames):
+        if self.pad_to_fit:
+            frames = self.pad_with_ratio(frames, self.output_res, fill=self.fill)
+        if isinstance(frames, torch.Tensor):
+            original_dim = frames.ndim
+            if frames.ndim > 4:
+                batch_size = frames.shape[0]
+                frames = rearrange(frames, "b f c h w -> (b f) c h w")
+            frames = (frames + 1) / 2.
+        frames = Resize(tuple(self.output_res), interpolation=InterpolationMode.BILINEAR, antialias=True)(frames)
+        if isinstance(frames, torch.Tensor):
+            if original_dim > 4:
+                frames = rearrange(frames, "(b f) c h w -> b f c h w", b=batch_size)
+            frames = frames * 2 - 1
+        return frames
+    def callback(self, frames):
+        return self.return_to_original_res(frames)
+class VideoIO(object):
+    def __init__(
+        self,
+        video_path,
+        keyframe_paths,
+        output_dir,
+        device,
+        dtype,
+        start_t:int=0,
+        end_t:int=-1,
+        sample_fps:int=-1,
+        chunk_size: int=14,
+        overlay_size: int=-1,
+        normalize: bool=True,
+        output_fps: int=-1,
+        save_sampled_video: bool=True,
+        **kwargs
+    ):
+        self.video_path = video_path
+        self.keyframe_paths = keyframe_paths
+        self.device = device
+        self.dtype = dtype
+        self.start_t = start_t
+        self.end_t = end_t
+        self.sample_fps = sample_fps
+        self.chunk_size = chunk_size
+        self.overlay_size = overlay_size
+        self.normalize = normalize
+        self.save_sampled_video = save_sampled_video
+        vr = decord.VideoReader(video_path)
+        initial_fps = vr.get_avg_fps()
+        self.initial_fps = initial_fps
+        if output_fps == -1: output_fps = initial_fps
+        self.video_writer_list = []
+        for keyframe_path in keyframe_paths:
+            fname, ext = os.path.splitext(os.path.basename(keyframe_path))
+            output_video_path = os.path.join(output_dir, fname+".mp4")
+            self.video_writer_list.append( imageio.get_writer(output_video_path, fps=output_fps) )
+        if save_sampled_video:
+            fname, ext = os.path.splitext(os.path.basename(video_path))
+            output_sampled_video_path = os.path.join(output_dir, fname+f"_from{start_t}s_to{end_t}s{ext}")
+            self.sampled_video_writer = imageio.get_writer(output_sampled_video_path, fps=output_fps)
+    def read_keyframe_iter(self):
+        for keyframe_path in self.keyframe_paths:
+            image = Image.open(keyframe_path).convert("RGB")
+            yield image
+    def read_video_iter(self):
+        vr = decord.VideoReader(self.video_path)
+        if self.sample_fps == -1: self.sample_fps = self.initial_fps
+        if self.end_t == -1:
+            self.end_t = len(vr) / self.initial_fps
+        else:
+            self.end_t = min(len(vr) / self.initial_fps, self.end_t)
+        if self.overlay_size == -1: self.overlay_size = 0
+        assert 0 <= self.start_t < self.end_t
+        assert self.sample_fps > 0
+        start_f_ind = int(self.start_t * self.initial_fps)
+        end_f_ind = int(self.end_t * self.initial_fps)
+        num_f = int((self.end_t - self.start_t) * self.sample_fps)
+        sample_idx = np.linspace(start_f_ind, end_f_ind, num_f, endpoint=False).astype(int)
+        print("sample_idx", sample_idx)
+        assert len(sample_idx) > 0, f"sample_idx is empty!"
+        begin_frame_idx = 0
+        while begin_frame_idx < len(sample_idx):
+            self.begin_frame_idx = begin_frame_idx
+            begin_frame_idx = max(begin_frame_idx - self.overlay_size, 0)
+            next_frame_idx = min(begin_frame_idx + self.chunk_size, len(sample_idx))
+            video = vr.get_batch(sample_idx[begin_frame_idx:next_frame_idx])
+            begin_frame_idx = next_frame_idx
+            if self.save_sampled_video:
+                overlay_size = 0 if self.begin_frame_idx == 0 else self.overlay_size
+                print(type(video))
+                for frame in video[overlay_size:]:
+                    self.sampled_video_writer.append_data(frame.detach().cpu().numpy())
+            video = torch.Tensor(video).to(self.device).to(self.dtype)
+            video = rearrange(video, "f h w c -> f c h w")
+            if self.normalize:
+                video = video / 127.5 - 1.0
+            yield video
+    def write_video(self, video, video_id, resctrl: ResolutionControl = None):
+        '''
+        video:
+        '''
+        overlay_size = 0 if self.begin_frame_idx == 0 else self.overlay_size
+        for img in video[overlay_size:]:
+            if resctrl is not None:
+                img = resctrl.callback(img)
+            self.video_writer_list[video_id].append_data(np.array(img))
+    def close(self):
+        for video_writer in self.video_writer_list:
+            video_writer.close()
+        if self.save_sampled_video:
+            self.sampled_video_writer.close()
+        self.begin_frame_idx = 0
+class SingleClipDataset(Dataset):
+#    data_aug_class = {
+#        "rsfnet": ColorDataAugmentation,
+#        "controlnet": ControlNetDataAugmentation
+#    }
+    def __init__(
+        self,
+        inversion_noise,
+        video_clip,
+        keyframe,
+        firstframe,
+        height,
+        width,
+        use_data_aug=None,
+        pad_to_fit=False,
+        keyframe_latent=None
+    ):
+        self.resctrl = ResolutionControl(video_clip.shape[-2:],(height,width),pad_to_fit,fill=-1)
+        video_clip = rearrange(video_clip, "1 f c h w -> f c h w")
+        keyframe = rearrange(keyframe, "1 f c h w -> f c h w")
+        firstframe = rearrange(firstframe, "1 f c h w -> f c h w")
+        if inversion_noise is not None:
+            inversion_noise = rearrange(inversion_noise, "1 f c h w -> f c h w")
+        if use_data_aug is not None:
+            if use_data_aug in self.data_aug_class:
+                self.data_augment = self.data_aug_class[use_data_aug]()
+                use_data_aug = True
+                print(f"Augmentation mode: {use_data_aug} is implemented.")
+            else:
+                raise NotImplementedError(f"Augmentation mode: {use_data_aug} is not implemented!")
+        else:
+            use_data_aug = False
+        self.video_clip = video_clip
+        self.keyframe = keyframe
+        self.firstframe = firstframe
+        self.inversion_noise = inversion_noise
+        self.use_data_aug = use_data_aug
+        self.keyframe_latent = keyframe_latent
+    @staticmethod
+    def __getname__(): return 'single_clip'
+    def __len__(self):
+        return 1
+    def __getitem__(self, index):
+        motion_values = torch.Tensor([127.])
+        pixel_values = self.resctrl(self.video_clip)
+        refer_pixel_values = self.resctrl(self.keyframe)
+        cross_pixel_values = self.resctrl(self.firstframe)
+        if self.use_data_aug:
+            print("pixel_values before augment", refer_pixel_values.min(), refer_pixel_values.max())
+            #pixel_values, refer_pixel_values, cross_pixel_values = \
+            #self.data_augment.augment(
+            #    torch.cat([pixel_values, refer_pixel_values, cross_pixel_values], dim=0)
+            #).tensor_split([pixel_values.shape[0],pixel_values.shape[0]+refer_pixel_values.shape[0]],dim=0)
+            refer_pixel_values = self.data_augment.augment(refer_pixel_values)
+            print("pixel_values after augment", refer_pixel_values.min(), refer_pixel_values.max())
+        outputs = {
+            "pixel_values": pixel_values,
+            "refer_pixel_values": refer_pixel_values,
+            "cross_pixel_values": cross_pixel_values,
+            "motion_values": motion_values,
+            'dataset': self.__getname__(),
+        }
+        if self.inversion_noise is not None:
+            outputs.update({
+                "inversion_noise": self.inversion_noise
+            })
+        if self.keyframe_latent is not None:
+            outputs.update({
+                "refer_latents": self.keyframe_latent
+            })
+        return outputs

i2vedit/inference.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import argparse
+import os
+import platform
+import re
+import warnings
+import imageio
+import random
+from typing import Optional
+from tqdm import trange
+from einops import rearrange
+import torch
+from torch import Tensor
+from torch.nn.functional import interpolate
+from diffusers import StableVideoDiffusionPipeline, EulerDiscreteScheduler
+from diffusers import TextToVideoSDPipeline
+from i2vedit.train import export_to_video, handle_memory_attention, load_primary_models, unet_and_text_g_c, freeze_models
+from i2vedit.utils.lora_handler import LoraHandler
+from i2vedit.utils.model_utils import P2PStableVideoDiffusionPipeline
+def initialize_pipeline(
+    model: str,
+    device: str = "cuda",
+    xformers: bool = False,
+    sdp: bool = False,
+    lora_path: str = "",
+    lora_rank: int = 64,
+    lora_scale: float = 1.0,
+    load_spatial_lora: bool = False,
+    dtype = torch.float16
+):
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore")
+        scheduler, feature_extractor, image_encoder, vae, unet = load_primary_models(model)
+    # Freeze any necessary models
+    freeze_models([vae, image_encoder, unet])
+    # Enable xformers if available
+    handle_memory_attention(xformers, sdp, unet)
+    lora_manager_temporal = LoraHandler(
+        version="cloneofsimo",
+        use_unet_lora=True,
+        use_image_lora=False,
+        save_for_webui=False,
+        only_for_webui=False,
+        unet_replace_modules=["TemporalBasicTransformerBlock"],
+        image_encoder_replace_modules=None,
+        lora_bias=None
+    )
+    unet_lora_params, unet_negation = lora_manager_temporal.add_lora_to_model(
+        True, unet, lora_manager_temporal.unet_replace_modules, 0, lora_path, r=lora_rank, scale=lora_scale)
+    if load_spatial_lora:
+        lora_manager_spatial = LoraHandler(
+            version="cloneofsimo",
+            use_unet_lora=True,
+            use_image_lora=False,
+            save_for_webui=False,
+            only_for_webui=False,
+            unet_replace_modules=["BasicTransformerBlock"],
+            image_encoder_replace_modules=None,
+            lora_bias=None
+        )
+        spatial_lora_path = lora_path.replace("temporal", "spatial")
+        unet_lora_params, unet_negation = lora_manager_spatial.add_lora_to_model(
+            True, unet, lora_manager_spatial.unet_replace_modules, 0, spatial_lora_path, r=lora_rank, scale=lora_scale)
+    unet.eval()
+    image_encoder.eval()
+    unet_and_text_g_c(unet, image_encoder, False, False)
+    pipe = P2PStableVideoDiffusionPipeline.from_pretrained(
+        model,
+        scheduler=scheduler,
+        feature_extractor=feature_extractor,
+        image_encoder=image_encoder.to(device=device, dtype=dtype),
+        vae=vae.to(device=device, dtype=dtype),
+        unet=unet.to(device=device, dtype=dtype)
+    )
+    pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
+    return pipe

i2vedit/prompt_attention/__init__.py ADDED Viewed

File without changes

i2vedit/prompt_attention/attention_register.py ADDED Viewed

	@@ -0,0 +1,250 @@

+"""
+register the attention controller into the UNet of stable diffusion
+Build a customized attention function `_attention'
+Replace the original attention function with `forward' and `spatial_temporal_forward' in attention_controlled_forward function
+Most of spatial_temporal_forward is directly copy from `video_diffusion/models/attention.py'
+TODO FIXME: merge redundant code with attention.py
+"""
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch.utils.checkpoint import checkpoint
+import logging
+from einops import rearrange, repeat
+import math
+from inspect import isfunction
+from typing import Any, Optional
+from packaging import version
+from diffusers.models.attention_processor import AttnProcessor2_0, Attention
+from diffusers.utils import USE_PEFT_BACKEND
+class AttnControllerProcessor:
+    def __init__(self, consistency_controller, controller, place_in_unet, attention_type):
+        self.consistency_controller = consistency_controller
+        self.controller = controller
+        self.place_in_unet = place_in_unet
+        self.attention_type = attention_type
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
+        scale: float = 1.0,
+    ) -> torch.FloatTensor:
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        args = () if USE_PEFT_BACKEND else (scale,)
+        query = attn.to_q(hidden_states, *args)
+        is_cross = True
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+            is_cross = False
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states, *args)
+        value = attn.to_v(encoder_hidden_states, *args)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if self.consistency_controller is not None:
+            key = self.consistency_controller(
+                key, is_cross, f"{self.place_in_unet}_{self.attention_type}_k"
+            )
+            value = self.consistency_controller(
+                value, is_cross, f"{self.place_in_unet}_{self.attention_type}_v"
+            )
+        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+        # TODO: add support for attn.scale when we move to Torch 2.1
+        if self.controller is not None:
+            hidden_states = self.controller.attention_control(
+                self.place_in_unet, self.attention_type, is_cross,
+                query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False,
+            )
+        else:
+            hidden_states = F.scaled_dot_product_attention(
+                query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+            )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states, *args)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+def register_attention_control(
+    model,
+    controller=None,
+    consistency_controller=None,
+    find_modules = {},
+    consistency_find_modules = {},
+    undo=False
+):
+    "Connect a model with a controller"
+    class DummyController:
+        def __call__(self, *args):
+            return args[0]
+        def __init__(self):
+            self.num_att_layers = 0
+    #if controller is None:
+    #    controller = DummyController()
+    f_keys = list(set(find_modules.keys()).difference(set(consistency_find_modules.keys())))
+    c_keys = list(set(consistency_find_modules.keys()).difference(set(find_modules.keys())))
+    common_keys = list(set(find_modules.keys()).intersection(set(consistency_find_modules.keys())))
+    new_find_modules = {}
+    for f_key in f_keys:
+        new_find_modules.update({
+            f_key: find_modules[f_key]
+        })
+    new_consistency_find_modules = {}
+    for c_key in c_keys:
+        new_consistency_find_modules.update({
+            c_key: consistency_find_modules[c_key]
+        })
+    common_modules = {}
+    for key in common_keys:
+        find_modules[key] = [] if find_modules[key] is None else find_modules[key]
+        consistency_find_modules[key] = [] if consistency_find_modules[key] is None else consistency_find_modules[key]
+        f_list = list(set(find_modules[key]).difference(set(consistency_find_modules[key])))
+        c_list = list(set(consistency_find_modules[key]).difference(set(find_modules[key])))
+        common_list = list(set(find_modules[key]).intersection(set(consistency_find_modules[key])))
+        if len(f_list) > 0:
+            new_find_modules.update({key: f_list})
+        if len(c_list) > 0:
+            new_consistency_find_modules.update({key: c_list})
+        if len(common_list) > 0:
+            common_modules.update({key: common_list})
+    find_modules = new_find_modules
+    consistency_find_modules = new_consistency_find_modules
+    print("common_modules", common_modules)
+    print("find_modules", find_modules)
+    print("consistency_find_modules", consistency_find_modules)
+    print("controller", controller, "consistency_controller", consistency_controller)
+    def register_recr(net_, count1, count2, place_in_unet):
+        if net_[1].__class__.__name__ == 'BasicTransformerBlock':
+            attention_type = 'spatial'
+        elif net_[1].__class__.__name__ == 'TemporalBasicTransformerBlock':
+            attention_type = 'temporal'
+        control1, control2 = None, None
+        if net_[1].__class__.__name__ in common_modules.keys():
+            control1, control2 = consistency_controller, controller
+            module_list = common_modules[net_[1].__class__.__name__]
+        elif net_[1].__class__.__name__ in find_modules.keys():
+            control1, control2 = None, controller
+            module_list = find_modules[net_[1].__class__.__name__]
+        elif net_[1].__class__.__name__ in consistency_find_modules.keys():
+            control1, control2 = consistency_controller, None
+            module_list = consistency_find_modules[net_[1].__class__.__name__]
+        if any([control is not None for control in [control1, control2]]):
+            if module_list is not None and 'attn1' in module_list:
+                if undo:
+                    net_[1].attn1.set_processor(AttnProcessor2_0())
+                else:
+                    net_[1].attn1.set_processor(AttnControllerProcessor(control1, control2, place_in_unet, attention_type = attention_type))
+                if control1 is not None: count1 += 1
+                if control2 is not None: count2 += 1
+            if module_list is not None and 'attn2' in module_list:
+                if undo:
+                    net_[1].attn2.set_processor(AttnProcessor2_0())
+                else:
+                    net_[1].attn2.set_processor(AttnControllerProcessor(control1, control2, place_in_unet, attention_type = attention_type))
+                if control1 is not None: count1 += 1
+                if control2 is not None: count2 += 1
+            return count1, count2
+        elif hasattr(net_[1], 'children'):
+            for net in net_[1].named_children():
+                count1, count2 = register_recr(net, count1, count2, place_in_unet)
+        return count1, count2
+    cross_att_count1 = 0
+    cross_att_count2 = 0
+    sub_nets = model.named_children()
+    for net in sub_nets:
+        if "down" in net[0]:
+            c1, c2 = register_recr(net, 0, 0, "down")
+            cross_att_count1 += c1
+            cross_att_count2 += c2
+        elif "up" in net[0]:
+            c1, c2 = register_recr(net, 0, 0, "up")
+            cross_att_count1 += c1
+            cross_att_count2 += c2
+        elif "mid" in net[0]:
+            c1, c2 = register_recr(net, 0, 0, "mid")
+            cross_att_count1 += c1
+            cross_att_count2 += c2
+    if undo:
+        print(f"Number of attention layer unregistered for controller: {cross_att_count2}")
+        print(f"Number of attention layer unregistered for consistency_controller: {cross_att_count1}")
+    else:
+        print(f"Number of attention layer registered for controller: {cross_att_count2}")
+        if controller is not None:
+            controller.num_att_layers = cross_att_count2
+        print(f"Number of attention layer registered for consistency_controller: {cross_att_count1}")
+        if consistency_controller is not None:
+            consistency_controller.num_att_layers = cross_att_count1

i2vedit/prompt_attention/attention_store.py ADDED Viewed

	@@ -0,0 +1,305 @@

+"""
+Code of attention storer AttentionStore, which is a base class for attention editor in attention_util.py
+"""
+import abc
+import os
+import copy
+import shutil
+import torch
+import torch.nn.functional as F
+from packaging import version
+from einops import rearrange
+import math
+from i2vedit.prompt_attention.common.util import get_time_string
+if version.parse(torch.__version__) >= version.parse("2.0.0"):
+    SDP_IS_AVAILABLE = True
+    from torch.backends.cuda import SDPBackend, sdp_kernel
+    BACKEND_MAP = {
+        SDPBackend.MATH: {
+            "enable_math": True,
+            "enable_flash": False,
+            "enable_mem_efficient": False,
+        },
+        SDPBackend.FLASH_ATTENTION: {
+            "enable_math": False,
+            "enable_flash": True,
+            "enable_mem_efficient": False,
+        },
+        SDPBackend.EFFICIENT_ATTENTION: {
+            "enable_math": False,
+            "enable_flash": False,
+            "enable_mem_efficient": True,
+        },
+        None: {"enable_math": True, "enable_flash": True, "enable_mem_efficient": True},
+    }
+else:
+    from contextlib import nullcontext
+    SDP_IS_AVAILABLE = False
+    sdp_kernel = nullcontext
+    BACKEND_MAP = {}
+    logpy.warn(
+        f"No SDP backend available, likely because you are running in pytorch "
+        f"versions < 2.0. In fact, you are using PyTorch {torch.__version__}. "
+        f"You might want to consider upgrading."
+    )
+class AttentionControl(abc.ABC):
+    def step_callback(self, x_t):
+        self.cur_att_layer = 0
+        self.cur_step += 1
+        self.between_steps()
+        return x_t
+    def between_steps(self):
+        return
+    @property
+    def num_uncond_att_layers(self):
+        """I guess the diffusion of google has some unconditional attention layer
+        No unconditional attention layer in Stable diffusion
+        Returns:
+            _type_: _description_
+        """
+        # return self.num_att_layers if config_dict['LOW_RESOURCE'] else 0
+        return 0
+    @abc.abstractmethod
+    def forward (self, attn, is_cross: bool, place_in_unet: str):
+        raise NotImplementedError
+    def __call__(self, attn, is_cross: bool, place_in_unet: str):
+        if self.cur_att_layer >= self.num_uncond_att_layers:
+            if self.LOW_RESOURCE or 'mask' in place_in_unet:
+                # For inversion without null text file
+                attn = self.forward(attn, is_cross, place_in_unet)
+            else:
+                # For classifier-free guidance scale!=1
+                h = attn.shape[0]
+                attn[h // 2:] = self.forward(attn[h // 2:], is_cross, place_in_unet)
+        self.cur_att_layer += 1
+        return attn
+    def reset(self):
+        self.cur_step = 0
+        self.cur_att_layer = 0
+    def __init__(self,
+                 ):
+        self.LOW_RESOURCE = False # assume the edit have cfg
+        self.cur_step = 0
+        self.num_att_layers = -1
+        self.cur_att_layer = 0
+class AttentionStore(AttentionControl):
+    def step_callback(self, x_t):
+        x_t = super().step_callback(x_t)
+        if self.save_latents:
+            self.latents_store.append(x_t.cpu().detach())
+        return x_t
+    @staticmethod
+    def get_empty_store():
+        return {"down_spatial_q_cross": [], "mid_spatial_q_cross": [], "up_spatial_q_cross": [],
+                "down_spatial_k_cross": [], "mid_spatial_k_cross": [], "up_spatial_k_cross": [],
+                "down_spatial_mask_cross": [], "mid_spatial_mask_cross": [], "up_spatial_mask_cross": [],
+                "down_temporal_cross": [], "mid_temporal_cross": [], "up_temporal_cross": [],
+                "down_spatial_q_self": [],  "mid_spatial_q_self": [],  "up_spatial_q_self": [],
+                "down_spatial_k_self": [],  "mid_spatial_k_self": [],  "up_spatial_k_self": [],
+                "down_spatial_mask_self": [],  "mid_spatial_mask_self": [],  "up_spatial_mask_self": [],
+                "down_spatial_self": [],  "mid_spatial_self": [],  "up_spatial_self": [],
+                "down_temporal_self": [],  "mid_temporal_self": [],  "up_temporal_self": []}
+    @staticmethod
+    def get_empty_cross_store():
+        return {"down_spatial_q_cross": [], "mid_spatial_q_cross": [], "up_spatial_q_cross": [],
+                "down_spatial_k_cross": [], "mid_spatial_k_cross": [], "up_spatial_k_cross": [],
+                "down_spatial_mask_cross": [], "mid_spatial_mask_cross": [], "up_spatial_mask_cross": [],
+                "down_temporal_cross": [], "mid_temporal_cross": [], "up_temporal_cross": [],
+                }
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        key = f"{place_in_unet}_{'cross' if is_cross else 'self'}"
+        if attn.shape[-2] <= 8*9*8*16:  # avoid memory overload
+            # print(f"Store attention map {key} of shape {attn.shape}")
+            if (is_cross or self.save_self_attention or 'mask' in key):
+                if False:#attn.shape[-2] >= 4*9*4*16:
+                    append_tensor = attn.cpu().detach()
+                else:
+                    append_tensor = attn
+                self.step_store[key].append(copy.deepcopy(append_tensor))
+                # FIXME: Are these deepcopy all necessary?
+                # self.step_store[key].append(append_tensor)
+        return attn
+    def between_steps(self):
+        if len(self.attention_store) == 0:
+            self.attention_store = {key: self.step_store[key] for key in self.step_store if 'mask' in key}
+        else:
+            for key in self.attention_store:
+                if 'mask' in key:
+                    for i in range(len(self.attention_store[key])):
+                        self.attention_store[key][i] += self.step_store[key][i]
+        if self.disk_store:
+            path = self.store_dir + f'/{self.cur_step:03d}.pt'
+            if self.load_attention_store is None:
+                torch.save(copy.deepcopy(self.step_store), path)
+                self.attention_store_all_step.append(path)
+        else:
+            self.attention_store_all_step.append(copy.deepcopy(self.step_store))
+        self.step_store = self.get_empty_store()
+    def get_average_attention(self):
+        "divide the attention map value in attention store by denoising steps"
+        average_attention = {key: [item / self.cur_step for item in self.attention_store[key]] for key in self.attention_store if 'mask' in key}
+        return average_attention
+    def reset(self):
+        super(AttentionStore, self).reset()
+        self.step_store = self.get_empty_store()
+        self.attention_store_all_step = []
+        if self.disk_store:
+            if self.load_attention_store is not None:
+                flist = sorted(os.listdir(self.load_attention_store), key=lambda x: int(x[:-3]))
+                self.attention_store_all_step = [
+                    os.path.join(self.load_attention_store, fn) for fn in flist
+                ]
+        self.attention_store = {}
+    def __init__(self,
+                 save_self_attention:bool=True,
+                 save_latents:bool=True,
+                 disk_store=False,
+                 load_attention_store:str=None,
+                 store_path:str=None
+        ):
+        super(AttentionStore, self).__init__()
+        self.disk_store = disk_store
+        if load_attention_store is not None:
+            if not os.path.exists(load_attention_store):
+                print(f"can not load attentions from {load_attention_store}: file doesn't exist.")
+                load_attention_store = None
+            else:
+                assert self.disk_store, f"can not load attentions from {load_attention_store} because disk_store is disabled."
+        self.attention_store_all_step = []
+        if self.disk_store:
+            if load_attention_store is not None:
+                self.store_dir = load_attention_store
+                flist = sorted([fpath for fpath in os.listdir(load_attention_store) if "inverted" not in fpath], key=lambda x: int(x[:-3]))
+                self.attention_store_all_step = [
+                    os.path.join(load_attention_store, fn) for fn in flist
+                ]
+            else:
+                if store_path is None:
+                    time_string = get_time_string()
+                    path = f'./trash/{self.__class__.__name__}_attention_cache_{time_string}'
+                else:
+                    path = store_path
+                os.makedirs(path, exist_ok=True)
+                self.store_dir = path
+        else:
+            self.store_dir =None
+        self.step_store = self.get_empty_store()
+        self.attention_store = {}
+        self.save_self_attention = save_self_attention
+        self.latents_store = []
+        self.save_latents = save_latents
+        self.load_attention_store = load_attention_store
+    def delete(self):
+        if self.disk_store:
+            try:
+                shutil.rmtree(self.store_dir)
+                print(f"Successfully remove {self.store_dir}")
+            except:
+                print(f"Fail to remove {self.store_dir}")
+    def attention_control(
+            self, place_in_unet, attention_type, is_cross,
+            q, k, v, attn_mask, dropout_p=0.0, is_causal=False
+        ):
+        if attention_type == "temporal":
+            return self.temporal_attention_control(
+                place_in_unet, attention_type, is_cross,
+                q, k, v, attn_mask, dropout_p=0.0, is_causal=False
+            )
+        elif attention_type == "spatial":
+            return self.spatial_attention_control(
+                place_in_unet, attention_type, is_cross,
+                q, k, v, attn_mask, dropout_p=0.0, is_causal=False
+            )
+    def temporal_attention_control(
+            self, place_in_unet, attention_type, is_cross,
+            q, k, v, attn_mask, dropout_p=0.0, is_causal=False
+        ):
+        h = q.shape[1]
+        q, k, v = map(lambda t: rearrange(t, "b h n d -> (b h) n d"), (q, k, v))
+        attention_scores = torch.baddbmm(
+            torch.empty(q.shape[0], q.shape[1], k.shape[1], dtype=q.dtype, device=q.device),
+            q,
+            k.transpose(-1, -2),
+            beta=0,
+            alpha=1 / math.sqrt(q.size(-1)),
+        )
+        if attn_mask is not None:
+            if attn_mask.dtype == torch.bool:
+                attn_mask.masked_fill_(attn_mask.logical_not(), float("-inf"))
+            attention_scores = attention_scores + attn_mask
+        attention_probs = attention_scores.softmax(dim=-1)
+        # cast back to the original dtype
+        attention_probs = attention_probs.to(v.dtype)
+        # START OF CORE FUNCTION
+        # Record during inversion and edit the attention probs during editing
+        attention_probs = rearrange(
+            self.__call__(
+               rearrange(attention_probs, "(b h) n d -> b h n d", h=h),
+                is_cross,
+                f'{place_in_unet}_{attention_type}'
+            ),
+            "b h n d -> (b h) n d"
+        )
+        # END OF CORE FUNCTION
+        # compute attention output
+        hidden_states = torch.bmm(attention_probs, v)
+        # reshape hidden_states
+        hidden_states = rearrange(hidden_states, "(b h) n d -> b h n d", h=h)
+        return hidden_states
+    def spatial_attention_control(
+            self, place_in_unet, attention_type, is_cross,
+            q, k, v, attn_mask, dropout_p=0.0, is_causal=False
+        ):
+        q = self.__call__(q, is_cross, f"{place_in_unet}_{attention_type}_q")
+        k = self.__call__(k, is_cross, f"{place_in_unet}_{attention_type}_k")
+        hidden_states = F.scaled_dot_product_attention(
+            q, k, v, attn_mask=attn_mask, dropout_p=dropout_p, is_causal=is_causal
+        )
+        return hidden_states

i2vedit/prompt_attention/attention_util.py ADDED Viewed

	@@ -0,0 +1,621 @@

+"""
+Collect all function in prompt_attention folder.
+Provide a API `make_controller' to return an initialized AttentionControlEdit class object in the main validation loop.
+"""
+from typing import Optional, Union, Tuple, List, Dict
+import abc
+import numpy as np
+import copy
+import math
+from einops import rearrange
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+from i2vedit.prompt_attention.visualization import (
+    show_cross_attention,
+    show_self_attention_comp,
+    show_self_attention,
+    show_self_attention_distance,
+    calculate_attention_mask,
+    show_avg_difference_maps
+)
+from i2vedit.prompt_attention.attention_store import AttentionStore, AttentionControl
+from i2vedit.prompt_attention.attention_register import register_attention_control
+device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+if version.parse(torch.__version__) >= version.parse("2.0.0"):
+    SDP_IS_AVAILABLE = True
+    from torch.backends.cuda import SDPBackend, sdp_kernel
+    BACKEND_MAP = {
+        SDPBackend.MATH: {
+            "enable_math": True,
+            "enable_flash": False,
+            "enable_mem_efficient": False,
+        },
+        SDPBackend.FLASH_ATTENTION: {
+            "enable_math": False,
+            "enable_flash": True,
+            "enable_mem_efficient": False,
+        },
+        SDPBackend.EFFICIENT_ATTENTION: {
+            "enable_math": False,
+            "enable_flash": False,
+            "enable_mem_efficient": True,
+        },
+        None: {"enable_math": True, "enable_flash": True, "enable_mem_efficient": True},
+    }
+else:
+    from contextlib import nullcontext
+    SDP_IS_AVAILABLE = False
+    sdp_kernel = nullcontext
+    BACKEND_MAP = {}
+    logpy.warn(
+        f"No SDP backend available, likely because you are running in pytorch "
+        f"versions < 2.0. In fact, you are using PyTorch {torch.__version__}. "
+        f"You might want to consider upgrading."
+    )
+class EmptyControl:
+    def step_callback(self, x_t):
+        return x_t
+    def between_steps(self):
+        return
+    def __call__(self, attn, is_cross: bool, place_in_unet: str):
+        return attn
+class AttentionControlEdit(AttentionStore, abc.ABC):
+    """Decide self or cross-attention. Call the reweighting cross attention module
+    Args:
+        AttentionStore (_type_): ([1, 4, 8, 64, 64])
+        abc (_type_): [8, 8, 1024, 77]
+    """
+    def get_all_last_latents(self, overlay_size):
+        return [latents[:,-overlay_size:,...] for latents in self.latents_store]
+    def step_callback(self, x_t):
+        x_t = super().step_callback(x_t)
+        x_t_device = x_t.device
+        x_t_dtype = x_t.dtype
+#        if self.previous_latents is not None:
+#            # replace latents
+#            step_in_store = self.cur_step - 1
+#            previous_latents = self.previous_latents[step_in_store]
+#            x_t[:,:len(previous_latents),...] = previous_latents.to(x_t_device, x_t_dtype)
+        if self.latent_blend:
+            avg_attention = self.get_average_attention()
+            masks = []
+            for key in avg_attention:
+                if 'down' in key and 'mask' in key:
+                    for attn in avg_attention[key]:
+                        if attn.shape[-2] == 8 * 9:
+                            masks.append( attn )
+            mask = sum(masks) / len(masks)
+            mask[mask > 0.2] = 1.0
+            if self.use_inversion_attention and self.additional_attention_store is not None:
+                step_in_store = len(self.additional_attention_store.latents_store) - self.cur_step
+            elif self.additional_attention_store is None:
+                pass
+            else:
+                step_in_store = self.cur_step - 1
+            inverted_latents = self.additional_attention_store.latents_store[step_in_store]
+            inverted_latents = inverted_latents.to(device =x_t_device, dtype=x_t_dtype)
+            x_t = (1 - mask) * inverted_latents + mask * x_t
+        self.step_in_store_atten_dict = None
+        return x_t
+    def replace_self_attention(self, attn_base, attn_replace, reshaped_mask=None, key=None):
+        target_device = attn_replace.device
+        target_dtype  = attn_replace.dtype
+        attn_base = attn_base.to(target_device, dtype=target_dtype)
+        if "temporal" in key:
+            if self.control_mode["temporal_self"] == "copy_v2":
+                if self.cur_step < int(self.temporal_step_thr[0] * self.num_steps):
+                    return attn_base
+                if self.cur_step >= int(self.temporal_step_thr[1] * self.num_steps):
+                    return attn_replace
+                if ('down' in key and self.current_pos<4) or \
+                   ('up' in key and self.current_pos>1):
+                    return attn_replace
+                return attn_base
+            else:
+                raise NotImplementedError
+        elif "spatial" in key:
+            raise NotImplementedError
+    def replace_cross_attention(self, attn_base, attn_replace, key=None):
+        raise NotImplementedError
+    def update_attention_position_dict(self, current_attention_key):
+        self.attention_position_counter_dict[current_attention_key] +=1
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        super(AttentionControlEdit, self).forward(attn, is_cross, place_in_unet)
+        if 'mask' in place_in_unet:
+            return attn
+        if (not is_cross and 'temporal' in place_in_unet and (self.cur_step < self.num_temporal_self_replace[0] or self.cur_step >=self.num_temporal_self_replace[1])):
+            if self.control_mode["temporal_self"] == "copy" or \
+               self.control_mode["temporal_self"] == "copy_v2":
+                return attn
+        if (not is_cross and 'spatial' in place_in_unet and (self.cur_step < self.num_spatial_self_replace[0] or self.cur_step >=self.num_spatial_self_replace[1])):
+            if self.control_mode["spatial_self"] == "copy":
+                return attn
+        if (is_cross and (self.cur_step < self.num_cross_replace[0] or self.cur_step >= self.num_cross_replace[1])):
+            return attn
+        if True:#'temporal' in place_in_unet:
+            key = f"{place_in_unet}_{'cross' if is_cross else 'self'}"
+            current_pos = self.attention_position_counter_dict[key]
+            if self.use_inversion_attention and self.additional_attention_store is not None:
+                step_in_store = len(self.additional_attention_store.attention_store_all_step) - self.cur_step -1
+            elif self.additional_attention_store is None:
+                return attn
+            else:
+                step_in_store = self.cur_step
+            step_in_store_atten_dict = self.additional_attention_store.attention_store_all_step[step_in_store]
+            if isinstance(step_in_store_atten_dict, str):
+                if self.step_in_store_atten_dict is None:
+                    step_in_store_atten_dict = torch.load(step_in_store_atten_dict)
+                    self.step_in_store_atten_dict = step_in_store_atten_dict
+                else:
+                    step_in_store_atten_dict = self.step_in_store_atten_dict
+            # Note that attn is append to step_store,
+            # if attn is get through clean -> noisy, we should inverse it
+            #print(key)
+            attn_base = step_in_store_atten_dict[key][current_pos]
+            self.current_pos = current_pos
+            self.update_attention_position_dict(key)
+            # save in format of [temporal, head, resolution, text_embedding]
+            attn_base, attn_replace = attn_base, attn
+            if not is_cross:
+                attn = self.replace_self_attention(attn_base, attn_replace, None, key)
+            #elif is_cross and (self.num_cross_replace[0] <= self.cur_step < self.num_cross_replace[1]):
+            elif is_cross:
+                attn = self.replace_cross_attention(attn_base, attn_replace, key)
+            return attn
+        else:
+            raise NotImplementedError("Due to CUDA RAM limit, direct replace functions for spatial  are not implemented.")
+    def between_steps(self):
+        super().between_steps()
+        self.step_store = self.get_empty_store()
+        self.attention_position_counter_dict = {
+            'down_spatial_q_cross': 0,
+            'mid_spatial_q_cross': 0,
+            'up_spatial_q_cross': 0,
+            'down_spatial_k_cross': 0,
+            'mid_spatial_k_cross': 0,
+            'up_spatial_k_cross': 0,
+            'down_spatial_mask_cross': 0,
+            'mid_spatial_mask_cross': 0,
+            'up_spatial_mask_cross': 0,
+            'down_spatial_q_self': 0,
+            'mid_spatial_q_self': 0,
+            'up_spatial_q_self': 0,
+            'down_spatial_k_self': 0,
+            'mid_spatial_k_self': 0,
+            'up_spatial_k_self': 0,
+            'down_spatial_mask_self': 0,
+            'mid_spatial_mask_self': 0,
+            'up_spatial_mask_self': 0,
+            'down_temporal_cross': 0,
+            'mid_temporal_cross': 0,
+            'up_temporal_cross': 0,
+            'down_temporal_self': 0,
+            'mid_temporal_self': 0,
+            'up_temporal_self': 0
+        }
+        return
+    def __init__(self, num_steps: int,
+                 cross_replace_steps: Union[float, Tuple[float, float], Dict[str, Tuple[float, float]]],
+                 temporal_self_replace_steps: Union[float, Tuple[float, float]],
+                 spatial_self_replace_steps: Union[float, Tuple[float, float]],
+                 control_mode={"temporal_self":"copy","spatial_self":"copy"},
+                 spatial_attention_chunk_size = 1,
+                 additional_attention_store: AttentionStore =None,
+                 use_inversion_attention: bool=False,
+                 save_self_attention: bool=True,
+                 save_latents: bool=True,
+                 disk_store=False,
+                 *args, **kwargs
+                 ):
+        super(AttentionControlEdit, self).__init__(
+            save_self_attention=save_self_attention,
+            save_latents=save_latents,
+            disk_store=disk_store)
+        self.additional_attention_store = additional_attention_store
+        if type(temporal_self_replace_steps) is float:
+            temporal_self_replace_steps = 0, temporal_self_replace_steps
+        if type(spatial_self_replace_steps) is float:
+            spatial_self_replace_steps = 0, spatial_self_replace_steps
+        if type(cross_replace_steps) is float:
+            cross_replace_steps = 0, cross_replace_steps
+        self.num_temporal_self_replace = int(num_steps * temporal_self_replace_steps[0]), int(num_steps * temporal_self_replace_steps[1])
+        self.num_spatial_self_replace = int(num_steps * spatial_self_replace_steps[0]), int(num_steps * spatial_self_replace_steps[1])
+        self.num_cross_replace = int(num_steps * cross_replace_steps[0]), int(num_steps * cross_replace_steps[1])
+        self.control_mode = control_mode
+        self.spatial_attention_chunk_size = spatial_attention_chunk_size
+        self.step_in_store_atten_dict = None
+        # We need to know the current position in attention
+        self.prev_attention_key_name = 0
+        self.use_inversion_attention = use_inversion_attention
+        self.attention_position_counter_dict = {
+            'down_spatial_q_cross': 0,
+            'mid_spatial_q_cross': 0,
+            'up_spatial_q_cross': 0,
+            'down_spatial_k_cross': 0,
+            'mid_spatial_k_cross': 0,
+            'up_spatial_k_cross': 0,
+            'down_spatial_mask_cross': 0,
+            'mid_spatial_mask_cross': 0,
+            'up_spatial_mask_cross': 0,
+            'down_spatial_q_self': 0,
+            'mid_spatial_q_self': 0,
+            'up_spatial_q_self': 0,
+            'down_spatial_k_self': 0,
+            'mid_spatial_k_self': 0,
+            'up_spatial_k_self': 0,
+            'down_spatial_mask_self': 0,
+            'mid_spatial_mask_self': 0,
+            'up_spatial_mask_self': 0,
+            'down_temporal_cross': 0,
+            'mid_temporal_cross': 0,
+            'up_temporal_cross': 0,
+            'down_temporal_self': 0,
+            'mid_temporal_self': 0,
+            'up_temporal_self': 0
+        }
+        self.mask_thr = kwargs.get("mask_thr", 0.35)
+        self.latent_blend = kwargs.get('latent_blend', False)
+        self.temporal_step_thr = kwargs.get("temporal_step_thr", [0.4,0.8])
+        self.num_steps = num_steps
+    def spatial_attention_control(
+            self, place_in_unet, attention_type, is_cross,
+            q, k, v, attn_mask, dropout_p=0.0, is_causal=False
+        ):
+        return self.spatial_attention_matching(
+            place_in_unet, attention_type, is_cross,
+            q, k, v, attn_mask, dropout_p=0.0, is_causal=False,
+            mode = self.control_mode["spatial_self"]
+        )
+    def spatial_attention_matching(
+            self, place_in_unet, attention_type, is_cross,
+            q, k, v, attn_mask, dropout_p=0.0, is_causal=False,
+            mode = "matching"
+        ):
+        place_in_unet = f"{place_in_unet}_{attention_type}"
+        with sdp_kernel(**BACKEND_MAP[None]):
+#            print("register", q.shape, k.shape, v.shape)
+            # fetch inversion q and k
+            key_q = f"{place_in_unet}_q_{'cross' if is_cross else 'self'}"
+            key_k = f"{place_in_unet}_k_{'cross' if is_cross else 'self'}"
+            current_pos_q = self.attention_position_counter_dict[key_q]
+            current_pos_k = self.attention_position_counter_dict[key_k]
+            if self.use_inversion_attention and self.additional_attention_store is not None:
+                step_in_store = len(self.additional_attention_store.attention_store_all_step) - self.cur_step -1
+            else:
+                step_in_store = self.cur_step
+            step_in_store_atten_dict = self.additional_attention_store.attention_store_all_step[step_in_store]
+            if isinstance(step_in_store_atten_dict, str):
+                if self.step_in_store_atten_dict is None:
+                    step_in_store_atten_dict = torch.load(step_in_store_atten_dict)
+                    self.step_in_store_atten_dict = step_in_store_atten_dict
+                else:
+                    step_in_store_atten_dict = self.step_in_store_atten_dict
+            q0s = step_in_store_atten_dict[key_q][current_pos_q].to(q.device)
+            k0s = step_in_store_atten_dict[key_k][current_pos_k].to(k.device)
+            self.update_attention_position_dict(key_q)
+            self.update_attention_position_dict(key_k)
+            qs, ks, vs = q, k, v
+            h = q.shape[1]
+            res = int(np.sqrt(q.shape[-2] / (9*16)))
+            if res == 0:
+                res = 1
+            #res = int(np.sqrt(q.shape[-2] / (8*14)))
+            bs = self.spatial_attention_chunk_size
+            if bs is None: bs = qs.shape[0]
+            N = qs.shape[0] // bs
+            assert qs.shape[0] % bs == 0
+            i1st, n1st = qs.shape[0]//2//bs, qs.shape[0]//2%bs
+            outs = []
+            masks = []
+            # this might reduce time costs but will introduce inaccurate motions
+#            if current_pos_q >= 6 and 'up' in key_q:
+#                return F.scaled_dot_product_attention(
+#                    q, k, v, attn_mask=attn_mask, dropout_p=dropout_p, is_causal=is_causal
+#                )
+            for i in range(N):
+                q = qs[i*bs:(i+1)*bs,...].type(torch.float32)
+                k = ks[i*bs:(i+1)*bs,...].type(torch.float32)
+                v = vs[i*bs:(i+1)*bs,...].type(torch.float32)
+                q, k, v = map(lambda t: rearrange(t, "b h n d -> (b h) n d"), (q, k, v))
+                with torch.autocast("cuda", enabled=False):
+                    attention_scores = torch.baddbmm(
+                        torch.empty(q.shape[0], q.shape[1], k.shape[1], dtype=q.dtype, device=q.device),
+                        q,
+                        k.transpose(-1, -2),
+                        beta=0,
+                        alpha=1 / math.sqrt(q.size(-1)),
+                    )
+                if attn_mask is not None:
+                    if attn_mask.dtype == torch.bool:
+                        attn_mask.masked_fill_(attn_mask.logical_not(), float("-inf"))
+                    attention_scores = attention_scores + attn_mask
+                attention_probs = attention_scores.softmax(dim=-1).to(vs.dtype)
+                # only compute conditional output
+                if i >= N//2:
+                    q0 = q0s[(i-N//2)*bs:(i-N//2+1)*bs,...].type(torch.float32)
+                    k0 = k0s[(i-N//2)*bs:(i-N//2+1)*bs,...].type(torch.float32)
+                    q0, k0 = map(lambda t: rearrange(t, "b h n d -> (b h) n d"), (q0, k0))
+                    with torch.autocast("cuda", enabled=False):
+                        attention_scores_0 = torch.baddbmm(
+                            torch.empty(q0.shape[0], q0.shape[1], k0.shape[1], dtype=q0.dtype, device=q0.device),
+                            q0,
+                            k0.transpose(-1, -2),
+                            beta=0,
+                            alpha=1 / math.sqrt(q0.size(-1)),
+                        )
+                    attention_probs_0 = attention_scores_0.softmax(dim=-1).to(vs.dtype)
+                    attention_probs, attention_probs_0 = \
+                        map(lambda t: rearrange(t, "(b h) n d -> b h n d", h=h),
+                            (attention_probs, attention_probs_0))
+                    if mode == "masked_copy":
+                        mask = torch.sum(
+                            torch.mean(
+                                torch.abs(attention_probs_0 - attention_probs),
+                                dim=1
+                            ),
+                            dim=2
+                        ).reshape(bs,1,-1,1).clamp(0,2)/2.0
+                        mask_thr = (self.mask_thr[1]-self.mask_thr[0]) / (qs.shape[0]//2)*(i-N//2) + self.mask_thr[0]
+                        mask_tmp = mask.clone()
+                        mask[mask>=mask_thr] = 1.0
+                        masks.append(mask)
+                        # apply mask
+                        attention_probs = (1 - mask) * attention_probs_0 + mask * attention_probs
+                    else:
+                        raise NotImplementedError
+                    attention_probs = rearrange(attention_probs, "b h n d -> (b h) n d")
+                # compute attention output
+                hidden_states = torch.bmm(attention_probs, v)
+                # reshape hidden_states
+                hidden_states = rearrange(hidden_states, "(b h) n d -> b h n d", h=h)
+                outs.append(hidden_states)
+            if mode == "masked_copy":
+                # masks = rearrange(torch.cat(masks, 0), "b 1 (h w) 1 -> h (b w)", h=res*9)
+                masks = torch.cat(masks, 0)
+                #print(f"{place_in_unet}_masked_copy")
+                # save mask
+                _ = self.__call__(masks, is_cross, f"{place_in_unet}_mask")
+            return torch.cat(outs, 0)
+class ConsistencyAttentionControl(AttentionStore, abc.ABC):
+    """Decide self or cross-attention. Call the reweighting cross attention module
+    Args:
+        AttentionStore (_type_): ([1, 4, 8, 64, 64])
+        abc (_type_): [8, 8, 1024, 77]
+    """
+    def step_callback(self, x_t):
+        x_t = super().step_callback(x_t)
+        x_t_device = x_t.device
+        x_t_dtype = x_t.dtype
+#        if self.previous_latents is not None:
+#            # replace latents
+#            step_in_store = self.cur_step - 1
+#            previous_latents = self.previous_latents[step_in_store]
+#            x_t[:,:len(previous_latents),...] = previous_latents.to(x_t_device, x_t_dtype)
+        self.step_in_store_atten_dict = None
+        return x_t
+    def update_attention_position_dict(self, current_attention_key):
+        self.attention_position_counter_dict[current_attention_key] +=1
+    def __call__(self, attn, is_cross: bool, place_in_unet: str):
+        if self.cur_att_layer >= self.num_uncond_att_layers:
+            attn = self.forward(attn, is_cross, place_in_unet)
+        self.cur_att_layer += 1
+        return attn
+    def set_cur_step(self, step: int = 0):
+        self.cur_step = step
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        super(ConsistencyAttentionControl, self).forward(attn, is_cross, place_in_unet)
+        key = f"{place_in_unet}_{'cross' if is_cross else 'self'}"
+        current_pos = self.attention_position_counter_dict[key]
+        if self.use_inversion_attention and self.additional_attention_store is not None:
+            step_in_store = len(self.additional_attention_store.attention_store_all_step) - self.cur_step -1
+        elif self.additional_attention_store is None:
+            return attn
+        else:
+            step_in_store = self.cur_step
+        step_in_store_atten_dict = self.additional_attention_store.attention_store_all_step[step_in_store]
+        if isinstance(step_in_store_atten_dict, str):
+            if self.step_in_store_atten_dict is None:
+                step_in_store_atten_dict = torch.load(step_in_store_atten_dict)
+                self.step_in_store_atten_dict = step_in_store_atten_dict
+            else:
+                step_in_store_atten_dict = self.step_in_store_atten_dict
+        # Note that attn is append to step_store,
+        # if attn is get through clean -> noisy, we should inverse it
+        #print("consistency", key)
+        attn_base = step_in_store_atten_dict[key][current_pos].to(attn.device, attn.dtype)
+        attn_base = attn_base.detach()
+        self.update_attention_position_dict(key)
+        # save in format of [temporal, head, resolution, text_embedding]
+        attn = torch.cat([attn_base, attn], dim=2)
+        return attn
+    @staticmethod
+    def get_empty_store():
+        return {
+                "down_temporal_k_self": [],  "mid_temporal_k_self": [],  "up_temporal_k_self": [],
+                "down_temporal_v_self": [],  "mid_temporal_v_self": [],  "up_temporal_v_self": []
+               }
+    def between_steps(self):
+        super().between_steps()
+        self.step_store = self.get_empty_store()
+        self.attention_position_counter_dict = {
+            'down_temporal_k_self': 0,
+            'mid_temporal_k_self': 0,
+            'up_temporal_k_self': 0,
+            'down_temporal_v_self': 0,
+            'mid_temporal_v_self': 0,
+            'up_temporal_v_self': 0
+        }
+        return
+    def __init__(self,
+                 additional_attention_store: AttentionStore =None,
+                 use_inversion_attention: bool=False,
+                 load_attention_store: str = None,
+                 save_self_attention: bool=True,
+                 save_latents: bool=True,
+                 disk_store=False,
+                 store_path:str=None
+                 ):
+        super(ConsistencyAttentionControl, self).__init__(
+            save_self_attention=save_self_attention,
+            load_attention_store=load_attention_store,
+            save_latents=save_latents,
+            disk_store=disk_store,
+            store_path=store_path
+        )
+        self.additional_attention_store = additional_attention_store
+        self.step_in_store_atten_dict = None
+        # We need to know the current position in attention
+        self.use_inversion_attention = use_inversion_attention
+        self.attention_position_counter_dict = {
+            'down_temporal_k_self': 0,
+            'mid_temporal_k_self': 0,
+            'up_temporal_k_self': 0,
+            'down_temporal_v_self': 0,
+            'mid_temporal_v_self': 0,
+            'up_temporal_v_self': 0
+        }
+def make_controller(
+                    cross_replace_steps: Dict[str, float], self_replace_steps: float=0.0,
+                    additional_attention_store=None, use_inversion_attention = False,
+                    NUM_DDIM_STEPS=None,
+                    save_path = None,
+                    save_self_attention = True,
+                    disk_store = False
+                    ) -> AttentionControlEdit:
+    controller = AttentionControlEdit(NUM_DDIM_STEPS,
+                                  cross_replace_steps=cross_replace_steps,
+                                  self_replace_steps=self_replace_steps,
+                                  additional_attention_store=additional_attention_store,
+                                  use_inversion_attention = use_inversion_attention,
+                                  save_self_attention = save_self_attention,
+                                  disk_store=disk_store
+                                  )
+    return controller

i2vedit/prompt_attention/common/__init__.py ADDED Viewed

File without changes

i2vedit/prompt_attention/common/image_util.py ADDED Viewed

	@@ -0,0 +1,192 @@

+import os
+import math
+import textwrap
+import imageio
+import numpy as np
+from typing import Sequence
+import requests
+import cv2
+from PIL import Image, ImageDraw, ImageFont
+import torch
+from torchvision import transforms
+from einops import rearrange
+IMAGE_EXTENSION = (".jpg", ".jpeg", ".png", ".ppm", ".bmp", ".pgm", ".tif", ".tiff", ".webp", ".JPEG")
+FONT_URL = "https://raw.github.com/googlefonts/opensans/main/fonts/ttf/OpenSans-Regular.ttf"
+FONT_PATH = "./docs/OpenSans-Regular.ttf"
+def pad(image: Image.Image, top=0, right=0, bottom=0, left=0, color=(255, 255, 255)) -> Image.Image:
+    new_image = Image.new(image.mode, (image.width + right + left, image.height + top + bottom), color)
+    new_image.paste(image, (left, top))
+    return new_image
+def download_font_opensans(path=FONT_PATH):
+    font_url = FONT_URL
+    response = requests.get(font_url)
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    with open(path, "wb") as f:
+        f.write(response.content)
+def annotate_image_with_font(image: Image.Image, text: str, font: ImageFont.FreeTypeFont) -> Image.Image:
+    image_w = image.width
+    _, _, text_w, text_h = font.getbbox(text)
+    line_size = math.floor(len(text) * image_w / text_w)
+    lines = textwrap.wrap(text, width=line_size)
+    padding = text_h * len(lines)
+    image = pad(image, top=padding + 3)
+    ImageDraw.Draw(image).text((0, 0), "\n".join(lines), fill=(0, 0, 0), font=font)
+    return image
+def annotate_image(image: Image.Image, text: str, font_size: int = 15):
+    if not os.path.isfile(FONT_PATH):
+        download_font_opensans()
+    font = ImageFont.truetype(FONT_PATH, size=font_size)
+    return annotate_image_with_font(image=image, text=text, font=font)
+def make_grid(images: Sequence[Image.Image], rows=None, cols=None) -> Image.Image:
+    if isinstance(images[0], np.ndarray):
+        images = [Image.fromarray(i) for i in images]
+    if rows is None:
+        assert cols is not None
+        rows = math.ceil(len(images) / cols)
+    else:
+        cols = math.ceil(len(images) / rows)
+    w, h = images[0].size
+    grid = Image.new("RGB", size=(cols * w, rows * h))
+    for i, image in enumerate(images):
+        if image.size != (w, h):
+            image = image.resize((w, h))
+        grid.paste(image, box=(i % cols * w, i // cols * h))
+    return grid
+def save_images_as_gif(
+    images: Sequence[Image.Image],
+    save_path: str,
+    loop=0,
+    duration=100,
+    optimize=False,
+) -> None:
+    images[0].save(
+        save_path,
+        save_all=True,
+        append_images=images[1:],
+        optimize=optimize,
+        loop=loop,
+        duration=duration,
+    )
+def save_images_as_mp4(
+    images: Sequence[Image.Image],
+    save_path: str,
+) -> None:
+    writer_edit = imageio.get_writer(
+        save_path,
+        fps=10)
+    for i in images:
+        init_image = i.convert("RGB")
+        writer_edit.append_data(np.array(init_image))
+    writer_edit.close()
+def save_images_as_folder(
+    images: Sequence[Image.Image],
+    save_path: str,
+) -> None:
+    os.makedirs(save_path, exist_ok=True)
+    for index, image in enumerate(images):
+        init_image = image
+        if len(np.array(init_image).shape) == 3:
+            cv2.imwrite(os.path.join(save_path, f"{index:05d}.png"), np.array(init_image)[:, :, ::-1])
+        else:
+            cv2.imwrite(os.path.join(save_path, f"{index:05d}.png"), np.array(init_image))
+def log_train_samples(
+    train_dataloader,
+    save_path,
+    num_batch: int = 4,
+):
+    train_samples = []
+    for idx, batch in enumerate(train_dataloader):
+        if idx >= num_batch:
+            break
+        train_samples.append(batch["images"])
+    train_samples = torch.cat(train_samples).numpy()
+    train_samples = rearrange(train_samples, "b c f h w -> b f h w c")
+    train_samples = (train_samples * 0.5 + 0.5).clip(0, 1)
+    train_samples = numpy_batch_seq_to_pil(train_samples)
+    train_samples = [make_grid(images, cols=int(np.ceil(np.sqrt(len(train_samples))))) for images in zip(*train_samples)]
+    # save_images_as_gif(train_samples, save_path)
+    save_gif_mp4_folder_type(train_samples, save_path)
+def log_train_reg_samples(
+    train_dataloader,
+    save_path,
+    num_batch: int = 4,
+):
+    train_samples = []
+    for idx, batch in enumerate(train_dataloader):
+        if idx >= num_batch:
+            break
+        train_samples.append(batch["class_images"])
+    train_samples = torch.cat(train_samples).numpy()
+    train_samples = rearrange(train_samples, "b c f h w -> b f h w c")
+    train_samples = (train_samples * 0.5 + 0.5).clip(0, 1)
+    train_samples = numpy_batch_seq_to_pil(train_samples)
+    train_samples = [make_grid(images, cols=int(np.ceil(np.sqrt(len(train_samples))))) for images in zip(*train_samples)]
+    # save_images_as_gif(train_samples, save_path)
+    save_gif_mp4_folder_type(train_samples, save_path)
+def save_gif_mp4_folder_type(images, save_path, save_gif=True):
+    if isinstance(images[0], np.ndarray):
+        images = [Image.fromarray(i) for i in images]
+    elif isinstance(images[0], torch.Tensor):
+        images = [transforms.ToPILImage()(i.cpu().clone()[0]) for i in images]
+    save_path_mp4 = save_path.replace('gif', 'mp4')
+    save_path_folder = save_path.replace('.gif', '')
+    if save_gif: save_images_as_gif(images, save_path)
+    save_images_as_mp4(images, save_path_mp4)
+    save_images_as_folder(images, save_path_folder)
+# copy from video_diffusion/pipelines/stable_diffusion.py
+def numpy_seq_to_pil(images):
+    """
+    Convert a numpy image or a batch of images to a PIL image.
+    """
+    if images.ndim == 3:
+        images = images[None, ...]
+    images = (images * 255).round().astype("uint8")
+    if images.shape[-1] == 1:
+        # special case for grayscale (single channel) images
+        pil_images = [Image.fromarray(image.squeeze(), mode="L") for image in images]
+    else:
+        pil_images = [Image.fromarray(image) for image in images]
+    return pil_images
+# copy from diffusers-0.11.1/src/diffusers/pipeline_utils.py
+def numpy_batch_seq_to_pil(images):
+    pil_images = []
+    for sequence in images:
+        pil_images.append(numpy_seq_to_pil(sequence))
+    return pil_images

i2vedit/prompt_attention/common/instantiate_from_config.py ADDED Viewed

	@@ -0,0 +1,33 @@

+"""
+Copy from stable diffusion
+"""
+import importlib
+def instantiate_from_config(config:dict, **args_from_code):
+    """Util funciton to decompose differenct modules using config
+    Args:
+        config (dict): with key of "target" and "params", better from yaml
+        static
+        args_from_code: additional con
+    Returns:
+        a validation/training pipeline, a module
+    """
+    if not "target" in config:
+        if config == '__is_first_stage__':
+            return None
+        elif config == "__is_unconditional__":
+            return None
+        raise KeyError("Expected key `target` to instantiate.")
+    return get_obj_from_str(config["target"])(**config.get("params", dict()), **args_from_code)
+def get_obj_from_str(string, reload=False):
+    module, cls = string.rsplit(".", 1)
+    if reload:
+        module_imp = importlib.import_module(module)
+        importlib.reload(module_imp)
+    return getattr(importlib.import_module(module, package=None), cls)

i2vedit/prompt_attention/common/logger.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import os
+import logging, logging.handlers
+from accelerate.logging import get_logger
+def get_logger_config_path(logdir):
+    # accelerate handles the logger in multiprocessing
+    logger = get_logger(__name__)
+    logging.basicConfig(
+        level=logging.INFO,
+        format='%(asctime)s:%(levelname)s : %(message)s',
+        datefmt='%a, %d %b %Y %H:%M:%S',
+        filename=os.path.join(logdir, 'log.log'),
+        filemode='w')
+    chlr = logging.StreamHandler()
+    chlr.setFormatter(logging.Formatter('%(asctime)s:%(levelname)s : %(message)s'))
+    logger.logger.addHandler(chlr)
+    return logger

i2vedit/prompt_attention/common/set_seed.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import os
+os.environ['CUBLAS_WORKSPACE_CONFIG'] = ':4096:8'
+import torch
+import numpy as np
+import random
+from accelerate.utils import set_seed
+def video_set_seed(seed: int):
+    """
+    Helper function for reproducible behavior to set the seed in `random`, `numpy`, `torch`.
+    Args:
+        seed (`int`): The seed to set.
+        device_specific (`bool`, *optional*, defaults to `False`):
+            Whether to differ the seed on each device slightly with `self.process_index`.
+    """
+    set_seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    torch.backends.cudnn.benchmark = False
+    # torch.use_deterministic_algorithms(True, warn_only=True)
+    # [W Context.cpp:82] Warning: efficient_attention_forward_cutlass does not have a deterministic implementation, but you set 'torch.use_deterministic_algorithms(True, warn_only=True)'. You can file an issue at https://github.com/pytorch/pytorch/issues to help us prioritize adding deterministic support for this operation. (function alertNotDeterministic)

i2vedit/prompt_attention/common/util.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import os
+import sys
+import copy
+import inspect
+import datetime
+from typing import List, Tuple, Optional, Dict
+def glob_files(
+    root_path: str,
+    extensions: Tuple[str],
+    recursive: bool = True,
+    skip_hidden_directories: bool = True,
+    max_directories: Optional[int] = None,
+    max_files: Optional[int] = None,
+    relative_path: bool = False,
+) -> Tuple[List[str], bool, bool]:
+    """glob files with specified extensions
+    Args:
+        root_path (str): _description_
+        extensions (Tuple[str]): _description_
+        recursive (bool, optional): _description_. Defaults to True.
+        skip_hidden_directories (bool, optional): _description_. Defaults to True.
+        max_directories (Optional[int], optional): max number of directories to search. Defaults to None.
+        max_files (Optional[int], optional): max file number limit. Defaults to None.
+        relative_path (bool, optional): _description_. Defaults to False.
+    Returns:
+        Tuple[List[str], bool, bool]: _description_
+    """
+    paths = []
+    hit_max_directories = False
+    hit_max_files = False
+    for directory_idx, (directory, _, fnames) in enumerate(os.walk(root_path, followlinks=True)):
+        if skip_hidden_directories and os.path.basename(directory).startswith("."):
+            continue
+        if max_directories is not None and directory_idx >= max_directories:
+            hit_max_directories = True
+            break
+        paths += [
+            os.path.join(directory, fname)
+            for fname in sorted(fnames)
+            if fname.lower().endswith(extensions)
+        ]
+        if not recursive:
+            break
+        if max_files is not None and len(paths) > max_files:
+            hit_max_files = True
+            paths = paths[:max_files]
+            break
+    if relative_path:
+        paths = [os.path.relpath(p, root_path) for p in paths]
+    return paths, hit_max_directories, hit_max_files
+def get_time_string() -> str:
+    x = datetime.datetime.now()
+    return f"{(x.year - 2000):02d}{x.month:02d}{x.day:02d}-{x.hour:02d}{x.minute:02d}{x.second:02d}"
+def get_function_args() -> Dict:
+    frame = sys._getframe(1)
+    args, _, _, values = inspect.getargvalues(frame)
+    args_dict = copy.deepcopy({arg: values[arg] for arg in args})
+    return args_dict

i2vedit/prompt_attention/ptp_utils.py ADDED Viewed

	@@ -0,0 +1,199 @@

+'''
+utils code for image visualization
+'''
+# Copyright 2022 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import torch
+from PIL import Image
+import cv2
+from typing import Optional, Union, Tuple, List, Callable, Dict
+import datetime
+def text_under_image(image: np.ndarray, text: str, text_color: Tuple[int, int, int] = (0, 0, 0)):
+    h, w, c = image.shape
+    offset = int(h * .2)
+    img = np.ones((h + offset, w, c), dtype=np.uint8) * 255
+    font = cv2.FONT_HERSHEY_SIMPLEX
+    # font = ImageFont.truetype("/usr/share/fonts/truetype/noto/NotoMono-Regular.ttf", font_size)
+    img[:h] = image
+    textsize = cv2.getTextSize(text, font, 1, 2)[0]
+    text_x, text_y = (w - textsize[0]) // 2, h + offset - textsize[1] // 2
+    cv2.putText(img, text, (text_x, text_y ), font, 1, text_color, 2)
+    return img
+def view_images(images, num_rows=1, offset_ratio=0.02, save_path=None):
+    if type(images) is list:
+        num_empty = len(images) % num_rows
+    elif images.ndim == 4:
+        num_empty = images.shape[0] % num_rows
+    else:
+        images = [images]
+        num_empty = 0
+    empty_images = np.ones(images[0].shape, dtype=np.uint8) * 255
+    images = [image.astype(np.uint8) for image in images] + [empty_images] * num_empty
+    num_items = len(images)
+    h, w, c = images[0].shape
+    offset = int(h * offset_ratio)
+    num_cols = num_items // num_rows
+    image_ = np.ones((h * num_rows + offset * (num_rows - 1),
+                      w * num_cols + offset * (num_cols - 1), 3), dtype=np.uint8) * 255
+    for i in range(num_rows):
+        for j in range(num_cols):
+            image_[i * (h + offset): i * (h + offset) + h:, j * (w + offset): j * (w + offset) + w] = images[
+                i * num_cols + j]
+    if save_path is not None:
+        pil_img = Image.fromarray(image_)
+        now = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
+        pil_img.save(f'{save_path}/{now}.png')
+    # display(pil_img)
+def register_attention_control_p2p_deprecated(model, controller):
+    "Original code from prompt to prompt"
+    def ca_forward(self, place_in_unet):
+        to_out = self.to_out
+        if type(to_out) is torch.nn.modules.container.ModuleList:
+            to_out = self.to_out[0]
+        else:
+            to_out = self.to_out
+        # def forward(x, encoder_hidden_states=None, attention_mask=None):
+        def forward(hidden_states, encoder_hidden_states=None, attention_mask=None, **cross_attention_kwargs):
+            batch_size, sequence_length, _ = hidden_states.shape
+            attention_mask = self.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            query = self.to_q(hidden_states)
+            query = self.head_to_batch_dim(query)
+            is_cross = encoder_hidden_states is not None
+            encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
+            key = self.to_k(encoder_hidden_states)
+            value = self.to_v(encoder_hidden_states)
+            key = self.head_to_batch_dim(key)
+            value = self.head_to_batch_dim(value)
+            attention_probs = self.get_attention_scores(query, key, attention_mask) # [16, 4096, 4096]
+            attention_probs = controller(attention_probs, is_cross, place_in_unet)
+            hidden_states = torch.bmm(attention_probs, value)
+            hidden_states = self.batch_to_head_dim(hidden_states)
+            # linear proj
+            hidden_states = self.to_out[0](hidden_states)
+            # dropout
+            hidden_states = self.to_out[1](hidden_states)
+            return hidden_states
+        return forward
+    class DummyController:
+        def __call__(self, *args):
+            return args[0]
+        def __init__(self):
+            self.num_att_layers = 0
+    if controller is None:
+        controller = DummyController()
+    def register_recr(net_, count, place_in_unet):
+        if net_.__class__.__name__ == 'CrossAttention':
+            net_.forward = ca_forward(net_, place_in_unet)
+            return count + 1
+        elif hasattr(net_, 'children'):
+            for net__ in net_.children():
+                count = register_recr(net__, count, place_in_unet)
+        return count
+    cross_att_count = 0
+    sub_nets = model.unet.named_children()
+    for net in sub_nets:
+        if "down" in net[0]:
+            cross_att_count += register_recr(net[1], 0, "down")
+        elif "up" in net[0]:
+            cross_att_count += register_recr(net[1], 0, "up")
+        elif "mid" in net[0]:
+            cross_att_count += register_recr(net[1], 0, "mid")
+    controller.num_att_layers = cross_att_count
+def get_word_inds(text: str, word_place: int, tokenizer):
+    split_text = text.split(" ")
+    if type(word_place) is str:
+        word_place = [i for i, word in enumerate(split_text) if word_place == word]
+    elif type(word_place) is int:
+        word_place = [word_place]
+    out = []
+    if len(word_place) > 0:
+        words_encode = [tokenizer.decode([item]).strip("#") for item in tokenizer.encode(text)][1:-1]
+        cur_len, ptr = 0, 0
+        for i in range(len(words_encode)):
+            cur_len += len(words_encode[i])
+            if ptr in word_place:
+                out.append(i + 1)
+            if cur_len >= len(split_text[ptr]):
+                ptr += 1
+                cur_len = 0
+    return np.array(out)
+def update_alpha_time_word(alpha, bounds: Union[float, Tuple[float, float]], prompt_ind: int,
+                           word_inds: Optional[torch.Tensor]=None):
+    # Edit the alpha map during attention map editing
+    if type(bounds) is float:
+        bounds = 0, bounds
+    start, end = int(bounds[0] * alpha.shape[0]), int(bounds[1] * alpha.shape[0])
+    if word_inds is None:
+        word_inds = torch.arange(alpha.shape[2])
+    alpha[: start, prompt_ind, word_inds] = 0
+    alpha[start: end, prompt_ind, word_inds] = 1
+    alpha[end:, prompt_ind, word_inds] = 0
+    return alpha
+import omegaconf
+def get_time_words_attention_alpha(prompts, num_steps,
+                                   cross_replace_steps: Union[float, Dict[str, Tuple[float, float]]],
+                                   tokenizer, max_num_words=77):
+    # Not understand
+    if (type(cross_replace_steps) is not dict) and \
+        (type(cross_replace_steps) is not omegaconf.dictconfig.DictConfig):
+        cross_replace_steps = {"default_": cross_replace_steps}
+    if "default_" not in cross_replace_steps:
+        cross_replace_steps["default_"] = (0., 1.)
+    alpha_time_words = torch.zeros(num_steps + 1, len(prompts) - 1, max_num_words)
+    for i in range(len(prompts) - 1):
+        alpha_time_words = update_alpha_time_word(alpha_time_words, cross_replace_steps["default_"],
+                                                  i)
+    for key, item in cross_replace_steps.items():
+        if key != "default_":
+             inds = [get_word_inds(prompts[i], key, tokenizer) for i in range(1, len(prompts))]
+             for i, ind in enumerate(inds):
+                 if len(ind) > 0:
+                    alpha_time_words = update_alpha_time_word(alpha_time_words, item, i, ind)
+    alpha_time_words = alpha_time_words.reshape(num_steps + 1, len(prompts) - 1, 1, 1, max_num_words)
+    return alpha_time_words

i2vedit/prompt_attention/visualization.py ADDED Viewed

	@@ -0,0 +1,391 @@

+from typing import List
+import os
+import datetime
+import numpy as np
+from PIL import Image
+from einops import rearrange, repeat
+import math
+import torch
+import torch.nn.functional as F
+from packaging import version
+from i2vedit.prompt_attention import ptp_utils
+from i2vedit.prompt_attention.common.image_util import save_gif_mp4_folder_type
+from i2vedit.prompt_attention.attention_store import AttentionStore
+if version.parse(torch.__version__) >= version.parse("2.0.0"):
+    SDP_IS_AVAILABLE = True
+    from torch.backends.cuda import SDPBackend, sdp_kernel
+    BACKEND_MAP = {
+        SDPBackend.MATH: {
+            "enable_math": True,
+            "enable_flash": False,
+            "enable_mem_efficient": False,
+        },
+        SDPBackend.FLASH_ATTENTION: {
+            "enable_math": False,
+            "enable_flash": True,
+            "enable_mem_efficient": False,
+        },
+        SDPBackend.EFFICIENT_ATTENTION: {
+            "enable_math": False,
+            "enable_flash": False,
+            "enable_mem_efficient": True,
+        },
+        None: {"enable_math": True, "enable_flash": True, "enable_mem_efficient": True},
+    }
+else:
+    from contextlib import nullcontext
+    SDP_IS_AVAILABLE = False
+    sdp_kernel = nullcontext
+    BACKEND_MAP = {}
+    logpy.warn(
+        f"No SDP backend available, likely because you are running in pytorch "
+        f"versions < 2.0. In fact, you are using PyTorch {torch.__version__}. "
+        f"You might want to consider upgrading."
+    )
+def aggregate_attention(prompts, attention_store: AttentionStore, res: int, from_where: List[str], is_cross: bool, select: int):
+    out = []
+    attention_maps = attention_store.get_average_attention()
+    num_pixels = res ** 2
+    for location in from_where:
+        for item in attention_maps[f"{location}_{'cross' if is_cross else 'self'}"]:
+            if item.dim() == 3:
+                if item.shape[1] == num_pixels:
+                    cross_maps = item.reshape(len(prompts), -1, res, res, item.shape[-1])[select]
+                    out.append(cross_maps)
+            elif item.dim() == 4:
+                t, h, res_sq, token = item.shape
+                if item.shape[2] == num_pixels:
+                    cross_maps = item.reshape(len(prompts), t, -1, res, res, item.shape[-1])[select]
+                    out.append(cross_maps)
+    out = torch.cat(out, dim=-4)
+    out = out.sum(-4) / out.shape[-4]
+    return out.cpu()
+def show_cross_attention(tokenizer, prompts, attention_store: AttentionStore,
+                         res: int, from_where: List[str], select: int = 0, save_path = None):
+    """
+        attention_store (AttentionStore):
+            ["down", "mid", "up"] X ["self", "cross"]
+            4,         1,    6
+            head*res*text_token_len = 8*res*77
+            res=1024 -> 64 -> 1024
+        res (int): res
+        from_where (List[str]): "up", "down'
+    """
+    if isinstance(prompts, str):
+        prompts = [prompts,]
+    tokens = tokenizer.encode(prompts[select])
+    decoder = tokenizer.decode
+    attention_maps = aggregate_attention(prompts, attention_store, res, from_where, True, select)
+    os.makedirs('trash', exist_ok=True)
+    attention_list = []
+    if attention_maps.dim()==3: attention_maps=attention_maps[None, ...]
+    for j in range(attention_maps.shape[0]):
+        images = []
+        for i in range(len(tokens)):
+            image = attention_maps[j, :, :, i]
+            image = 255 * image / image.max()
+            image = image.unsqueeze(-1).expand(*image.shape, 3)
+            image = image.numpy().astype(np.uint8)
+            image = np.array(Image.fromarray(image).resize((256, 256)))
+            image = ptp_utils.text_under_image(image, decoder(int(tokens[i])))
+            images.append(image)
+        ptp_utils.view_images(np.stack(images, axis=0), save_path=save_path)
+        atten_j = np.concatenate(images, axis=1)
+        attention_list.append(atten_j)
+    if save_path is not None:
+        now = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S")
+        video_save_path = f'{save_path}/{now}.gif'
+        save_gif_mp4_folder_type(attention_list, video_save_path)
+    return attention_list
+def show_self_attention_comp(attention_store: AttentionStore, res: int, from_where: List[str],
+                        max_com=10, select: int = 0):
+    attention_maps = aggregate_attention(attention_store, res, from_where, False, select).numpy().reshape((res ** 2, res ** 2))
+    u, s, vh = np.linalg.svd(attention_maps - np.mean(attention_maps, axis=1, keepdims=True))
+    images = []
+    for i in range(max_com):
+        image = vh[i].reshape(res, res)
+        image = image - image.min()
+        image = 255 * image / image.max()
+        image = np.repeat(np.expand_dims(image, axis=2), 3, axis=2).astype(np.uint8)
+        image = Image.fromarray(image).resize((256, 256))
+        image = np.array(image)
+        images.append(image)
+    ptp_utils.view_images(np.concatenate(images, axis=1))
+def show_avg_difference_maps(
+    attention_store: AttentionStore,
+    save_path = None
+):
+    avg_attention = attention_store.get_average_attention()
+    masks = []
+    for key in avg_attention:
+        if 'mask' in key:
+            for cur_pos in range(len(avg_attention[key])):
+                mask = avg_attention[key][cur_pos]
+                res = mask.shape[0] / 9
+                file_path = os.path.join(
+                    save_path,
+                    f"avg_key_{key}_curpos_{cur_pos}_res_{res}_mask.png"
+                )
+                print(key, cur_pos, mask.shape)
+                image = 255 * mask #/ attn.max()
+                image = image.cpu().numpy().astype(np.uint8)
+                image = Image.fromarray(image)
+                image.save(file_path)
+def show_self_attention(
+    attention_store: AttentionStore,
+    steps: List[int],
+    save_path = None,
+    inversed = False):
+    """
+        attention_store (AttentionStore):
+            ["down", "mid", "up"] X ["self", "cross"]
+            4,         1,    6
+            head*res*text_token_len = 8*res*77
+            res=1024 -> 64 -> 1024
+        res (int): res
+        from_where (List[str]): "up", "down'
+    """
+    #os.system(f"rm -rf {save_path}")
+    os.makedirs(save_path, exist_ok=True)
+    for step in steps:
+        step_in_store = len(attention_store.attention_store_all_step) - step - 1 if inversed else step
+        print("step_in_store", step_in_store)
+        step_in_store_atten_dict = attention_store.attention_store_all_step[step_in_store]
+        if isinstance(step_in_store_atten_dict, str):
+            step_in_store_atten_dict = torch.load(step_in_store_atten_dict)
+        step_in_store_atten_dict_reorg = {}
+        for key in step_in_store_atten_dict:
+            if '_q_' not in key and ('_k_' not in key and '_v_' not in key):
+                step_in_store_atten_dict_reorg[key] = step_in_store_atten_dict[key]
+            elif '_q_' in key:
+                step_in_store_atten_dict_reorg[key.replace("_q_","_qxk_")] = \
+                [[step_in_store_atten_dict[key][i], \
+                  step_in_store_atten_dict[key.replace("_q_","_k_")][i] \
+                 ] \
+                 for i in range(len(step_in_store_atten_dict[key]))]
+        for key in step_in_store_atten_dict_reorg:
+            if '_mask_' not in key and '_qxk_' not in key:
+                for cur_pos in range(len(step_in_store_atten_dict_reorg[key])):
+                    attn = step_in_store_atten_dict_reorg[key][cur_pos]
+                    attn = torch.mean(attn, dim=1)
+                    s, t, d = attn.shape
+                    res = int(np.sqrt(s / (9*16)))
+                    attn = attn.reshape(res*9,res*16,t,d).permute(2,0,3,1).reshape(t*res*9,d*res*16)
+                    file_path = os.path.join(
+                        save_path,
+                        f"step_{step}_key_{key}_curpos_{cur_pos}_res_{res}.png"
+                    )
+                    print(step, key, cur_pos, attn.shape)
+                    image = 255 * attn #/ attn.max()
+                    image = image.cpu().numpy().astype(np.uint8)
+                    image = Image.fromarray(image)
+                    image.save(file_path)
+            elif '_mask_' in key:
+                for cur_pos in range(len(step_in_store_atten_dict_reorg[key])):
+                    mask = step_in_store_atten_dict_reorg[key][cur_pos]
+                    res = mask.shape[0] / 9
+                    file_path = os.path.join(
+                        save_path,
+                        f"step_{step}_key_{key}_curpos_{cur_pos}_res_{res}_mask.png"
+                    )
+                    print(step, key, cur_pos, mask.shape)
+                    image = 255 * mask #/ attn.max()
+                    image = image.cpu().numpy().astype(np.uint8)
+                    image = Image.fromarray(image)
+                    image.save(file_path)
+            else:
+                for cur_pos in range(len(step_in_store_atten_dict_reorg[key])):
+                    q, k = step_in_store_atten_dict_reorg[key][cur_pos]
+                    q = q.to("cuda").type(torch.float32)
+                    k = k.to("cuda").type(torch.float32)
+                    res = int(np.sqrt(q.shape[-2] / (9*16)))
+                    h = q.shape[1]
+                    bs = 1
+                    N = q.shape[0] // bs
+                    vectors = []
+                    vectors_diff = []
+                    for i in range(N):
+                        attn_prob = calculate_attention_probs(q[i*bs:(i+1)*bs], k[i*bs:(i+1)*bs])
+                        print("attn_prob 1", attn_prob.min(), attn_prob.max())
+                        attn_prob = torch.mean(attn_prob, dim=2).reshape(h, res*9, res*16)
+                        print("attn_prob 2", attn_prob.min(), attn_prob.max())
+                        attn_prob = torch.mean(attn_prob, dim=0)
+                        print("attn_prob 3", attn_prob.min(), attn_prob.max())
+                        vectors.append( attn_prob )
+                    for i in range(1, len(vectors)):
+                        vectors_diff.append(vectors[i] - vectors[i-1])
+                    vectors = torch.cat(vectors, dim=1)
+                    vectors_diff = torch.cat(vectors_diff, dim=1)
+                    file_path = os.path.join(
+                        save_path,
+                        f"step_{step}_key_{key}_curpos_{cur_pos}_res_{res}_vector.png"
+                    )
+                    print(step, key, cur_pos, vectors.shape)
+                    image = 255 * vectors / vectors.max()
+                    image = image.clamp(0,255).cpu().numpy().astype(np.uint8)
+                    image = Image.fromarray(image)
+                    image.save(file_path)
+                    file_path = os.path.join(
+                        save_path,
+                        f"step_{step}_key_{key}_curpos_{cur_pos}_res_{res}_diff.png"
+                    )
+                    print(step, key, cur_pos, vectors_diff.shape)
+                    image = 255 * vectors_diff / vectors_diff.max()
+                    image = image.clamp(0,255).cpu().numpy().astype(np.uint8)
+                    image = Image.fromarray(image)
+                    image.save(file_path)
+#            else:
+#                #  只看最后两帧
+#                for cur_pos in range(len(step_in_store_atten_dict_reorg[key])):
+#                    q, k, v = step_in_store_atten_dict_reorg[key][cur_pos]
+#                    q = q[-2:,...].to("cuda")
+#                    k = k[-2:,...].to("cuda")
+#                    v = v[-2:,...].to("cuda")
+#                    res = int(np.sqrt(q.shape[-2] / (9*16)))
+#                    attn = calculate_attention_probs(q,k,v)
+#                    attn_d = torch.sum(torch.mean(torch.abs(attn[0,...] - attn[1,...]), dim=0), dim=1).reshape(res*9,res*16)
+#                    print(step, key, cur_pos, attn_d.shape, attn_d.min(), attn_d.max())
+#                    file_path = os.path.join(
+#                        save_path,
+#                        f"step_{step}_key_{key}_curpos_{cur_pos}_res_{res}_attn_d.png"
+#                    )
+#                    image = (255 * attn_d + 1e-3) / 2.#attn_d.max()
+#                    image = image.clamp(0,255).cpu().numpy().astype(np.uint8)
+#                    image = Image.fromarray(image)
+#                    image.save(file_path)
+def show_self_attention_distance(
+    attention_store: List[AttentionStore],
+    steps: List[int],
+    save_path = None,
+):
+    """
+        attention_store (AttentionStore):
+            ["down", "mid", "up"] X ["self", "cross"]
+            4,         1,    6
+            head*res*text_token_len = 8*res*77
+            res=1024 -> 64 -> 1024
+        res (int): res
+        from_where (List[str]): "up", "down'
+    """
+    os.system(f"rm -rf {save_path}")
+    os.makedirs(save_path, exist_ok=True)
+    assert len(attention_store) == 2
+    for step in steps:
+        step_in_store = [len(attention_store[0].attention_store_all_step) - step - 1, step]
+        step_in_store_atten_dict = [attention_store[i].attention_store_all_step[step_in_store[i]] \
+                                    for i in range(2)]
+        step_in_store_atten_dict = [ \
+            torch.load(step_in_store_atten_dict[i]) \
+            if isinstance(step_in_store_atten_dict[i], str) \
+            else step_in_store_atten_dict[i] \
+            for i in range(2)]
+        step_in_store_atten_dict_reorg = [{},{}]
+        for i in range(2):
+            item = step_in_store_atten_dict[i]
+            for key in item:
+                if '_q_' in key:
+                    step_in_store_atten_dict_reorg[i][key.replace("_q_","_qxk_")] = \
+                    [[step_in_store_atten_dict[i][key][j], \
+                      step_in_store_atten_dict[i][key.replace("_q_","_k_")][j] \
+                     ] \
+                     for j in range(len(step_in_store_atten_dict[i][key]))]
+        for key in step_in_store_atten_dict_reorg[1]:
+            for cur_pos in range(len(step_in_store_atten_dict_reorg[1][key])):
+                q1, k1 = step_in_store_atten_dict_reorg[1][key][cur_pos]
+                q0, k0 = step_in_store_atten_dict_reorg[0][key][cur_pos]
+                res = int(np.sqrt(q1.shape[-2] / (9*16)))
+                attn_d = calculate_attention_mask(q0, k0, q1, k1, bs=1, device="cuda")
+                attn_d = rearrange(attn_d, "b h w -> h (b w)")
+                print(step, key, cur_pos, attn_d.shape, "attnd", attn_d.min(), attn_d.max())
+                file_path = os.path.join(
+                    save_path,
+                    f"step_{step}_key_{key}_curpos_{cur_pos}_res_{res}_attn_d.png"
+                )
+                image = 255 * attn_d#attn_d.max()
+                image = image.clamp(0,255).cpu().numpy().astype(np.uint8)
+                image = Image.fromarray(image)
+                image.save(file_path)
+def calculate_attention_mask(q0, k0, q1, k1, bs=1, device="cuda"):
+    q1 = q1.to(device)
+    k1 = k1.to(device)
+    q0 = q0.to(device)
+    k0 = k0.to(device)
+    res = int(np.sqrt(q1.shape[-2] / (9*16)))
+    N = q0.shape[0] // bs
+    attn_d = []
+    for i in range(N):
+        attn0 = calculate_attention_probs(q0[bs*i:bs*(i+1),...],k0[bs*i:bs*(i+1),...])
+        attn1 = calculate_attention_probs(q1[bs*i:bs*(i+1),...],k1[bs*i:bs*(i+1),...])
+        attn_d_i = torch.sum(torch.mean(torch.abs(attn0 - attn1), dim=1), dim=2).reshape(bs,res*9,res*16)
+        attn_d.append( attn_d_i )
+    attn_d = torch.cat(attn_d, dim=0) / 2.0
+    return attn_d.clamp(0,1)
+def calculate_attention_probs(q, k, attn_mask=None):
+    with sdp_kernel(**BACKEND_MAP[None]):
+        h = q.shape[1]
+        q, k = map(lambda t: rearrange(t, "b h n d -> (b h) n d"), (q, k))
+        with torch.autocast("cuda", enabled=False):
+            attention_scores = torch.baddbmm(
+                torch.empty(q.shape[0], q.shape[1], k.shape[1], dtype=q.dtype, device=q.device),
+                q,
+                k.transpose(-1, -2),
+                beta=0,
+                alpha=1 / math.sqrt(q.size(-1)),
+            )
+        #print("attention_scores", attention_scores.min(), attention_scores.max())
+        if attn_mask is not None:
+            if attn_mask.dtype == torch.bool:
+                attn_mask.masked_fill_(attn_mask.logical_not(), float("-inf"))
+            attention_scores = attention_scores + attn_mask
+        attention_probs = attention_scores.softmax(dim=-1)
+        #print("attention_softmax", attention_probs.min(), attention_probs.max())
+        # cast back to the original dtype
+        attention_probs = attention_probs.to(q.dtype)
+        # reshape hidden_states
+        attention_probs = rearrange(attention_probs, "(b h) n d -> b h n d", h=h)
+#        v = torch.eye(q.shape[-2], device=q.device)
+#        v = repeat(v, "... -> b h ...", b=q.shape[0], h=q.shape[1])
+#        attention_probs = F.scaled_dot_product_attention(
+#            q, k, v, attn_mask=attn_mask
+#        )  # scale is dim_head ** -0.5 per default
+    return attention_probs

i2vedit/train.py ADDED Viewed

	@@ -0,0 +1,1488 @@

+import argparse
+import datetime
+import logging
+import inspect
+import math
+import os
+import random
+import gc
+import copy
+from scipy.stats import anderson
+import imageio
+import numpy as np
+from typing import Dict, Optional, Tuple, List
+from omegaconf import OmegaConf
+from einops import rearrange, repeat
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import diffusers
+import transformers
+from torchvision import transforms
+from tqdm.auto import tqdm
+from PIL import Image
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
+import diffusers
+from diffusers.models import AutoencoderKL
+from diffusers import DDIMScheduler, TextToVideoSDPipeline
+from diffusers.optimization import get_scheduler
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.models.attention_processor import AttnProcessor2_0, Attention
+from diffusers.models.attention import BasicTransformerBlock
+from diffusers import StableVideoDiffusionPipeline
+from diffusers.models.lora import LoRALinearLayer
+from diffusers import AutoencoderKLTemporalDecoder, EulerDiscreteScheduler, UNetSpatioTemporalConditionModel
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import EMAModel
+from diffusers.utils import check_min_version, deprecate, is_wandb_available, load_image
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.models.unet_3d_blocks import \
+            (CrossAttnDownBlockSpatioTemporal,
+             DownBlockSpatioTemporal,
+             CrossAttnUpBlockSpatioTemporal,
+             UpBlockSpatioTemporal)
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+from transformers.models.clip.modeling_clip import CLIPEncoder
+from i2vedit.utils.dataset import (
+    CachedDataset,
+)
+from i2vedit.utils.lora_handler import LoraHandler
+from i2vedit.utils.lora import extract_lora_child_module
+from i2vedit.utils.euler_utils import euler_inversion
+from i2vedit.utils.svd_util import SmoothAreaRandomDetection
+from i2vedit.utils.model_utils import (
+    tensor_to_vae_latent,
+    P2PEulerDiscreteScheduler,
+    P2PStableVideoDiffusionPipeline
+)
+from i2vedit.data import ResolutionControl, SingleClipDataset
+from i2vedit.prompt_attention import attention_util
+already_printed_trainables = False
+logger = get_logger(__name__, log_level="INFO")
+def create_logging(logging, logger, accelerator):
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+def accelerate_set_verbose(accelerator):
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+def extend_datasets(datasets, dataset_items, extend=False):
+    biggest_data_len = max(x.__len__() for x in datasets)
+    extended = []
+    for dataset in datasets:
+        if dataset.__len__() == 0:
+            del dataset
+            continue
+        if dataset.__len__() < biggest_data_len:
+            for item in dataset_items:
+                if extend and item not in extended and hasattr(dataset, item):
+                    print(f"Extending {item}")
+                    value = getattr(dataset, item)
+                    value *= biggest_data_len
+                    value = value[:biggest_data_len]
+                    setattr(dataset, item, value)
+                    print(f"New {item} dataset length: {dataset.__len__()}")
+                    extended.append(item)
+def export_to_video(video_frames, output_video_path, fps, resctrl:ResolutionControl):
+    flattened_video_frames = [img for sublist in video_frames for img in sublist]
+    video_writer = imageio.get_writer(output_video_path, fps=fps)
+    for img in flattened_video_frames:
+        img = resctrl.callback(img)
+        video_writer.append_data(np.array(img))
+    video_writer.close()
+def create_output_folders(output_dir, config, clip_id):
+    out_dir = os.path.join(output_dir, f"train_motion_lora/clip_{clip_id}")
+    os.makedirs(out_dir, exist_ok=True)
+    os.makedirs(f"{out_dir}/samples", exist_ok=True)
+    # OmegaConf.save(config, os.path.join(out_dir, 'config.yaml'))
+    return out_dir
+def load_primary_models(pretrained_model_path):
+    noise_scheduler = EulerDiscreteScheduler.from_pretrained(
+        pretrained_model_path, subfolder="scheduler")
+    feature_extractor = CLIPImageProcessor.from_pretrained(
+        pretrained_model_path, subfolder="feature_extractor", revision=None
+    )
+    image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+        pretrained_model_path, subfolder="image_encoder", revision=None, variant="fp16"
+    )
+    vae = AutoencoderKLTemporalDecoder.from_pretrained(
+        pretrained_model_path, subfolder="vae", revision=None, variant="fp16")
+    unet = UNetSpatioTemporalConditionModel.from_pretrained(
+        pretrained_model_path,
+        subfolder="unet",
+        low_cpu_mem_usage=True,
+        variant="fp16",
+    )
+    return noise_scheduler, feature_extractor, image_encoder, vae, unet
+def unet_and_text_g_c(unet, image_encoder, unet_enable, image_enable):
+    unet.gradient_checkpointing = unet_enable
+    unet.mid_block.gradient_checkpointing = unet_enable
+    for module in unet.down_blocks + unet.up_blocks:
+        if isinstance(module,
+            (CrossAttnDownBlockSpatioTemporal,
+             DownBlockSpatioTemporal,
+             CrossAttnUpBlockSpatioTemporal,
+             UpBlockSpatioTemporal)):
+            module.gradient_checkpointing = unet_enable
+def freeze_models(models_to_freeze):
+    for model in models_to_freeze:
+        if model is not None: model.requires_grad_(False)
+def is_attn(name):
+    return ('attn1' or 'attn2' == name.split('.')[-1])
+def set_processors(attentions):
+    for attn in attentions: attn.set_processor(AttnProcessor2_0())
+def set_torch_2_attn(unet):
+    optim_count = 0
+    for name, module in unet.named_modules():
+        if is_attn(name):
+            if isinstance(module, torch.nn.ModuleList):
+                for m in module:
+                    if isinstance(m, BasicTransformerBlock):
+                        set_processors([m.attn1, m.attn2])
+                        optim_count += 1
+    if optim_count > 0:
+        print(f"{optim_count} Attention layers using Scaled Dot Product Attention.")
+def handle_memory_attention(enable_xformers_memory_efficient_attention, enable_torch_2_attn, unet):
+    try:
+        is_torch_2 = hasattr(F, 'scaled_dot_product_attention')
+        enable_torch_2 = is_torch_2 and enable_torch_2_attn
+        if enable_xformers_memory_efficient_attention and not enable_torch_2:
+            if is_xformers_available():
+                from xformers.ops import MemoryEfficientAttentionFlashAttentionOp
+                unet.enable_xformers_memory_efficient_attention(attention_op=MemoryEfficientAttentionFlashAttentionOp)
+            else:
+                raise ValueError("xformers is not available. Make sure it is installed correctly")
+        if enable_torch_2:
+            set_torch_2_attn(unet)
+    except:
+        print("Could not enable memory efficient attention for xformers or Torch 2.0.")
+def param_optim(model, condition, extra_params=None, is_lora=False, negation=None):
+    extra_params = extra_params if len(extra_params.keys()) > 0 else None
+    return {
+        "model": model,
+        "condition": condition,
+        'extra_params': extra_params,
+        'is_lora': is_lora,
+        "negation": negation
+    }
+def create_optim_params(name='param', params=None, lr=5e-6, extra_params=None):
+    params = {
+        "name": name,
+        "params": params,
+        "lr": lr
+    }
+    if extra_params is not None:
+        for k, v in extra_params.items():
+            params[k] = v
+    return params
+def negate_params(name, negation):
+    # We have to do this if we are co-training with LoRA.
+    # This ensures that parameter groups aren't duplicated.
+    if negation is None: return False
+    for n in negation:
+        if n in name and 'temp' not in name:
+            return True
+    return False
+def create_optimizer_params(model_list, lr):
+    import itertools
+    optimizer_params = []
+    for optim in model_list:
+        model, condition, extra_params, is_lora, negation = optim.values()
+        # Check if we are doing LoRA training.
+        if is_lora and condition and isinstance(model, list):
+            params = create_optim_params(
+                params=itertools.chain(*model),
+                extra_params=extra_params
+            )
+            optimizer_params.append(params)
+            continue
+        if is_lora and condition and not isinstance(model, list):
+            for n, p in model.named_parameters():
+                if 'lora' in n:
+                    params = create_optim_params(n, p, lr, extra_params)
+                    optimizer_params.append(params)
+            continue
+        # If this is true, we can train it.
+        if condition:
+            for n, p in model.named_parameters():
+                should_negate = 'lora' in n and not is_lora
+                if should_negate: continue
+                params = create_optim_params(n, p, lr, extra_params)
+                optimizer_params.append(params)
+    return optimizer_params
+def get_optimizer(use_8bit_adam):
+    if use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "Please install bitsandbytes to use 8-bit Adam. You can do so by running `pip install bitsandbytes`"
+            )
+        return bnb.optim.AdamW8bit
+    else:
+        return torch.optim.AdamW
+def is_mixed_precision(accelerator):
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+    return weight_dtype
+def cast_to_gpu_and_type(model_list, accelerator, weight_dtype):
+    for model in model_list:
+        if model is not None: model.to(accelerator.device, dtype=weight_dtype)
+def inverse_video(pipe, latents, num_steps, image):
+    euler_inv_scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
+    euler_inv_scheduler.set_timesteps(num_steps)
+    euler_inv_latent = euler_inversion(
+        pipe, euler_inv_scheduler, video_latent=latents.to(pipe.device),
+        num_inv_steps=num_steps, image=image)[-1]
+    return euler_inv_latent
+def handle_cache_latents(
+        should_cache,
+        output_dir,
+        train_dataloader,
+        train_batch_size,
+        vae,
+        unet,
+        cached_latent_dir=None,
+):
+    # Cache latents by storing them in VRAM.
+    # Speeds up training and saves memory by not encoding during the train loop.
+    if not should_cache: return None
+    vae.to('cuda', dtype=torch.float32)
+    #vae.enable_slicing()
+    cached_latent_dir = (
+        os.path.abspath(cached_latent_dir) if cached_latent_dir is not None else None
+    )
+    if cached_latent_dir is None:
+        cache_save_dir = f"{output_dir}/cached_latents"
+        os.makedirs(cache_save_dir, exist_ok=True)
+        for i, batch in enumerate(tqdm(train_dataloader, desc="Caching Latents.")):
+            save_name = f"cached_{i}"
+            full_out_path = f"{cache_save_dir}/{save_name}.pt"
+            pixel_values = batch['pixel_values'].to('cuda', dtype=torch.float32)
+            refer_pixel_values = batch['refer_pixel_values'].to('cuda', dtype=torch.float32)
+            cross_pixel_values = batch['cross_pixel_values'].to('cuda', dtype=torch.float32)
+            batch['latents'] = tensor_to_vae_latent(pixel_values, vae)
+            if batch.get("refer_latents") is None:
+                batch['refer_latents'] = tensor_to_vae_latent(refer_pixel_values, vae)
+            batch['cross_latents'] = tensor_to_vae_latent(cross_pixel_values, vae)
+            for k, v in batch.items(): batch[k] = v[0]
+            torch.save(batch, full_out_path)
+            del pixel_values
+            del batch
+            # We do this to avoid fragmentation from casting latents between devices.
+            torch.cuda.empty_cache()
+    else:
+        cache_save_dir = cached_latent_dir
+    return torch.utils.data.DataLoader(
+        CachedDataset(cache_dir=cache_save_dir),
+        batch_size=train_batch_size,
+        shuffle=True,
+        num_workers=0
+    )
+def handle_trainable_modules(model, trainable_modules=None, is_enabled=True, negation=None):
+    global already_printed_trainables
+    # This can most definitely be refactored :-)
+    unfrozen_params = 0
+    if trainable_modules is not None:
+        for name, module in model.named_modules():
+            for tm in tuple(trainable_modules):
+                if tm == 'all':
+                    model.requires_grad_(is_enabled)
+                    unfrozen_params = len(list(model.parameters()))
+                    break
+                if tm in name and 'lora' not in name:
+                    for m in module.parameters():
+                        m.requires_grad_(is_enabled)
+                        if is_enabled: unfrozen_params += 1
+    if unfrozen_params > 0 and not already_printed_trainables:
+        already_printed_trainables = True
+        print(f"{unfrozen_params} params have been unfrozen for training.")
+def sample_noise(latents, noise_strength, use_offset_noise=False):
+    b, c, f, *_ = latents.shape
+    noise_latents = torch.randn_like(latents, device=latents.device)
+    if use_offset_noise:
+        offset_noise = torch.randn(b, c, f, 1, 1, device=latents.device)
+        noise_latents = noise_latents + noise_strength * offset_noise
+    return noise_latents
+def enforce_zero_terminal_snr(betas):
+    """
+    Corrects noise in diffusion schedulers.
+    From: Common Diffusion Noise Schedules and Sample Steps are Flawed
+    https://arxiv.org/pdf/2305.08891.pdf
+    """
+    # Convert betas to alphas_bar_sqrt
+    alphas = 1 - betas
+    alphas_bar = alphas.cumprod(0)
+    alphas_bar_sqrt = alphas_bar.sqrt()
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= alphas_bar_sqrt_T
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (
+            alphas_bar_sqrt_0 - alphas_bar_sqrt_T
+    )
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt ** 2
+    alphas = alphas_bar[1:] / alphas_bar[:-1]
+    alphas = torch.cat([alphas_bar[0:1], alphas])
+    betas = 1 - alphas
+    return betas
+def should_sample(global_step, validation_steps, validation_data):
+    return global_step % validation_steps == 0 and validation_data.sample_preview
+def save_pipe(
+        path,
+        global_step,
+        accelerator,
+        unet,
+        image_encoder,
+        vae,
+        output_dir,
+        lora_manager_spatial: LoraHandler,
+        lora_manager_temporal: LoraHandler,
+        unet_target_replace_module=None,
+        image_target_replace_module=None,
+        is_checkpoint=False,
+        save_pretrained_model=True,
+):
+    if is_checkpoint:
+        save_path = os.path.join(output_dir, f"checkpoint-{global_step}")
+        os.makedirs(save_path, exist_ok=True)
+    else:
+        save_path = output_dir
+    # Save the dtypes so we can continue training at the same precision.
+    u_dtype, i_dtype, v_dtype = unet.dtype, image_encoder.dtype, vae.dtype
+    # Copy the model without creating a reference to it. This allows keeping the state of our lora training if enabled.
+    unet_out = copy.deepcopy(accelerator.unwrap_model(unet.cpu(), keep_fp32_wrapper=False))
+    image_encoder_out = copy.deepcopy(accelerator.unwrap_model(image_encoder.cpu(), keep_fp32_wrapper=False))
+    pipeline = P2PStableVideoDiffusionPipeline.from_pretrained(
+        path,
+        unet=unet_out,
+        image_encoder=image_encoder_out,
+        vae=accelerator.unwrap_model(vae),
+#        torch_dtype=weight_dtype,
+    ).to(torch_dtype=torch.float32)
+#    lora_manager_spatial.save_lora_weights(model=copy.deepcopy(pipeline), save_path=save_path+'/spatial', step=global_step)
+    if lora_manager_temporal is not None:
+        lora_manager_temporal.save_lora_weights(model=copy.deepcopy(pipeline), save_path=save_path+'/temporal', step=global_step)
+    if save_pretrained_model:
+        pipeline.save_pretrained(save_path)
+    if is_checkpoint:
+        unet, image_encoder = accelerator.prepare(unet, image_encoder)
+        models_to_cast_back = [(unet, u_dtype), (image_encoder, i_dtype), (vae, v_dtype)]
+        [x[0].to(accelerator.device, dtype=x[1]) for x in models_to_cast_back]
+    logger.info(f"Saved model at {save_path} on step {global_step}")
+    del pipeline
+    del unet_out
+    del image_encoder_out
+    torch.cuda.empty_cache()
+    gc.collect()
+def load_images_from_list(img_list):
+    images = []
+    valid_extensions = {".jpg", ".jpeg", ".png", ".bmp", ".gif", ".tiff", ".webp"}  # Add or remove extensions as needed
+    # Function to extract frame number from the filename
+    def frame_number(filename):
+        parts = filename.split('_')
+        if len(parts) > 1 and parts[0] == 'frame':
+            try:
+                return int(parts[1].split('.')[0])  # Extracting the number part
+            except ValueError:
+                return float('inf')  # In case of non-integer part, place this file at the end
+        return float('inf')  # Non-frame files are placed at the end
+    # Sorting files based on frame number
+    #sorted_files = sorted(os.listdir(folder), key=frame_number)
+    sorted_files = img_list
+    # Load images in sorted order
+    for filename in sorted_files:
+        ext = os.path.splitext(filename)[1].lower()
+        if ext in valid_extensions:
+            img = Image.open(filename).convert('RGB')
+            images.append(img)
+    return images
+# copy from https://github.com/crowsonkb/k-diffusion.git
+def stratified_uniform(shape, group=0, groups=1, dtype=None, device=None):
+    """Draws stratified samples from a uniform distribution."""
+    if groups <= 0:
+        raise ValueError(f"groups must be positive, got {groups}")
+    if group < 0 or group >= groups:
+        raise ValueError(f"group must be in [0, {groups})")
+    n = shape[-1] * groups
+    offsets = torch.arange(group, n, groups, dtype=dtype, device=device)
+    u = torch.rand(shape, dtype=dtype, device=device)
+    return (offsets + u) / n
+def rand_cosine_interpolated(shape, image_d, noise_d_low, noise_d_high, sigma_data=1., min_value=1e-3, max_value=1e3, device='cpu', dtype=torch.float32):
+    """Draws samples from an interpolated cosine timestep distribution (from simple diffusion)."""
+    def logsnr_schedule_cosine(t, logsnr_min, logsnr_max):
+        t_min = math.atan(math.exp(-0.5 * logsnr_max))
+        t_max = math.atan(math.exp(-0.5 * logsnr_min))
+        return -2 * torch.log(torch.tan(t_min + t * (t_max - t_min)))
+    def logsnr_schedule_cosine_shifted(t, image_d, noise_d, logsnr_min, logsnr_max):
+        shift = 2 * math.log(noise_d / image_d)
+        return logsnr_schedule_cosine(t, logsnr_min - shift, logsnr_max - shift) + shift
+    def logsnr_schedule_cosine_interpolated(t, image_d, noise_d_low, noise_d_high, logsnr_min, logsnr_max):
+        logsnr_low = logsnr_schedule_cosine_shifted(
+            t, image_d, noise_d_low, logsnr_min, logsnr_max)
+        logsnr_high = logsnr_schedule_cosine_shifted(
+            t, image_d, noise_d_high, logsnr_min, logsnr_max)
+        return torch.lerp(logsnr_low, logsnr_high, t)
+    logsnr_min = -2 * math.log(min_value / sigma_data)
+    logsnr_max = -2 * math.log(max_value / sigma_data)
+    u = stratified_uniform(
+        shape, group=0, groups=1, dtype=dtype, device=device
+    )
+    logsnr = logsnr_schedule_cosine_interpolated(
+        u, image_d, noise_d_low, noise_d_high, logsnr_min, logsnr_max)
+    return torch.exp(-logsnr / 2) * sigma_data, u
+min_value = 0.002
+max_value = 700
+image_d = 64
+noise_d_low = 32
+noise_d_high = 64
+sigma_data = 0.5
+def _compute_padding(kernel_size):
+    """Compute padding tuple."""
+    # 4 or 6 ints:  (padding_left, padding_right,padding_top,padding_bottom)
+    # https://pytorch.org/docs/stable/nn.html#torch.nn.functional.pad
+    if len(kernel_size) < 2:
+        raise AssertionError(kernel_size)
+    computed = [k - 1 for k in kernel_size]
+    # for even kernels we need to do asymmetric padding :(
+    out_padding = 2 * len(kernel_size) * [0]
+    for i in range(len(kernel_size)):
+        computed_tmp = computed[-(i + 1)]
+        pad_front = computed_tmp // 2
+        pad_rear = computed_tmp - pad_front
+        out_padding[2 * i + 0] = pad_front
+        out_padding[2 * i + 1] = pad_rear
+    return out_padding
+def _filter2d(input, kernel):
+    # prepare kernel
+    b, c, h, w = input.shape
+    tmp_kernel = kernel[:, None, ...].to(
+        device=input.device, dtype=input.dtype)
+    tmp_kernel = tmp_kernel.expand(-1, c, -1, -1)
+    height, width = tmp_kernel.shape[-2:]
+    padding_shape: list[int] = _compute_padding([height, width])
+    input = torch.nn.functional.pad(input, padding_shape, mode="reflect")
+    # kernel and input tensor reshape to align element-wise or batch-wise params
+    tmp_kernel = tmp_kernel.reshape(-1, 1, height, width)
+    input = input.view(-1, tmp_kernel.size(0), input.size(-2), input.size(-1))
+    # convolve the tensor with the kernel.
+    output = torch.nn.functional.conv2d(
+        input, tmp_kernel, groups=tmp_kernel.size(0), padding=0, stride=1)
+    out = output.view(b, c, h, w)
+    return out
+def _gaussian(window_size: int, sigma):
+    if isinstance(sigma, float):
+        sigma = torch.tensor([[sigma]])
+    batch_size = sigma.shape[0]
+    x = (torch.arange(window_size, device=sigma.device,
+         dtype=sigma.dtype) - window_size // 2).expand(batch_size, -1)
+    if window_size % 2 == 0:
+        x = x + 0.5
+    gauss = torch.exp(-x.pow(2.0) / (2 * sigma.pow(2.0)))
+    return gauss / gauss.sum(-1, keepdim=True)
+def _gaussian_blur2d(input, kernel_size, sigma):
+    if isinstance(sigma, tuple):
+        sigma = torch.tensor([sigma], dtype=input.dtype)
+    else:
+        sigma = sigma.to(dtype=input.dtype)
+    ky, kx = int(kernel_size[0]), int(kernel_size[1])
+    bs = sigma.shape[0]
+    kernel_x = _gaussian(kx, sigma[:, 1].view(bs, 1))
+    kernel_y = _gaussian(ky, sigma[:, 0].view(bs, 1))
+    out_x = _filter2d(input, kernel_x[..., None, :])
+    out = _filter2d(out_x, kernel_y[..., None])
+    return out
+def _resize_with_antialiasing(input, size, interpolation="bicubic", align_corners=True):
+    h, w = input.shape[-2:]
+    factors = (h / size[0], w / size[1])
+    # First, we have to determine sigma
+    # Taken from skimage: https://github.com/scikit-image/scikit-image/blob/v0.19.2/skimage/transform/_warps.py#L171
+    sigmas = (
+        max((factors[0] - 1.0) / 2.0, 0.001),
+        max((factors[1] - 1.0) / 2.0, 0.001),
+    )
+    # Now kernel size. Good results are for 3 sigma, but that is kind of slow. Pillow uses 1 sigma
+    # https://github.com/python-pillow/Pillow/blob/master/src/libImaging/Resample.c#L206
+    # But they do it in the 2 passes, which gives better results. Let's try 2 sigmas for now
+    ks = int(max(2.0 * 2 * sigmas[0], 3)), int(max(2.0 * 2 * sigmas[1], 3))
+    # Make sure it is odd
+    if (ks[0] % 2) == 0:
+        ks = ks[0] + 1, ks[1]
+    if (ks[1] % 2) == 0:
+        ks = ks[0], ks[1] + 1
+    input = _gaussian_blur2d(input, ks, sigmas)
+    output = torch.nn.functional.interpolate(
+        input, size=size, mode=interpolation, align_corners=align_corners)
+    return output
+def train_motion_lora(
+        pretrained_model_path,
+        output_dir: str,
+        train_dataset: SingleClipDataset,
+        validation_data: Dict,
+        edited_firstframes: List[Image.Image],
+        train_data: Dict,
+        validation_images: List[Image.Image],
+        validation_images_latents: List[torch.Tensor],
+        clip_id: int,
+        consistency_controller: attention_util.ConsistencyAttentionControl = None,
+        consistency_edit_controller_list: List[attention_util.ConsistencyAttentionControl] = [None,],
+        consistency_find_modules: Dict = {},
+        single_spatial_lora: bool = False,
+        train_temporal_lora: bool = True,
+        validation_steps: int = 100,
+        trainable_modules: Tuple[str] = None,  # Eg: ("attn1", "attn2")
+        extra_unet_params=None,
+        train_batch_size: int = 1,
+        max_train_steps: int = 500,
+        learning_rate: float = 5e-5,
+        lr_scheduler: str = "constant",
+        lr_warmup_steps: int = 0,
+        adam_beta1: float = 0.9,
+        adam_beta2: float = 0.999,
+        adam_weight_decay: float = 1e-2,
+        adam_epsilon: float = 1e-08,
+        gradient_accumulation_steps: int = 1,
+        gradient_checkpointing: bool = False,
+        image_encoder_gradient_checkpointing: bool = False,
+        checkpointing_steps: int = 500,
+        resume_from_checkpoint: Optional[str] = None,
+        resume_step: Optional[int] = None,
+        mixed_precision: Optional[str] = "fp16",
+        use_8bit_adam: bool = False,
+        enable_xformers_memory_efficient_attention: bool = True,
+        enable_torch_2_attn: bool = False,
+        seed: Optional[int] = None,
+        use_offset_noise: bool = False,
+        rescale_schedule: bool = False,
+        offset_noise_strength: float = 0.1,
+        extend_dataset: bool = False,
+        cache_latents: bool = False,
+        cached_latent_dir=None,
+        use_unet_lora: bool = False,
+        unet_lora_modules: Tuple[str] = [],
+        image_encoder_lora_modules: Tuple[str] = [],
+        save_pretrained_model: bool = True,
+        lora_rank: int = 16,
+        lora_path: str = '',
+        lora_unet_dropout: float = 0.1,
+        logger_type: str = 'tensorboard',
+        **kwargs
+):
+    *_, config = inspect.getargvalues(inspect.currentframe())
+    accelerator = Accelerator(
+        gradient_accumulation_steps=gradient_accumulation_steps,
+        mixed_precision=mixed_precision,
+        log_with=logger_type,
+        project_dir=output_dir
+    )
+    # Make one log on every process with the configuration for debugging.
+    create_logging(logging, logger, accelerator)
+    # Initialize accelerate, transformers, and diffusers warnings
+    accelerate_set_verbose(accelerator)
+    # Handle the output folder creation
+    if accelerator.is_main_process:
+        output_dir = create_output_folders(output_dir, config, clip_id)
+    # Load scheduler, tokenizer and models.
+    noise_scheduler, feature_extractor, image_encoder, vae, unet = load_primary_models(pretrained_model_path)
+    # Freeze any necessary models
+    freeze_models([vae, image_encoder, unet])
+    # Enable xformers if available
+    handle_memory_attention(enable_xformers_memory_efficient_attention, enable_torch_2_attn, unet)
+    # Initialize the optimizer
+    optimizer_cls = get_optimizer(use_8bit_adam)
+    # Create parameters to optimize over with a condition (if "condition" is true, optimize it)
+    #extra_unet_params = extra_unet_params if extra_unet_params is not None else {}
+    #extra_text_encoder_params = extra_unet_params if extra_unet_params is not None else {}
+    # Temporal LoRA
+    if train_temporal_lora:
+        # one temporal lora
+        lora_manager_temporal = LoraHandler(use_unet_lora=use_unet_lora, unet_replace_modules=["TemporalBasicTransformerBlock"])
+        unet_lora_params_temporal, unet_negation_temporal = lora_manager_temporal.add_lora_to_model(
+            use_unet_lora, unet, lora_manager_temporal.unet_replace_modules, lora_unet_dropout,
+            lora_path + '/temporal/lora/', r=lora_rank)
+        optimizer_temporal = optimizer_cls(
+            create_optimizer_params([param_optim(unet_lora_params_temporal, use_unet_lora, is_lora=True,
+                                                 extra_params={**{"lr": learning_rate}}
+                                                 )], learning_rate),
+            lr=learning_rate,
+            betas=(adam_beta1, adam_beta2),
+            weight_decay=adam_weight_decay,
+            eps=adam_epsilon,
+        )
+        lr_scheduler_temporal = get_scheduler(
+            lr_scheduler,
+            optimizer=optimizer_temporal,
+            num_warmup_steps=lr_warmup_steps * gradient_accumulation_steps,
+            num_training_steps=max_train_steps * gradient_accumulation_steps,
+        )
+    else:
+        lora_manager_temporal = None
+        unet_lora_params_temporal, unet_negation_temporal = [], []
+        optimizer_temporal = None
+        lr_scheduler_temporal = None
+    # Spatial LoRAs
+    if single_spatial_lora:
+        spatial_lora_num = 1
+    else:
+        # one spatial lora for each video
+        spatial_lora_num = train_dataset.__len__()
+    lora_managers_spatial = []
+    unet_lora_params_spatial_list = []
+    optimizer_spatial_list = []
+    lr_scheduler_spatial_list = []
+    for i in range(spatial_lora_num):
+        lora_manager_spatial = LoraHandler(use_unet_lora=use_unet_lora, unet_replace_modules=["BasicTransformerBlock"])
+        lora_managers_spatial.append(lora_manager_spatial)
+        unet_lora_params_spatial, unet_negation_spatial = lora_manager_spatial.add_lora_to_model(
+            use_unet_lora, unet, lora_manager_spatial.unet_replace_modules, lora_unet_dropout,
+            lora_path + '/spatial/lora/', r=lora_rank)
+        unet_lora_params_spatial_list.append(unet_lora_params_spatial)
+        optimizer_spatial = optimizer_cls(
+            create_optimizer_params([param_optim(unet_lora_params_spatial, use_unet_lora, is_lora=True,
+                                                 extra_params={**{"lr": learning_rate}}
+                                                 )], learning_rate),
+            lr=learning_rate,
+            betas=(adam_beta1, adam_beta2),
+            weight_decay=adam_weight_decay,
+            eps=adam_epsilon,
+        )
+        optimizer_spatial_list.append(optimizer_spatial)
+        # Scheduler
+        lr_scheduler_spatial = get_scheduler(
+            lr_scheduler,
+            optimizer=optimizer_spatial,
+            num_warmup_steps=lr_warmup_steps * gradient_accumulation_steps,
+            num_training_steps=max_train_steps * gradient_accumulation_steps,
+        )
+        lr_scheduler_spatial_list.append(lr_scheduler_spatial)
+        unet_negation_all = unet_negation_spatial + unet_negation_temporal
+    # DataLoaders creation:
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=train_batch_size,
+        shuffle=True
+    )
+    # Latents caching
+    cached_data_loader = handle_cache_latents(
+        cache_latents,
+        output_dir,
+        train_dataloader,
+        train_batch_size,
+        vae,
+        unet,
+        cached_latent_dir
+    )
+    if cached_data_loader is not None and train_data.get("use_data_aug") is None:
+        train_dataloader = cached_data_loader
+    # Prepare everything with our `accelerator`.
+    unet, optimizer_temporal, train_dataloader, lr_scheduler_temporal, image_encoder = accelerator.prepare(
+        unet,
+        optimizer_temporal,
+        train_dataloader,
+        lr_scheduler_temporal,
+        image_encoder
+    )
+    # Use Gradient Checkpointing if enabled.
+    unet_and_text_g_c(
+        unet,
+        image_encoder,
+        gradient_checkpointing,
+        image_encoder_gradient_checkpointing
+    )
+    # Enable VAE slicing to save memory.
+    #vae.enable_slicing()
+    # For mixed precision training we cast the text_encoder and vae weights to half-precision
+    # as these models are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = is_mixed_precision(accelerator)
+    # Move text encoders, and VAE to GPU
+    models_to_cast = [image_encoder, vae]
+    cast_to_gpu_and_type(models_to_cast, accelerator, weight_dtype)
+    # Fix noise schedules to predcit light and dark areas if available.
+    # if not use_offset_noise and rescale_schedule:
+    #    noise_scheduler.betas = enforce_zero_terminal_snr(noise_scheduler.betas)
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / gradient_accumulation_steps)
+    # Afterwards we recalculate our number of training epochs
+    num_train_epochs = math.ceil(max_train_steps / num_update_steps_per_epoch)
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        accelerator.init_trackers("svd-finetune")
+    # Train!
+    total_batch_size = train_batch_size * accelerator.num_processes * gradient_accumulation_steps
+    logger.info("***** Running training for motion lora*****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient Accumulation steps = {gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+    def encode_image(pixel_values):
+#        pixel_values = pixel_values * 2.0 - 1.0
+        pixel_values = _resize_with_antialiasing(pixel_values, (224, 224))
+        pixel_values = (pixel_values + 1.0) / 2.0
+        # Normalize the image with for CLIP input
+        pixel_values = feature_extractor(
+            images=pixel_values,
+            do_normalize=True,
+            do_center_crop=False,
+            do_resize=False,
+            do_rescale=False,
+            return_tensors="pt",
+        ).pixel_values
+        pixel_values = pixel_values.to(
+            device=accelerator.device, dtype=weight_dtype)
+        image_embeddings = image_encoder(pixel_values).image_embeds
+        image_embeddings= image_embeddings.unsqueeze(1)
+        return image_embeddings
+    def _get_add_time_ids(
+        fps,
+        motion_bucket_ids,  # Expecting a list of tensor floats
+        noise_aug_strength,
+        dtype,
+        batch_size,
+        unet=None,
+        device=None,  # Add a device parameter
+    ):
+        # Determine the target device
+        target_device = device if device is not None else 'cpu'
+        # Ensure motion_bucket_ids is a tensor and on the target device
+        if not isinstance(motion_bucket_ids, torch.Tensor):
+            motion_bucket_ids = torch.tensor(motion_bucket_ids, dtype=dtype, device=target_device)
+        else:
+            motion_bucket_ids = motion_bucket_ids.to(device=target_device)
+        # Reshape motion_bucket_ids if necessary
+        if motion_bucket_ids.dim() == 1:
+            motion_bucket_ids = motion_bucket_ids.view(-1, 1)
+        # Check for batch size consistency
+        if motion_bucket_ids.size(0) != batch_size:
+            raise ValueError("The length of motion_bucket_ids must match the batch_size.")
+        # Create fps and noise_aug_strength tensors on the target device
+        add_time_ids = torch.tensor([fps, noise_aug_strength], dtype=dtype, device=target_device).repeat(batch_size, 1)
+        # Concatenate with motion_bucket_ids
+        add_time_ids = torch.cat([add_time_ids, motion_bucket_ids], dim=1)
+        # Checking the dimensions of the added time embedding
+        passed_add_embed_dim = unet.config.addition_time_embed_dim * add_time_ids.size(1)
+        expected_add_embed_dim = unet.add_embedding.linear_1.in_features
+        if expected_add_embed_dim != passed_add_embed_dim:
+            raise ValueError(
+                f"Model expects an added time embedding vector of length {expected_add_embed_dim}, "
+                f"but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. "
+                "Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+            )
+        return add_time_ids
+    # Only show the progress bar once on each machine.
+    progress_bar = tqdm(range(global_step, max_train_steps), disable=not accelerator.is_local_main_process)
+    progress_bar.set_description("Steps")
+    # set consistency controller
+    if consistency_controller is not None:
+        consistency_train_controller = attention_util.ConsistencyAttentionControl(
+             additional_attention_store=consistency_controller,
+             use_inversion_attention=True,
+             save_self_attention=False,
+             save_latents=False,
+             disk_store=True
+        )
+        attention_util.register_attention_control(
+            unet,
+            None,
+            consistency_train_controller,
+            find_modules={},
+            consistency_find_modules=consistency_find_modules
+        )
+    def finetune_unet(batch, step, mask_spatial_lora=False, mask_temporal_lora=False):
+        nonlocal use_offset_noise
+        nonlocal rescale_schedule
+        # Unfreeze UNET Layers
+        if global_step == 0:
+            already_printed_trainables = False
+            unet.train()
+            handle_trainable_modules(
+                unet,
+                trainable_modules,
+                is_enabled=True,
+                negation=unet_negation_all
+            )
+        # Convert videos to latent space
+        #print("use_data_aug", train_data.get("use_data_aug"))
+        if not cache_latents or train_data.get("use_data_aug") is not None:
+            latents = tensor_to_vae_latent(batch["pixel_values"], vae)
+            refer_latents = tensor_to_vae_latent(batch["refer_pixel_values"], vae)
+            cross_latents = tensor_to_vae_latent(batch["cross_pixel_values"], vae)
+        else:
+            latents = batch["latents"]
+            refer_latents = batch["refer_latents"]
+            cross_latents = batch["cross_latents"]
+        # Sample noise that we'll add to the latents
+        use_offset_noise = use_offset_noise and not rescale_schedule
+        noise = sample_noise(latents, offset_noise_strength, use_offset_noise)
+        noise_1 = sample_noise(latents, offset_noise_strength, False)
+        bsz = latents.shape[0]
+        # Sample a random timestep for each video
+        sigmas, u = rand_cosine_interpolated(shape=[bsz,], image_d=image_d, noise_d_low=noise_d_low, noise_d_high=noise_d_high,
+                                          sigma_data=sigma_data, min_value=min_value, max_value=max_value)
+        noise_scheduler.set_timesteps(validation_data.num_inference_steps, device=latents.device)
+        all_sigmas = noise_scheduler.sigmas
+        sigmas = sigmas.to(latents.device)
+        timestep = (validation_data.num_inference_steps - torch.searchsorted(all_sigmas.to(latents.device).flip(dims=(0,)), sigmas, right=False)).clamp(0,validation_data.num_inference_steps-1)[0]
+        u = u.item()
+        if consistency_controller is not None:
+            #timestep = int(u * (validation_data.num_inference_steps-1)+0.5)
+            #print("u", u, "timestep", timestep, "sigmas", sigmas, "all_sigmas", all_sigmas)
+            consistency_train_controller.set_cur_step(timestep)
+        # Add noise to the latents according to the noise magnitude at each timestep
+        # (this is the forward diffusion process)
+        sigmas_reshaped = sigmas.clone()
+        while len(sigmas_reshaped.shape) < len(latents.shape):
+            sigmas_reshaped = sigmas_reshaped.unsqueeze(-1)
+        # add noise to the latents or the original image?
+        train_noise_aug = 0.02
+        conditional_latents = refer_latents / vae.config.scaling_factor
+        small_noise_latents = conditional_latents + noise_1[:,0:1,:,:,:] * train_noise_aug
+        conditional_latents = small_noise_latents[:, 0, :, :, :]
+        noisy_latents = latents + noise * sigmas_reshaped
+        timesteps = torch.Tensor(
+            [0.25 * sigma.log() for sigma in sigmas]).to(latents.device)
+        inp_noisy_latents = noisy_latents  / ((sigmas_reshaped**2 + 1) ** 0.5)
+        # *Potentially* Fixes gradient checkpointing training.
+        # See: https://github.com/prigoyal/pytorch_memonger/blob/master/tutorial/Checkpointing_for_PyTorch_models.ipynb
+        if kwargs.get('eval_train', False):
+            unet.eval()
+            image_encoder.eval()
+        # Get the text embedding for conditioning.
+        encoder_hidden_states = encode_image(
+            batch["cross_pixel_values"][:, 0, :, :, :])
+        detached_encoder_state = encoder_hidden_states.clone().detach()
+        added_time_ids = _get_add_time_ids(
+            6,
+            batch["motion_values"],
+            train_noise_aug, # noise_aug_strength == 0.0
+            encoder_hidden_states.dtype,
+            bsz,
+            unet,
+            device=latents.device
+        )
+        added_time_ids = added_time_ids.to(latents.device)
+        # check out the section 3.2.1 of the original paper https://arxiv.org/abs/2211.09800.
+        conditioning_dropout_prob = kwargs.get('conditioning_dropout_prob')
+        if conditioning_dropout_prob is not None:
+            random_p = torch.rand(
+                bsz, device=latents.device, generator=generator)
+            # Sample masks for the edit prompts.
+            prompt_mask = random_p < 2 * conditioning_dropout_prob
+            prompt_mask = prompt_mask.reshape(bsz, 1, 1)
+            # Final text conditioning.
+            null_conditioning = torch.zeros_like(encoder_hidden_states)
+            encoder_hidden_states = torch.where(
+                prompt_mask, null_conditioning, encoder_hidden_states)
+            # Sample masks for the original images.
+            image_mask_dtype = conditional_latents.dtype
+            image_mask = 1 - (
+                (random_p >= conditioning_dropout_prob).to(
+                    image_mask_dtype)
+                * (random_p < 3 * conditioning_dropout_prob).to(image_mask_dtype)
+            )
+            image_mask = image_mask.reshape(bsz, 1, 1, 1)
+            # Final image conditioning.
+            conditional_latents = image_mask * conditional_latents
+        # Concatenate the `conditional_latents` with the `noisy_latents`.
+        conditional_latents = conditional_latents.unsqueeze(
+            1).repeat(1, noisy_latents.shape[1], 1, 1, 1)
+        inp_noisy_latents = torch.cat(
+            [inp_noisy_latents, conditional_latents], dim=2)
+        # Get the target for loss depending on the prediction type
+        # if noise_scheduler.config.prediction_type == "epsilon":
+        #     target = latents  # we are computing loss against denoise latents
+        # elif noise_scheduler.config.prediction_type == "v_prediction":
+        #     target = noise_scheduler.get_velocity(
+        #         latents, noise, timesteps)
+        # else:
+        #     raise ValueError(
+        #         f"Unknown prediction type {noise_scheduler.config.prediction_type}")
+        target = latents
+        encoder_hidden_states = detached_encoder_state
+        if True:#mask_spatial_lora:
+            loras = extract_lora_child_module(unet, target_replace_module=["BasicTransformerBlock"])
+            for lora_i in loras:
+                lora_i.scale = 0.
+            loss_spatial = None
+        else:
+            loras = extract_lora_child_module(unet, target_replace_module=["BasicTransformerBlock"])
+            if spatial_lora_num == 1:
+                for lora_i in loras:
+                    lora_i.scale = 1.
+            else:
+                for lora_i in loras:
+                    lora_i.scale = 0.
+                for lora_idx in range(0, len(loras), spatial_lora_num):
+                    loras[lora_idx + step].scale = 1.
+            loras = extract_lora_child_module(unet, target_replace_module=["TemporalBasicTransformerBlock"])
+            if len(loras) > 0:
+                for lora_i in loras:
+                    lora_i.scale = 0.
+            ran_idx = 0#torch.randint(0, noisy_latents.shape[2], (1,)).item()
+            #spatial_inp_noisy_latents = inp_noisy_refer_latents[:, ran_idx:ran_idx+1, :, :, :]
+            inp_noisy_spatial_latents = inp_noisy_latents#[:, ran_idx:ran_idx+1, :, :, :]
+            target_spatial = latents#[:, ran_idx:ran_idx+1, :, :, :]
+            # Predict the noise residual
+            model_pred = unet(
+                inp_noisy_spatial_latents, timesteps, encoder_hidden_states,
+                added_time_ids
+            ).sample
+            sigmas = sigmas_reshaped
+            # Denoise the latents
+            c_out = -sigmas / ((sigmas**2 + 1)**0.5)
+            c_skip = 1 / (sigmas**2 + 1)
+            denoised_latents = model_pred * c_out + c_skip * noisy_latents#[:, ran_idx:ran_idx+1, :, :, :]
+            weighing = (1 + sigmas ** 2) * (sigmas**-2.0)
+            # MSE loss
+            loss_spatial = torch.mean(
+                (weighing.float() * (denoised_latents.float() -
+                 target_spatial.float()) ** 2).reshape(target_spatial.shape[0], -1),
+                dim=1,
+            )
+            loss_spatial = loss_spatial.mean()
+        if mask_temporal_lora:
+            loras = extract_lora_child_module(unet, target_replace_module=["TemporalBasicTransformerBlock"])
+            for lora_i in loras:
+                lora_i.scale = 0.
+            loss_temporal = None
+        else:
+            loras = extract_lora_child_module(unet, target_replace_module=["TemporalBasicTransformerBlock"])
+            for lora_i in loras:
+                lora_i.scale = 1.
+            # Predict the noise residual
+            model_pred = unet(
+                inp_noisy_latents, timesteps, encoder_hidden_states,
+                added_time_ids=added_time_ids,
+            ).sample
+            sigmas = sigmas_reshaped
+            # Denoise the latents
+            c_out = -sigmas / ((sigmas**2 + 1)**0.5)
+            c_skip = 1 / (sigmas**2 + 1)
+            denoised_latents = model_pred * c_out + c_skip * noisy_latents
+            if consistency_controller is not None:
+                consistency_train_controller.step_callback(denoised_latents.detach())
+            weighing = (1 + sigmas ** 2) * (sigmas**-2.0)
+            # MSE loss
+            loss_temporal = torch.mean(
+                (weighing.float() * (denoised_latents.float() -
+                 target.float()) ** 2).reshape(target.shape[0], -1),
+                dim=1,
+            )
+            loss_temporal = loss_temporal.mean()
+#            beta = 1
+#            alpha = (beta ** 2 + 1) ** 0.5
+#            ran_idx = torch.randint(0, model_pred.shape[1], (1,)).item()
+#            model_pred_decent = alpha * model_pred - beta * model_pred[:, ran_idx, :, :, :].unsqueeze(1)
+#            target_decent = alpha * target - beta * target[:, ran_idx, :, :, :].unsqueeze(1)
+#            loss_ad_temporal = F.mse_loss(model_pred_decent.float(), target_decent.float(), reduction="mean")
+            loss_temporal = loss_temporal #+ loss_ad_temporal
+        return loss_spatial, loss_temporal, latents, noise
+    for epoch in range(first_epoch, num_train_epochs):
+        train_loss_spatial = 0.0
+        train_loss_temporal = 0.0
+        for step, batch in enumerate(train_dataloader):
+            #torch.cuda.empty_cache()
+            # Skip steps until we reach the resumed step
+            if resume_from_checkpoint and epoch == first_epoch and step < resume_step:
+                if step % gradient_accumulation_steps == 0:
+                    progress_bar.update(1)
+                continue
+            with accelerator.accumulate(unet):
+                for optimizer_spatial in optimizer_spatial_list:
+                    optimizer_spatial.zero_grad(set_to_none=True)
+                if optimizer_temporal is not None:
+                    optimizer_temporal.zero_grad(set_to_none=True)
+                if train_temporal_lora:
+                    mask_temporal_lora = False
+                else:
+                    mask_temporal_lora = True
+                if False:#clip_id != 0:
+                    mask_spatial_lora = random.uniform(0, 1) < 0.2 and not mask_temporal_lora
+                else:
+                    mask_spatial_lora = True
+                with accelerator.autocast():
+                    loss_spatial, loss_temporal, latents, init_noise = finetune_unet(batch, step, mask_spatial_lora=mask_spatial_lora, mask_temporal_lora=mask_temporal_lora)
+                # Gather the losses across all processes for logging (if we use distributed training).
+                if not mask_spatial_lora:
+                    avg_loss_spatial = accelerator.gather(loss_spatial.repeat(train_batch_size)).mean()
+                    train_loss_spatial += avg_loss_spatial.item() / gradient_accumulation_steps
+                if not mask_temporal_lora and train_temporal_lora:
+                    avg_loss_temporal = accelerator.gather(loss_temporal.repeat(train_batch_size)).mean()
+                    train_loss_temporal += avg_loss_temporal.item() / gradient_accumulation_steps
+                # Backpropagate
+                if not mask_spatial_lora:
+                    accelerator.backward(loss_spatial, retain_graph=True)
+                    if spatial_lora_num == 1:
+                        optimizer_spatial_list[0].step()
+                    else:
+                        optimizer_spatial_list[step].step()
+                    if spatial_lora_num == 1:
+                        lr_scheduler_spatial_list[0].step()
+                    else:
+                        lr_scheduler_spatial_list[step].step()
+                if not mask_temporal_lora and train_temporal_lora:
+                    accelerator.backward(loss_temporal)
+                    optimizer_temporal.step()
+                if lr_scheduler_temporal is not None:
+                    lr_scheduler_temporal.step()
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+                accelerator.log({"train_loss": train_loss_temporal}, step=global_step)
+                train_loss_temporal = 0.0
+                if global_step % checkpointing_steps == 0 and global_step > 0:
+                    save_pipe(
+                        pretrained_model_path,
+                        global_step,
+                        accelerator,
+                        unet,
+                        image_encoder,
+                        vae,
+                        output_dir,
+                        lora_manager_spatial,
+                        lora_manager_temporal,
+                        unet_lora_modules,
+                        image_encoder_lora_modules,
+                        is_checkpoint=True,
+                        save_pretrained_model=save_pretrained_model
+                    )
+                if should_sample(global_step, validation_steps, validation_data):
+                    if accelerator.is_main_process:
+                        with accelerator.autocast():
+                            unet.eval()
+                            image_encoder.eval()
+                            generator = torch.Generator(device="cpu")
+                            generator.manual_seed(seed)
+                            unet_and_text_g_c(unet, image_encoder, False, False)
+                            loras = extract_lora_child_module(unet, target_replace_module=["BasicTransformerBlock"])
+                            for lora_i in loras:
+                                lora_i.scale = 0.0
+                            if consistency_controller is not None:
+                                attention_util.register_attention_control(
+                                    unet,
+                                    None,
+                                    consistency_train_controller,
+                                    find_modules={},
+                                    consistency_find_modules=consistency_find_modules,
+                                    undo=True
+                                )
+                            pipeline = P2PStableVideoDiffusionPipeline.from_pretrained(
+                                pretrained_model_path,
+                                image_encoder=image_encoder,
+                                vae=vae,
+                                unet=unet
+                            )
+                            if consistency_controller is not None:
+                                pipeline.scheduler = P2PEulerDiscreteScheduler.from_config(pipeline.scheduler.config)
+#                            # recalculate inversed noise latent
+#                            if any([np > 0. for np in validation_data.noise_prior]):
+#                                pixel_values_for_inv = batch['pixel_values_for_inv'].to('cuda', dtype=torch.float16)
+#                                batch['inversion_noise'] = inverse_video(pipeline, batch['latents_for_inv'], 25, pixel_values_for_inv[:,0,:,:,:])
+                            preset_noises = []
+                            for noise_prior in validation_data.noise_prior:
+                                if noise_prior > 0:
+                                    assert batch['inversion_noise'] is not None, "inversion_noise should not be None when noise_prior > 0"
+                                    preset_noise = (noise_prior) ** 0.5 * batch['inversion_noise'] + (
+                                        1-noise_prior) ** 0.5 * torch.randn_like(batch['inversion_noise'])
+                                    #print("preset noise", torch.mean(preset_noise), torch.std(preset_noise))
+                                else:
+                                    preset_noise = None
+                                preset_noises.append( preset_noise )
+                            for val_img_idx in range(len(validation_images)):
+                                for i in range(len(preset_noises)):
+                                    if consistency_controller is not None:
+                                        consistency_edit_controller = attention_util.ConsistencyAttentionControl(
+                                             additional_attention_store=consistency_edit_controller_list[val_img_idx],
+                                             use_inversion_attention=False,
+                                             save_self_attention=False,
+                                             save_latents=False,
+                                             disk_store=True
+                                        )
+                                        attention_util.register_attention_control(
+                                            pipeline.unet,
+                                            None,
+                                            consistency_edit_controller,
+                                            find_modules={},
+                                            consistency_find_modules=consistency_find_modules,
+                                        )
+                                        pipeline.scheduler.controller = [consistency_edit_controller]
+                                    preset_noise = preset_noises[i]
+                                    save_filename = f"step_{global_step}_noise_{i}_{val_img_idx}"
+                                    out_file = f"{output_dir}/samples/{save_filename}.mp4"
+                                    val_img = validation_images[val_img_idx]
+                                    edited_firstframe = edited_firstframes[val_img_idx]
+                                    original_res = val_img.size
+                                    resctrl = ResolutionControl(
+                                        (original_res[1],original_res[0]),
+                                        (validation_data.height, validation_data.width),
+                                        validation_data.get("pad_to_fit", False),
+                                        fill=0
+                                    )
+                                    #val_img = Image.open("white.png").convert("RGB")
+                                    val_img = resctrl(val_img)
+                                    edited_firstframe = resctrl(edited_firstframe)
+                                    with torch.no_grad():
+                                        video_frames = pipeline(
+                                            val_img,
+                                            edited_firstframe=edited_firstframe,
+                                            image_latents=validation_images_latents[val_img_idx],
+                                            width=validation_data.width,
+                                            height=validation_data.height,
+                                            num_frames=batch["pixel_values"].shape[1],
+                                            decode_chunk_size=8,
+                                            motion_bucket_id=127,
+                                            fps=validation_data.get('fps', 7),
+                                            noise_aug_strength=0.02,
+                                            generator=generator,
+                                            num_inference_steps=validation_data.num_inference_steps,
+                                            latents=preset_noise
+                                        ).frames
+                                    export_to_video(video_frames, out_file, validation_data.get('fps', 7), resctrl)
+                                    if consistency_controller is not None:
+                                        attention_util.register_attention_control(
+                                            pipeline.unet,
+                                            None,
+                                            consistency_edit_controller,
+                                            find_modules={},
+                                            consistency_find_modules=consistency_find_modules,
+                                            undo=True
+                                        )
+                                        consistency_edit_controller.delete()
+                                        del consistency_edit_controller
+                                    logger.info(f"Saved a new sample to {out_file}")
+                            if consistency_controller is not None:
+                                attention_util.register_attention_control(
+                                    unet,
+                                    None,
+                                    consistency_train_controller,
+                                    find_modules={},
+                                    consistency_find_modules=consistency_find_modules,
+                                )
+                            del pipeline
+                            torch.cuda.empty_cache()
+                    unet_and_text_g_c(
+                        unet,
+                        image_encoder,
+                        gradient_checkpointing,
+                        image_encoder_gradient_checkpointing
+                    )
+            if loss_temporal is not None:
+                accelerator.log({"loss_temporal": loss_temporal.detach().item()}, step=step)
+            if global_step >= max_train_steps:
+                break
+    # Create the pipeline using the trained modules and save it.
+    accelerator.wait_for_everyone()
+    if accelerator.is_main_process:
+        save_pipe(
+            pretrained_model_path,
+            global_step,
+            accelerator,
+            unet,
+            image_encoder,
+            vae,
+            output_dir,
+            lora_manager_spatial,
+            lora_manager_temporal,
+            unet_lora_modules,
+            image_encoder_lora_modules,
+            is_checkpoint=False,
+            save_pretrained_model=save_pretrained_model
+        )
+    accelerator.end_training()
+    if consistency_controller is not None:
+        consistency_train_controller.delete()
+        del consistency_train_controller
+if __name__ == "__main__":
+    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, default='./configs/config_multi_videos.yaml')
+    args = parser.parse_args()
+    train_motion_lora(**OmegaConf.load(args.config))

i2vedit/utils/__init__.py ADDED Viewed

File without changes

i2vedit/utils/bucketing.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from PIL import Image
+def min_res(size, min_size): return 192 if size < 192 else size
+def up_down_bucket(m_size, in_size, direction):
+    if direction == 'down': return abs(int(m_size - in_size))
+    if direction == 'up': return abs(int(m_size + in_size))
+def get_bucket_sizes(size, direction: 'down', min_size):
+    multipliers = [64, 128]
+    for i, m in enumerate(multipliers):
+        res =  up_down_bucket(m, size, direction)
+        multipliers[i] = min_res(res, min_size=min_size)
+    return multipliers
+def closest_bucket(m_size, size, direction, min_size):
+    lst = get_bucket_sizes(m_size, direction, min_size)
+    return lst[min(range(len(lst)), key=lambda i: abs(lst[i]-size))]
+def resolve_bucket(i,h,w): return  (i / (h / w))
+def sensible_buckets(m_width, m_height, w, h, min_size=192):
+    if h > w:
+        w = resolve_bucket(m_width, h, w)
+        w = closest_bucket(m_width, w, 'down', min_size=min_size)
+        return w, m_height
+    if h < w:
+        h = resolve_bucket(m_height, w, h)
+        h = closest_bucket(m_height, h, 'down', min_size=min_size)
+        return m_width, h
+    return m_width, m_height

i2vedit/utils/dataset.py ADDED Viewed

	@@ -0,0 +1,705 @@

+import os
+import decord
+import numpy as np
+import random
+import json
+import torchvision
+import torchvision.transforms as T
+import torch
+from torchvision.transforms import Resize, Pad, InterpolationMode, ToTensor
+from glob import glob
+from PIL import Image
+from itertools import islice
+from pathlib import Path
+from .bucketing import sensible_buckets
+decord.bridge.set_bridge('torch')
+from torch.utils.data import Dataset
+from einops import rearrange, repeat
+def pad_with_ratio(frames, res, fill=0):
+    process = False
+    if not isinstance(frames, torch.Tensor):
+        frames = ToTensor()(frames).unsqueeze(0)
+        process = True
+    _, _, ih, iw = frames.shape
+#    print("ih, iw", ih, iw)
+    i_ratio = ih / iw
+    h, w = res
+#    print("h,w", h ,w)
+    n_ratio = h / w
+    if i_ratio > n_ratio:
+        nw = int(ih / h * w)
+#        print("nw", nw)
+        frames = Pad((nw - iw)//2, fill=fill)(frames)
+        frames = frames[...,(nw - iw)//2:-(nw - iw)//2,:]
+    else:
+        nh = int(iw / w * h)
+        frames = Pad((nh - ih)//2, fill=fill)(frames)
+        frames = frames[...,:,(nh - ih)//2:-(nh - ih)//2]
+#    print("after pad", frames.shape)
+    if process:
+        frames = (frames * 255.).type(torch.uint8).permute(0,2,3,1).squeeze().cpu().numpy()
+        frames = Image.fromarray(frames)
+    return frames
+def return_to_original_res(frames, res, pad_to_fix=False):
+    process = False
+    if not isinstance(frames, torch.Tensor):
+        frames = ToTensor()(frames).unsqueeze(0)
+        process = True
+#    print("original res", res)
+    _, _, h, w = frames.shape
+#    print("h w", h, w)
+    n_ratio = h / w
+    ih, iw = res
+    i_ratio = ih / iw
+    if pad_to_fix:
+        if i_ratio > n_ratio:
+            nw = int(ih / h * w)
+            frames = Resize((ih, iw+2*(nw - iw)//2), interpolation=InterpolationMode.BICUBIC, antialias=True)(frames)
+            frames = frames[...,:,(nw - iw)//2:-(nw - iw)//2]
+        else:
+            nh = int(iw / w * h)
+            frames = Resize((ih+2*(nh - ih)//2, iw), interpolation=InterpolationMode.BICUBIC, antialias=True)(frames)
+            frames = frames[...,(nh - ih)//2:-(nh - ih)//2,:]
+    else:
+        frames = Resize((ih, iw), interpolation=InterpolationMode.BICUBIC, antialias=True)(frames)
+    if process:
+        frames = (frames * 255.).type(torch.uint8).permute(0,2,3,1).squeeze().cpu().numpy()
+        frames = Image.fromarray(frames)
+    return frames
+def get_prompt_ids(prompt, tokenizer):
+    prompt_ids = tokenizer(
+            prompt,
+            truncation=True,
+            padding="max_length",
+            max_length=tokenizer.model_max_length,
+            return_tensors="pt",
+    ).input_ids
+    return prompt_ids
+def read_caption_file(caption_file):
+        with open(caption_file, 'r', encoding="utf8") as t:
+            return t.read()
+def get_text_prompt(
+        text_prompt: str = '',
+        fallback_prompt: str= '',
+        file_path:str = '',
+        ext_types=['.mp4'],
+        use_caption=False
+    ):
+    try:
+        if use_caption:
+            if len(text_prompt) > 1: return text_prompt
+            caption_file = ''
+            # Use caption on per-video basis (One caption PER video)
+            for ext in ext_types:
+                maybe_file = file_path.replace(ext, '.txt')
+                if maybe_file.endswith(ext_types): continue
+                if os.path.exists(maybe_file):
+                    caption_file = maybe_file
+                    break
+            if os.path.exists(caption_file):
+                return read_caption_file(caption_file)
+            # Return fallback prompt if no conditions are met.
+            return fallback_prompt
+        return text_prompt
+    except:
+        print(f"Couldn't read prompt caption for {file_path}. Using fallback.")
+        return fallback_prompt
+def get_video_frames(vr, start_idx, sample_rate=1, max_frames=24):
+    max_range = len(vr)
+    frame_number = sorted((0, start_idx, max_range))[1]
+    frame_range = range(frame_number, max_range, sample_rate)
+    frame_range_indices = list(frame_range)[:max_frames]
+    return frame_range_indices
+def process_video(vid_path, use_bucketing, w, h, get_frame_buckets, get_frame_batch, pad_to_fix=False, use_aug=False):
+    use_aug = False
+    if use_bucketing:
+        vr = decord.VideoReader(vid_path)
+        resize = get_frame_buckets(vr)
+        video = get_frame_batch(vr, resize=resize)
+    else:
+        if not pad_to_fix:
+            vr = decord.VideoReader(vid_path, width=w, height=h)
+            video = get_frame_batch(vr, use_aug=use_aug)
+        else:
+            vr = decord.VideoReader(vid_path)
+            video = get_frame_batch(vr, use_aug=use_aug)
+            video = pad_with_ratio(video, (h, w))
+            video = T.transforms.Resize((h, w), antialias=True)(video)
+    return video, vr
+# https://github.com/ExponentialML/Video-BLIP2-Preprocessor
+class VideoJsonDataset(Dataset):
+    def __init__(
+            self,
+            tokenizer = None,
+            width: int = 256,
+            height: int = 256,
+            n_sample_frames: int = 4,
+            sample_start_idx: int = 1,
+            frame_step: int = 1,
+            json_path: str ="",
+            json_data = None,
+            vid_data_key: str = "video_path",
+            preprocessed: bool = False,
+            use_bucketing: bool = False,
+            **kwargs
+    ):
+        self.vid_types = (".mp4", ".avi", ".mov", ".webm", ".flv", ".mjpeg")
+        self.use_bucketing = use_bucketing
+        self.tokenizer = tokenizer
+        self.preprocessed = preprocessed
+        self.vid_data_key = vid_data_key
+        self.train_data = self.load_from_json(json_path, json_data)
+        self.width = width
+        self.height = height
+        self.n_sample_frames = n_sample_frames
+        self.sample_start_idx = sample_start_idx
+        self.frame_step = frame_step
+    def build_json(self, json_data):
+        extended_data = []
+        for data in json_data['data']:
+            for nested_data in data['data']:
+                self.build_json_dict(
+                    data,
+                    nested_data,
+                    extended_data
+                )
+        json_data = extended_data
+        return json_data
+    def build_json_dict(self, data, nested_data, extended_data):
+        clip_path = nested_data['clip_path'] if 'clip_path' in nested_data else None
+        extended_data.append({
+            self.vid_data_key: data[self.vid_data_key],
+            'frame_index': nested_data['frame_index'],
+            'prompt': nested_data['prompt'],
+            'clip_path': clip_path
+        })
+    def load_from_json(self, path, json_data):
+        try:
+            with open(path) as jpath:
+                print(f"Loading JSON from {path}")
+                json_data = json.load(jpath)
+                return self.build_json(json_data)
+        except:
+            self.train_data = []
+            print("Non-existant JSON path. Skipping.")
+    def validate_json(self, base_path, path):
+        return os.path.exists(f"{base_path}/{path}")
+    def get_frame_range(self, vr):
+        return get_video_frames(
+            vr,
+            self.sample_start_idx,
+            self.frame_step,
+            self.n_sample_frames
+        )
+    def get_vid_idx(self, vr, vid_data=None):
+        frames = self.n_sample_frames
+        if vid_data is not None:
+            idx = vid_data['frame_index']
+        else:
+            idx = self.sample_start_idx
+        return idx
+    def get_frame_buckets(self, vr):
+        _, h, w = vr[0].shape
+        width, height = sensible_buckets(self.width, self.height, h, w)
+        # width, height = self.width, self.height
+        resize = T.transforms.Resize((height, width), antialias=True)
+        return resize
+    def get_frame_batch(self, vr, resize=None):
+        frame_range = self.get_frame_range(vr)
+        frames = vr.get_batch(frame_range)
+        video = rearrange(frames, "f h w c -> f c h w")
+        if resize is not None: video = resize(video)
+        return video
+    def process_video_wrapper(self, vid_path):
+        video, vr = process_video(
+                vid_path,
+                self.use_bucketing,
+                self.width,
+                self.height,
+                self.get_frame_buckets,
+                self.get_frame_batch
+            )
+        return video, vr
+    def train_data_batch(self, index):
+        # If we are training on individual clips.
+        if 'clip_path' in self.train_data[index] and \
+            self.train_data[index]['clip_path'] is not None:
+            vid_data = self.train_data[index]
+            clip_path = vid_data['clip_path']
+            # Get video prompt
+            prompt = vid_data['prompt']
+            video, _ = self.process_video_wrapper(clip_path)
+            prompt_ids = get_prompt_ids(prompt, self.tokenizer)
+            return video, prompt, prompt_ids
+         # Assign train data
+        train_data = self.train_data[index]
+        # Get the frame of the current index.
+        self.sample_start_idx = train_data['frame_index']
+        # Initialize resize
+        resize = None
+        video, vr = self.process_video_wrapper(train_data[self.vid_data_key])
+        # Get video prompt
+        prompt = train_data['prompt']
+        vr.seek(0)
+        prompt_ids = get_prompt_ids(prompt, self.tokenizer)
+        return video, prompt, prompt_ids
+    @staticmethod
+    def __getname__(): return 'json'
+    def __len__(self):
+        if self.train_data is not None:
+            return len(self.train_data)
+        else:
+            return 0
+    def __getitem__(self, index):
+        # Initialize variables
+        video = None
+        prompt = None
+        prompt_ids = None
+        # Use default JSON training
+        if self.train_data is not None:
+            video, prompt, prompt_ids = self.train_data_batch(index)
+        example = {
+            "pixel_values": (video / 127.5 - 1.0),
+            "prompt_ids": prompt_ids[0],
+            "text_prompt": prompt,
+            'dataset': self.__getname__()
+        }
+        return example
+class SingleVideoDataset(Dataset):
+    def __init__(
+        self,
+            width: int = 256,
+            height: int = 256,
+            inversion_width: int = 256,
+            inversion_height: int = 256,
+            start_t: float=0,
+            end_t: float=-1,
+            sample_fps: int=-1,
+            single_video_path: str = "",
+            refer_image_path: str = "",
+            use_caption: bool = False,
+            use_bucketing: bool = False,
+            pad_to_fix: bool = False,
+            use_aug: bool = False,
+            **kwargs
+    ):
+        self.use_bucketing = use_bucketing
+        self.frames = []
+        self.index = 1
+        self.vid_types = (".mp4", ".avi", ".mov", ".webm", ".flv", ".mjpeg")
+        self.start_t = start_t
+        self.end_t = end_t
+        self.output_fps = sample_fps
+        self.single_video_path = single_video_path
+        self.refer_image_path = refer_image_path
+        self.width = width
+        self.height = height
+        self.inversion_width = inversion_width
+        self.inversion_height = inversion_height
+        self.pad_to_fix = pad_to_fix
+        self.use_aug = use_aug
+        #self.data_augment = ControlNetDataAugmentation()
+    def create_video_chunks(self):
+        output_fps = self.output_fps
+        start_t = self.start_t
+        end_t = self.end_t
+        vr = decord.VideoReader(self.single_video_path)
+        initial_fps = vr.get_avg_fps()
+        if output_fps == -1:
+            output_fps = int(initial_fps)
+        if end_t == -1:
+            end_t = len(vr) / initial_fps
+        else:
+            end_t = min(len(vr) / initial_fps, end_t)
+        assert 0 <= start_t < end_t
+        assert output_fps > 0
+        start_f_ind = int(start_t * initial_fps)
+        end_f_ind = int(end_t * initial_fps)
+        num_f = int((end_t - start_t) * output_fps)
+        sample_idx = np.linspace(start_f_ind, end_f_ind, num_f, endpoint=False).astype(int)
+        self.frames = [sample_idx]
+        return self.frames
+    def chunk(self, it, size):
+        it = iter(it)
+        return iter(lambda: tuple(islice(it, size)), ())
+    def get_frame_batch(self, vr, resize=None, use_aug=False):
+        index = self.index
+        frames = vr.get_batch(self.frames[self.index])
+        if use_aug:
+            frames = self.data_augment.augment(frames)
+            print(frames.min(), frames.max())
+        video = rearrange(frames, "f h w c -> f c h w")
+        if resize is not None: video = resize(video)
+        return video
+    def get_frame_buckets(self, vr):
+        h, w, c = vr[0].shape
+        width, height = sensible_buckets(self.width, self.height, w, h)
+        resize = T.transforms.Resize((height, width), antialias=True)
+        return resize
+    def process_video_wrapper(self, vid_path):
+        video, vr = process_video(
+                vid_path,
+                self.use_bucketing,
+                self.width,
+                self.height,
+                self.get_frame_buckets,
+                self.get_frame_batch,
+                self.pad_to_fix,
+                self.use_aug
+            )
+        video_for_inversion, vr = process_video(
+                vid_path,
+                self.use_bucketing,
+                self.inversion_width,
+                self.inversion_height,
+                self.get_frame_buckets,
+                self.get_frame_batch,
+                self.pad_to_fix
+            )
+        return video, video_for_inversion, vr
+    def image_batch(self):
+        train_data = self.refer_image_path
+        img = train_data
+        try:
+            img = torchvision.io.read_image(img, mode=torchvision.io.ImageReadMode.RGB)
+        except:
+            img = T.transforms.PILToTensor()(Image.open(img).convert("RGB"))
+        width = self.width
+        height = self.height
+        if self.use_bucketing:
+            _, h, w = img.shape
+            width, height = sensible_buckets(width, height, w, h)
+        resize = T.transforms.Resize((height, width), antialias=True)
+        img = resize(img)
+        img = repeat(img, 'c h w -> f c h w', f=1)
+        return img
+    def single_video_batch(self, index):
+        train_data = self.single_video_path
+        self.index = index
+        if train_data.endswith(self.vid_types):
+            video, video_for_inv, _ = self.process_video_wrapper(train_data)
+            return video, video_for_inv
+        else:
+            raise ValueError(f"Single video is not a video type. Types: {self.vid_types}")
+    @staticmethod
+    def __getname__(): return 'single_video'
+    def __len__(self):
+        return len(self.create_video_chunks())
+    def __getitem__(self, index):
+        video, video_for_inv = self.single_video_batch(index)
+        image = self.image_batch()
+        motion_values = torch.Tensor([127.])
+        example = {
+            "pixel_values": (video / 127.5 - 1.0),
+            "pixel_values_for_inv": (video_for_inv / 127.5 - 1.0),
+            "refer_pixel_values": (image / 127.5 - 1.0),
+            "motion_values": motion_values,
+            'dataset': self.__getname__()
+        }
+        return example
+class ImageDataset(Dataset):
+    def __init__(
+        self,
+        tokenizer = None,
+        width: int = 256,
+        height: int = 256,
+        base_width: int = 256,
+        base_height: int = 256,
+        use_caption:     bool = False,
+        image_dir: str = '',
+        single_img_prompt: str = '',
+        use_bucketing: bool = False,
+        fallback_prompt: str = '',
+        **kwargs
+    ):
+        self.tokenizer = tokenizer
+        self.img_types = (".png", ".jpg", ".jpeg", '.bmp')
+        self.use_bucketing = use_bucketing
+        self.image_dir = self.get_images_list(image_dir)
+        self.fallback_prompt = fallback_prompt
+        self.use_caption = use_caption
+        self.single_img_prompt = single_img_prompt
+        self.width = width
+        self.height = height
+    def get_images_list(self, image_dir):
+        if os.path.exists(image_dir):
+            imgs = [x for x in os.listdir(image_dir) if x.endswith(self.img_types)]
+            full_img_dir = []
+            for img in imgs:
+                full_img_dir.append(f"{image_dir}/{img}")
+            return sorted(full_img_dir)
+        return ['']
+    def image_batch(self, index):
+        train_data = self.image_dir[index]
+        img = train_data
+        try:
+            img = torchvision.io.read_image(img, mode=torchvision.io.ImageReadMode.RGB)
+        except:
+            img = T.transforms.PILToTensor()(Image.open(img).convert("RGB"))
+        width = self.width
+        height = self.height
+        if self.use_bucketing:
+            _, h, w = img.shape
+            width, height = sensible_buckets(width, height, w, h)
+        resize = T.transforms.Resize((height, width), antialias=True)
+        img = resize(img)
+        img = repeat(img, 'c h w -> f c h w', f=16)
+        prompt = get_text_prompt(
+            file_path=train_data,
+            text_prompt=self.single_img_prompt,
+            fallback_prompt=self.fallback_prompt,
+            ext_types=self.img_types,
+            use_caption=True
+        )
+        prompt_ids = get_prompt_ids(prompt, self.tokenizer)
+        return img, prompt, prompt_ids
+    @staticmethod
+    def __getname__(): return 'image'
+    def __len__(self):
+        # Image directory
+        if os.path.exists(self.image_dir[0]):
+            return len(self.image_dir)
+        else:
+            return 0
+    def __getitem__(self, index):
+        img, prompt, prompt_ids = self.image_batch(index)
+        example = {
+            "pixel_values": (img / 127.5 - 1.0),
+            "prompt_ids": prompt_ids[0],
+            "text_prompt": prompt,
+            'dataset': self.__getname__()
+        }
+        return example
+class VideoFolderDataset(Dataset):
+    def __init__(
+        self,
+        tokenizer=None,
+        width: int = 256,
+        height: int = 256,
+        n_sample_frames: int = 16,
+        fps: int = 8,
+        path: str = "./data",
+        fallback_prompt: str = "",
+        use_bucketing: bool = False,
+        **kwargs
+    ):
+        self.tokenizer = tokenizer
+        self.use_bucketing = use_bucketing
+        self.fallback_prompt = fallback_prompt
+        self.video_files = glob(f"{path}/*.mp4")
+        self.width = width
+        self.height = height
+        self.n_sample_frames = n_sample_frames
+        self.fps = fps
+    def get_frame_buckets(self, vr):
+        h, w, c = vr[0].shape
+        width, height = sensible_buckets(self.width, self.height, w, h)
+        resize = T.transforms.Resize((height, width), antialias=True)
+        return resize
+    def get_frame_batch(self, vr, resize=None):
+        n_sample_frames = self.n_sample_frames
+        native_fps = vr.get_avg_fps()
+        every_nth_frame = max(1, round(native_fps / self.fps))
+        every_nth_frame = min(len(vr), every_nth_frame)
+        effective_length = len(vr) // every_nth_frame
+        if effective_length < n_sample_frames:
+            n_sample_frames = effective_length
+        effective_idx = random.randint(0, (effective_length - n_sample_frames))
+        idxs = every_nth_frame * np.arange(effective_idx, effective_idx + n_sample_frames)
+        video = vr.get_batch(idxs)
+        video = rearrange(video, "f h w c -> f c h w")
+        if resize is not None: video = resize(video)
+        return video, vr
+    def process_video_wrapper(self, vid_path):
+        video, vr = process_video(
+                vid_path,
+                self.use_bucketing,
+                self.width,
+                self.height,
+                self.get_frame_buckets,
+                self.get_frame_batch
+            )
+        return video, vr
+    def get_prompt_ids(self, prompt):
+        return self.tokenizer(
+            prompt,
+            truncation=True,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            return_tensors="pt",
+        ).input_ids
+    @staticmethod
+    def __getname__(): return 'folder'
+    def __len__(self):
+        return len(self.video_files)
+    def __getitem__(self, index):
+        video, _ = self.process_video_wrapper(self.video_files[index])
+        prompt = self.fallback_prompt
+        prompt_ids = self.get_prompt_ids(prompt)
+        return {"pixel_values": (video[0] / 127.5 - 1.0), "prompt_ids": prompt_ids[0], "text_prompt": prompt, 'dataset': self.__getname__()}
+class CachedDataset(Dataset):
+    def __init__(self,cache_dir: str = ''):
+        self.cache_dir = cache_dir
+        self.cached_data_list = self.get_files_list()
+    def get_files_list(self):
+        tensors_list = [f"{self.cache_dir}/{x}" for x in os.listdir(self.cache_dir) if x.endswith('.pt')]
+        return sorted(tensors_list)
+    def __len__(self):
+        return len(self.cached_data_list)
+    def __getitem__(self, index):
+        cached_latent = torch.load(self.cached_data_list[index], map_location='cuda:0')
+        return cached_latent

i2vedit/utils/euler_utils.py ADDED Viewed

	@@ -0,0 +1,226 @@

+import os
+import numpy as np
+from PIL import Image
+from typing import Union
+import copy
+from scipy.stats import anderson
+import torch
+from tqdm import tqdm
+from diffusers import StableVideoDiffusionPipeline
+from i2vedit.prompt_attention import attention_util
+# Euler Inversion
+@torch.no_grad()
+def init_image(image, firstframe, pipeline):
+    if isinstance(image, torch.Tensor):
+        height, width = image.shape[-2:]
+        image = (image + 1) / 2. * 255.
+        image = image.type(torch.uint8).squeeze().permute(1,2,0).cpu().numpy()
+        image = Image.fromarray(image)
+    if isinstance(firstframe, torch.Tensor):
+        firstframe = (firstframe + 1) / 2. * 255.
+        firstframe = firstframe.type(torch.uint8).squeeze().permute(1,2,0).cpu().numpy()
+        firstframe = Image.fromarray(firstframe)
+    device = pipeline._execution_device
+    image_embeddings = pipeline._encode_image(firstframe, device, 1, False)
+    image = pipeline.image_processor.preprocess(image, height=height, width=width)
+    firstframe = pipeline.image_processor.preprocess(firstframe, height=height, width=width)
+    #print(image.dtype)
+    noise = torch.randn(image.shape, device=image.device, dtype=image.dtype)
+    image = image + 0.02 * noise
+    firstframe = firstframe + 0.02 * noise
+    #print(image.dtype)
+    image_latents = pipeline._encode_vae_image(image, device, 1, False)
+    firstframe_latents = pipeline._encode_vae_image(firstframe, device, 1, False)
+    image_latents = image_latents.to(image_embeddings.dtype)
+    firstframe_latents = firstframe_latents.to(image_embeddings.dtype)
+    return image_embeddings, image_latents, firstframe_latents
+def next_step(model_output: Union[torch.FloatTensor, np.ndarray], sigma, sigma_next,
+              sample: Union[torch.FloatTensor, np.ndarray], euler_scheduler, controller=None, consistency_controller=None):
+    pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
+    if controller is not None:
+        pred_original_sample = controller.step_callback(pred_original_sample)
+    if consistency_controller is not None:
+        pred_original_sample = consistency_controller.step_callback(pred_original_sample)
+    #print("sample", sample.mean(), sample.std(), "pred_original_sample", pred_original_sample.mean())
+    #pred_original_sample = sample.mean() - pred_original_sample.mean() + pred_original_sample
+    next_sample = sample + (sigma_next - sigma) * (sample - pred_original_sample) / sigma
+    #print(sigma, sigma_next)
+    #print("next sample", torch.mean(next_sample), torch.std(next_sample))
+    return next_sample
+def get_model_pred_single(latents, t, image_embeddings, added_time_ids, unet):
+    noise_pred = unet(
+        latents,
+        t,
+        encoder_hidden_states=image_embeddings,
+        added_time_ids=added_time_ids,
+        return_dict=False,
+        )[0]
+    return noise_pred
+@torch.no_grad()
+def euler_loop(pipeline, euler_scheduler, latents, num_inv_steps, image, firstframe, controller=None, consistency_controller=None):
+    device = pipeline._execution_device
+    # prepare image conditions
+    image_embeddings, image_latents, firstframe_latents = init_image(image, firstframe, pipeline)
+    skip = 1#latents.shape[1]
+    image_latents = torch.cat(
+        [
+         image_latents.unsqueeze(1).repeat(1, skip, 1, 1, 1),
+         firstframe_latents.unsqueeze(1).repeat(1, latents.shape[1]-skip, 1, 1, 1)
+        ],
+        dim=1
+    )
+    #image_latents = image_latents.unsqueeze(1).repeat(1, latents.shape[1], 1, 1, 1)
+    # Get Added Time IDs
+    added_time_ids = pipeline._get_add_time_ids(
+        8,
+        127,
+        0.02,
+        image_embeddings.dtype,
+        1,
+        1,
+        False
+    )
+    added_time_ids = added_time_ids.to(device)
+    # Prepare timesteps
+    euler_scheduler.set_timesteps(num_inv_steps, device=device)
+    sigmas_0 = euler_scheduler.sigmas[-2] * euler_scheduler.sigmas[-2] / euler_scheduler.sigmas[-3]
+    timesteps = torch.cat([euler_scheduler.timesteps[1:],torch.Tensor([0.25 * sigmas_0.log()]).to(device)])
+    sigmas = copy.deepcopy(euler_scheduler.sigmas)
+    sigmas[-1] = sigmas_0
+    #print(sigmas)
+    # prepare latents
+    all_latent = [latents]
+    latents = latents.clone().detach()
+    for i in tqdm(range(num_inv_steps)):
+        t = timesteps[len(timesteps) -i -1]
+        sigma = sigmas[len(sigmas) -i -1]
+        sigma_next = sigmas[len(sigmas) -i -2]
+        latent_model_input = latents
+        latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)
+        latent_model_input = torch.cat([latent_model_input, image_latents], dim=2)
+        model_pred = get_model_pred_single(latent_model_input, t, image_embeddings, added_time_ids, pipeline.unet)
+        latents = next_step(model_pred, sigma, sigma_next, latents, euler_scheduler, controller=controller, consistency_controller=consistency_controller)
+        all_latent.append(latents)
+    all_latent[-1] = all_latent[-1] / ((sigmas[0]**2 + 1)**0.5)
+    return all_latent
+@torch.no_grad()
+def euler_inversion(pipeline, euler_scheduler, video_latent, num_inv_steps, image, firstframe, controller=None, consistency_controller=None):
+    euler_latents = euler_loop(pipeline, euler_scheduler, video_latent, num_inv_steps, image, firstframe, controller=controller, consistency_controller=consistency_controller)
+    return euler_latents
+from diffusers import EulerDiscreteScheduler
+from .model_utils import tensor_to_vae_latent, load_primary_models, handle_memory_attention
+def inverse_video(
+    pretrained_model_path,
+    video,
+    keyframe,
+    firstframe,
+    num_steps,
+    resctrl=None,
+    sard=None,
+    enable_xformers_memory_efficient_attention=True,
+    enable_torch_2_attn=False,
+    store_controller = None,
+    consistency_store_controller = None,
+    find_modules={},
+    consistency_find_modules={},
+    sarp_noise_scale=0.002,
+):
+    dtype = torch.float32
+    # check if inverted latents exists
+    for _controller in [store_controller, consistency_store_controller]:
+        if _controller is not None:
+            if os.path.exists(os.path.join(_controller.store_dir, "inverted_latents.pt")):
+                euler_inv_latent = torch.load(os.path.join(_controller.store_dir, "inverted_latents.pt")).to("cuda", dtype)
+                print(f"Successfully load inverted latents from {os.path.join(_controller.store_dir, 'inverted_latents.pt')}")
+                return euler_inv_latent
+    # prepare model, Load scheduler, tokenizer and models.
+    noise_scheduler, feature_extractor, image_encoder, vae, unet = load_primary_models(pretrained_model_path)
+    # Enable xformers if available
+    handle_memory_attention(enable_xformers_memory_efficient_attention, enable_torch_2_attn, unet)
+    vae.to('cuda', dtype=dtype)
+    unet.to('cuda')
+    pipe = StableVideoDiffusionPipeline.from_pretrained(
+        pretrained_model_path,
+        feature_extractor=feature_extractor,
+        image_encoder=image_encoder,
+        vae=vae,
+        unet=unet
+    )
+    pipe.image_encoder.to('cuda')
+    attention_util.register_attention_control(
+        pipe.unet,
+        store_controller,
+        consistency_store_controller,
+        find_modules=find_modules,
+        consistency_find_modules=consistency_find_modules
+    )
+    if store_controller is not None:
+        store_controller.LOW_RESOURCE = True
+    video_for_inv = torch.cat([firstframe,keyframe,video],dim=1).to(dtype)
+    #print(video_for_inv.shape)
+    if resctrl is not None:
+        video_for_inv = resctrl(video_for_inv)
+    if sard is not None:
+        indx = sard.detection(video_for_inv, 0.001)
+        #import cv2
+        #cv2.imwrite("indx.png", indx[0,0,:,:,:].permute(1,2,0).type(torch.uint8).cpu().numpy()*255)
+        noise = torch.randn(video_for_inv.shape, device=video.device, dtype=video.dtype)
+        video_for_inv[indx] = video_for_inv[indx] + noise[indx] * sarp_noise_scale
+        video_for_inv = video_for_inv.clamp(-1,1)
+    firstframe, keyframe, video_for_inv = video_for_inv.tensor_split([1,2],dim=1)
+    #print("video for inv", video_for_inv.mean(), video_for_inv.std())
+    latents_for_inv = tensor_to_vae_latent(video_for_inv, vae)
+    #noise = torch.randn(latents_for_inv.shape, device=video.device, dtype=video.dtype)
+    #latents_for_inv = latents_for_inv + noise * sarp_noise_scale
+    #print("video latent for inv", latents_for_inv.mean(), latents_for_inv.std(), latents_for_inv.shape)
+    euler_inv_scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
+    euler_inv_scheduler.set_timesteps(num_steps)
+    euler_inv_latent = euler_inversion(
+        pipe, euler_inv_scheduler, video_latent=latents_for_inv.to(pipe.device),
+        num_inv_steps=num_steps, image=keyframe[:,0,:,:,:], firstframe=firstframe[:,0,:,:,:], controller=store_controller, consistency_controller=consistency_store_controller)[-1]
+    torch.cuda.empty_cache()
+    del pipe
+    #res = anderson(euler_inv_latent.cpu().view(-1).numpy())
+    #print(euler_inv_latent.mean(), euler_inv_latent.std())
+    #print(res.statistic)
+    #print(res.critical_values)
+    #print(res.significance_level)
+    # save inverted latents
+    for _controller in [store_controller, consistency_store_controller]:
+        if _controller is not None:
+            torch.save(euler_inv_latent, os.path.join(_controller.store_dir, "inverted_latents.pt"))
+            break
+    return euler_inv_latent

i2vedit/utils/lora.py ADDED Viewed

	@@ -0,0 +1,1493 @@

+import json
+import math
+from itertools import groupby
+import os
+from typing import Callable, Dict, List, Optional, Set, Tuple, Type, Union
+import numpy as np
+import PIL
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+try:
+    from safetensors.torch import safe_open
+    from safetensors.torch import save_file as safe_save
+    safetensors_available = True
+except ImportError:
+    from .safe_open import safe_open
+    def safe_save(
+        tensors: Dict[str, torch.Tensor],
+        filename: str,
+        metadata: Optional[Dict[str, str]] = None,
+    ) -> None:
+        raise EnvironmentError(
+            "Saving safetensors requires the safetensors library. Please install with pip or similar."
+        )
+    safetensors_available = False
+class LoraInjectedLinear(nn.Module):
+    def __init__(
+        self, in_features, out_features, bias=False, r=4, dropout_p=0.1, scale=1.0
+    ):
+        super().__init__()
+        if r > min(in_features, out_features):
+            #raise ValueError(
+            #    f"LoRA rank {r} must be less or equal than {min(in_features, out_features)}"
+            #)
+            print(f"LoRA rank {r} is too large. setting to: {min(in_features, out_features)}")
+            r = min(in_features, out_features)
+        self.r = r
+        self.linear = nn.Linear(in_features, out_features, bias)
+        self.lora_down = nn.Linear(in_features, r, bias=False)
+        self.dropout = nn.Dropout(dropout_p)
+        self.lora_up = nn.Linear(r, out_features, bias=False)
+        self.scale = scale
+        self.selector = nn.Identity()
+        nn.init.normal_(self.lora_down.weight, std=1 / r)
+        nn.init.zeros_(self.lora_up.weight)
+    def forward(self, input):
+        return (
+            self.linear(input)
+            + self.dropout(self.lora_up(self.selector(self.lora_down(input))))
+            * self.scale
+        )
+    def realize_as_lora(self):
+        return self.lora_up.weight.data * self.scale, self.lora_down.weight.data
+    def set_selector_from_diag(self, diag: torch.Tensor):
+        # diag is a 1D tensor of size (r,)
+        assert diag.shape == (self.r,)
+        self.selector = nn.Linear(self.r, self.r, bias=False)
+        self.selector.weight.data = torch.diag(diag)
+        self.selector.weight.data = self.selector.weight.data.to(
+            self.lora_up.weight.device
+        ).to(self.lora_up.weight.dtype)
+class MultiLoraInjectedLinear(nn.Module):
+    def __init__(
+        self, in_features, out_features, bias=False, r=4, dropout_p=0.1, lora_num=1, scales=[1.0]
+    ):
+        super().__init__()
+        if r > min(in_features, out_features):
+            #raise ValueError(
+            #    f"LoRA rank {r} must be less or equal than {min(in_features, out_features)}"
+            #)
+            print(f"LoRA rank {r} is too large. setting to: {min(in_features, out_features)}")
+            r = min(in_features, out_features)
+        self.r = r
+        self.linear = nn.Linear(in_features, out_features, bias)
+        for i in range(lora_num):
+            if i==0:
+                self.lora_down =[nn.Linear(in_features, r, bias=False)]
+                self.dropout = [nn.Dropout(dropout_p)]
+                self.lora_up = [nn.Linear(r, out_features, bias=False)]
+                self.scale = scales[i]
+                self.selector = [nn.Identity()]
+            else:
+                self.lora_down.append(nn.Linear(in_features, r, bias=False))
+                self.dropout.append( nn.Dropout(dropout_p))
+                self.lora_up.append( nn.Linear(r, out_features, bias=False))
+                self.scale.append(scales[i])
+        nn.init.normal_(self.lora_down.weight, std=1 / r)
+        nn.init.zeros_(self.lora_up.weight)
+    def forward(self, input):
+        return (
+            self.linear(input)
+            + self.dropout(self.lora_up(self.selector(self.lora_down(input))))
+            * self.scale
+        )
+    def realize_as_lora(self):
+        return self.lora_up.weight.data * self.scale, self.lora_down.weight.data
+    def set_selector_from_diag(self, diag: torch.Tensor):
+        # diag is a 1D tensor of size (r,)
+        assert diag.shape == (self.r,)
+        self.selector = nn.Linear(self.r, self.r, bias=False)
+        self.selector.weight.data = torch.diag(diag)
+        self.selector.weight.data = self.selector.weight.data.to(
+            self.lora_up.weight.device
+        ).to(self.lora_up.weight.dtype)
+class LoraInjectedConv2d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups: int = 1,
+        bias: bool = True,
+        r: int = 4,
+        dropout_p: float = 0.1,
+        scale: float = 1.0,
+    ):
+        super().__init__()
+        if r > min(in_channels, out_channels):
+            print(f"LoRA rank {r} is too large. setting to: {min(in_channels, out_channels)}")
+            r = min(in_channels, out_channels)
+        self.r = r
+        self.conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+        )
+        self.lora_down = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=r,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=False,
+        )
+        self.dropout = nn.Dropout(dropout_p)
+        self.lora_up = nn.Conv2d(
+            in_channels=r,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.selector = nn.Identity()
+        self.scale = scale
+        nn.init.normal_(self.lora_down.weight, std=1 / r)
+        nn.init.zeros_(self.lora_up.weight)
+    def forward(self, input):
+        return (
+            self.conv(input)
+            + self.dropout(self.lora_up(self.selector(self.lora_down(input))))
+            * self.scale
+        )
+    def realize_as_lora(self):
+        return self.lora_up.weight.data * self.scale, self.lora_down.weight.data
+    def set_selector_from_diag(self, diag: torch.Tensor):
+        # diag is a 1D tensor of size (r,)
+        assert diag.shape == (self.r,)
+        self.selector = nn.Conv2d(
+            in_channels=self.r,
+            out_channels=self.r,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.selector.weight.data = torch.diag(diag)
+        # same device + dtype as lora_up
+        self.selector.weight.data = self.selector.weight.data.to(
+            self.lora_up.weight.device
+        ).to(self.lora_up.weight.dtype)
+class LoraInjectedConv3d(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: (3, 1, 1),
+        padding: (1, 0, 0),
+        bias: bool = False,
+        r: int = 4,
+        dropout_p: float = 0,
+        scale: float = 1.0,
+    ):
+        super().__init__()
+        if r > min(in_channels, out_channels):
+            print(f"LoRA rank {r} is too large. setting to: {min(in_channels, out_channels)}")
+            r = min(in_channels, out_channels)
+        self.r = r
+        self.kernel_size = kernel_size
+        self.padding = padding
+        self.conv = nn.Conv3d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            padding=padding,
+        )
+        self.lora_down = nn.Conv3d(
+            in_channels=in_channels,
+            out_channels=r,
+            kernel_size=kernel_size,
+            bias=False,
+            padding=padding
+        )
+        self.dropout = nn.Dropout(dropout_p)
+        self.lora_up = nn.Conv3d(
+            in_channels=r,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.selector = nn.Identity()
+        self.scale = scale
+        nn.init.normal_(self.lora_down.weight, std=1 / r)
+        nn.init.zeros_(self.lora_up.weight)
+    def forward(self, input):
+        return (
+            self.conv(input)
+            + self.dropout(self.lora_up(self.selector(self.lora_down(input))))
+            * self.scale
+        )
+    def realize_as_lora(self):
+        return self.lora_up.weight.data * self.scale, self.lora_down.weight.data
+    def set_selector_from_diag(self, diag: torch.Tensor):
+        # diag is a 1D tensor of size (r,)
+        assert diag.shape == (self.r,)
+        self.selector = nn.Conv3d(
+            in_channels=self.r,
+            out_channels=self.r,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.selector.weight.data = torch.diag(diag)
+        # same device + dtype as lora_up
+        self.selector.weight.data = self.selector.weight.data.to(
+            self.lora_up.weight.device
+        ).to(self.lora_up.weight.dtype)
+UNET_DEFAULT_TARGET_REPLACE = {"CrossAttention", "Attention", "GEGLU"}
+UNET_EXTENDED_TARGET_REPLACE = {"ResnetBlock2D", "CrossAttention", "Attention", "GEGLU"}
+TEXT_ENCODER_DEFAULT_TARGET_REPLACE = {"CLIPAttention"}
+TEXT_ENCODER_EXTENDED_TARGET_REPLACE = {"CLIPAttention"}
+DEFAULT_TARGET_REPLACE = UNET_DEFAULT_TARGET_REPLACE
+EMBED_FLAG = "<embed>"
+def _find_children(
+    model,
+    search_class: List[Type[nn.Module]] = [nn.Linear],
+):
+    """
+    Find all modules of a certain class (or union of classes).
+    Returns all matching modules, along with the parent of those moduless and the
+    names they are referenced by.
+    """
+    # For each target find every linear_class module that isn't a child of a LoraInjectedLinear
+    for parent in model.modules():
+        for name, module in parent.named_children():
+            if any([isinstance(module, _class) for _class in search_class]):
+                yield parent, name, module
+def _find_modules_v2(
+    model,
+    ancestor_class: Optional[Set[str]] = None,
+    search_class: List[Type[nn.Module]] = [nn.Linear],
+    exclude_children_of: Optional[List[Type[nn.Module]]] = None,
+    #     [
+    #     LoraInjectedLinear,
+    #     LoraInjectedConv2d,
+    #     LoraInjectedConv3d
+    # ],
+):
+    """
+    Find all modules of a certain class (or union of classes) that are direct or
+    indirect descendants of other modules of a certain class (or union of classes).
+    Returns all matching modules, along with the parent of those moduless and the
+    names they are referenced by.
+    """
+    # Get the targets we should replace all linears under
+    if ancestor_class is not None:
+        ancestors = (
+            module
+            for name, module in model.named_modules()
+            if module.__class__.__name__ in ancestor_class # and ('transformer_in' not in name)
+        )
+    else:
+        # this, incase you want to naively iterate over all modules.
+        ancestors = [module for module in model.modules()]
+    # For each target find every linear_class module that isn't a child of a LoraInjectedLinear
+    for ancestor in ancestors:
+        for fullname, module in ancestor.named_modules():
+            if any([isinstance(module, _class) for _class in search_class]):
+                continue_flag = True
+                if 'Transformer2DModel' in ancestor_class and ('attn1' in fullname or 'ff' in fullname):
+                    continue_flag = False
+                if 'TransformerTemporalModel' in ancestor_class and ('attn1' in fullname or 'attn2' in fullname or 'ff' in fullname):
+                    continue_flag = False
+                if 'TemporalBasicTransformerBlock' in ancestor_class and ('attn1' in fullname or 'attn2' in fullname or 'ff' in fullname):
+                    continue_flag = False
+                #if 'TemporalBasicTransformerBlock' in ancestor_class and ('attn1' in fullname or 'ff' in fullname):
+                #    continue_flag = False
+                if 'BasicTransformerBlock' in ancestor_class and ('attn2' in fullname or 'ff' in fullname):
+                    continue_flag = False
+                if continue_flag:
+                    continue
+                # Find the direct parent if this is a descendant, not a child, of target
+                *path, name = fullname.split(".")
+                parent = ancestor
+                while path:
+                    parent = parent.get_submodule(path.pop(0))
+                # Skip this linear if it's a child of a LoraInjectedLinear
+                if exclude_children_of and any(
+                    [isinstance(parent, _class) for _class in exclude_children_of]
+                ):
+                    continue
+                if name in ['lora_up', 'dropout', 'lora_down']:
+                    continue
+                # Otherwise, yield it
+                yield parent, name, module
+def _find_modules_old(
+    model,
+    ancestor_class: Set[str] = DEFAULT_TARGET_REPLACE,
+    search_class: List[Type[nn.Module]] = [nn.Linear],
+    exclude_children_of: Optional[List[Type[nn.Module]]] = [LoraInjectedLinear],
+):
+    ret = []
+    for _module in model.modules():
+        if _module.__class__.__name__ in ancestor_class:
+            for name, _child_module in _module.named_modules():
+                if _child_module.__class__ in search_class:
+                    ret.append((_module, name, _child_module))
+    print(ret)
+    return ret
+_find_modules = _find_modules_v2
+def inject_trainable_lora(
+    model: nn.Module,
+    target_replace_module: Set[str] = DEFAULT_TARGET_REPLACE,
+    r: int = 4,
+    loras=None,  # path to lora .pt
+    verbose: bool = False,
+    dropout_p: float = 0.0,
+    scale: float = 1.0,
+):
+    """
+    inject lora into model, and returns lora parameter groups.
+    """
+    require_grad_params = []
+    names = []
+    if loras != None:
+        loras = torch.load(loras)
+    for _module, name, _child_module in _find_modules(
+        model, target_replace_module, search_class=[nn.Linear]
+    ):
+        weight = _child_module.weight
+        bias = _child_module.bias
+        if verbose:
+            print("LoRA Injection : injecting lora into ", name)
+            print("LoRA Injection : weight shape", weight.shape)
+        _tmp = LoraInjectedLinear(
+            _child_module.in_features,
+            _child_module.out_features,
+            _child_module.bias is not None,
+            r=r,
+            dropout_p=dropout_p,
+            scale=scale,
+        )
+        _tmp.linear.weight = weight
+        if bias is not None:
+            _tmp.linear.bias = bias
+        # switch the module
+        _tmp.to(_child_module.weight.device).to(_child_module.weight.dtype)
+        _module._modules[name] = _tmp
+        require_grad_params.append(_module._modules[name].lora_up.parameters())
+        require_grad_params.append(_module._modules[name].lora_down.parameters())
+        if loras != None:
+            _module._modules[name].lora_up.weight = loras.pop(0)
+            _module._modules[name].lora_down.weight = loras.pop(0)
+        _module._modules[name].lora_up.weight.requires_grad = True
+        _module._modules[name].lora_down.weight.requires_grad = True
+        names.append(name)
+    return require_grad_params, names
+def inject_trainable_lora_extended(
+    model: nn.Module,
+    target_replace_module: Set[str] = UNET_EXTENDED_TARGET_REPLACE,
+    r: int = 4,
+    loras=None,  # path to lora .pt
+    dropout_p: float = 0.0,
+    scale: float = 1.0,
+):
+    """
+    inject lora into model, and returns lora parameter groups.
+    """
+    require_grad_params = []
+    names = []
+    if loras != None:
+        print(f"Load from lora: {loras} ...")
+        loras = torch.load(loras)
+    if True:
+        for target_replace_module_i in target_replace_module:
+            for _module, name, _child_module in _find_modules(
+                model, [target_replace_module_i], search_class=[nn.Linear, nn.Conv2d, nn.Conv3d]
+            ):
+                # if name == 'to_q':
+                #     continue
+                if _child_module.__class__ == nn.Linear:
+                    weight = _child_module.weight
+                    bias = _child_module.bias
+                    _tmp = LoraInjectedLinear(
+                        _child_module.in_features,
+                        _child_module.out_features,
+                        _child_module.bias is not None,
+                        r=r,
+                        dropout_p=dropout_p,
+                        scale=scale,
+                    )
+                    _tmp.linear.weight = weight
+                    if bias is not None:
+                        _tmp.linear.bias = bias
+                elif _child_module.__class__ == nn.Conv2d:
+                    weight = _child_module.weight
+                    bias = _child_module.bias
+                    _tmp = LoraInjectedConv2d(
+                        _child_module.in_channels,
+                        _child_module.out_channels,
+                        _child_module.kernel_size,
+                        _child_module.stride,
+                        _child_module.padding,
+                        _child_module.dilation,
+                        _child_module.groups,
+                        _child_module.bias is not None,
+                        r=r,
+                        dropout_p=dropout_p,
+                        scale=scale,
+                    )
+                    _tmp.conv.weight = weight
+                    if bias is not None:
+                        _tmp.conv.bias = bias
+                elif _child_module.__class__ == nn.Conv3d:
+                    weight = _child_module.weight
+                    bias = _child_module.bias
+                    _tmp = LoraInjectedConv3d(
+                        _child_module.in_channels,
+                        _child_module.out_channels,
+                        bias=_child_module.bias is not None,
+                        kernel_size=_child_module.kernel_size,
+                        padding=_child_module.padding,
+                        r=r,
+                        dropout_p=dropout_p,
+                        scale=scale,
+                    )
+                    _tmp.conv.weight = weight
+                    if bias is not None:
+                        _tmp.conv.bias = bias
+                # LoRA layer
+                else:
+                    _tmp = _child_module
+                # switch the module
+                _tmp.to(_child_module.weight.device).to(_child_module.weight.dtype)
+                try:
+                    if bias is not None:
+                        _tmp.to(_child_module.bias.device).to(_child_module.bias.dtype)
+                except:
+                    pass
+                _module._modules[name] = _tmp
+                require_grad_params.append(_module._modules[name].lora_up.parameters())
+                require_grad_params.append(_module._modules[name].lora_down.parameters())
+                if loras != None:
+                    _module._modules[name].lora_up.weight = loras.pop(0)
+                    _module._modules[name].lora_down.weight = loras.pop(0)
+                _module._modules[name].lora_up.weight.requires_grad = True
+                _module._modules[name].lora_down.weight.requires_grad = True
+                names.append(name)
+    else:
+        for _module, name, _child_module in _find_modules(
+                model, target_replace_module, search_class=[nn.Linear, nn.Conv2d, nn.Conv3d]
+        ):
+            if _child_module.__class__ == nn.Linear:
+                weight = _child_module.weight
+                bias = _child_module.bias
+                _tmp = LoraInjectedLinear(
+                    _child_module.in_features,
+                    _child_module.out_features,
+                    _child_module.bias is not None,
+                    r=r,
+                    dropout_p=dropout_p,
+                    scale=scale,
+                )
+                _tmp.linear.weight = weight
+                if bias is not None:
+                    _tmp.linear.bias = bias
+            elif _child_module.__class__ == nn.Conv2d:
+                weight = _child_module.weight
+                bias = _child_module.bias
+                _tmp = LoraInjectedConv2d(
+                    _child_module.in_channels,
+                    _child_module.out_channels,
+                    _child_module.kernel_size,
+                    _child_module.stride,
+                    _child_module.padding,
+                    _child_module.dilation,
+                    _child_module.groups,
+                    _child_module.bias is not None,
+                    r=r,
+                    dropout_p=dropout_p,
+                    scale=scale,
+                )
+                _tmp.conv.weight = weight
+                if bias is not None:
+                    _tmp.conv.bias = bias
+            elif _child_module.__class__ == nn.Conv3d:
+                weight = _child_module.weight
+                bias = _child_module.bias
+                _tmp = LoraInjectedConv3d(
+                    _child_module.in_channels,
+                    _child_module.out_channels,
+                    bias=_child_module.bias is not None,
+                    kernel_size=_child_module.kernel_size,
+                    padding=_child_module.padding,
+                    r=r,
+                    dropout_p=dropout_p,
+                    scale=scale,
+                )
+                _tmp.conv.weight = weight
+                if bias is not None:
+                    _tmp.conv.bias = bias
+            # switch the module
+            _tmp.to(_child_module.weight.device).to(_child_module.weight.dtype)
+            if bias is not None:
+                _tmp.to(_child_module.bias.device).to(_child_module.bias.dtype)
+            _module._modules[name] = _tmp
+            require_grad_params.append(_module._modules[name].lora_up.parameters())
+            require_grad_params.append(_module._modules[name].lora_down.parameters())
+            if loras != None:
+                _module._modules[name].lora_up.weight = loras.pop(0)
+                _module._modules[name].lora_down.weight = loras.pop(0)
+            _module._modules[name].lora_up.weight.requires_grad = True
+            _module._modules[name].lora_down.weight.requires_grad = True
+            names.append(name)
+    return require_grad_params, names
+def inject_inferable_lora(
+        model,
+        lora_path='',
+        unet_replace_modules=["UNet3DConditionModel"],
+        text_encoder_replace_modules=["CLIPEncoderLayer"],
+        is_extended=False,
+        r=16
+    ):
+    from transformers.models.clip import CLIPTextModel
+    from diffusers import UNet3DConditionModel
+    def is_text_model(f): return 'text_encoder' in f and isinstance(model.text_encoder, CLIPTextModel)
+    def is_unet(f): return 'unet' in f and model.unet.__class__.__name__ == "UNet3DConditionModel"
+    if os.path.exists(lora_path):
+        try:
+            for f in os.listdir(lora_path):
+                if f.endswith('.pt'):
+                    lora_file = os.path.join(lora_path, f)
+                    if is_text_model(f):
+                        monkeypatch_or_replace_lora(
+                            model.text_encoder,
+                            torch.load(lora_file),
+                            target_replace_module=text_encoder_replace_modules,
+                            r=r
+                        )
+                        print("Successfully loaded Text Encoder LoRa.")
+                        continue
+                    if is_unet(f):
+                        monkeypatch_or_replace_lora_extended(
+                            model.unet,
+                            torch.load(lora_file),
+                            target_replace_module=unet_replace_modules,
+                            r=r
+                        )
+                        print("Successfully loaded UNET LoRa.")
+                        continue
+                    print("Found a .pt file, but doesn't have the correct name format. (unet.pt, text_encoder.pt)")
+        except Exception as e:
+            print(e)
+            print("Couldn't inject LoRA's due to an error.")
+def extract_lora_ups_down(model, target_replace_module=DEFAULT_TARGET_REPLACE):
+    loras = []
+    for target_replace_module_i in target_replace_module:
+        for _m, _n, _child_module in _find_modules(
+            model,
+            [target_replace_module_i],
+            search_class=[LoraInjectedLinear, LoraInjectedConv2d, LoraInjectedConv3d],
+        ):
+            loras.append((_child_module.lora_up, _child_module.lora_down))
+    if len(loras) == 0:
+        raise ValueError("No lora injected.")
+    return loras
+def extract_lora_child_module(model, target_replace_module=DEFAULT_TARGET_REPLACE):
+    loras = []
+    for target_replace_module_i in target_replace_module:
+        for _m, _n, _child_module in _find_modules(
+            model,
+            [target_replace_module_i],
+            search_class=[LoraInjectedLinear, LoraInjectedConv2d, LoraInjectedConv3d],
+        ):
+            loras.append(_child_module)
+    return loras
+def extract_lora_as_tensor(
+    model, target_replace_module=DEFAULT_TARGET_REPLACE, as_fp16=True
+):
+    loras = []
+    for _m, _n, _child_module in _find_modules(
+        model,
+        target_replace_module,
+        search_class=[LoraInjectedLinear, LoraInjectedConv2d, LoraInjectedConv3d],
+    ):
+        up, down = _child_module.realize_as_lora()
+        if as_fp16:
+            up = up.to(torch.float16)
+            down = down.to(torch.float16)
+        loras.append((up, down))
+    if len(loras) == 0:
+        raise ValueError("No lora injected.")
+    return loras
+def save_lora_weight(
+    model,
+    path="./lora.pt",
+    target_replace_module=DEFAULT_TARGET_REPLACE,
+    flag=None
+):
+    weights = []
+    for _up, _down in extract_lora_ups_down(
+        model, target_replace_module=target_replace_module
+    ):
+        weights.append(_up.weight.to("cpu").to(torch.float32))
+        weights.append(_down.weight.to("cpu").to(torch.float32))
+    if not flag:
+        torch.save(weights, path)
+    else:
+        weights_new=[]
+        for i in range(0, len(weights), 4):
+            subset = weights[i+(flag-1)*2:i+(flag-1)*2+2]
+            weights_new.extend(subset)
+        torch.save(weights_new, path)
+def save_lora_as_json(model, path="./lora.json"):
+    weights = []
+    for _up, _down in extract_lora_ups_down(model):
+        weights.append(_up.weight.detach().cpu().numpy().tolist())
+        weights.append(_down.weight.detach().cpu().numpy().tolist())
+    import json
+    with open(path, "w") as f:
+        json.dump(weights, f)
+def save_safeloras_with_embeds(
+    modelmap: Dict[str, Tuple[nn.Module, Set[str]]] = {},
+    embeds: Dict[str, torch.Tensor] = {},
+    outpath="./lora.safetensors",
+):
+    """
+    Saves the Lora from multiple modules in a single safetensor file.
+    modelmap is a dictionary of {
+        "module name": (module, target_replace_module)
+    }
+    """
+    weights = {}
+    metadata = {}
+    for name, (model, target_replace_module) in modelmap.items():
+        metadata[name] = json.dumps(list(target_replace_module))
+        for i, (_up, _down) in enumerate(
+            extract_lora_as_tensor(model, target_replace_module)
+        ):
+            rank = _down.shape[0]
+            metadata[f"{name}:{i}:rank"] = str(rank)
+            weights[f"{name}:{i}:up"] = _up
+            weights[f"{name}:{i}:down"] = _down
+    for token, tensor in embeds.items():
+        metadata[token] = EMBED_FLAG
+        weights[token] = tensor
+    print(f"Saving weights to {outpath}")
+    safe_save(weights, outpath, metadata)
+def save_safeloras(
+    modelmap: Dict[str, Tuple[nn.Module, Set[str]]] = {},
+    outpath="./lora.safetensors",
+):
+    return save_safeloras_with_embeds(modelmap=modelmap, outpath=outpath)
+def convert_loras_to_safeloras_with_embeds(
+    modelmap: Dict[str, Tuple[str, Set[str], int]] = {},
+    embeds: Dict[str, torch.Tensor] = {},
+    outpath="./lora.safetensors",
+):
+    """
+    Converts the Lora from multiple pytorch .pt files into a single safetensor file.
+    modelmap is a dictionary of {
+        "module name": (pytorch_model_path, target_replace_module, rank)
+    }
+    """
+    weights = {}
+    metadata = {}
+    for name, (path, target_replace_module, r) in modelmap.items():
+        metadata[name] = json.dumps(list(target_replace_module))
+        lora = torch.load(path)
+        for i, weight in enumerate(lora):
+            is_up = i % 2 == 0
+            i = i // 2
+            if is_up:
+                metadata[f"{name}:{i}:rank"] = str(r)
+                weights[f"{name}:{i}:up"] = weight
+            else:
+                weights[f"{name}:{i}:down"] = weight
+    for token, tensor in embeds.items():
+        metadata[token] = EMBED_FLAG
+        weights[token] = tensor
+    print(f"Saving weights to {outpath}")
+    safe_save(weights, outpath, metadata)
+def convert_loras_to_safeloras(
+    modelmap: Dict[str, Tuple[str, Set[str], int]] = {},
+    outpath="./lora.safetensors",
+):
+    convert_loras_to_safeloras_with_embeds(modelmap=modelmap, outpath=outpath)
+def parse_safeloras(
+    safeloras,
+) -> Dict[str, Tuple[List[nn.parameter.Parameter], List[int], List[str]]]:
+    """
+    Converts a loaded safetensor file that contains a set of module Loras
+    into Parameters and other information
+    Output is a dictionary of {
+        "module name": (
+            [list of weights],
+            [list of ranks],
+            target_replacement_modules
+        )
+    }
+    """
+    loras = {}
+    metadata = safeloras.metadata()
+    get_name = lambda k: k.split(":")[0]
+    keys = list(safeloras.keys())
+    keys.sort(key=get_name)
+    for name, module_keys in groupby(keys, get_name):
+        info = metadata.get(name)
+        if not info:
+            raise ValueError(
+                f"Tensor {name} has no metadata - is this a Lora safetensor?"
+            )
+        # Skip Textual Inversion embeds
+        if info == EMBED_FLAG:
+            continue
+        # Handle Loras
+        # Extract the targets
+        target = json.loads(info)
+        # Build the result lists - Python needs us to preallocate lists to insert into them
+        module_keys = list(module_keys)
+        ranks = [4] * (len(module_keys) // 2)
+        weights = [None] * len(module_keys)
+        for key in module_keys:
+            # Split the model name and index out of the key
+            _, idx, direction = key.split(":")
+            idx = int(idx)
+            # Add the rank
+            ranks[idx] = int(metadata[f"{name}:{idx}:rank"])
+            # Insert the weight into the list
+            idx = idx * 2 + (1 if direction == "down" else 0)
+            weights[idx] = nn.parameter.Parameter(safeloras.get_tensor(key))
+        loras[name] = (weights, ranks, target)
+    return loras
+def parse_safeloras_embeds(
+    safeloras,
+) -> Dict[str, torch.Tensor]:
+    """
+    Converts a loaded safetensor file that contains Textual Inversion embeds into
+    a dictionary of embed_token: Tensor
+    """
+    embeds = {}
+    metadata = safeloras.metadata()
+    for key in safeloras.keys():
+        # Only handle Textual Inversion embeds
+        meta = metadata.get(key)
+        if not meta or meta != EMBED_FLAG:
+            continue
+        embeds[key] = safeloras.get_tensor(key)
+    return embeds
+def load_safeloras(path, device="cpu"):
+    safeloras = safe_open(path, framework="pt", device=device)
+    return parse_safeloras(safeloras)
+def load_safeloras_embeds(path, device="cpu"):
+    safeloras = safe_open(path, framework="pt", device=device)
+    return parse_safeloras_embeds(safeloras)
+def load_safeloras_both(path, device="cpu"):
+    safeloras = safe_open(path, framework="pt", device=device)
+    return parse_safeloras(safeloras), parse_safeloras_embeds(safeloras)
+def collapse_lora(model, alpha=1.0):
+    for _module, name, _child_module in _find_modules(
+        model,
+        UNET_EXTENDED_TARGET_REPLACE | TEXT_ENCODER_EXTENDED_TARGET_REPLACE,
+        search_class=[LoraInjectedLinear, LoraInjectedConv2d, LoraInjectedConv3d],
+    ):
+        if isinstance(_child_module, LoraInjectedLinear):
+            print("Collapsing Lin Lora in", name)
+            _child_module.linear.weight = nn.Parameter(
+                _child_module.linear.weight.data
+                + alpha
+                * (
+                    _child_module.lora_up.weight.data
+                    @ _child_module.lora_down.weight.data
+                )
+                .type(_child_module.linear.weight.dtype)
+                .to(_child_module.linear.weight.device)
+            )
+        else:
+            print("Collapsing Conv Lora in", name)
+            _child_module.conv.weight = nn.Parameter(
+                _child_module.conv.weight.data
+                + alpha
+                * (
+                    _child_module.lora_up.weight.data.flatten(start_dim=1)
+                    @ _child_module.lora_down.weight.data.flatten(start_dim=1)
+                )
+                .reshape(_child_module.conv.weight.data.shape)
+                .type(_child_module.conv.weight.dtype)
+                .to(_child_module.conv.weight.device)
+            )
+def monkeypatch_or_replace_lora(
+    model,
+    loras,
+    target_replace_module=DEFAULT_TARGET_REPLACE,
+    r: Union[int, List[int]] = 4,
+):
+    for _module, name, _child_module in _find_modules(
+        model, target_replace_module, search_class=[nn.Linear, LoraInjectedLinear]
+    ):
+        _source = (
+            _child_module.linear
+            if isinstance(_child_module, LoraInjectedLinear)
+            else _child_module
+        )
+        weight = _source.weight
+        bias = _source.bias
+        _tmp = LoraInjectedLinear(
+            _source.in_features,
+            _source.out_features,
+            _source.bias is not None,
+            r=r.pop(0) if isinstance(r, list) else r,
+        )
+        _tmp.linear.weight = weight
+        if bias is not None:
+            _tmp.linear.bias = bias
+        # switch the module
+        _module._modules[name] = _tmp
+        up_weight = loras.pop(0)
+        down_weight = loras.pop(0)
+        _module._modules[name].lora_up.weight = nn.Parameter(
+            up_weight.type(weight.dtype)
+        )
+        _module._modules[name].lora_down.weight = nn.Parameter(
+            down_weight.type(weight.dtype)
+        )
+        _module._modules[name].to(weight.device)
+def monkeypatch_or_replace_lora_extended(
+    model,
+    loras,
+    target_replace_module=DEFAULT_TARGET_REPLACE,
+    r: Union[int, List[int]] = 4,
+):
+    for _module, name, _child_module in _find_modules(
+        model,
+        target_replace_module,
+        search_class=[
+            nn.Linear,
+            nn.Conv2d,
+            nn.Conv3d,
+            LoraInjectedLinear,
+            LoraInjectedConv2d,
+            LoraInjectedConv3d,
+        ],
+    ):
+        if (_child_module.__class__ == nn.Linear) or (
+            _child_module.__class__ == LoraInjectedLinear
+        ):
+            if len(loras[0].shape) != 2:
+                continue
+            _source = (
+                _child_module.linear
+                if isinstance(_child_module, LoraInjectedLinear)
+                else _child_module
+            )
+            weight = _source.weight
+            bias = _source.bias
+            _tmp = LoraInjectedLinear(
+                _source.in_features,
+                _source.out_features,
+                _source.bias is not None,
+                r=r.pop(0) if isinstance(r, list) else r,
+            )
+            _tmp.linear.weight = weight
+            if bias is not None:
+                _tmp.linear.bias = bias
+        elif (_child_module.__class__ == nn.Conv2d) or (
+            _child_module.__class__ == LoraInjectedConv2d
+        ):
+            if len(loras[0].shape) != 4:
+                continue
+            _source = (
+                _child_module.conv
+                if isinstance(_child_module, LoraInjectedConv2d)
+                else _child_module
+            )
+            weight = _source.weight
+            bias = _source.bias
+            _tmp = LoraInjectedConv2d(
+                _source.in_channels,
+                _source.out_channels,
+                _source.kernel_size,
+                _source.stride,
+                _source.padding,
+                _source.dilation,
+                _source.groups,
+                _source.bias is not None,
+                r=r.pop(0) if isinstance(r, list) else r,
+            )
+            _tmp.conv.weight = weight
+            if bias is not None:
+                _tmp.conv.bias = bias
+        elif _child_module.__class__ == nn.Conv3d or(
+            _child_module.__class__ == LoraInjectedConv3d
+        ):
+            if len(loras[0].shape) != 5:
+                continue
+            _source = (
+                _child_module.conv
+                if isinstance(_child_module, LoraInjectedConv3d)
+                else _child_module
+            )
+            weight = _source.weight
+            bias = _source.bias
+            _tmp = LoraInjectedConv3d(
+                _source.in_channels,
+                _source.out_channels,
+                bias=_source.bias is not None,
+                kernel_size=_source.kernel_size,
+                padding=_source.padding,
+                r=r.pop(0) if isinstance(r, list) else r,
+            )
+            _tmp.conv.weight = weight
+            if bias is not None:
+                _tmp.conv.bias = bias
+        # switch the module
+        _module._modules[name] = _tmp
+        up_weight = loras.pop(0)
+        down_weight = loras.pop(0)
+        _module._modules[name].lora_up.weight = nn.Parameter(
+            up_weight.type(weight.dtype)
+        )
+        _module._modules[name].lora_down.weight = nn.Parameter(
+            down_weight.type(weight.dtype)
+        )
+        _module._modules[name].to(weight.device)
+def monkeypatch_or_replace_safeloras(models, safeloras):
+    loras = parse_safeloras(safeloras)
+    for name, (lora, ranks, target) in loras.items():
+        model = getattr(models, name, None)
+        if not model:
+            print(f"No model provided for {name}, contained in Lora")
+            continue
+        monkeypatch_or_replace_lora_extended(model, lora, target, ranks)
+def monkeypatch_remove_lora(model):
+    for _module, name, _child_module in _find_modules(
+        model, search_class=[LoraInjectedLinear, LoraInjectedConv2d, LoraInjectedConv3d]
+    ):
+        if isinstance(_child_module, LoraInjectedLinear):
+            _source = _child_module.linear
+            weight, bias = _source.weight, _source.bias
+            _tmp = nn.Linear(
+                _source.in_features, _source.out_features, bias is not None
+            )
+            _tmp.weight = weight
+            if bias is not None:
+                _tmp.bias = bias
+        else:
+            _source = _child_module.conv
+            weight, bias = _source.weight, _source.bias
+            if isinstance(_source, nn.Conv2d):
+                _tmp = nn.Conv2d(
+                    in_channels=_source.in_channels,
+                    out_channels=_source.out_channels,
+                    kernel_size=_source.kernel_size,
+                    stride=_source.stride,
+                    padding=_source.padding,
+                    dilation=_source.dilation,
+                    groups=_source.groups,
+                    bias=bias is not None,
+                )
+                _tmp.weight = weight
+                if bias is not None:
+                    _tmp.bias = bias
+            if isinstance(_source, nn.Conv3d):
+                _tmp = nn.Conv3d(
+                _source.in_channels,
+                _source.out_channels,
+                bias=_source.bias is not None,
+                kernel_size=_source.kernel_size,
+                padding=_source.padding,
+            )
+            _tmp.weight = weight
+            if bias is not None:
+                _tmp.bias = bias
+        _module._modules[name] = _tmp
+def monkeypatch_add_lora(
+    model,
+    loras,
+    target_replace_module=DEFAULT_TARGET_REPLACE,
+    alpha: float = 1.0,
+    beta: float = 1.0,
+):
+    for _module, name, _child_module in _find_modules(
+        model, target_replace_module, search_class=[LoraInjectedLinear]
+    ):
+        weight = _child_module.linear.weight
+        up_weight = loras.pop(0)
+        down_weight = loras.pop(0)
+        _module._modules[name].lora_up.weight = nn.Parameter(
+            up_weight.type(weight.dtype).to(weight.device) * alpha
+            + _module._modules[name].lora_up.weight.to(weight.device) * beta
+        )
+        _module._modules[name].lora_down.weight = nn.Parameter(
+            down_weight.type(weight.dtype).to(weight.device) * alpha
+            + _module._modules[name].lora_down.weight.to(weight.device) * beta
+        )
+        _module._modules[name].to(weight.device)
+def tune_lora_scale(model, alpha: float = 1.0):
+    for _module in model.modules():
+        if _module.__class__.__name__ in ["LoraInjectedLinear", "LoraInjectedConv2d", "LoraInjectedConv3d"]:
+            _module.scale = alpha
+def set_lora_diag(model, diag: torch.Tensor):
+    for _module in model.modules():
+        if _module.__class__.__name__ in ["LoraInjectedLinear", "LoraInjectedConv2d", "LoraInjectedConv3d"]:
+            _module.set_selector_from_diag(diag)
+def _text_lora_path(path: str) -> str:
+    assert path.endswith(".pt"), "Only .pt files are supported"
+    return ".".join(path.split(".")[:-1] + ["text_encoder", "pt"])
+def _ti_lora_path(path: str) -> str:
+    assert path.endswith(".pt"), "Only .pt files are supported"
+    return ".".join(path.split(".")[:-1] + ["ti", "pt"])
+def apply_learned_embed_in_clip(
+    learned_embeds,
+    text_encoder,
+    tokenizer,
+    token: Optional[Union[str, List[str]]] = None,
+    idempotent=False,
+):
+    if isinstance(token, str):
+        trained_tokens = [token]
+    elif isinstance(token, list):
+        assert len(learned_embeds.keys()) == len(
+            token
+        ), "The number of tokens and the number of embeds should be the same"
+        trained_tokens = token
+    else:
+        trained_tokens = list(learned_embeds.keys())
+    for token in trained_tokens:
+        print(token)
+        embeds = learned_embeds[token]
+        # cast to dtype of text_encoder
+        dtype = text_encoder.get_input_embeddings().weight.dtype
+        num_added_tokens = tokenizer.add_tokens(token)
+        i = 1
+        if not idempotent:
+            while num_added_tokens == 0:
+                print(f"The tokenizer already contains the token {token}.")
+                token = f"{token[:-1]}-{i}>"
+                print(f"Attempting to add the token {token}.")
+                num_added_tokens = tokenizer.add_tokens(token)
+                i += 1
+        elif num_added_tokens == 0 and idempotent:
+            print(f"The tokenizer already contains the token {token}.")
+            print(f"Replacing {token} embedding.")
+        # resize the token embeddings
+        text_encoder.resize_token_embeddings(len(tokenizer))
+        # get the id for the token and assign the embeds
+        token_id = tokenizer.convert_tokens_to_ids(token)
+        text_encoder.get_input_embeddings().weight.data[token_id] = embeds
+    return token
+def load_learned_embed_in_clip(
+    learned_embeds_path,
+    text_encoder,
+    tokenizer,
+    token: Optional[Union[str, List[str]]] = None,
+    idempotent=False,
+):
+    learned_embeds = torch.load(learned_embeds_path)
+    apply_learned_embed_in_clip(
+        learned_embeds, text_encoder, tokenizer, token, idempotent
+    )
+def patch_pipe(
+    pipe,
+    maybe_unet_path,
+    token: Optional[str] = None,
+    r: int = 4,
+    patch_unet=True,
+    patch_text=True,
+    patch_ti=True,
+    idempotent_token=True,
+    unet_target_replace_module=DEFAULT_TARGET_REPLACE,
+    text_target_replace_module=TEXT_ENCODER_DEFAULT_TARGET_REPLACE,
+):
+    if maybe_unet_path.endswith(".pt"):
+        # torch format
+        if maybe_unet_path.endswith(".ti.pt"):
+            unet_path = maybe_unet_path[:-6] + ".pt"
+        elif maybe_unet_path.endswith(".text_encoder.pt"):
+            unet_path = maybe_unet_path[:-16] + ".pt"
+        else:
+            unet_path = maybe_unet_path
+        ti_path = _ti_lora_path(unet_path)
+        text_path = _text_lora_path(unet_path)
+        if patch_unet:
+            print("LoRA : Patching Unet")
+            monkeypatch_or_replace_lora(
+                pipe.unet,
+                torch.load(unet_path),
+                r=r,
+                target_replace_module=unet_target_replace_module,
+            )
+        if patch_text:
+            print("LoRA : Patching text encoder")
+            monkeypatch_or_replace_lora(
+                pipe.text_encoder,
+                torch.load(text_path),
+                target_replace_module=text_target_replace_module,
+                r=r,
+            )
+        if patch_ti:
+            print("LoRA : Patching token input")
+            token = load_learned_embed_in_clip(
+                ti_path,
+                pipe.text_encoder,
+                pipe.tokenizer,
+                token=token,
+                idempotent=idempotent_token,
+            )
+    elif maybe_unet_path.endswith(".safetensors"):
+        safeloras = safe_open(maybe_unet_path, framework="pt", device="cpu")
+        monkeypatch_or_replace_safeloras(pipe, safeloras)
+        tok_dict = parse_safeloras_embeds(safeloras)
+        if patch_ti:
+            apply_learned_embed_in_clip(
+                tok_dict,
+                pipe.text_encoder,
+                pipe.tokenizer,
+                token=token,
+                idempotent=idempotent_token,
+            )
+        return tok_dict
+def train_patch_pipe(pipe, patch_unet, patch_text):
+    if patch_unet:
+        print("LoRA : Patching Unet")
+        collapse_lora(pipe.unet)
+        monkeypatch_remove_lora(pipe.unet)
+    if patch_text:
+        print("LoRA : Patching text encoder")
+        collapse_lora(pipe.text_encoder)
+        monkeypatch_remove_lora(pipe.text_encoder)
+@torch.no_grad()
+def inspect_lora(model):
+    moved = {}
+    for name, _module in model.named_modules():
+        if _module.__class__.__name__ in ["LoraInjectedLinear", "LoraInjectedConv2d", "LoraInjectedConv3d"]:
+            ups = _module.lora_up.weight.data.clone()
+            downs = _module.lora_down.weight.data.clone()
+            wght: torch.Tensor = ups.flatten(1) @ downs.flatten(1)
+            dist = wght.flatten().abs().mean().item()
+            if name in moved:
+                moved[name].append(dist)
+            else:
+                moved[name] = [dist]
+    return moved
+def save_all(
+    unet,
+    text_encoder,
+    save_path,
+    placeholder_token_ids=None,
+    placeholder_tokens=None,
+    save_lora=True,
+    save_ti=True,
+    target_replace_module_text=TEXT_ENCODER_DEFAULT_TARGET_REPLACE,
+    target_replace_module_unet=DEFAULT_TARGET_REPLACE,
+    safe_form=True,
+):
+    if not safe_form:
+        # save ti
+        if save_ti:
+            ti_path = _ti_lora_path(save_path)
+            learned_embeds_dict = {}
+            for tok, tok_id in zip(placeholder_tokens, placeholder_token_ids):
+                learned_embeds = text_encoder.get_input_embeddings().weight[tok_id]
+                print(
+                    f"Current Learned Embeddings for {tok}:, id {tok_id} ",
+                    learned_embeds[:4],
+                )
+                learned_embeds_dict[tok] = learned_embeds.detach().cpu()
+            torch.save(learned_embeds_dict, ti_path)
+            print("Ti saved to ", ti_path)
+        # save text encoder
+        if save_lora:
+            save_lora_weight(
+                unet, save_path, target_replace_module=target_replace_module_unet
+            )
+            print("Unet saved to ", save_path)
+            save_lora_weight(
+                text_encoder,
+                _text_lora_path(save_path),
+                target_replace_module=target_replace_module_text,
+            )
+            print("Text Encoder saved to ", _text_lora_path(save_path))
+    else:
+        assert save_path.endswith(
+            ".safetensors"
+        ), f"Save path : {save_path} should end with .safetensors"
+        loras = {}
+        embeds = {}
+        if save_lora:
+            loras["unet"] = (unet, target_replace_module_unet)
+            loras["text_encoder"] = (text_encoder, target_replace_module_text)
+        if save_ti:
+            for tok, tok_id in zip(placeholder_tokens, placeholder_token_ids):
+                learned_embeds = text_encoder.get_input_embeddings().weight[tok_id]
+                print(
+                    f"Current Learned Embeddings for {tok}:, id {tok_id} ",
+                    learned_embeds[:4],
+                )
+                embeds[tok] = learned_embeds.detach().cpu()
+        save_safeloras_with_embeds(loras, embeds, save_path)

i2vedit/utils/lora_handler.py ADDED Viewed

	@@ -0,0 +1,270 @@

+import os
+from logging import warnings
+import torch
+from typing import Union
+from types import SimpleNamespace
+from diffusers import UNetSpatioTemporalConditionModel
+from transformers import CLIPTextModel
+from .lora import (
+    extract_lora_ups_down,
+    inject_trainable_lora_extended,
+    save_lora_weight,
+    train_patch_pipe,
+    monkeypatch_or_replace_lora,
+    monkeypatch_or_replace_lora_extended
+)
+FILE_BASENAMES = ['unet', 'text_encoder']
+LORA_FILE_TYPES = ['.pt', '.safetensors']
+CLONE_OF_SIMO_KEYS = ['model', 'loras', 'target_replace_module', 'r']
+STABLE_LORA_KEYS = ['model', 'target_module', 'search_class', 'r', 'dropout', 'lora_bias']
+lora_versions = dict(
+    stable_lora = "stable_lora",
+    cloneofsimo = "cloneofsimo"
+)
+lora_func_types = dict(
+    loader = "loader",
+    injector = "injector"
+)
+lora_args = dict(
+    model = None,
+    loras = None,
+    target_replace_module = [],
+    target_module = [],
+    r = 4,
+    search_class = [torch.nn.Linear],
+    dropout = 0,
+    lora_bias = 'none'
+)
+LoraVersions = SimpleNamespace(**lora_versions)
+LoraFuncTypes = SimpleNamespace(**lora_func_types)
+LORA_VERSIONS = [LoraVersions.stable_lora, LoraVersions.cloneofsimo]
+LORA_FUNC_TYPES = [LoraFuncTypes.loader, LoraFuncTypes.injector]
+def filter_dict(_dict, keys=[]):
+    if len(keys) == 0:
+        assert "Keys cannot empty for filtering return dict."
+    for k in keys:
+        if k not in lora_args.keys():
+            assert f"{k} does not exist in available LoRA arguments"
+    return {k: v for k, v in _dict.items() if k in keys}
+class LoraHandler(object):
+    def __init__(
+        self,
+        version: LORA_VERSIONS = LoraVersions.cloneofsimo,
+        use_unet_lora: bool = False,
+        use_image_lora: bool = False,
+        save_for_webui: bool = False,
+        only_for_webui: bool = False,
+        lora_bias: str = 'none',
+        unet_replace_modules: list = None,
+        image_encoder_replace_modules: list = None
+    ):
+        self.version = version
+        self.lora_loader = self.get_lora_func(func_type=LoraFuncTypes.loader)
+        self.lora_injector = self.get_lora_func(func_type=LoraFuncTypes.injector)
+        self.lora_bias = lora_bias
+        self.use_unet_lora = use_unet_lora
+        self.use_image_lora = use_image_lora
+        self.save_for_webui = save_for_webui
+        self.only_for_webui = only_for_webui
+        self.unet_replace_modules = unet_replace_modules
+        self.image_encoder_replace_modules = image_encoder_replace_modules
+        self.use_lora = any([use_image_lora, use_unet_lora])
+    def is_cloneofsimo_lora(self):
+        return self.version == LoraVersions.cloneofsimo
+    def get_lora_func(self, func_type: LORA_FUNC_TYPES = LoraFuncTypes.loader):
+        if self.is_cloneofsimo_lora():
+            if func_type == LoraFuncTypes.loader:
+                return monkeypatch_or_replace_lora_extended
+            if func_type == LoraFuncTypes.injector:
+                return inject_trainable_lora_extended
+        assert "LoRA Version does not exist."
+    def check_lora_ext(self, lora_file: str):
+        return lora_file.endswith(tuple(LORA_FILE_TYPES))
+    def get_lora_file_path(
+        self,
+        lora_path: str,
+        model: Union[UNetSpatioTemporalConditionModel, CLIPTextModel]
+    ):
+        if os.path.exists(lora_path):
+            lora_filenames = [fns for fns in os.listdir(lora_path)]
+            is_lora = self.check_lora_ext(lora_path)
+            is_unet = isinstance(model, UNetSpatioTemporalConditionModel)
+            #is_text =  isinstance(model, CLIPTextModel)
+            idx = 0 if is_unet else 1
+            base_name = FILE_BASENAMES[idx]
+            for lora_filename in lora_filenames:
+                is_lora = self.check_lora_ext(lora_filename)
+                if not is_lora:
+                    continue
+                if base_name in lora_filename:
+                    return os.path.join(lora_path, lora_filename)
+        else:
+            print(f"lora_path: {lora_path} does not exist. Inject without pretrained loras...")
+        return None
+    def handle_lora_load(self, file_name:str, lora_loader_args: dict = None):
+        self.lora_loader(**lora_loader_args)
+        print(f"Successfully loaded LoRA from: {file_name}")
+    def load_lora(self, model, lora_path: str = '', lora_loader_args: dict = None,):
+        try:
+            lora_file = self.get_lora_file_path(lora_path, model)
+            if lora_file is not None:
+                lora_loader_args.update({"lora_path": lora_file})
+                self.handle_lora_load(lora_file, lora_loader_args)
+            else:
+                print(f"Could not load LoRAs for {model.__class__.__name__}. Injecting new ones instead...")
+        except Exception as e:
+            print(f"An error occurred while loading a LoRA file: {e}")
+    def get_lora_func_args(self, lora_path, use_lora, model, replace_modules, r, dropout, lora_bias, scale):
+        return_dict = lora_args.copy()
+        if self.is_cloneofsimo_lora():
+            return_dict = filter_dict(return_dict, keys=CLONE_OF_SIMO_KEYS)
+            return_dict.update({
+                "model": model,
+                "loras": self.get_lora_file_path(lora_path, model),
+                "target_replace_module": replace_modules,
+                "r": r,
+                "scale": scale,
+                "dropout_p": dropout,
+            })
+        return return_dict
+    def do_lora_injection(
+        self,
+        model,
+        replace_modules,
+        bias='none',
+        dropout=0,
+        r=4,
+        lora_loader_args=None,
+    ):
+        REPLACE_MODULES = replace_modules
+        params = None
+        negation = None
+        is_injection_hybrid = False
+        if self.is_cloneofsimo_lora():
+            is_injection_hybrid = True
+            injector_args = lora_loader_args
+            params, negation = self.lora_injector(**injector_args)  # inject_trainable_lora_extended
+            for _up, _down in extract_lora_ups_down(
+                model,
+                target_replace_module=REPLACE_MODULES):
+                if all(x is not None for x in [_up, _down]):
+                    print(f"Lora successfully injected into {model.__class__.__name__}.")
+                break
+            return params, negation, is_injection_hybrid
+        return params, negation, is_injection_hybrid
+    def add_lora_to_model(self, use_lora, model, replace_modules, dropout=0.0, lora_path='', r=16, scale=1.0):
+        params = None
+        negation = None
+        lora_loader_args = self.get_lora_func_args(
+            lora_path,
+            use_lora,
+            model,
+            replace_modules,
+            r,
+            dropout,
+            self.lora_bias,
+            scale
+        )
+        if use_lora:
+            params, negation, is_injection_hybrid = self.do_lora_injection(
+                model,
+                replace_modules,
+                bias=self.lora_bias,
+                lora_loader_args=lora_loader_args,
+                dropout=dropout,
+                r=r
+            )
+            if not is_injection_hybrid:
+                self.load_lora(model, lora_path=lora_path, lora_loader_args=lora_loader_args)
+        params = model if params is None else params
+        return params, negation
+    def save_cloneofsimo_lora(self, model, save_path, step, flag):
+        def save_lora(model, name, condition, replace_modules, step, save_path, flag=None):
+            if condition and replace_modules is not None:
+                save_path = f"{save_path}/{step}_{name}.pt"
+                save_lora_weight(model, save_path, replace_modules, flag)
+        save_lora(
+            model.unet,
+            FILE_BASENAMES[0],
+            self.use_unet_lora,
+            self.unet_replace_modules,
+            step,
+            save_path,
+            flag
+        )
+        save_lora(
+            model.image_encoder,
+            FILE_BASENAMES[1],
+            self.use_image_lora,
+            self.image_encoder_replace_modules,
+            step,
+            save_path,
+            flag
+        )
+        # train_patch_pipe(model, self.use_unet_lora, self.use_text_lora)
+    def save_lora_weights(self, model: None, save_path: str ='',step: str = '', flag=None):
+        save_path = f"{save_path}/lora"
+        os.makedirs(save_path, exist_ok=True)
+        if self.is_cloneofsimo_lora():
+            if any([self.save_for_webui, self.only_for_webui]):
+                warnings.warn(
+                    """
+                    You have 'save_for_webui' enabled, but are using cloneofsimo's LoRA implemention.
+                    Only 'stable_lora' is supported for saving to a compatible webui file.
+                    """
+                )
+            self.save_cloneofsimo_lora(model, save_path, step, flag)

i2vedit/utils/model_utils.py ADDED Viewed

	@@ -0,0 +1,588 @@

+import argparse
+import datetime
+import logging
+import inspect
+import math
+import os
+import random
+import gc
+import copy
+import imageio
+import numpy as np
+import PIL
+from PIL import Image
+from scipy.stats import anderson
+from typing import Dict, Optional, Tuple, Callable, List, Union
+from omegaconf import OmegaConf
+from einops import rearrange, repeat
+from dataclasses import dataclass
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torchvision import transforms
+from tqdm.auto import tqdm
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
+import transformers
+from transformers import CLIPTextModel, CLIPTokenizer
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+from transformers.models.clip.modeling_clip import CLIPEncoder
+import diffusers
+from diffusers.models import AutoencoderKL
+from diffusers import DDIMScheduler, TextToVideoSDPipeline
+from diffusers.optimization import get_scheduler
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.models.attention_processor import AttnProcessor2_0, Attention
+from diffusers.models.attention import BasicTransformerBlock
+from diffusers import StableVideoDiffusionPipeline
+from diffusers.models.lora import LoRALinearLayer
+from diffusers import AutoencoderKLTemporalDecoder, EulerDiscreteScheduler, UNetSpatioTemporalConditionModel
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import EMAModel
+from diffusers.utils import check_min_version, deprecate, is_wandb_available, load_image, BaseOutput
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.models.unet_3d_blocks import \
+            (CrossAttnDownBlockSpatioTemporal,
+             DownBlockSpatioTemporal,
+             CrossAttnUpBlockSpatioTemporal,
+             UpBlockSpatioTemporal)
+from diffusers.schedulers.scheduling_euler_discrete import EulerDiscreteSchedulerOutput, EulerDiscreteScheduler
+def _append_dims(x, target_dims):
+    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
+    dims_to_append = target_dims - x.ndim
+    if dims_to_append < 0:
+        raise ValueError(f"input has {x.ndim} dims but target_dims is {target_dims}, which is less")
+    return x[(...,) + (None,) * dims_to_append]
+def tensor2vid(video: torch.Tensor, processor, output_type="np"):
+    # Based on:
+    # https://github.com/modelscope/modelscope/blob/1509fdb973e5871f37148a4b5e5964cafd43e64d/modelscope/pipelines/multi_modal/text_to_video_synthesis_pipeline.py#L78
+    batch_size, channels, num_frames, height, width = video.shape
+    outputs = []
+    for batch_idx in range(batch_size):
+        batch_vid = video[batch_idx].permute(1, 0, 2, 3)
+        batch_output = processor.postprocess(batch_vid, output_type)
+        outputs.append(batch_output)
+    return outputs
+@torch.no_grad()
+def tensor_to_vae_latent(t, vae):
+    video_length = t.shape[1]
+    t = rearrange(t, "b f c h w -> (b f) c h w")
+    latents = vae.encode(t).latent_dist.sample()
+    latents = rearrange(latents, "(b f) c h w -> b f c h w", f=video_length)
+    latents = latents * vae.config.scaling_factor
+    return latents
+def load_primary_models(pretrained_model_path):
+    noise_scheduler = EulerDiscreteScheduler.from_pretrained(
+        pretrained_model_path, subfolder="scheduler")
+    feature_extractor = CLIPImageProcessor.from_pretrained(
+        pretrained_model_path, subfolder="feature_extractor", revision=None
+    )
+    image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+        pretrained_model_path, subfolder="image_encoder", revision=None, variant="fp16"
+    )
+    vae = AutoencoderKLTemporalDecoder.from_pretrained(
+        pretrained_model_path, subfolder="vae", revision=None, variant="fp16")
+    unet = UNetSpatioTemporalConditionModel.from_pretrained(
+        pretrained_model_path,
+        subfolder="unet",
+        low_cpu_mem_usage=True,
+        variant="fp16",
+    )
+    return noise_scheduler, feature_extractor, image_encoder, vae, unet
+def set_processors(attentions):
+    for attn in attentions: attn.set_processor(AttnProcessor2_0())
+def is_attn(name):
+    return ('attn1' or 'attn2' == name.split('.')[-1])
+def set_torch_2_attn(unet):
+    optim_count = 0
+    for name, module in unet.named_modules():
+        if is_attn(name):
+            if isinstance(module, torch.nn.ModuleList):
+                for m in module:
+                    if isinstance(m, BasicTransformerBlock):
+                        set_processors([m.attn1, m.attn2])
+                        optim_count += 1
+    if optim_count > 0:
+        print(f"{optim_count} Attention layers using Scaled Dot Product Attention.")
+def handle_memory_attention(enable_xformers_memory_efficient_attention, enable_torch_2_attn, unet):
+    try:
+        is_torch_2 = hasattr(F, 'scaled_dot_product_attention')
+        enable_torch_2 = is_torch_2 and enable_torch_2_attn
+        if enable_xformers_memory_efficient_attention and not enable_torch_2:
+            if is_xformers_available():
+                from xformers.ops import MemoryEfficientAttentionFlashAttentionOp
+                unet.enable_xformers_memory_efficient_attention(attention_op=MemoryEfficientAttentionFlashAttentionOp)
+            else:
+                raise ValueError("xformers is not available. Make sure it is installed correctly")
+        if enable_torch_2:
+            set_torch_2_attn(unet)
+    except Exception as e:
+        print(e)
+        print("Could not enable memory efficient attention for xformers or Torch 2.0.")
+class P2PEulerDiscreteScheduler(EulerDiscreteScheduler):
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: Union[float, torch.FloatTensor],
+        sample: torch.FloatTensor,
+        conditional_latents: torch.FloatTensor = None,
+        s_churn: float = 0.0,
+        s_tmin: float = 0.0,
+        s_tmax: float = float("inf"),
+        s_noise: float = 1.0,
+        generator: Optional[torch.Generator] = None,
+        return_dict: bool = True,
+    ) -> Union[EulerDiscreteSchedulerOutput, Tuple]:
+        """
+        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
+        process from the learned model outputs (most often the predicted noise).
+        Args:
+            model_output (`torch.FloatTensor`):
+                The direct output from learned diffusion model.
+            timestep (`float`):
+                The current discrete timestep in the diffusion chain.
+            sample (`torch.FloatTensor`):
+                A current instance of a sample created by the diffusion process.
+            s_churn (`float`):
+            s_tmin  (`float`):
+            s_tmax  (`float`):
+            s_noise (`float`, defaults to 1.0):
+                Scaling factor for noise added to the sample.
+            generator (`torch.Generator`, *optional*):
+                A random number generator.
+            return_dict (`bool`):
+                Whether or not to return a [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or
+                tuple.
+        Returns:
+            [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or `tuple`:
+                If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is
+                returned, otherwise a tuple is returned where the first element is the sample tensor.
+        """
+        if (
+            isinstance(timestep, int)
+            or isinstance(timestep, torch.IntTensor)
+            or isinstance(timestep, torch.LongTensor)
+        ):
+            raise ValueError(
+                (
+                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
+                    " `EulerDiscreteScheduler.step()` is not supported. Make sure to pass"
+                    " one of the `scheduler.timesteps` as a timestep."
+                ),
+            )
+        if not self.is_scale_input_called:
+            logger.warning(
+                "The `scale_model_input` function should be called before `step` to ensure correct denoising. "
+                "See `StableDiffusionPipeline` for a usage example."
+            )
+        if self.step_index is None:
+            self._init_step_index(timestep)
+        # Upcast to avoid precision issues when computing prev_sample
+        sample = sample.to(torch.float32)
+        sigma = self.sigmas[self.step_index]
+        gamma = min(s_churn / (len(self.sigmas) - 1), 2**0.5 - 1) if s_tmin <= sigma <= s_tmax else 0.0
+        noise = randn_tensor(
+            model_output.shape, dtype=model_output.dtype, device=model_output.device, generator=generator
+        )
+        eps = noise * s_noise
+        sigma_hat = sigma * (gamma + 1)
+        if gamma > 0:
+            sample = sample + eps * (sigma_hat**2 - sigma**2) ** 0.5
+        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+        # NOTE: "original_sample" should not be an expected prediction_type but is left in for
+        # backwards compatibility
+        if self.config.prediction_type == "original_sample" or self.config.prediction_type == "sample":
+            pred_original_sample = model_output
+        elif self.config.prediction_type == "epsilon":
+            pred_original_sample = sample - sigma_hat * model_output
+        elif self.config.prediction_type == "v_prediction":
+            # denoised = model_output * c_out + input * c_skip
+            pred_original_sample = model_output * (-sigma / (sigma**2 + 1) ** 0.5) + (sample / (sigma**2 + 1))
+        else:
+            raise ValueError(
+                f"prediction_type given as {self.config.prediction_type} must be one of `epsilon`, or `v_prediction`"
+            )
+        for controller in self.controller:
+            pred_original_sample = controller.step_callback(pred_original_sample)
+        # color preservation
+        #color_delta = torch.mean(conditional_latents[:,0:1,:,:,:]) - torch.mean(pred_original_sample[:,0:1,:,:,:])
+        #print("color_delta", color_delta)
+        #pred_original_sample = pred_original_sample + color_delta
+        # 2. Convert to an ODE derivative
+        derivative = (sample - pred_original_sample) / sigma_hat
+        dt = self.sigmas[self.step_index + 1] - sigma_hat
+        prev_sample = sample + derivative * dt
+        # Cast sample back to model compatible dtype
+        prev_sample = prev_sample.to(model_output.dtype)
+        # upon completion increase step index by one
+        self._step_index += 1
+        if not return_dict:
+            return (prev_sample,)
+        return EulerDiscreteSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
+@dataclass
+class StableVideoDiffusionPipelineOutput(BaseOutput):
+    r"""
+    Output class for zero-shot text-to-video pipeline.
+    Args:
+        frames (`[List[PIL.Image.Image]`, `np.ndarray`]):
+            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
+            num_channels)`.
+    """
+    frames: Union[List[PIL.Image.Image], np.ndarray]
+    latents: torch.Tensor
+class P2PStableVideoDiffusionPipeline(StableVideoDiffusionPipeline):
+    def _encode_vae_image(
+        self,
+        image: torch.Tensor,
+        device,
+        num_videos_per_prompt,
+        do_classifier_free_guidance,
+        image_latents: torch.Tensor = None
+    ):
+        if image_latents is None:
+            image = image.to(device=device)
+            image_latents = self.vae.encode(image).latent_dist.mode()
+        else:
+            image_latents = rearrange(image_latents, "b f c h w -> (b f) c h w")
+        if do_classifier_free_guidance:
+            negative_image_latents = torch.zeros_like(image_latents)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            image_latents = torch.cat([negative_image_latents, image_latents])
+        # duplicate image_latents for each generation per prompt, using mps friendly method
+        image_latents = image_latents.repeat(num_videos_per_prompt, 1, 1, 1)
+        return image_latents
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor],
+        edited_firstframe: Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor] = None,
+        image_latents: torch.FloatTensor = None,
+        height: int = 576,
+        width: int = 1024,
+        num_frames: Optional[int] = None,
+        num_inference_steps: int = 25,
+        min_guidance_scale: float = 1.0,
+        max_guidance_scale: float = 2.5,
+        fps: int = 7,
+        motion_bucket_id: int = 127,
+        noise_aug_strength: int = 0.02,
+        decode_chunk_size: Optional[int] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        return_dict: bool = True,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
+                Image or images to guide image generation. If you provide a tensor, it needs to be compatible with
+                [`CLIPImageProcessor`](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json).
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_frames (`int`, *optional*):
+                The number of video frames to generate. Defaults to 14 for `stable-video-diffusion-img2vid` and to 25 for `stable-video-diffusion-img2vid-xt`
+            num_inference_steps (`int`, *optional*, defaults to 25):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            min_guidance_scale (`float`, *optional*, defaults to 1.0):
+                The minimum guidance scale. Used for the classifier free guidance with first frame.
+            max_guidance_scale (`float`, *optional*, defaults to 3.0):
+                The maximum guidance scale. Used for the classifier free guidance with last frame.
+            fps (`int`, *optional*, defaults to 7):
+                Frames per second. The rate at which the generated images shall be exported to a video after generation.
+                Note that Stable Diffusion Video's UNet was micro-conditioned on fps-1 during training.
+            motion_bucket_id (`int`, *optional*, defaults to 127):
+                The motion bucket ID. Used as conditioning for the generation. The higher the number the more motion will be in the video.
+            noise_aug_strength (`int`, *optional*, defaults to 0.02):
+                The amount of noise added to the init image, the higher it is the less the video will look like the init image. Increase it for more motion.
+            decode_chunk_size (`int`, *optional*):
+                The number of frames to decode at a time. The higher the chunk size, the higher the temporal consistency
+                between frames, but also the higher the memory consumption. By default, the decoder will decode all frames at once
+                for maximal quality. Reduce `decode_chunk_size` to reduce memory usage.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+        Returns:
+            [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableVideoDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list of list with the generated frames.
+        Examples:
+        ```py
+        from diffusers import StableVideoDiffusionPipeline
+        from diffusers.utils import load_image, export_to_video
+        pipe = StableVideoDiffusionPipeline.from_pretrained("stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16")
+        pipe.to("cuda")
+        image = load_image("https://lh3.googleusercontent.com/y-iFOHfLTwkuQSUegpwDdgKmOjRSTvPxat63dQLB25xkTs4lhIbRUFeNBWZzYf370g=s1200")
+        image = image.resize((1024, 576))
+        frames = pipe(image, num_frames=25, decode_chunk_size=8).frames[0]
+        export_to_video(frames, "generated.mp4", fps=7)
+        ```
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        num_frames = num_frames if num_frames is not None else self.unet.config.num_frames
+        decode_chunk_size = decode_chunk_size if decode_chunk_size is not None else num_frames
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(image, height, width)
+        # 2. Define call parameters
+        if isinstance(image, PIL.Image.Image):
+            batch_size = 1
+        elif isinstance(image, list):
+            batch_size = len(image)
+        else:
+            batch_size = image.shape[0]
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        self._guidance_scale = max_guidance_scale
+        # 3. Encode input image
+        edited_firstframe = image if edited_firstframe is None else edited_firstframe
+        image_embeddings = self._encode_image(edited_firstframe, device, num_videos_per_prompt, self.do_classifier_free_guidance)
+        # NOTE: Stable Diffusion Video was conditioned on fps - 1, which
+        # is why it is reduced here.
+        # See: https://github.com/Stability-AI/generative-models/blob/ed0997173f98eaf8f4edf7ba5fe8f15c6b877fd3/scripts/sampling/simple_video_sample.py#L188
+        fps = fps - 1
+        # 4. Encode input image using VAE
+        image = self.image_processor.preprocess(image, height=height, width=width)
+        edited_firstframe = self.image_processor.preprocess(edited_firstframe, height=height, width=width)
+        #print("before vae", image.min(), image.max())
+        #noise = randn_tensor(image.shape, generator=generator, device=image.device, dtype=image.dtype)
+        #image = image + noise_aug_strength * noise
+        #edited_firstframe = edited_firstframe + noise_aug_strength * noise
+        if image_latents is not None:
+            #noise_tmp = randn_tensor(image_latents.shape, generator=generator, device=image_latents.device, dtype=image_latents.dtype)
+            image_latents = image_latents / self.vae.config.scaling_factor
+            #image_latents = image_latents + noise_aug_strength * noise_tmp
+        needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float32)
+        #print("before vae", image.min(), image.max())
+        image_latents = self._encode_vae_image(image, device, num_videos_per_prompt, self.do_classifier_free_guidance, image_latents = image_latents)
+        firstframe_latents = self._encode_vae_image(edited_firstframe, device, num_videos_per_prompt, self.do_classifier_free_guidance)
+        noise = randn_tensor(image_latents.shape, generator=generator, device=image_latents.device, dtype=image_latents.dtype)[1:]
+        image_latents[1:] = image_latents[1:] + noise_aug_strength * noise #/ self.vae.config.scaling_factor
+        #firstframe_latents = firstframe_latents + noise_aug_strength * noise / self.vae.config.scaling_factor
+        image_latents = image_latents.to(image_embeddings.dtype)
+        firstframe_latents = firstframe_latents.to(image_embeddings.dtype)
+        # cast back to fp16 if needed
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float16)
+        # Repeat the image latents for each frame so we can concatenate them with the noise
+        # image_latents [batch, channels, height, width] ->[batch, num_frames, channels, height, width]
+        skip=num_frames
+        image_latents = torch.cat(
+            [
+             image_latents.unsqueeze(1).repeat(1, skip, 1, 1, 1),
+             firstframe_latents.unsqueeze(1).repeat(1, num_frames-skip, 1, 1, 1)
+            ],
+            dim=1
+        )
+        #image_latents = image_latents.unsqueeze(1).repeat(1, num_frames, 1, 1, 1)
+        #print("image latents", image_latents.min(), image_latents.max())
+        # 5. Get Added Time IDs
+        added_time_ids = self._get_add_time_ids(
+            fps,
+            motion_bucket_id,
+            noise_aug_strength,
+            image_embeddings.dtype,
+            batch_size,
+            num_videos_per_prompt,
+            self.do_classifier_free_guidance,
+        )
+        added_time_ids = added_time_ids.to(device)
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_frames,
+            num_channels_latents,
+            height,
+            width,
+            image_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 7. Prepare guidance scale
+        guidance_scale = torch.linspace(min_guidance_scale, max_guidance_scale, num_frames).unsqueeze(0)
+        guidance_scale = guidance_scale.to(device, latents.dtype)
+        guidance_scale = guidance_scale.repeat(batch_size * num_videos_per_prompt, 1)
+        guidance_scale = _append_dims(guidance_scale, latents.ndim)
+        self._guidance_scale = guidance_scale
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # Concatenate image_latents over channels dimention
+                latent_model_input = torch.cat([latent_model_input, image_latents], dim=2)
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=image_embeddings,
+                    added_time_ids=added_time_ids,
+                    return_dict=False,
+                )[0]
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                #conditional_latents = image_latents.chunk(2)[1] if self.do_classifier_free_guidance else image_latents
+                #latents = self.scheduler.step(noise_pred, t, latents, conditional_latents=conditional_latents*self.vae.config.scaling_factor).prev_sample
+                latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+        if not output_type == "latent":
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+            frames = self.decode_latents(latents, num_frames, decode_chunk_size)
+            frames = tensor2vid(frames, self.image_processor, output_type=output_type)
+        else:
+            frames = latents
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return frames
+        return StableVideoDiffusionPipelineOutput(frames=frames, latents=latents)

i2vedit/utils/svd_util.py ADDED Viewed

	@@ -0,0 +1,397 @@

+import functools
+import importlib
+import os
+from functools import partial
+from inspect import isfunction
+import fsspec
+import numpy as np
+import torch
+import torch.nn.functional as F
+from PIL import Image, ImageDraw, ImageFont
+from safetensors.torch import load_file as load_safetensors
+import decord
+from einops import rearrange, repeat
+from torchvision.transforms import Resize, Pad, InterpolationMode
+from torch import nn
+def disabled_train(self, mode=True):
+    """Overwrite model.train with this function to make sure train/eval mode
+    does not change anymore."""
+    return self
+def get_string_from_tuple(s):
+    try:
+        # Check if the string starts and ends with parentheses
+        if s[0] == "(" and s[-1] == ")":
+            # Convert the string to a tuple
+            t = eval(s)
+            # Check if the type of t is tuple
+            if type(t) == tuple:
+                return t[0]
+            else:
+                pass
+    except:
+        pass
+    return s
+def is_power_of_two(n):
+    """
+    chat.openai.com/chat
+    Return True if n is a power of 2, otherwise return False.
+    The function is_power_of_two takes an integer n as input and returns True if n is a power of 2, otherwise it returns False.
+    The function works by first checking if n is less than or equal to 0. If n is less than or equal to 0, it can't be a power of 2, so the function returns False.
+    If n is greater than 0, the function checks whether n is a power of 2 by using a bitwise AND operation between n and n-1. If n is a power of 2, then it will have only one bit set to 1 in its binary representation. When we subtract 1 from a power of 2, all the bits to the right of that bit become 1, and the bit itself becomes 0. So, when we perform a bitwise AND between n and n-1, we get 0 if n is a power of 2, and a non-zero value otherwise.
+    Thus, if the result of the bitwise AND operation is 0, then n is a power of 2 and the function returns True. Otherwise, the function returns False.
+    """
+    if n <= 0:
+        return False
+    return (n & (n - 1)) == 0
+def autocast(f, enabled=True):
+    def do_autocast(*args, **kwargs):
+        with torch.cuda.amp.autocast(
+            enabled=enabled,
+            dtype=torch.get_autocast_gpu_dtype(),
+            cache_enabled=torch.is_autocast_cache_enabled(),
+        ):
+            return f(*args, **kwargs)
+    return do_autocast
+def load_partial_from_config(config):
+    return partial(get_obj_from_str(config["target"]), **config.get("params", dict()))
+def log_txt_as_img(wh, xc, size=10):
+    # wh a tuple of (width, height)
+    # xc a list of captions to plot
+    b = len(xc)
+    txts = list()
+    for bi in range(b):
+        txt = Image.new("RGB", wh, color="white")
+        draw = ImageDraw.Draw(txt)
+        font = ImageFont.truetype("data/DejaVuSans.ttf", size=size)
+        nc = int(40 * (wh[0] / 256))
+        if isinstance(xc[bi], list):
+            text_seq = xc[bi][0]
+        else:
+            text_seq = xc[bi]
+        lines = "\n".join(
+            text_seq[start : start + nc] for start in range(0, len(text_seq), nc)
+        )
+        try:
+            draw.text((0, 0), lines, fill="black", font=font)
+        except UnicodeEncodeError:
+            print("Cant encode string for logging. Skipping.")
+        txt = np.array(txt).transpose(2, 0, 1) / 127.5 - 1.0
+        txts.append(txt)
+    txts = np.stack(txts)
+    txts = torch.tensor(txts)
+    return txts
+def partialclass(cls, *args, **kwargs):
+    class NewCls(cls):
+        __init__ = functools.partialmethod(cls.__init__, *args, **kwargs)
+    return NewCls
+def make_path_absolute(path):
+    fs, p = fsspec.core.url_to_fs(path)
+    if fs.protocol == "file":
+        return os.path.abspath(p)
+    return path
+def ismap(x):
+    if not isinstance(x, torch.Tensor):
+        return False
+    return (len(x.shape) == 4) and (x.shape[1] > 3)
+def isimage(x):
+    if not isinstance(x, torch.Tensor):
+        return False
+    return (len(x.shape) == 4) and (x.shape[1] == 3 or x.shape[1] == 1)
+def isheatmap(x):
+    if not isinstance(x, torch.Tensor):
+        return False
+    return x.ndim == 2
+def isneighbors(x):
+    if not isinstance(x, torch.Tensor):
+        return False
+    return x.ndim == 5 and (x.shape[2] == 3 or x.shape[2] == 1)
+def exists(x):
+    return x is not None
+def expand_dims_like(x, y):
+    while x.dim() != y.dim():
+        x = x.unsqueeze(-1)
+    return x
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+def mean_flat(tensor):
+    """
+    https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/nn.py#L86
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+def count_params(model, verbose=False):
+    total_params = sum(p.numel() for p in model.parameters())
+    if verbose:
+        print(f"{model.__class__.__name__} has {total_params * 1.e-6:.2f} M params.")
+    return total_params
+def instantiate_from_config(config):
+    if not "target" in config:
+        if config == "__is_first_stage__":
+            return None
+        elif config == "__is_unconditional__":
+            return None
+        raise KeyError("Expected key `target` to instantiate.")
+    return get_obj_from_str(config["target"])(**config.get("params", dict()))
+def get_obj_from_str(string, reload=False, invalidate_cache=True):
+    module, cls = string.rsplit(".", 1)
+    if invalidate_cache:
+        importlib.invalidate_caches()
+    if reload:
+        module_imp = importlib.import_module(module)
+        importlib.reload(module_imp)
+    return getattr(importlib.import_module(module, package=None), cls)
+def append_zero(x):
+    return torch.cat([x, x.new_zeros([1])])
+def append_dims(x, target_dims):
+    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
+    dims_to_append = target_dims - x.ndim
+    if dims_to_append < 0:
+        raise ValueError(
+            f"input has {x.ndim} dims but target_dims is {target_dims}, which is less"
+        )
+    return x[(...,) + (None,) * dims_to_append]
+def load_model_from_config(config, ckpt, verbose=True, freeze=True):
+    print(f"Loading model from {ckpt}")
+    if ckpt.endswith("ckpt"):
+        pl_sd = torch.load(ckpt, map_location="cpu")
+        if "global_step" in pl_sd:
+            print(f"Global Step: {pl_sd['global_step']}")
+        sd = pl_sd["state_dict"]
+    elif ckpt.endswith("safetensors"):
+        sd = load_safetensors(ckpt)
+    else:
+        raise NotImplementedError
+    model = instantiate_from_config(config.model)
+    m, u = model.load_state_dict(sd, strict=False)
+    if len(m) > 0 and verbose:
+        print("missing keys:")
+        print(m)
+    if len(u) > 0 and verbose:
+        print("unexpected keys:")
+        print(u)
+    if freeze:
+        for param in model.parameters():
+            param.requires_grad = False
+    model.eval()
+    return model
+def get_configs_path() -> str:
+    """
+    Get the `configs` directory.
+    For a working copy, this is the one in the root of the repository,
+    but for an installed copy, it's in the `sgm` package (see pyproject.toml).
+    """
+    this_dir = os.path.dirname(__file__)
+    candidates = (
+        os.path.join(this_dir, "configs"),
+        os.path.join(this_dir, "..", "configs"),
+    )
+    for candidate in candidates:
+        candidate = os.path.abspath(candidate)
+        if os.path.isdir(candidate):
+            return candidate
+    raise FileNotFoundError(f"Could not find SGM configs in {candidates}")
+def get_nested_attribute(obj, attribute_path, depth=None, return_key=False):
+    """
+    Will return the result of a recursive get attribute call.
+    E.g.:
+        a.b.c
+        = getattr(getattr(a, "b"), "c")
+        = get_nested_attribute(a, "b.c")
+    If any part of the attribute call is an integer x with current obj a, will
+    try to call a[x] instead of a.x first.
+    """
+    attributes = attribute_path.split(".")
+    if depth is not None and depth > 0:
+        attributes = attributes[:depth]
+    assert len(attributes) > 0, "At least one attribute should be selected"
+    current_attribute = obj
+    current_key = None
+    for level, attribute in enumerate(attributes):
+        current_key = ".".join(attributes[: level + 1])
+        try:
+            id_ = int(attribute)
+            current_attribute = current_attribute[id_]
+        except ValueError:
+            current_attribute = getattr(current_attribute, attribute)
+    return (current_attribute, current_key) if return_key else current_attribute
+def pad_with_ratio(frames, res):
+    _, _, ih, iw = frames.shape
+    #print("ih, iw", ih, iw)
+    i_ratio = ih / iw
+    h, w = res
+    #print("h,w", h ,w)
+    n_ratio = h / w
+    if i_ratio > n_ratio:
+        nw = int(ih / h * w)
+        #print("nw", nw)
+        frames = Pad((nw - iw)//2)(frames)
+        frames = frames[...,(nw - iw)//2:-(nw - iw)//2,:]
+    else:
+        nh = int(iw / w * h)
+        frames = Pad((nh - ih)//2)(frames)
+        frames = frames[...,:,(nh - ih)//2:-(nh - ih)//2]
+    #print("after pad", frames.shape)
+    return frames
+def prepare_video(video_path:str, resolution, device, dtype, normalize=True, start_t:float=0, end_t:float=-1, output_fps:int=-1, pad_to_fix=False):
+    vr = decord.VideoReader(video_path)
+    initial_fps = vr.get_avg_fps()
+    if output_fps == -1:
+        output_fps = int(initial_fps)
+    if end_t == -1:
+        end_t = len(vr) / initial_fps
+    else:
+        end_t = min(len(vr) / initial_fps, end_t)
+    assert 0 <= start_t < end_t
+    assert output_fps > 0
+    start_f_ind = int(start_t * initial_fps)
+    end_f_ind = int(end_t * initial_fps)
+    num_f = int((end_t - start_t) * output_fps)
+    sample_idx = np.linspace(start_f_ind, end_f_ind, num_f, endpoint=False).astype(int)
+    video = vr.get_batch(sample_idx)
+    if torch.is_tensor(video):
+        video = video.detach().cpu().numpy()
+    else:
+        video = video.asnumpy()
+    _, h, w, _ = video.shape
+    video = rearrange(video, "f h w c -> f c h w")
+    video = torch.Tensor(video).to(device).to(dtype)
+    # Use max if you want the larger side to be equal to resolution (e.g. 512)
+    # k = float(resolution) / min(h, w)
+    if pad_to_fix and resolution is not None:
+        video = pad_with_ratio(video, resolution)
+    if isinstance(resolution, tuple):
+        #video = Resize(resolution, interpolation=InterpolationMode.BICUBIC, antialias=True)(video)
+        video = nn.functional.interpolate(video, size=resolution, mode='bilinear')
+    else:
+        k = float(resolution) / max(h, w)
+        h *= k
+        w *= k
+        h = int(np.round(h / 64.0)) * 64
+        w = int(np.round(w / 64.0)) * 64
+        video = Resize((h, w), interpolation=InterpolationMode.BICUBIC, antialias=True)(video)
+    if normalize:
+        video = video / 127.5 - 1.0
+    return video, output_fps
+def return_to_original_res(frames, res, pad_to_fix=False):
+    #print("original res", res)
+    _, _, h, w = frames.shape
+    #print("h w", h, w)
+    n_ratio = h / w
+    ih, iw = res
+    i_ratio = ih / iw
+    if pad_to_fix:
+        if i_ratio > n_ratio:
+            nw = int(ih / h * w)
+            frames = Resize((ih, iw+2*(nw - iw)//2), interpolation=InterpolationMode.BICUBIC, antialias=True)(frames)
+            frames = frames[...,:,(nw - iw)//2:-(nw - iw)//2]
+        else:
+            nh = int(iw / w * h)
+            frames = Resize((ih+2*(nh - ih)//2, iw), interpolation=InterpolationMode.BICUBIC, antialias=True)(frames)
+            frames = frames[...,(nh - ih)//2:-(nh - ih)//2,:]
+    else:
+        frames = Resize((ih, iw), interpolation=InterpolationMode.BICUBIC, antialias=True)(frames)
+    return frames
+class SmoothAreaRandomDetection(object):
+    def __init__(self, device="cuda", dtype=torch.float16):
+        kernel_x = torch.zeros(3,3,3,3)
+        for i in range(3):
+            kernel_x[i,i,:,:] = torch.Tensor([[-1., 0., 1.], [-2., 0., 2.], [-1., 0., 1.]])
+        kernel_y = torch.zeros(3,3,3,3)
+        for i in range(3):
+            kernel_y[i,i,:,:] = torch.Tensor([[-1., -2., -1.], [0., 0., 0.], [1., 2., 1.]])
+        kernel_x = kernel_x.to(device, dtype)
+        kernel_y = kernel_y.to(device, dtype)
+        self.weight_x = kernel_x
+        self.weight_y = kernel_y
+        self.eps = 1/256.
+    def detection(self, x, thr=0.0):
+        original_dim = x.ndim
+        if x.ndim > 4:
+            b, f, c, h, w = x.shape
+            x = rearrange(x, "b f c h w -> (b f) c h w")
+        grad_xx = F.conv2d(x, self.weight_x, stride=1, padding=1)
+        grad_yx = F.conv2d(x, self.weight_y, stride=1, padding=1)
+        gradient_x = torch.abs(grad_xx) + torch.abs(grad_yx)
+        gradient_x = torch.mean(gradient_x, dim=1, keepdim=True)
+        gradient_x = repeat(gradient_x, "b 1 ... -> b 3 ...")
+        if original_dim > 4:
+            gradient_x = rearrange(gradient_x, "(b f) c h w -> b f c h w", b=b)
+        return gradient_x <= thr

i2vedit/version.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# GENERATED VERSION FILE
+# TIME: Thu May 29 22:38:49 2025
+__version__ = '0.1.0-dev'
+__gitsha__ = 'unknown'
+version_info = (0, 1, "0-dev")

main.py ADDED Viewed

	@@ -0,0 +1,595 @@

+import argparse
+import datetime
+import logging
+import inspect
+import math
+import os
+import random
+import gc
+import copy
+import imageio
+import numpy as np
+from PIL import Image
+from scipy.stats import anderson
+from typing import Dict, Optional, Tuple, List
+from omegaconf import OmegaConf
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torchvision import transforms
+from torchvision.transforms import ToTensor
+from tqdm.auto import tqdm
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
+import transformers
+from transformers import CLIPTextModel, CLIPTokenizer
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+from transformers.models.clip.modeling_clip import CLIPEncoder
+import diffusers
+from diffusers.models import AutoencoderKL
+from diffusers import DDIMScheduler, TextToVideoSDPipeline
+from diffusers.optimization import get_scheduler
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.models.attention_processor import AttnProcessor2_0, Attention
+from diffusers.models.attention import BasicTransformerBlock
+from diffusers import StableVideoDiffusionPipeline
+from diffusers.models.lora import LoRALinearLayer
+from diffusers import AutoencoderKLTemporalDecoder, EulerDiscreteScheduler, UNetSpatioTemporalConditionModel
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.optimization import get_scheduler
+from diffusers.training_utils import EMAModel
+from diffusers.utils import check_min_version, deprecate, is_wandb_available, load_image
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.models.unet_3d_blocks import \
+            (CrossAttnDownBlockSpatioTemporal,
+             DownBlockSpatioTemporal,
+             CrossAttnUpBlockSpatioTemporal,
+             UpBlockSpatioTemporal)
+from i2vedit.utils.dataset import VideoJsonDataset, SingleVideoDataset, \
+    ImageDataset, VideoFolderDataset, CachedDataset, \
+    pad_with_ratio, return_to_original_res
+from einops import rearrange, repeat
+from i2vedit.utils.lora_handler import LoraHandler
+from i2vedit.utils.lora import extract_lora_child_module
+from i2vedit.utils.euler_utils import euler_inversion
+from i2vedit.utils.svd_util import SmoothAreaRandomDetection
+from i2vedit.data import VideoIO, SingleClipDataset, ResolutionControl
+#from utils.model_utils import load_primary_models
+from i2vedit.utils.euler_utils import inverse_video
+from i2vedit.train import train_motion_lora, load_images_from_list
+from i2vedit.inference import initialize_pipeline
+from i2vedit.utils.model_utils import P2PEulerDiscreteScheduler, P2PStableVideoDiffusionPipeline
+from i2vedit.prompt_attention import attention_util
+def create_output_folders(output_dir, config):
+    os.makedirs(output_dir, exist_ok=True)
+    OmegaConf.save(config, os.path.join(output_dir, 'config.yaml'))
+    return output_dir
+def main(
+    pretrained_model_path: str,
+    data_params: Dict,
+    train_motion_lora_params: Dict,
+    sarp_params: Dict,
+    attention_matching_params: Dict,
+    long_video_params: Dict = {"mode": "skip-interval"},
+    use_sarp: bool = True,
+    use_motion_lora: bool = True,
+    train_motion_lora_only: bool = False,
+    retrain_motion_lora: bool = True,
+    use_inversed_latents: bool = True,
+    use_attention_matching: bool = True,
+    use_consistency_attention_control: bool = False,
+    output_dir: str = "./outputs",
+    num_steps: int = 25,
+    device: str = "cuda",
+    seed: int = 23,
+    enable_xformers_memory_efficient_attention: bool = True,
+    enable_torch_2_attn: bool = False,
+    dtype: str = 'fp16',
+    load_from_last_frames_latents: List[str] = None,
+    save_last_frames: bool = True,
+    visualize_attention_store: bool = False,
+    visualize_attention_store_steps: List[int] = None,
+    use_latent_blend: bool = False,
+    use_previous_latent_for_train: bool = False,
+    use_latent_noise: bool = True,
+    load_from_previous_consistency_store_controller: str = None,
+    load_from_previous_consistency_edit_controller: List[str] = None
+):
+    *_, config = inspect.getargvalues(inspect.currentframe())
+    if dtype == "fp16":
+        dtype = torch.float16
+    elif dtype == "fp32":
+        dtype = torch.float32
+    # create folder
+    output_dir = create_output_folders(output_dir, config)
+    # prepare video data
+    data_params["output_dir"] = output_dir
+    data_params["device"] = device
+    videoio = VideoIO(**data_params, dtype=dtype)
+    # smooth area random perturbation
+    if use_sarp:
+        sard = SmoothAreaRandomDetection(device, dtype=torch.float32)
+    else:
+        sard = None
+    keyframe = None
+    previous_last_frames = load_images_from_list(data_params.keyframe_paths)
+    consistency_train_controller = None
+    if load_from_last_frames_latents is not None:
+        previous_last_frames_latents = [torch.load(thpath).to(device) for thpath in load_from_last_frames_latents]
+    else:
+        previous_last_frames_latents = [None,] * len(previous_last_frames)
+    if use_consistency_attention_control and load_from_previous_consistency_store_controller is not None:
+        previous_consistency_store_controller = attention_util.ConsistencyAttentionControl(
+             additional_attention_store=None,
+             use_inversion_attention=False,
+             save_self_attention=True,
+             save_latents=False,
+             disk_store=True,
+             load_attention_store=os.path.join(load_from_previous_consistency_store_controller, "clip_0")
+        )
+    else:
+        previous_consistency_store_controller = None
+    previous_consistency_edit_controller_list = [None,] * len(previous_last_frames)
+    if use_consistency_attention_control and load_from_previous_consistency_edit_controller is not None:
+        for i in range(len(load_from_previous_consistency_edit_controller)):
+            previous_consistency_edit_controller_list[i] = attention_util.ConsistencyAttentionControl(
+                 additional_attention_store=None,
+                 use_inversion_attention=False,
+                 save_self_attention=True,
+                 save_latents=False,
+                 disk_store=True,
+                 load_attention_store=os.path.join(load_from_previous_consistency_edit_controller[i], "clip_0")
+            )
+    # read data and process
+    for clip_id, video in enumerate(videoio.read_video_iter()):
+        if clip_id >= data_params.get("end_clip_id", 9):
+            break
+        if clip_id < data_params.get("begin_clip_id", 0):
+            continue
+        video = video.unsqueeze(0)
+        resctrl = ResolutionControl(video.shape[-2:], data_params.output_res, data_params.pad_to_fit, fill=-1)
+        # update keyframe and edited keyframe
+        if long_video_params.mode == "skip-interval":
+            assert data_params.overlay_size > 0
+            # save the first frame as the keyframe for cross-attention
+            #if clip_id == 0:
+            firstframe = video[:,0:1,:,:,:]
+            keyframe = video[:,0:1,:,:,:]
+            edited_keyframes = copy.deepcopy(previous_last_frames)
+            edited_firstframes = edited_keyframes
+            #edited_firstframes = load_images_from_list(data_params.keyframe_paths)
+        elif long_video_params.mode == "auto-regressive":
+            assert data_params.overlay_size == 1
+            firstframe = video[:,0:1,:,:,:]
+            keyframe = video[:,0:1,:,:,:]
+            edited_keyframes = copy.deepcopy(previous_last_frames)
+            edited_firstframes = edited_keyframes
+        # register for unet, perform inversion
+        load_attention_store = None
+        if use_attention_matching:
+            assert use_inversed_latents, "inversion is disabled."
+            if attention_matching_params.get("load_attention_store") is not None:
+                load_attention_store = os.path.join(attention_matching_params.get("load_attention_store"), f"clip_{clip_id}")
+                if not os.path.exists(load_attention_store):
+                    print(f"Load {load_attention_store} failed, folder doesn't exists.")
+                    load_attention_store = None
+            store_controller = attention_util.AttentionStore(
+                disk_store=attention_matching_params.disk_store,
+                save_latents = use_latent_blend,
+                save_self_attention=True,
+                load_attention_store=load_attention_store,
+                store_path=os.path.join(output_dir, "attention_store", f"clip_{clip_id}")
+            )
+            print("store_controller.store_dir:", store_controller.store_dir)
+        else:
+            store_controller = None
+        load_consistency_attention_store = None
+        if use_consistency_attention_control:
+            if clip_id==0 and attention_matching_params.get("load_consistency_attention_store") is not None:
+                load_consistency_attention_store = os.path.join(attention_matching_params.get("load_consistency_attention_store"), f"clip_{clip_id}")
+                if not os.path.exists(load_consistency_attention_store):
+                     print(f"Load {load_consistency_attention_store} failed, folder doesn't exists.")
+                     load_consistency_attention_store = None
+            consistency_store_controller = attention_util.ConsistencyAttentionControl(
+                 additional_attention_store=previous_consistency_store_controller,
+                 use_inversion_attention=False,
+                 save_self_attention=(clip_id==0),
+                 load_attention_store=load_consistency_attention_store,
+                 save_latents=False,
+                 disk_store=True,
+                 store_path=os.path.join(output_dir, "consistency_attention_store", f"clip_{clip_id}")
+            )
+            print("consistency_store_controller.store_dir:", consistency_store_controller.store_dir)
+        else:
+            consistency_store_controller = None
+        if train_motion_lora_only:
+            assert use_motion_lora and retrain_motion_lora, "use_motion_lora/retrain_motion_lora should be enbled to train motion lora only."
+        # perform smooth area random perturbation
+        if use_inversed_latents:
+            print("begin inversion sampling for inference...")
+            inversion_noise = inverse_video(
+                pretrained_model_path,
+                video,
+                keyframe,
+                firstframe,
+                num_steps,
+                resctrl,
+                sard,
+                enable_xformers_memory_efficient_attention,
+                enable_torch_2_attn,
+                store_controller = store_controller,
+                consistency_store_controller = consistency_store_controller,
+                find_modules=attention_matching_params.registered_modules if load_attention_store is None else {},
+                consistency_find_modules=long_video_params.registered_modules if load_consistency_attention_store is None else {},
+               # dtype=dtype,
+                **sarp_params,
+            )
+        else:
+            if use_motion_lora and retrain_motion_lora:
+                assert not any([np > 0 for np in train_motion_lora_params.validation_data.noise_prior]), "inversion noise is not calculated but validation during motion lora training aims to use inversion noise as input latents."
+            inversion_noise = None
+        if use_motion_lora:
+            if retrain_motion_lora:
+                if use_consistency_attention_control:
+                    if data_params.output_res[0] != train_motion_lora_params.train_data.height or \
+                       data_params.output_res[1] != train_motion_lora_params.train_data.width:
+                        if consistency_train_controller is None:
+                            load_consistency_train_attention_store = None
+                            if attention_matching_params.get("load_consistency_train_attention_store") is not None:
+                                load_consistency_train_attention_store = os.path.join(attention_matching_params.get("load_consistency_train_attention_store"), f"clip_0")
+                                if not os.path.exists(load_consistency_train_attention_store):
+                                    print(f"Load {load_consistency_train_attention_store} failed, folder doesn't exists.")
+                                    load_consistency_train_attention_store = None
+                            if load_consistency_train_attention_store is None and clip_id > 0:
+                                raise IOError(f"load_consistency_train_attention_store can't be None for clip {clip_id}.")
+                            consistency_train_controller = attention_util.ConsistencyAttentionControl(
+                                 additional_attention_store=None,
+                                 use_inversion_attention=False,
+                                 save_self_attention=True,
+                                 load_attention_store=load_consistency_train_attention_store,
+                                 save_latents=False,
+                                 disk_store=True,
+                                 store_path=os.path.join(output_dir, "consistency_train_attention_store", "clip_0")
+                            )
+                            print("consistency_train_controller.store_dir:", consistency_train_controller.store_dir)
+                            resctrl_train = ResolutionControl(
+                                video.shape[-2:],
+                                (train_motion_lora_params.train_data.height,train_motion_lora_params.train_data.width),
+                                data_params.pad_to_fit, fill=-1
+                            )
+                            print("begin inversion sampling for training...")
+                            inversion_noise_train = inverse_video(
+                                pretrained_model_path,
+                                video,
+                                keyframe,
+                                firstframe,
+                                num_steps,
+                                resctrl_train,
+                                sard,
+                                enable_xformers_memory_efficient_attention,
+                                enable_torch_2_attn,
+                                store_controller = None,
+                                consistency_store_controller = consistency_train_controller,
+                                find_modules={},
+                                consistency_find_modules=long_video_params.registered_modules if long_video_params.get("load_attention_store") is None else {},
+                               # dtype=dtype,
+                                **sarp_params,
+                            )
+                    else:
+                        if consistency_train_controller is None:
+                            consistency_train_controller = consistency_store_controller
+                else:
+                    consistency_train_controller = None
+            if retrain_motion_lora:
+                train_dataset = SingleClipDataset(
+                    inversion_noise=inversion_noise,
+                    video_clip=video,
+                    keyframe=((ToTensor()(previous_last_frames[0])-0.5)/0.5).unsqueeze(0).unsqueeze(0) if use_previous_latent_for_train else keyframe,
+                    keyframe_latent=previous_last_frames_latents[0] if use_previous_latent_for_train else None,
+                    firstframe=firstframe,
+                    height=train_motion_lora_params.train_data.height,
+                    width=train_motion_lora_params.train_data.width,
+                    use_data_aug=train_motion_lora_params.train_data.get("use_data_aug"),
+                    pad_to_fit=train_motion_lora_params.train_data.get("pad_to_fit", False)
+                )
+                train_motion_lora_params.validation_data.num_inference_steps = num_steps
+                train_motion_lora(
+                    pretrained_model_path,
+                    output_dir,
+                    train_dataset,
+                    edited_firstframes=edited_firstframes,
+                    validation_images=edited_keyframes,
+                    validation_images_latents=previous_last_frames_latents,
+                    seed=seed,
+                    clip_id=clip_id,
+                    consistency_edit_controller_list=previous_consistency_edit_controller_list,
+                    consistency_controller=consistency_train_controller if clip_id!=0 else None,
+                    consistency_find_modules=long_video_params.registered_modules,
+                    enable_xformers_memory_efficient_attention=enable_xformers_memory_efficient_attention,
+                    enable_torch_2_attn=enable_torch_2_attn,
+                    **train_motion_lora_params
+                )
+            if train_motion_lora_only:
+                if not use_consistency_attention_control:
+                    continue
+            # choose and load motion lora
+            best_checkpoint_index = attention_matching_params.get("best_checkpoint_index", 250)
+            if retrain_motion_lora:
+                lora_dir = f"{os.path.join(output_dir,'train_motion_lora')}/clip_{clip_id}"
+                lora_path = f"{lora_dir}/checkpoint-{best_checkpoint_index}/temporal/lora"
+            else:
+                lora_path = f"/homw/user/app/upload/lora"
+            assert os.path.exists(lora_path), f"lora path: {lora_path} doesn't exist!"
+            lora_rank = train_motion_lora_params.lora_rank
+            lora_scale = attention_matching_params.get("lora_scale", 1.0)
+            # prepare models
+            pipe = initialize_pipeline(
+                pretrained_model_path,
+                device,
+                enable_xformers_memory_efficient_attention,
+                enable_torch_2_attn,
+                lora_path,
+                lora_rank,
+                lora_scale,
+                load_spatial_lora = False #(clip_id != 0)
+            ).to(device, dtype=dtype)
+        else:
+            pipe = P2PStableVideoDiffusionPipeline.from_pretrained(
+                pretrained_model_path
+            ).to(device, dtype=dtype)
+        if use_attention_matching or use_consistency_attention_control:
+            pipe.scheduler = P2PEulerDiscreteScheduler.from_config(pipe.scheduler.config)
+        generator = torch.Generator(device="cpu")
+        generator.manual_seed(seed)
+        previous_last_frames = []
+        editing_params = [item for name, item in attention_matching_params.params.items()]
+        with torch.no_grad():
+            with torch.autocast(device, dtype=dtype):
+                for kf_id, (edited_keyframe, editing_param) in enumerate(zip(edited_keyframes, editing_params)):
+                    print(kf_id, editing_param)
+                    # control resolution
+                    iw, ih = edited_keyframe.size
+                    resctrl = ResolutionControl(
+                        (ih, iw),
+                        data_params.output_res,
+                        data_params.pad_to_fit,
+                        fill=0
+                    )
+                    edited_keyframe = resctrl(edited_keyframe)
+                    edited_firstframe = resctrl(edited_firstframes[kf_id])
+                    # control attention
+                    pipe.scheduler.controller = []
+                    if use_attention_matching:
+                        edit_controller = attention_util.AttentionControlEdit(
+                            num_steps = num_steps,
+                            cross_replace_steps = attention_matching_params.cross_replace_steps,
+                            temporal_self_replace_steps = attention_matching_params.temporal_self_replace_steps,
+                            spatial_self_replace_steps = attention_matching_params.spatial_self_replace_steps,
+                            mask_thr = editing_param.get("mask_thr", 0.35),
+                            temporal_step_thr = editing_param.get("temporal_step_thr", [0.5,0.8]),
+                            control_mode = attention_matching_params.control_mode,
+                            spatial_attention_chunk_size = attention_matching_params.get("spatial_attention_chunk_size", 1),
+                            additional_attention_store = store_controller,
+                            use_inversion_attention = True,
+                            save_self_attention = False,
+                            save_latents = False,
+                            latent_blend = use_latent_blend,
+                            disk_store = attention_matching_params.disk_store
+                        )
+                        pipe.scheduler.controller.append(edit_controller)
+                    else:
+                        edit_controller = None
+                    if use_consistency_attention_control:
+                        consistency_edit_controller = attention_util.ConsistencyAttentionControl(
+                             additional_attention_store=previous_consistency_edit_controller_list[kf_id],
+                             use_inversion_attention=False,
+                             save_self_attention=(clip_id==0),
+                             save_latents=False,
+                             disk_store=True,
+                             store_path=os.path.join(output_dir, f"consistency_edit{kf_id}_attention_store", f"clip_{clip_id}")
+                        )
+                        pipe.scheduler.controller.append(consistency_edit_controller)
+                    else:
+                        consistency_edit_controller = None
+                    if use_attention_matching or use_consistency_attention_control:
+                        attention_util.register_attention_control(
+                            pipe.unet,
+                            edit_controller,
+                            consistency_edit_controller,
+                            find_modules=attention_matching_params.registered_modules,
+                            consistency_find_modules=long_video_params.registered_modules
+                        )
+                    # should be reorganized to perform attention control
+                    edited_output = pipe(
+                        edited_keyframe,
+                        edited_firstframe=edited_firstframe,
+                        image_latents=previous_last_frames_latents[kf_id],
+                        width=data_params.output_res[1],
+                        height=data_params.output_res[0],
+                        num_frames=video.shape[1],
+                        num_inference_steps=num_steps,
+                        decode_chunk_size=8,
+                        motion_bucket_id=127,
+                        fps=data_params.output_fps,
+                        noise_aug_strength=0.02,
+                        max_guidance_scale=attention_matching_params.get("max_guidance_scale", 2.5),
+                        generator=generator,
+                        latents=inversion_noise
+                    )
+                    edited_video = [img for sublist in edited_output.frames for img in sublist]
+                    edited_video_latents = edited_output.latents
+                    # callback to replace frames
+                    videoio.write_video(edited_video, kf_id, resctrl)
+                    # save previous frames
+                    if long_video_params.mode == "skip-interval":
+                        #previous_latents[kf_id] = edit_controller.get_all_last_latents(data_params.overlay_size)
+                        previous_last_frames.append( resctrl.callback(edited_video[-1]) )
+                        if use_latent_noise:
+                            previous_last_frames_latents[kf_id] = edited_video_latents[:,-1:,:,:,:]
+                        else:
+                            previous_last_frames_latents[kf_id] = None
+                    elif long_video_params.mode == "auto-regressive":
+                        previous_last_frames.append( resctrl.callback(edited_video[-1]) )
+                        if use_latent_noise:
+                            previous_last_frames_latents[kf_id] = edited_video_latents[:,-1:,:,:,:]
+                        else:
+                            previous_last_frames_latents[kf_id] = None
+                    # save last frames for convenient
+                    if save_last_frames:
+                        try:
+                            fname = os.path.join(output_dir, f"clip_{clip_id}_lastframe_{kf_id}")
+                            previous_last_frames[kf_id].save(fname+".png")
+                            if use_latent_noise:
+                                torch.save(previous_last_frames_latents[kf_id], fname+".pt")
+                        except:
+                            print("save fail")
+                    if use_attention_matching or use_consistency_attention_control:
+                        attention_util.register_attention_control(
+                            pipe.unet,
+                            edit_controller,
+                            consistency_edit_controller,
+                            find_modules=attention_matching_params.registered_modules,
+                            consistency_find_modules=long_video_params.registered_modules,
+                            undo=True
+                        )
+                        if edit_controller is not None:
+                            if visualize_attention_store:
+                                vis_save_path = os.path.join(output_dir, "visualization", f"{kf_id}", f"clip_{clip_id}")
+                                os.makedirs(vis_save_path, exist_ok=True)
+                                attention_util.show_avg_difference_maps(
+                                    edit_controller,
+                                    save_path = vis_save_path
+                                )
+                                assert visualize_attention_store_steps is not None
+                                attention_util.show_self_attention(
+                                    edit_controller,
+                                    steps = visualize_attention_store_steps,
+                                    save_path = vis_save_path,
+                                    inversed = False
+                                )
+                            edit_controller.delete()
+                            del edit_controller
+                    if use_consistency_attention_control:
+                        if clip_id == 0:
+                            previous_consistency_edit_controller_list[kf_id] = consistency_edit_controller
+                        else:
+                            consistency_edit_controller.delete()
+                            del consistency_edit_controller
+                        print(f"previous_consistency_edit_controller_list[{kf_id}]", previous_consistency_edit_controller_list[kf_id].store_dir)
+        if use_attention_matching:
+            del store_controller
+        if use_consistency_attention_control and clip_id == 0:
+            previous_consistency_store_controller = consistency_store_controller
+    videoio.close()
+    if use_consistency_attention_control:
+        print("consistency_store_controller for clip 0:", previous_consistency_store_controller.store_dir)
+        if retrain_motion_lora:
+            print("consistency_train_controller for clip 0:", consistency_train_controller.store_dir)
+        for kf_id in range(len(previous_consistency_edit_controller_list)):
+            print(f"previous_consistency_edit_controller_list[{kf_id}]:", previous_consistency_edit_controller_list[kf_id].store_dir)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", type=str, default='./configs/svdedit/item2_2.yaml')
+    args = parser.parse_args()
+    main(**OmegaConf.load(args.config))

mydata/source_and_edits/source.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b5e206a8e21c8e3b51779ad09352fa96c93e1bd2ec8ad05e2042bd554b733d15
+size 1127699

mydata/source_and_edits/white.jpg ADDED Viewed

Git LFS Details

SHA256: 64bfe81d18ce7936cbac6684e9160852c613e65d70b89375592e25edae119489
Pointer size: 130 Bytes
Size of remote file: 78.2 kB

req.txt ADDED Viewed

	@@ -0,0 +1,165 @@

+absl-py==2.1.0
+accelerate==0.30.1
+addict==2.4.0
+aiofiles==23.2.1
+aiohttp==3.9.5
+aiosignal==1.3.1
+albucore==0.0.13
+albumentations==1.4.13
+annotated-types==0.7.0
+antlr4-python3-runtime==4.9.3
+anyio==4.4.0
+appdirs==1.4.4
+async-timeout==4.0.3
+attrs==23.2.0
+av==12.1.0
+black==24.8.0
+brotli==1.1.0
+certifi==2024.6.2
+charset-normalizer==3.3.2
+click==8.1.7
+cloudpickle==3.0.0
+coloredlogs==15.0.1
+contourpy==1.2.1
+cycler==0.12.1
+cython==3.0.11
+decorator==4.4.2
+decord==0.6.0
+defusedxml==0.7.1
+diffusers==0.25.1
+easydict==1.13
+einops==0.8.0
+et-xmlfile==1.1.0
+eval-type-backport==0.2.0
+exceptiongroup==1.2.2
+fairscale==0.4.13
+fastapi==0.112.0
+ffmpy==0.3.2
+filelock==3.14.0
+fire==0.6.0
+flatbuffers==24.3.25
+fonttools==4.53.1
+frozenlist==1.4.1
+fsspec==2024.6.0
+ftfy==6.2.0
+fvcore==0.1.5.post20221221
+gradio==4.41.0
+gradio-client==1.3.0
+grpcio==1.64.1
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.0
+huggingface-hub==0.23.3
+humanfriendly==10.0
+hydra-core==1.3.2
+idna==3.7
+imageio==2.34.1
+imageio-ffmpeg==0.5.1
+importlib-metadata==7.1.0
+importlib-resources==6.4.2
+insightface==0.7.3
+iopath==0.1.9
+jinja2==3.1.4
+joblib==1.4.2
+kiwisolver==1.4.5
+kornia==0.7.2
+kornia-rs==0.1.3
+lazy-loader==0.4
+lightning-utilities==0.3.0
+markdown==3.6
+markdown-it-py==3.0.0
+markupsafe==2.1.5
+matplotlib==3.9.2
+mdurl==0.1.2
+moviepy==1.0.3
+mpmath==1.3.0
+multidict==6.0.5
+mutagen==1.47.0
+mypy-extensions==1.0.0
+networkx==3.2.1
+ninja==1.11.1.1
+numpy==1.26.4
+omegaconf==2.3.0
+onnx==1.16.2
+onnxruntime==1.18.1
+open-clip-torch==2.24.0
+opencv-python==4.10.0.84
+opencv-python-headless==4.10.0.84
+openpyxl==3.1.5
+orjson==3.10.7
+packaging==24.0
+pandas==2.2.2
+pathspec==0.12.1
+peft==0.11.1
+pillow==10.3.0
+platformdirs==4.2.2
+portalocker==2.10.1
+prettytable==3.11.0
+proglog==0.1.10
+protobuf==4.25.3
+psutil==5.9.8
+pyasn1==0.6.0
+av==12.1.0
+pycocotools==2.0.8
+pycryptodomex==3.20.0
+pydantic==2.8.2
+pydantic-core==2.20.1
+pydub==0.25.1
+pygments==2.18.0
+pyparsing==3.1.2
+pysocks==1.7.1
+python-dateutil==2.9.0.post0
+python-multipart==0.0.9
+pytz==2024.1
+pyyaml==6.0.1
+regex==2023.12.25
+requests==2.32.3
+rich==13.7.1
+rpds-py==0.18.1
+ruff==0.6.0
+safetensors==0.4.3
+scenedetect==0.6.4
+scikit-image==0.24.0
+scikit-learn==1.5.1
+scipy==1.13.1
+segment-anything==1.0
+semantic-version==2.10.0
+sentencepiece==0.1.99
+sentry-sdk==2.5.1
+shellingham==1.5.4
+six==1.16.0
+smmap==5.0.1
+sniffio==1.3.1
+soupsieve==2.5
+starlette==0.37.2
+supervision==0.22.0
+sympy==1.12.1
+tabulate==0.9.0
+tensorboard==2.17.0
+tensorboard-data-server==0.7.2
+tensorboardx==2.6.2.2
+termcolor==2.4.0
+threadpoolctl==3.5.0
+tifffile==2024.8.10
+timm==1.0.8
+tokenizers==0.15.2
+tomli==2.0.1
+tomlkit==0.12.0
+toolz==0.12.1
+torchmetrics==0.11.4
+tqdm==4.66.4
+transformers==4.36.2
+typer==0.12.3
+typing-extensions==4.12.2
+tzdata==2024.1
+urllib3==2.2.1
+uvicorn==0.30.6
+wcwidth==0.2.13
+websockets==12.0
+werkzeug==3.0.3
+xformers==0.0.26.post1
+yacs==0.1.8
+yapf==0.40.2
+yarl==1.9.4
+yt-dlp==2024.8.6
+zipp==3.19.2

requirements.txt ADDED Viewed

	@@ -0,0 +1,176 @@

+--extra-index-url https://download.pytorch.org/whl/cu121
+absl-py==2.1.0
+accelerate==0.30.1
+addict==2.4.0
+aiofiles==23.2.1
+aiohttp==3.9.5
+aiosignal==1.3.1
+albucore==0.0.13
+albumentations==1.4.13
+annotated-types==0.7.0
+antlr4-python3-runtime==4.9.3
+anyio==4.4.0
+appdirs==1.4.4
+async-timeout==4.0.3
+attrs==23.2.0
+av==12.1.0
+black==24.8.0
+Brotli==1.1.0
+certifi==2024.6.2
+charset-normalizer==3.3.2
+click==8.1.7
+cloudpickle==3.0.0
+colorama==0.4.6
+coloredlogs==15.0.1
+contourpy==1.2.1
+cycler==0.12.1
+Cython==3.0.11
+decorator==4.4.2
+decord==0.6.0
+defusedxml==0.7.1
+diffusers>=0.27.0
+easydict==1.13
+einops==0.8.0
+et-xmlfile==1.1.0
+eval_type_backport==0.2.0
+exceptiongroup==1.2.2
+fairscale==0.4.13
+fastapi==0.112.0
+ffmpy==0.3.2
+filelock==3.14.0
+fire==0.6.0
+flatbuffers==24.3.25
+fonttools==4.53.1
+frozenlist==1.4.1
+fsspec==2024.6.0
+ftfy==6.2.0
+fvcore==0.1.5.post20221221
+gradio==4.41.0
+gradio_client==1.3.0
+grpcio==1.64.1
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.0
+huggingface-hub==0.23.3
+humanfriendly==10.0
+hydra-core==1.3.2
+idna==3.7
+imageio==2.34.1
+imageio-ffmpeg==0.5.1
+importlib_metadata==7.1.0
+importlib_resources==6.4.2
+insightface==0.7.3
+intel-openmp==2021.4.0
+iopath==0.1.9
+Jinja2==3.1.4
+joblib==1.4.2
+kiwisolver==1.4.5
+kornia==0.7.2
+kornia_rs==0.1.3
+lazy_loader==0.4
+lightning-utilities==0.3.0
+Markdown==3.6
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.9.2
+mdurl==0.1.2
+mkl==2021.4.0
+moviepy==1.0.3
+mpmath==1.3.0
+multidict==6.0.5
+mutagen==1.47.0
+mypy-extensions==1.0.0
+networkx==3.2.1
+ninja==1.11.1.1
+numpy==1.26.4
+omegaconf==2.3.0
+onnx==1.16.2
+onnxruntime==1.18.1
+open-clip-torch==2.24.0
+opencv-python==4.10.0.84
+opencv-python-headless==4.10.0.84
+openpyxl==3.1.5
+orjson==3.10.7
+packaging==24.0
+pandas==2.2.2
+pathspec==0.12.1
+peft==0.11.1
+pillow==10.3.0
+platformdirs==4.2.2
+portalocker==2.10.1
+prettytable==3.11.0
+proglog==0.1.10
+protobuf==4.25.3
+psutil==5.9.8
+pyasn1==0.6.0
+pycocotools==2.0.8
+pycryptodomex==3.20.0
+pydantic==2.8.2
+pydantic_core==2.20.1
+pydub==0.25.1
+Pygments==2.18.0
+pyparsing==3.1.2
+pyreadline3==3.5.4
+PySocks==1.7.1
+python-dateutil==2.9.0.post0
+python-multipart==0.0.9
+pytz==2024.1
+PyYAML==6.0.1
+regex==2023.12.25
+requests==2.32.3
+rich==13.7.1
+rpds-py==0.18.1
+ruff==0.6.0
+safetensors==0.4.3
+scenedetect==0.6.4
+scikit-image==0.24.0
+scikit-learn==1.5.1
+scipy==1.13.1
+segment-anything==1.0
+semantic-version==2.10.0
+sentencepiece==0.1.99
+sentry-sdk==2.5.1
+setuptools==69.5.1
+shellingham==1.5.4
+six==1.16.0
+smmap==5.0.1
+sniffio==1.3.1
+soupsieve==2.5
+starlette==0.37.2
+supervision==0.22.0
+sympy==1.12.1
+tabulate==0.9.0
+tbb==2021.13.1
+tensorboard==2.17.0
+tensorboard-data-server==0.7.2
+tensorboardX==2.6.2.2
+termcolor==2.4.0
+threadpoolctl==3.5.0
+tifffile==2024.8.10
+timm==1.0.8
+tokenizers==0.15.2
+tomli==2.0.1
+tomlkit==0.12.0
+toolz==0.12.1
+torch==2.3.0+cu121
+torchaudio==2.3.0+cu121
+torchmetrics==0.11.4
+torchvision==0.18.0+cu121
+tqdm==4.66.4
+transformers==4.36.2
+typer==0.12.3
+typing_extensions==4.12.2
+tzdata==2024.1
+urllib3==2.2.1
+uvicorn==0.30.6
+wcwidth==0.2.13
+websockets==12.0
+Werkzeug==3.0.3
+wheel==0.43.0
+xformers==0.0.26.post1
+yacs==0.1.8
+yapf==0.40.2
+yarl==1.9.4
+yt-dlp==2024.8.6
+zipp==3.19.2

setup.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import time
+from setuptools import setup, find_packages
+import sys
+import os
+import os.path as osp
+WORK_DIR = "i2vedit"
+NAME = "i2vedit"
+author = "wenqi.oywq"
+author_email = '[email protected]'
+version_file = 'i2vedit/version.py'
+def get_hash():
+    if False:#os.path.exists('.git'):
+        sha = get_git_hash()[:7]
+    # currently ignore this
+    # elif os.path.exists(version_file):
+    #     try:
+    #         from basicsr.version import __version__
+    #         sha = __version__.split('+')[-1]
+    #     except ImportError:
+    #         raise ImportError('Unable to get git version')
+    else:
+        sha = 'unknown'
+    return sha
+def get_version():
+    with open(version_file, 'r') as f:
+        exec(compile(f.read(), version_file, 'exec'))
+    return locals()['__version__']
+def write_version_py():
+    content = """# GENERATED VERSION FILE
+# TIME: {}
+__version__ = '{}'
+__gitsha__ = '{}'
+version_info = ({})
+"""
+    sha = get_hash()
+    with open('VERSION', 'r') as f:
+        SHORT_VERSION = f.read().strip()
+    VERSION_INFO = ', '.join([x if x.isdigit() else f'"{x}"' for x in SHORT_VERSION.split('.')])
+    version_file_str = content.format(time.asctime(), SHORT_VERSION, sha, VERSION_INFO)
+    with open(version_file, 'w') as f:
+        f.write(version_file_str)
+REQUIRE = [
+]
+def install_requires(REQUIRE):
+    for item in REQUIRE:
+        os.system(f'pip install {item}')
+write_version_py()
+install_requires(REQUIRE)
+setup(
+    name=NAME,
+    packages=find_packages(),
+    version=get_version(),
+    description="image-to-video editing",
+    author=author,
+    author_email=author_email,
+    keywords=["image-to-video editing"],
+    install_requires=[],
+    include_package_data=False,
+    exclude_package_data={'':['.gitignore','README.md','./configs','./outputs'],
+                         },
+    entry_points={'console_scripts': ['pyinstrument = pyinstrument.__main__:main']},
+    zip_safe=False,
+)