FLUX.MF-Lightning-Fast-Upscaler

Running on Zero

App Files Files Community

LPX55 commited on Mar 7

Commit

a102a01

verified ·

1 Parent(s): 1b616c4

Update optimized.py

Browse files

Files changed (1) hide show

optimized.py +38 -17

optimized.py CHANGED Viewed

@@ -11,26 +11,38 @@ from accelerate import dispatch_model, infer_auto_device_map
 def self_attention_slicing(module, slice_size=3):
     """Modified from Diffusers' original for Flux compatibility"""
     def sliced_attention(*args, **kwargs):
-        return module(*args, **kwargs)  # Remove dummy implementation <source_id data="pipeline_flux_controlnet.py" />
-huggingface_token = os.getenv("HUGGINFACE_TOKEN")
-# good_vae = AutoencoderKL.from_pretrained(
-#     "black-forest-labs/FLUX.1-dev",
-#     subfolder="vae",
-#     torch_dtype=torch.bfloat16,
-#     use_safetensors=True,
-#     device_map=None,  # Disable automatic mapping
-#     token=huggingface_token
-# )
 good_vae = AutoencoderKL.from_pretrained(
     "black-forest-labs/FLUX.1-dev",
     subfolder="vae",
     torch_dtype=torch.bfloat16,
     use_safetensors=True,
-    token=huggingface_token  # Fix typo in variable name
 )
 # 2. Main Pipeline Initialization WITH VAE SCOPE
 pipe = FluxControlNetPipeline.from_pretrained(
     "LPX55/FLUX.1-merged_uncensored",
@@ -47,15 +59,24 @@ pipe = FluxControlNetPipeline.from_pretrained(
 # 3. Strict Order for Optimization Steps
 # A. Apply CPU Offloading FIRST
-pipe.enable_sequential_cpu_offload()
 # B. Enable Memory Optimizations
-pipe.enable_vae_tiling()
-pipe.enable_xformers_memory_efficient_attention()
 # C. Unified Precision Handling
-for comp in [pipe.unet, pipe.vae, pipe.controlnet]:
-    comp.to(dtype=torch.bfloat16)
 print(f"VRAM used: {torch.cuda.memory_allocated()/1e9:.2f}GB")
 @spaces.GPU

 def self_attention_slicing(module, slice_size=3):
     """Modified from Diffusers' original for Flux compatibility"""
     def sliced_attention(*args, **kwargs):
+        if "dim" in kwargs:
+            dim = kwargs["dim"]
+        else:
+            dim = 1
+        if slice_size == "auto":
+            # Automatic slicing based on Flux architecture
+            return module(*args, **kwargs)
+        output = torch.cat([
+            module(
+                *[arg[:, :, i:i+slice_size] if i == dim else arg
+                for arg in args],
+                **{k: v[:, :, i:i+slice_size] if k == dim else v
+                   for k,v in kwargs.items()}
+            )
+            for i in range(0, args[0].shape[dim], slice_size)
+        ], dim=dim)
+        return output
+    return sliced_attention
+huggingface_token = os.getenv("HUGGINFACE_TOKEN")
 good_vae = AutoencoderKL.from_pretrained(
     "black-forest-labs/FLUX.1-dev",
     subfolder="vae",
     torch_dtype=torch.bfloat16,
     use_safetensors=True,
+    device_map=None,  # Disable automatic mapping
+    token=huggingface_token
 )
 # 2. Main Pipeline Initialization WITH VAE SCOPE
 pipe = FluxControlNetPipeline.from_pretrained(
     "LPX55/FLUX.1-merged_uncensored",
 # 3. Strict Order for Optimization Steps
 # A. Apply CPU Offloading FIRST
+pipe.enable_sequential_cpu_offload()  # No arguments for new API
+# 2. Then apply custom VAE slicing
+if getattr(pipe, "vae", None) is not None:
+    # Method 1: Use official implementation if available
+    try:
+        pipe.vae.enable_slicing()
+    except AttributeError:
+        # Method 2: Apply manual slicing for Flux compatibility [source_id]pipeline_flux_controlnet.py
+        pipe.vae.decode = self_attention_slicing(pipe.vae.decode, 2)
+pipe.enable_attention_slicing(1)
 # B. Enable Memory Optimizations
+# pipe.enable_vae_tiling()
+# pipe.enable_xformers_memory_efficient_attention()
 # C. Unified Precision Handling
+# for comp in [pipe.unet, pipe.vae, pipe.controlnet]:
+#     comp.to(dtype=torch.bfloat16)
 print(f"VRAM used: {torch.cuda.memory_allocated()/1e9:.2f}GB")
 @spaces.GPU