pixagram-dev

Runtime error

App Files Files Community

primerz commited on Oct 30, 2025

Commit

eda51f2

verified ·

1 Parent(s): 3e937c5

Delete ip_attention_processor_xformers.py

Browse files

Files changed (1) hide show

ip_attention_processor_xformers.py +0 -414

ip_attention_processor_xformers.py DELETED Viewed

@@ -1,414 +0,0 @@
-"""
-Enhanced IP-Adapter Attention Processor with XFormers Support
-==============================================================
-This version combines:
-1. Torch 2.0 scaled_dot_product_attention (from our enhanced version)
-2. XFormers memory efficient attention (from InstantID reference)
-3. Adaptive scaling and learnable parameters (from our enhanced version)
-4. Region control support (from InstantID reference)
-Expected improvements:
-- +15-25% faster inference with xformers
-- +2-3% better face preservation with adaptive scaling
-- Lower memory usage
-Author: Pixagram Team
-License: MIT
-"""
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from typing import Optional
-from diffusers.models.attention_processor import AttnProcessor2_0
-try:
-    import xformers
-    import xformers.ops
-    xformers_available = True
-except Exception:
-    xformers_available = False
-class RegionControler(object):
-    """Region control for localized face embedding application"""
-    def __init__(self) -> None:
-        self.prompt_image_conditioning = []
-region_control = RegionControler()
-class IPAttnProcessorXFormers(nn.Module):
-    """
-    Enhanced IP-Adapter attention with XFormers and adaptive scaling.
-    Features:
-    - XFormers memory efficient attention (if available)
-    - Torch 2.0 scaled_dot_product_attention (fallback)
-    - Adaptive per-layer scaling
-    - Learnable scale parameters
-    - Region control support
-    Args:
-        hidden_size: Attention layer hidden dimension
-        cross_attention_dim: Encoder hidden states dimension
-        scale: Base blending weight for face features
-        num_tokens: Number of face embedding tokens
-        adaptive_scale: Enable adaptive scaling
-        learnable_scale: Make scale learnable per layer
-    """
-    def __init__(
-        self,
-        hidden_size: int,
-        cross_attention_dim: Optional[int] = None,
-        scale: float = 1.0,
-        num_tokens: int = 4,
-        adaptive_scale: bool = True,
-        learnable_scale: bool = True
-    ):
-        super().__init__()
-        self.hidden_size = hidden_size
-        self.cross_attention_dim = cross_attention_dim or hidden_size
-        self.base_scale = scale
-        self.num_tokens = num_tokens
-        self.adaptive_scale = adaptive_scale
-        self.use_xformers = xformers_available
-        # Dedicated K/V projections for face features
-        self.to_k_ip = nn.Linear(self.cross_attention_dim, hidden_size, bias=False)
-        self.to_v_ip = nn.Linear(self.cross_attention_dim, hidden_size, bias=False)
-        # Learnable scale parameter (per layer)
-        if learnable_scale:
-            self.scale_param = nn.Parameter(torch.tensor(scale))
-        else:
-            self.register_buffer('scale_param', torch.tensor(scale))
-        # Adaptive scaling module
-        if adaptive_scale:
-            self.adaptive_gate = nn.Sequential(
-                nn.Linear(hidden_size, hidden_size // 4),
-                nn.ReLU(),
-                nn.Linear(hidden_size // 4, 1),
-                nn.Sigmoid()
-            )
-        # Better initialization
-        self._init_weights()
-        if self.use_xformers:
-            print(f"  [XFORMERS] Enabled for IP-Adapter attention")
-    def _init_weights(self):
-        """Xavier initialization for stable training."""
-        nn.init.xavier_uniform_(self.to_k_ip.weight)
-        nn.init.xavier_uniform_(self.to_v_ip.weight)
-        if self.adaptive_scale:
-            for module in self.adaptive_gate:
-                if isinstance(module, nn.Linear):
-                    nn.init.xavier_uniform_(module.weight)
-                    if module.bias is not None:
-                        nn.init.zeros_(module.bias)
-    def compute_adaptive_scale(
-        self,
-        query: torch.Tensor,
-        ip_key: torch.Tensor,
-        base_scale: float
-    ) -> torch.Tensor:
-        """
-        Compute adaptive scale based on query-key similarity.
-        Higher similarity = stronger face preservation.
-        """
-        # Compute mean query features
-        query_mean = query.mean(dim=(1, 2))  # [batch, head_dim * heads]
-        # Pass through gating network
-        gate = self.adaptive_gate(query_mean)  # [batch, 1]
-        # Modulate base scale
-        adaptive_scale = base_scale * (0.5 + gate)  # Range: [0.5*base, 1.5*base]
-        return adaptive_scale.view(-1, 1, 1)  # [batch, 1, 1] for broadcasting
-    def _memory_efficient_attention_xformers(self, query, key, value, attention_mask):
-        """XFormers memory efficient attention"""
-        # XFormers expects (batch, seq_len, heads, head_dim)
-        # Current shape: (batch * heads, seq_len, head_dim)
-        batch_heads, seq_len, head_dim = query.shape
-        # We need to reshape to (batch, seq_len, heads, head_dim)
-        # But we don't know batch size here, so we keep it simple
-        hidden_states = xformers.ops.memory_efficient_attention(
-            query.unsqueeze(0),
-            key.unsqueeze(0),
-            value.unsqueeze(0),
-            attn_bias=None if attention_mask is None else attention_mask.unsqueeze(0)
-        )
-        return hidden_states.squeeze(0)
-    def forward(
-        self,
-        attn,
-        hidden_states: torch.FloatTensor,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        temb: Optional[torch.FloatTensor] = None,
-    ) -> torch.FloatTensor:
-        """Forward pass with XFormers or Torch 2.0 attention."""
-        residual = hidden_states
-        if attn.spatial_norm is not None:
-            hidden_states = attn.spatial_norm(hidden_states, temb)
-        input_ndim = hidden_states.ndim
-        if input_ndim == 4:
-            batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None
-            else encoder_hidden_states.shape
-        )
-        if attention_mask is not None:
-            attention_mask = attn.prepare_attention_mask(
-                attention_mask, sequence_length, batch_size
-            )
-            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
-        if attn.group_norm is not None:
-            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-        query = attn.to_q(hidden_states)
-        # Split text and face embeddings
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-            ip_hidden_states = None
-        else:
-            end_pos = encoder_hidden_states.shape[1] - self.num_tokens
-            encoder_hidden_states, ip_hidden_states = (
-                encoder_hidden_states[:, :end_pos, :],
-                encoder_hidden_states[:, end_pos:, :]
-            )
-            if attn.norm_cross:
-                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-        # Text attention
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
-        inner_dim = key.shape[-1]
-        head_dim = inner_dim // attn.heads
-        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        # Choose attention implementation
-        if self.use_xformers and self.training == False:
-            # XFormers during inference
-            query_xf = query.reshape(batch_size * attn.heads, -1, head_dim)
-            key_xf = key.reshape(batch_size * attn.heads, -1, head_dim)
-            value_xf = value.reshape(batch_size * attn.heads, -1, head_dim)
-            try:
-                hidden_states = self._memory_efficient_attention_xformers(
-                    query_xf, key_xf, value_xf, attention_mask
-                )
-                hidden_states = hidden_states.reshape(batch_size, attn.heads, -1, head_dim)
-            except:
-                # Fallback to torch 2.0
-                hidden_states = F.scaled_dot_product_attention(
-                    query, key, value,
-                    attn_mask=attention_mask,
-                    dropout_p=0.0,
-                    is_causal=False
-                )
-        else:
-            # Torch 2.0 attention
-            hidden_states = F.scaled_dot_product_attention(
-                query, key, value,
-                attn_mask=attention_mask,
-                dropout_p=0.0,
-                is_causal=False
-            )
-        hidden_states = hidden_states.transpose(1, 2).reshape(
-            batch_size, -1, attn.heads * head_dim
-        )
-        hidden_states = hidden_states.to(query.dtype)
-        # Face attention with enhancements
-        if ip_hidden_states is not None:
-            # Dedicated K/V projections
-            ip_key = self.to_k_ip(ip_hidden_states)
-            ip_value = self.to_v_ip(ip_hidden_states)
-            ip_key = ip_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-            ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-            # Face attention
-            if self.use_xformers and self.training == False:
-                # XFormers
-                query_xf = query.reshape(batch_size * attn.heads, -1, head_dim)
-                ip_key_xf = ip_key.reshape(batch_size * attn.heads, -1, head_dim)
-                ip_value_xf = ip_value.reshape(batch_size * attn.heads, -1, head_dim)
-                try:
-                    ip_hidden_states = self._memory_efficient_attention_xformers(
-                        query_xf, ip_key_xf, ip_value_xf, None
-                    )
-                    ip_hidden_states = ip_hidden_states.reshape(batch_size, attn.heads, -1, head_dim)
-                except:
-                    # Fallback
-                    ip_hidden_states = F.scaled_dot_product_attention(
-                        query, ip_key, ip_value,
-                        attn_mask=None,
-                        dropout_p=0.0,
-                        is_causal=False
-                    )
-            else:
-                # Torch 2.0
-                ip_hidden_states = F.scaled_dot_product_attention(
-                    query, ip_key, ip_value,
-                    attn_mask=None,
-                    dropout_p=0.0,
-                    is_causal=False
-                )
-            ip_hidden_states = ip_hidden_states.transpose(1, 2).reshape(
-                batch_size, -1, attn.heads * head_dim
-            )
-            ip_hidden_states = ip_hidden_states.to(query.dtype)
-            # Compute effective scale
-            if self.adaptive_scale and self.training == False:
-                try:
-                    adaptive_scale = self.compute_adaptive_scale(query, ip_key, self.scale_param.item())
-                    effective_scale = adaptive_scale
-                except:
-                    effective_scale = self.scale_param
-            else:
-                effective_scale = self.scale_param
-            # Region control support
-            if len(region_control.prompt_image_conditioning) == 1:
-                region_mask = region_control.prompt_image_conditioning[0].get('region_mask', None)
-                if region_mask is not None:
-                    query_flat = query.reshape([-1, query.shape[-2], query.shape[-1]])
-                    h, w = region_mask.shape[:2]
-                    ratio = (h * w / query_flat.shape[1]) ** 0.5
-                    mask = F.interpolate(
-                        region_mask[None, None],
-                        scale_factor=1/ratio,
-                        mode='nearest'
-                    ).reshape([1, -1, 1])
-                else:
-                    mask = torch.ones_like(ip_hidden_states)
-                ip_hidden_states = ip_hidden_states * mask
-            # Blend with adaptive scale
-            hidden_states = hidden_states + effective_scale * ip_hidden_states
-        # Output projection
-        hidden_states = attn.to_out[0](hidden_states)
-        hidden_states = attn.to_out[1](hidden_states)
-        if input_ndim == 4:
-            hidden_states = hidden_states.transpose(-1, -2).reshape(
-                batch_size, channel, height, width
-            )
-        if attn.residual_connection:
-            hidden_states = hidden_states + residual
-        hidden_states = hidden_states / attn.rescale_output_factor
-        return hidden_states
-def setup_xformers_ip_adapter_attention(
-    pipe,
-    ip_adapter_scale: float = 1.0,
-    num_tokens: int = 4,
-    device: str = "cuda",
-    dtype = torch.float16,
-    adaptive_scale: bool = True,
-    learnable_scale: bool = True
-):
-    """
-    Setup IP-Adapter with XFormers optimized attention processors.
-    Args:
-        pipe: Diffusers pipeline
-        ip_adapter_scale: Base face embedding strength
-        num_tokens: Number of face tokens
-        device: Device
-        dtype: Data type
-        adaptive_scale: Enable adaptive scaling
-        learnable_scale: Make scales learnable
-    Returns:
-        Dict of attention processors
-    """
-    attn_procs = {}
-    for name in pipe.unet.attn_processors.keys():
-        cross_attention_dim = None if name.endswith("attn1.processor") else pipe.unet.config.cross_attention_dim
-        if name.startswith("mid_block"):
-            hidden_size = pipe.unet.config.block_out_channels[-1]
-        elif name.startswith("up_blocks"):
-            block_id = int(name[len("up_blocks.")])
-            hidden_size = list(reversed(pipe.unet.config.block_out_channels))[block_id]
-        elif name.startswith("down_blocks"):
-            block_id = int(name[len("down_blocks.")])
-            hidden_size = pipe.unet.config.block_out_channels[block_id]
-        else:
-            hidden_size = pipe.unet.config.block_out_channels[-1]
-        if cross_attention_dim is None:
-            attn_procs[name] = AttnProcessor2_0()
-        else:
-            attn_procs[name] = IPAttnProcessorXFormers(
-                hidden_size=hidden_size,
-                cross_attention_dim=cross_attention_dim,
-                scale=ip_adapter_scale,
-                num_tokens=num_tokens,
-                adaptive_scale=adaptive_scale,
-                learnable_scale=learnable_scale
-            ).to(device, dtype=dtype)
-    print(f"[OK] XFormers-optimized attention processors created")
-    print(f"  - Total processors: {len(attn_procs)}")
-    print(f"  - XFormers available: {xformers_available}")
-    print(f"  - Adaptive scaling: {adaptive_scale}")
-    print(f"  - Learnable scales: {learnable_scale}")
-    return attn_procs
-if __name__ == "__main__":
-    print("Testing XFormers IP-Adapter Processor...")
-    processor = IPAttnProcessorXFormers(
-        hidden_size=1280,
-        cross_attention_dim=2048,
-        scale=0.8,
-        num_tokens=4,
-        adaptive_scale=True,
-        learnable_scale=True
-    )
-    print(f"\n[OK] Processor created successfully")
-    print(f"Parameters: {sum(p.numel() for p in processor.parameters()):,}")
-    print(f"XFormers available: {xformers_available}")
-    print(f"Has adaptive scaling: {processor.adaptive_scale}")
-    print(f"Has learnable scale: {isinstance(processor.scale_param, nn.Parameter)}")