Spaces:

sourxbhh
/

ACMDM_Motion_Generation

Sleeping

App Files Files Community

sourxbhh commited on Nov 20

Commit

0f34fb9

1 Parent(s): 1355f5d

Add model directory

Browse files

Files changed (15) hide show

models/ACMDM.py +437 -0
models/ACMDM_ControlNet.py +314 -0
models/ACMDM_NoisyPrefix_AR.py +556 -0
models/ACMDM_Prefix_AR.py +434 -0
models/AE_2D_Causal.py +245 -0
models/AE_2D_NonCausal.py +228 -0
models/AE_Mesh.py +601 -0
models/LengthEstimator.py +40 -0
models/ROPE.py +91 -0
models/__pycache__/ACMDM.cpython-310.pyc +0 -0
models/__pycache__/ACMDM.cpython-313.pyc +0 -0
models/__pycache__/AE_2D_Causal.cpython-310.pyc +0 -0
models/__pycache__/AE_2D_Causal.cpython-313.pyc +0 -0
models/__pycache__/LengthEstimator.cpython-310.pyc +0 -0
models/__pycache__/ROPE.cpython-310.pyc +0 -0

models/ACMDM.py ADDED Viewed

	@@ -0,0 +1,437 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import clip
+import math
+from functools import partial
+from timm.models.vision_transformer import Attention
+from models.ROPE import RopeND
+from utils.eval_utils import eval_decorator
+from utils.train_utils import lengths_to_mask
+from diffusions.diffusion import create_diffusion
+from diffusions.transport import create_transport, Sampler
+#################################################################################
+#                                      ACMDM                                    #
+#################################################################################
+class ACMDM(nn.Module):
+    def __init__(self, input_dim, cond_mode, latent_dim=256, ff_size=1024, num_layers=8,
+                 num_heads=4, dropout=0, clip_dim=512,
+                 diff_model='Flow', cond_drop_prob=0.1, max_length=49,
+                 patch_size=(1, 22), stride_size=(1, 22), num_joint=22,
+                 clip_version='ViT-B/32', **kargs):
+        super(ACMDM, self).__init__()
+        self.input_dim = input_dim
+        self.latent_dim = latent_dim
+        self.clip_dim = clip_dim
+        self.dropout = dropout
+        self.cond_mode = cond_mode
+        self.cond_drop_prob = cond_drop_prob
+        if self.cond_mode == 'action':
+            assert 'num_actions' in kargs
+            self.num_actions = kargs.get('num_actions', 1)
+            self.encode_action = partial(F.one_hot, num_classes=self.num_actions)
+        # --------------------------------------------------------------------------
+        # Diffusion
+        self.diff_model = diff_model
+        if self.diff_model == 'Flow':
+            self.train_diffusion = create_transport()  # default to linear, velocity prediction
+            self.gen_diffusion = Sampler(self.train_diffusion)
+        else:
+            self.train_diffusion = create_diffusion(timestep_respacing="", noise_schedule="linear")
+            self.gen_diffusion = create_diffusion(timestep_respacing="", noise_schedule="linear")
+        # --------------------------------------------------------------------------
+        # ACMDM
+        print('Loading ACMDM...')
+        self.t_embedder = TimestepEmbedder(self.latent_dim)
+        self.patch_size = patch_size
+        self.stride_size = stride_size
+        self.patches_per_frame = (num_joint - patch_size[1]) // stride_size[1] + 1
+        # Patchification
+        self.x_embedder = nn.Conv2d(self.input_dim, self.latent_dim, kernel_size=self.patch_size, stride=self.stride_size, bias=True)
+        # Positional Encoding
+        max_length = max_length * self.patches_per_frame
+        self.max_lens = [max_length]
+        self.rope = RopeND(nd=1, nd_split=[1], max_lens=self.max_lens)
+        self.position_ids_precompute = torch.arange(max_length).unsqueeze(0)
+        self.ACMDMTransformer = nn.ModuleList([
+            ACMDMTransBlock(self.latent_dim, num_heads, mlp_size=ff_size, rope=self.rope, qk_norm=True) for _ in range(num_layers)
+        ])
+        if self.cond_mode == 'text':
+            self.y_embedder = nn.Linear(self.clip_dim, self.latent_dim)
+        elif self.cond_mode == 'action':
+            self.y_embedder = nn.Linear(self.num_actions, self.latent_dim)
+        elif self.cond_mode == 'uncond':
+            self.y_embedder = nn.Identity()
+        else:
+            raise KeyError("Unsupported condition mode!!!")
+        self.final_layer = FinalLayer(self.latent_dim, self.input_dim, patch_size=patch_size, stride_size=stride_size, patches=self.patches_per_frame, joint=num_joint)
+        self.initialize_weights()
+        if self.cond_mode == 'text':
+            print('Loading CLIP...')
+            self.clip_version = clip_version
+            self.clip_model = self.load_and_freeze_clip(clip_version)
+    def initialize_weights(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers in ACMDM blocks:
+        for block in self.ACMDMTransformer:
+            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+        # Zero-out output layers:
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+    def load_and_freeze_clip(self, clip_version):
+        clip_model, clip_preprocess = clip.load(clip_version, device='cpu', jit=False)
+        assert torch.cuda.is_available()
+        clip.model.convert_weights(clip_model)
+        clip_model.eval()
+        for p in clip_model.parameters():
+            p.requires_grad = False
+        return clip_model
+    def encode_text(self, raw_text):
+        device = next(self.parameters()).device
+        text = clip.tokenize(raw_text, truncate=True).to(device)
+        feat_clip_text = self.clip_model.encode_text(text).float()
+        return feat_clip_text
+    def mask_cond(self, cond, force_mask=False):
+        bs, d =  cond.shape
+        if force_mask:
+            return torch.zeros_like(cond)
+        elif self.training and self.cond_drop_prob > 0.:
+            mask = torch.bernoulli(torch.ones(bs, device=cond.device) * self.cond_drop_prob).view(bs, 1)
+            return cond * (1. - mask)
+        else:
+            return cond
+    def forward(self, x, t, conds, attention_mask, force_mask=False):
+        t = self.t_embedder(t, dtype=x.dtype)
+        conds = self.mask_cond(conds, force_mask=force_mask)
+        x = self.x_embedder(x)
+        x = x.flatten(2).transpose(1, 2)
+        conds = self.y_embedder(conds)
+        y = t.unsqueeze(1) + conds.unsqueeze(1)
+        position_ids = self.position_ids_precompute[:, :x.shape[1]]
+        for block in self.ACMDMTransformer:
+            x = block(x, y, attention_mask, position_ids=position_ids)
+        x = self.final_layer(x, y)
+        return x
+    def forward_with_CFG(self, x, t, conds, attention_mask, cfg=1.0):
+        if not cfg == 1.0:
+            half = x[: len(x) // 2]
+            x = torch.cat([half, half], dim=0)
+        x = self.forward(x, t, conds, attention_mask)
+        if not cfg == 1.0:
+            cond_eps, uncond_eps = torch.split(x, len(x) // 2, dim=0)
+            half_eps = uncond_eps + cfg * (cond_eps - uncond_eps)
+            x = torch.cat([half_eps, half_eps], dim=0)
+        return x
+    def forward_loss(self, latents, y, m_lens):
+        latents = latents.permute(0, 2, 3, 1)
+        b, l, j, d = latents.shape
+        device = latents.device
+        non_pad_mask = lengths_to_mask(m_lens, l)
+        latents = torch.where(non_pad_mask.unsqueeze(-1).unsqueeze(-1), latents, torch.zeros_like(latents))
+        target = latents.clone().permute(0, 3, 1, 2).detach()
+        force_mask = False
+        if self.cond_mode == 'text':
+            with torch.no_grad():
+                cond_vector = self.encode_text(y)
+        elif self.cond_mode == 'action':
+            cond_vector = self.enc_action(y).to(device).float()
+        elif self.cond_mode == 'uncond':
+            cond_vector = torch.zeros(b, self.latent_dim).float().to(device)
+            force_mask = True
+        else:
+            raise NotImplementedError("Unsupported condition mode!!!")
+        attention_mask = non_pad_mask.unsqueeze(-1).repeat(1, 1, self.patches_per_frame).flatten(1).unsqueeze(1).unsqueeze(1)
+        model_kwargs = dict(conds=cond_vector, force_mask=force_mask, attention_mask=attention_mask)
+        if self.diff_model == "Flow":
+            loss_dict = self.train_diffusion.training_losses(self.forward, target, model_kwargs)
+        else:
+            t = torch.randint(0, self.train_diffusion.num_timesteps, (target.shape[0],), device=target.device)
+            loss_dict = self.train_diffusion.training_losses(self.forward, target, t, model_kwargs)
+        loss = loss_dict["loss"]
+        loss = (loss * non_pad_mask).sum() / non_pad_mask.sum()
+        return loss
+    @torch.no_grad()
+    @eval_decorator
+    def generate(self,
+                 conds,
+                 m_lens,
+                 cond_scale: int,
+                 temperature=1,
+                 j=22,
+                 ):
+        device = next(self.parameters()).device
+        l = max(m_lens)
+        b = len(m_lens)
+        if self.cond_mode == 'text':
+            with torch.no_grad():
+                cond_vector = self.encode_text(conds)
+        elif self.cond_mode == 'action':
+            cond_vector = self.enc_action(conds).to(device)
+        elif self.cond_mode == 'uncond':
+            cond_vector = torch.zeros(b, self.latent_dim).float().to(device)
+        else:
+            raise NotImplementedError("Unsupported condition mode!!!")
+        padding_mask = ~lengths_to_mask(m_lens, l)
+        noise = torch.randn(b, self.input_dim, l, j).to(device)
+        if not cond_scale == 1.0:
+            cond_vector = torch.cat([cond_vector, torch.zeros_like(cond_vector)], dim=0)
+            noise = torch.cat([noise, noise], dim=0)
+        attention_mask = (~padding_mask).unsqueeze(-1).repeat(1,1,self.patches_per_frame).flatten(1).unsqueeze(1).unsqueeze(1)
+        model_kwargs = dict(conds=cond_vector, attention_mask=attention_mask, cfg=cond_scale)
+        sample_fn = self.forward_with_CFG
+        if not cond_scale == 1:
+            model_kwargs["attention_mask"] = attention_mask.repeat(2, 1, 1, 1)
+        if self.diff_model == "Flow":
+            model_fn = self.gen_diffusion.sample_ode()  # default to ode sampling
+            sampled_token_latent = model_fn(noise, sample_fn, **model_kwargs)[-1]
+        else:
+            sampled_token_latent = self.gen_diffusion.p_sample_loop(
+                sample_fn, noise.shape, noise, clip_denoised=False, model_kwargs=model_kwargs,
+                progress=False,
+                temperature=temperature
+            )
+        if not cond_scale == 1:
+            sampled_token_latent, _ = sampled_token_latent.chunk(2, dim=0)
+        sampled_token_latent = sampled_token_latent.permute(0,2,3,1)
+        latents = torch.where(padding_mask.unsqueeze(-1).unsqueeze(-1), torch.zeros_like(sampled_token_latent), sampled_token_latent)
+        return latents.permute(0,3,1,2)
+#################################################################################
+#                                     ACMDM Zoos                                #
+#################################################################################
+def acmdm_raw_flow_s_ps22(**kwargs):
+    layer = 8
+    return ACMDM(latent_dim=layer*64, ff_size=layer*64*4, num_layers=layer, num_heads=layer, dropout=0, clip_dim=512,
+                 diff_model="Flow", cond_drop_prob=0.1, max_length=196,
+                 patch_size=(1, 22), stride_size=(1, 22), **kwargs)
+def acmdm_flow_s_ps22(**kwargs):
+    layer = 8
+    return ACMDM(latent_dim=layer*64, ff_size=layer*64*4, num_layers=layer, num_heads=layer, dropout=0, clip_dim=512,
+                 diff_model="Flow", cond_drop_prob=0.1, max_length=49,
+                 patch_size=(1, 22), stride_size=(1, 22), **kwargs)
+def acmdm_flow_xl_ps2(**kwargs):
+    layer = 20
+    return ACMDM(latent_dim=layer*64, ff_size=layer*64*4, num_layers=layer, num_heads=layer, dropout=0, clip_dim=512,
+                 diff_model="Flow", cond_drop_prob=0.1, max_length=49,
+                 patch_size=(1, 2), stride_size=(1, 2), **kwargs)
+def acmdm_mesh_flow_s_ps28(**kwargs):
+    layer = 8
+    return ACMDM(latent_dim=layer*64, ff_size=layer*64*4, num_layers=layer, num_heads=layer, dropout=0, clip_dim=512,
+                 diff_model="Flow", cond_drop_prob=0.1, max_length=196, num_joint=28,
+                 patch_size=(1, 28), stride_size=(1, 28), **kwargs)
+ACMDM_models = {
+    'ACMDM-Raw-Flow-S-PatchSize22': acmdm_raw_flow_s_ps22, 'ACMDM-Flow-S-PatchSize22': acmdm_flow_s_ps22,
+    'ACMDM-Flow-XL-PatchSize2': acmdm_flow_xl_ps2, 'ACMDM-Mesh-Flow-S-PatchSize28': acmdm_mesh_flow_s_ps28,
+}
+#################################################################################
+#                                 Inner Architectures                           #
+#################################################################################
+def modulate(x, shift, scale):
+    return x * (1 + scale) + shift
+class ACMDMAttention(Attention):
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=True,
+        rope=None,
+        qk_norm=True,
+        **block_kwargs,
+    ):
+        super().__init__(dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_norm=qk_norm, **block_kwargs)
+        self.rope = rope
+    def forward(self, x, position_ids=None, attention_mask=None):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)
+        q, k = self.q_norm(q), self.k_norm(k)
+        if self.rope is not None:
+            q, k = self.rope(q, k, position_ids)
+        x = torch.nn.functional.scaled_dot_product_attention(
+            q, k, v,
+            attn_mask=attention_mask,
+            dropout_p=self.attn_drop.p
+        )
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class SwiGLUFFN(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = in_features
+        hidden_features = hidden_features
+        self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
+        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
+    def forward(self, x):
+        x12 = self.w12(x)
+        x1, x2 = x12.chunk(2, dim=-1)
+        hidden = F.silu(x1) * x2
+        return self.w3(hidden)
+class ACMDMTransBlock(nn.Module):
+    def __init__(self, hidden_size, num_heads, mlp_size=1024, rope=None, qk_norm=True):
+        super().__init__()
+        self.norm1 = LlamaRMSNorm(hidden_size, eps=1e-6)
+        self.attn = ACMDMAttention(hidden_size, num_heads=num_heads, qkv_bias=True, norm_layer=LlamaRMSNorm,
+                                        qk_norm=qk_norm, rope=rope)
+        self.norm2 = LlamaRMSNorm(hidden_size, eps=1e-6)
+        self.mlp = SwiGLUFFN(hidden_size, int(2 / 3 * mlp_size))
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 6 * hidden_size, bias=True)
+        )
+    def forward(self, x, c, attention_mask=None, position_ids=None):
+        dtype = x.dtype
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=-1)
+        norm_x1 = self.norm1(x.to(torch.float32)).to(dtype)
+        attn_input_x = modulate(norm_x1, shift_msa, scale_msa)
+        attn_output_x = self.attn(attn_input_x, attention_mask=attention_mask, position_ids=position_ids)
+        x = x + gate_msa * attn_output_x
+        norm_x2 = self.norm2(x.to(torch.float32)).to(dtype)
+        gate_input_x = modulate(norm_x2, shift_mlp, scale_mlp)
+        gate_output_x = self.mlp(gate_input_x)
+        x = x + gate_mlp * gate_output_x
+        return x
+class FinalLayer(nn.Module):
+    def __init__(self, hidden_size, output_size, patch_size=(1, 22), stride_size=(1,22), patches=1, joint=22):
+        super().__init__()
+        self.norm_final = LlamaRMSNorm(hidden_size, eps=1e-6)
+        self.patch_size = patch_size
+        self.stride_size = stride_size
+        self.patches = patches
+        self.joint=joint
+        self.linear = nn.Linear(hidden_size, output_size*patch_size[0]*patch_size[1], bias=True)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True)
+        )
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=-1)
+        norm_x = self.norm_final(x.to(torch.float32)).to(x.dtype)
+        x = modulate(norm_x, shift, scale)
+        x = self.linear(x)
+        x = x.reshape(shape=(x.shape[0], x.shape[1]//self.patches, self.patches, self.patch_size[0], self.patch_size[1], x.shape[-1] // self.patch_size[1]))
+        x = torch.einsum('nljpqc->nclpjq', x)
+        x = x.reshape(shape=(x.shape[0], x.shape[1], -1, self.joint))
+        return x
+class TimestepEmbedder(nn.Module):
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000, dtype=torch.float32):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=dtype) / half
+        ).to(device=t.device, dtype=dtype)
+        args = t[:, None] * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t, dtype=torch.bfloat16):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size, dtype=dtype)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class LlamaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return (self.weight * hidden_states).to(input_dtype)

models/ACMDM_ControlNet.py ADDED Viewed

	@@ -0,0 +1,314 @@

+import torch
+import torch.nn as nn
+from models.ACMDM import ACMDM
+from models.ACMDM import TimestepEmbedder, ACMDMTransBlock, LlamaRMSNorm
+from models.ROPE import RopeND
+from utils.eval_utils import eval_decorator
+from utils.train_utils import lengths_to_mask
+#################################################################################
+#                                  ACMDM+ControlNet                             #
+#################################################################################
+class ACMDM_ControlNet(ACMDM):
+    def __init__(self, input_dim, cond_mode, base_checkpoint, latent_dim=256, ff_size=1024, num_layers=8,
+                 num_heads=4, dropout=0.2, clip_dim=512,
+                 diff_model='Flow', cond_drop_prob=0.1, max_length=49,
+                 patch_size=(1, 22), stride_size=(1, 22),
+                 clip_version='ViT-B/32', freeze_base=True, need_base=True, **kargs):
+        # --------------------------------------------------------------------------
+        # ACMDM
+        super().__init__(input_dim, cond_mode, latent_dim=latent_dim, ff_size=ff_size, num_layers=num_layers,
+                 num_heads=num_heads, dropout=dropout, clip_dim=clip_dim,
+                 diff_model=diff_model, cond_drop_prob=cond_drop_prob, max_length=max_length,
+                 patch_size=patch_size, stride_size=stride_size,
+                 clip_version=clip_version, **kargs)
+        # --------------------------------------------------------------------------
+        # ControlNet
+        self.c_t_embedder = TimestepEmbedder(self.latent_dim)
+        self.c_control_embedder = c_control_embedder(3, self.latent_dim, patch_size=self.patch_size,
+                                                     stride_size=self.stride_size)
+        self.c_x_embedder = nn.Conv2d(self.input_dim, self.latent_dim, kernel_size=self.patch_size,
+                                      stride=self.stride_size, bias=True)
+        self.c_y_embedder = nn.Linear(self.clip_dim, self.latent_dim)
+        self.c_rope = RopeND(nd=1, nd_split=[1], max_lens=self.max_lens)
+        self.ControlNet = nn.ModuleList([
+            ACMDMTransBlock(self.latent_dim, num_heads, mlp_size=ff_size, rope=self.c_rope, qk_norm=True) for _ in
+            range(num_layers)
+        ])
+        self.zero_Linear = nn.ModuleList([
+            nn.Linear(self.latent_dim, self.latent_dim) for _ in range(num_layers)
+        ])
+        self.initialize_weights_control()
+        if need_base:
+            for key, value in list(base_checkpoint['ema_acmdm'].items()):
+                if key.startswith('ACMDMTransformer.'):
+                    new_key = key.replace('ACMDMTransformer.', 'ControlNet.')
+                    base_checkpoint['ema_acmdm'][new_key] = value.clone()
+            missing_keys, unexpected_keys = self.load_state_dict(base_checkpoint['ema_acmdm'], strict=False)
+            assert len(unexpected_keys) == 0
+        if self.cond_mode == 'text':
+            print('ReLoading CLIP...')
+            self.clip_version = clip_version
+            self.clip_model = self.load_and_freeze_clip(clip_version)
+        if freeze_base:
+            for param in self.t_embedder.parameters():
+                param.requires_grad = False
+            for param in self.x_embedder.parameters():
+                param.requires_grad = False
+            for param in self.y_embedder.parameters():
+                param.requires_grad = False
+            for param in self.final_layer.parameters():
+                param.requires_grad = False
+            for param in self.ACMDMTransformer.parameters():
+                param.requires_grad = False
+    def initialize_weights_control(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers in DiT blocks:
+        for block in self.ACMDMTransformer:
+            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+        # Zero-out output layers:
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.c_t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.c_t_embedder.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers in DiT blocks:
+        for block in self.ControlNet:
+            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.c_control_embedder.zero_linear.weight, 0)
+        nn.init.constant_(self.c_control_embedder.zero_linear.bias, 0)
+        for block in self.zero_Linear:
+            nn.init.constant_(block.weight, 0)
+            nn.init.constant_(block.bias, 0)
+    def forward_with_control(self, x, t, conds, attention_mask, cfg1=1.0, cfg2=1.0, control=None, index=None,
+                             force_mask=False):
+        if not (cfg1 == 1.0 and cfg2 == 1.0):
+            half = x[: len(x) // 3]
+            x = torch.cat([half, half, half], dim=0)
+        # controlnet
+        c_t = self.c_t_embedder(t, dtype=x.dtype)
+        conds = self.mask_cond(conds, force_mask=force_mask)
+        c_control = self.c_control_embedder(control * index)
+        if self.training and self.cond_drop_prob > 0.:
+            mask = torch.bernoulli(torch.ones(c_control.shape[0], device=c_control.device) * self.cond_drop_prob).view(c_control.shape[0], 1, 1)
+            c_control = c_control * (1. - mask)
+        if not (cfg1 == 1.0 and cfg2 == 1.0):
+            c_control = torch.cat([c_control, c_control, torch.zeros_like(c_control)], dim=0)
+        c_x = self.c_x_embedder(x).flatten(2).transpose(1, 2)
+        c_y = self.c_y_embedder(conds)
+        c_y = c_t.unsqueeze(1) + c_y.unsqueeze(1)
+        c_x = c_x + c_control
+        c_position_ids = self.position_ids_precompute[:, :c_x.shape[1]]
+        c_out = []
+        for c_block, c_linear in zip(self.ControlNet, self.zero_Linear):
+            c_x = c_block(c_x, c_y, attention_mask, position_ids=c_position_ids)
+            c_out.append(c_linear(c_x))
+        # main branch
+        tt = self.t_embedder(t, dtype=x.dtype)
+        x = self.x_embedder(x)
+        x = x.flatten(2).transpose(1, 2)
+        conds = self.y_embedder(conds)
+        y = tt.unsqueeze(1) + conds.unsqueeze(1)
+        position_ids = self.position_ids_precompute[:, :x.shape[1]]
+        # merging
+        for block, c in zip(self.ACMDMTransformer, c_out):
+            x = block(x, y, attention_mask, position_ids=position_ids)
+            x = x + c
+        x = self.final_layer(x, y)
+        if not (cfg1 == 1.0 and cfg2 == 1.0):
+            cond_eps, uncond_eps1, uncond_eps2 = torch.split(x, len(x) // 3, dim=0)
+            half_eps = cond_eps + (cfg1-1) * (cond_eps - uncond_eps1) + (cfg2-1) * (cond_eps - uncond_eps2)
+            x = torch.cat([half_eps, half_eps, half_eps], dim=0)
+        return x
+    def forward_control_loss(self, latents, y, m_lens, original, index, ae, mean_std):
+        latents = latents.permute(0, 2, 3, 1)
+        b, l, j, d = latents.shape
+        device = latents.device
+        non_pad_mask = lengths_to_mask(m_lens, l)
+        latents = torch.where(non_pad_mask.unsqueeze(-1).unsqueeze(-1), latents, torch.zeros_like(latents))
+        target = latents.clone().permute(0, 3, 1, 2).detach()
+        original = original.clone().detach()
+        force_mask = False
+        if self.cond_mode == 'text':
+            with torch.no_grad():
+                cond_vector = self.encode_text(y)
+        elif self.cond_mode == 'action':
+            cond_vector = self.enc_action(y).to(device).float()
+        elif self.cond_mode == 'uncond':
+            cond_vector = torch.zeros(b, self.latent_dim).float().to(device)
+            force_mask = True
+        else:
+            raise NotImplementedError("Unsupported condition mode!!!")
+        attention_mask = non_pad_mask.unsqueeze(-1).repeat(1, 1, self.patches_per_frame).flatten(1).unsqueeze(1).unsqueeze(1)
+        random_indices = torch.randint(0, len(index), (b,)).to(device)
+        indexx = torch.tensor(index, device=device)[random_indices]
+        mask_seq = torch.zeros((b, 3, l*4, j), device=device)
+        for i in range(b):
+            seq_num = torch.randint(1, m_lens[i]*4, (1,))
+            choose_seq = torch.sort(torch.randperm(m_lens[i]*4)[:seq_num.item()]).values
+            mask_seq[i, :, choose_seq, indexx[i]] = 1.0
+        model_kwargs = dict(conds=cond_vector, attention_mask=attention_mask, control=original, index=mask_seq,
+                            force_mask=force_mask, mean_std=mean_std)
+        if self.diff_model == "Flow":
+            loss_dict = self.train_diffusion.training_losses(self.forward_with_control, target, ae=ae,
+                                                             model_kwargs=model_kwargs)
+        else:
+            t = torch.randint(0, self.train_diffusion.num_timesteps, (target.shape[0],), device=target.device)
+            loss_dict = self.train_diffusion.training_losses(self.forward_with_control, target, t, model_kwargs)
+        loss = loss_dict["loss"]
+        loss = (loss * non_pad_mask).sum() / non_pad_mask.sum()
+        return loss, loss_dict["loss_control"]
+    @torch.no_grad()
+    @eval_decorator
+    def generate_control(self,
+                         conds,
+                         m_lens,
+                         control,
+                         index,
+                         density,
+                         cond_scale,
+                         temperature=1,
+                         j=22
+                         ):
+        device = next(self.parameters()).device
+        l = control.shape[2]//4
+        b = len(m_lens)
+        if self.cond_mode == 'text':
+            with torch.no_grad():
+                cond_vector = self.encode_text(conds)
+        elif self.cond_mode == 'action':
+            cond_vector = self.enc_action(conds).to(device)
+        elif self.cond_mode == 'uncond':
+            cond_vector = torch.zeros(b, self.latent_dim).float().to(device)
+        else:
+            raise NotImplementedError("Unsupported condition mode!!!")
+        padding_mask = ~lengths_to_mask(m_lens, l)
+        noise = torch.randn(b, self.input_dim, l, j).to(device)
+        control = control.clone()
+        cfg1 = cond_scale[0]
+        cfg2 = cond_scale[1]
+        if not (cfg1 == 1.0 and cfg2 == 1.0):
+            # (1) with text and with control (2) no text and with control (3) with text and no control
+            cond_vector = torch.cat([cond_vector, torch.zeros_like(cond_vector), cond_vector], dim=0)
+        random_indices = torch.tensor(0, device=device).repeat(b) # no random in inference
+        indexx = torch.tensor(index, device=device)[random_indices]
+        mask_seq = torch.zeros((b, 3, l * 4, j), device=device)
+        for i in range(b):
+            if density in [1, 2, 5]:
+                seq_num = density
+            else:
+                seq_num = int(m_lens[i] *4* density / 100)
+            choose_seq = torch.sort(torch.randperm(m_lens[i] * 4)[:seq_num]).values
+            mask_seq[i, :, choose_seq, indexx[i]] = 1.0
+        attention_mask = (~padding_mask).unsqueeze(-1).repeat(1, 1, self.patches_per_frame).flatten(1).unsqueeze(1).unsqueeze(1)
+        model_kwargs = dict(conds=cond_vector, attention_mask=attention_mask, cfg1=cfg1, cfg2=cfg2, index=mask_seq,
+                            control=control)
+        sample_fn = self.forward_with_control
+        if not (cfg1 == 1.0 and cfg2 == 1.0):
+            model_kwargs["attention_mask"] = attention_mask.repeat(3, 1, 1, 1)
+            noise = torch.cat([noise, noise, noise], dim=0)
+        if self.diff_model == "Flow":
+            model_fn = self.gen_diffusion.sample_ode()  # default to ode sampling
+            sampled_token_latent = model_fn(noise, sample_fn, **model_kwargs)[-1]
+        else:
+            sampled_token_latent = self.gen_diffusion.p_sample_loop(
+                sample_fn, noise.shape, noise, clip_denoised=False, model_kwargs=model_kwargs,
+                progress=False,
+                temperature=temperature
+            )
+        if not (cfg1 == 1.0 and cfg2 == 1.0):
+            sampled_token_latent, _, _ = sampled_token_latent.chunk(3, dim=0)
+        sampled_token_latent = sampled_token_latent.permute(0, 2, 3, 1)
+        latents = torch.where(padding_mask.unsqueeze(-1).unsqueeze(-1), torch.zeros_like(sampled_token_latent),
+                              sampled_token_latent)
+        return latents.permute(0, 3, 1, 2), mask_seq
+#################################################################################
+#                                     ACMDM Zoos                                #
+#################################################################################
+def acmdm_raw_flow_s_ps22_control(**kwargs):
+    layer = 8
+    return ACMDM_ControlNet(latent_dim=layer*64, ff_size=layer*64*4, num_layers=layer, num_heads=layer, dropout=0, clip_dim=512,
+                 diff_model="Flow", cond_drop_prob=0.1, max_length=49,
+                 patch_size=(1, 22), stride_size=(1, 22), freeze_base=True, **kwargs)
+ACMDM_ControlNet_Models = {
+    'ACMDM-Flow-S-PatchSize22-ControlNet': acmdm_raw_flow_s_ps22_control,
+}
+#################################################################################
+#                                 Inner Architectures                           #
+#################################################################################
+def modulate(x, shift, scale):
+    return x * (1 + scale) + shift
+def zero_module(module):
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+class c_control_embedder(nn.Module):
+    def __init__(
+            self,
+            in_features: int,
+            hidden_features,
+            patch_size,
+            stride_size,
+    ) -> None:
+        super().__init__()
+        self.patch_embed = nn.Conv2d(in_features, hidden_features, kernel_size=(4,patch_size[1]), stride=(4,stride_size[1]), bias=True)
+        self.norm = LlamaRMSNorm(hidden_features, eps=1e-6)
+        self.zero_linear = nn.Linear(hidden_features, hidden_features)
+    def forward(self, x):
+        x = self.patch_embed(x).flatten(2).transpose(1, 2)
+        x = self.norm(x)
+        x = self.zero_linear(x)
+        return x

models/ACMDM_NoisyPrefix_AR.py ADDED Viewed

	@@ -0,0 +1,556 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import clip
+import math
+from functools import partial
+from timm.models.vision_transformer import Attention
+from models.ROPE import RopeND
+from utils.eval_utils import eval_decorator
+from utils.train_utils import lengths_to_mask
+from diffusions.diffusion import create_diffusion
+from diffusions.transport import create_transport, Sampler
+#################################################################################
+#                                      ACMDM                                    #
+#################################################################################
+class ACMDM(nn.Module):
+    def __init__(self, input_dim, cond_mode, latent_dim=256, ff_size=1024, num_layers=8,
+                 num_heads=4, dropout=0, clip_dim=512,
+                 diff_model='Flow', cond_drop_prob=0.1, max_length=49,
+                 patch_size=(1, 22), stride_size=(1, 22), num_joint=22, cluster=5,
+                 clip_version='ViT-B/32', **kargs):
+        super(ACMDM, self).__init__()
+        self.input_dim = input_dim
+        self.latent_dim = latent_dim
+        self.clip_dim = clip_dim
+        self.dropout = dropout
+        self.cluster = cluster
+        self.cond_mode = cond_mode
+        self.cond_drop_prob = cond_drop_prob
+        if self.cond_mode == 'action':
+            assert 'num_actions' in kargs
+            self.num_actions = kargs.get('num_actions', 1)
+            self.encode_action = partial(F.one_hot, num_classes=self.num_actions)
+        # --------------------------------------------------------------------------
+        # Diffusion
+        self.diff_model = diff_model
+        if self.diff_model == 'Flow':
+            self.train_diffusion = create_transport()  # default to linear, velocity prediction
+            self.gen_diffusion = Sampler(self.train_diffusion)
+        else:
+            self.train_diffusion = create_diffusion(timestep_respacing="", noise_schedule="linear")
+            self.gen_diffusion = create_diffusion(timestep_respacing="", noise_schedule="linear")
+        # --------------------------------------------------------------------------
+        # ACMDM
+        print('Loading ACMDM...')
+        self.t_embedder = TimestepEmbedder(self.latent_dim)
+        self.patch_size = patch_size
+        self.stride_size = stride_size
+        self.patches_per_frame = (num_joint - patch_size[1]) // stride_size[1] + 1
+        # Patchification
+        self.x_embedder = nn.Linear(self.input_dim*self.patch_size[0]*self.patch_size[1], self.latent_dim, bias=True)
+        # Positional Encoding
+        max_length = max_length * self.patches_per_frame
+        self.max_lens = [max_length]
+        self.rope = RopeND(nd=1, nd_split=[1], max_lens=self.max_lens)
+        self.position_ids_precompute = torch.arange(max_length).unsqueeze(0)
+        self.cluster_patches = max_length // self.cluster
+        self.ACMDMTransformer = nn.ModuleList([
+            ACMDMTransBlock(self.latent_dim, num_heads, mlp_size=ff_size, rope=self.rope, qk_norm=True) for _ in range(num_layers)
+        ])
+        if self.cond_mode == 'text':
+            self.y_embedder = nn.Linear(self.clip_dim, self.latent_dim)
+        elif self.cond_mode == 'action':
+            self.y_embedder = nn.Linear(self.num_actions, self.latent_dim)
+        elif self.cond_mode == 'uncond':
+            self.y_embedder = nn.Identity()
+        else:
+            raise KeyError("Unsupported condition mode!!!")
+        self.final_layer = FinalLayer(self.latent_dim, self.input_dim*self.patch_size[0]*self.patch_size[1])
+        self.initialize_weights()
+        if self.cond_mode == 'text':
+            print('Loading CLIP...')
+            self.clip_version = clip_version
+            self.clip_model = self.load_and_freeze_clip(clip_version)
+        attention_mask = []
+        start = 0
+        total_length = max_length
+        for idx in range(max_length):
+            if idx in [self.cluster_patches * i for i in range(self.cluster)]:
+                start += self.cluster_patches * self.patches_per_frame
+            attention_mask.append(torch.cat([torch.ones((1, start)),
+                                             torch.zeros((1, total_length - start))], dim=-1))
+        attention_mask = torch.cat(attention_mask, dim=0)
+        attention_mask = torch.where(attention_mask == 0, -torch.inf, attention_mask)
+        attention_mask = torch.where(attention_mask == 1, 0, attention_mask)
+        attention_mask = attention_mask.unsqueeze(0).unsqueeze(0)
+        self.register_buffer('attention_mask', attention_mask.contiguous())
+    def initialize_weights(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers in ACMDM blocks:
+        for block in self.ACMDMTransformer:
+            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+        # Zero-out output layers:
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+    def load_and_freeze_clip(self, clip_version):
+        clip_model, clip_preprocess = clip.load(clip_version, device='cpu', jit=False)
+        assert torch.cuda.is_available()
+        clip.model.convert_weights(clip_model)
+        clip_model.eval()
+        for p in clip_model.parameters():
+            p.requires_grad = False
+        return clip_model
+    def encode_text(self, raw_text):
+        device = next(self.parameters()).device
+        text = clip.tokenize(raw_text, truncate=True).to(device)
+        feat_clip_text = self.clip_model.encode_text(text).float()
+        return feat_clip_text
+    def mask_cond(self, cond, force_mask=False):
+        bs, d =  cond.shape
+        if force_mask:
+            return torch.zeros_like(cond)
+        elif self.training and self.cond_drop_prob > 0.:
+            mask = torch.bernoulli(torch.ones(bs, device=cond.device) * self.cond_drop_prob).view(bs, 1)
+            return cond * (1. - mask)
+        else:
+            return cond
+    def patchify(self, x):
+        b, c, l, j = x.shape
+        p = self.patch_size[0]
+        q = self.patch_size[1]
+        l_, j_ = l // p, j // q
+        x = x.reshape(b, c, l_, p, j_, q)
+        x = torch.einsum('nclpjq->nljcpq', x)
+        x = x.reshape(b, l_ * j_, c * p *q)
+        return x
+    def patchify_mask(self, mask):
+        b, l = mask.shape
+        p = self.patch_size[0]
+        l_ = l//self.patch_size[0]
+        q = self.patch_size[1]
+        j_ = self.patches_per_frame
+        mask = mask.unsqueeze(1).unsqueeze(-1).expand(-1, self.input_dim, -1, j_*q)
+        mask = mask.reshape(b, self.input_dim, l_, p, j_, q)
+        mask = torch.einsum('nclpjq->nljcpq', mask)
+        mask = mask.reshape(b, l_ * j_, self.input_dim*p * q)
+        mask = mask.any(dim=-1)
+        return mask
+    def unpatchify(self, x):
+        b = x.shape[0]
+        p = self.patch_size[0]
+        q = self.patch_size[1]
+        c = self.input_dim
+        l_, j_ = x.shape[1]//self.patches_per_frame, self.patches_per_frame
+        x = x.reshape(b, l_, j_, c, p, q)
+        x = torch.einsum('nljcpq->nclpjq', x)
+        x = x.reshape(b, c, l_ * p, j_ * q)
+        return x
+    def forward(self, x, t, conds, attention_mask, force_mask=False, ids=None, block_size=None, cache=False):
+        t = self.t_embedder(t, dtype=x.dtype).unsqueeze(1).repeat(1, self.cluster_patches * self.patches_per_frame, 1)
+        t = t.chunk(self.cluster, dim=0)
+        t = torch.cat(t, dim=1)
+        conds = self.mask_cond(conds, force_mask=force_mask)
+        x = x.chunk(self.cluster, dim=0)
+        x = torch.cat(x, dim=1)
+        x = self.x_embedder(x)
+        conds = self.y_embedder(conds)
+        y = t + conds.unsqueeze(1)
+        if ids is not None:
+            position_ids = ids
+        else:
+            position_ids = self.position_ids_precompute[:, :x.shape[1]]
+        for block in self.ACMDMTransformer:
+            x = block(x, y, attention_mask, position_ids=position_ids, block_size=block_size, cache=cache)
+        x = self.final_layer(x, y)
+        x = x.chunk(self.cluster, dim=1)
+        x = torch.cat(x, dim=0)
+        return x
+    def forward_with_CFG(self, x, t, conds, attention_mask, cfg=1.0, context=None, cache=True, block_id=0):
+        if cache:
+            if self.ACMDMTransformer[0].attn.cached_k is None:
+                cache = True
+            elif block_id * self.cluster_patches == self.ACMDMTransformer[0].attn.cached_k.shape[2]:
+                cache = False
+        if not cfg == 1.0:
+            half = x[: len(x) // 2]
+            x = torch.cat([half, half], dim=0)
+        if context is not None and cache:
+            ids = self.position_ids_precompute[:, (block_id - 1) * self.cluster_patches * self.patches_per_frame:(block_id + 1) * self.cluster_patches * self.patches_per_frame]
+            x = torch.cat([context, x], dim=1)
+            t = torch.cat([torch.ones_like(t).unsqueeze(-1).repeat(1, self.patches_per_frame * self.cluster_patches),
+                           t.unsqueeze(-1).repeat(1, self.patches_per_frame * self.cluster_patches)], dim=1)
+            am_idx = block_id if block_id == 0 else block_id - 1
+            attention_mask = attention_mask[:, :, am_idx * self.cluster_patches * self.patches_per_frame: (block_id + 1) * self.cluster_patches * self.patches_per_frame,
+                             :(block_id + 1) * self.cluster_patches * self.patches_per_frame]
+        else:
+            ids = self.position_ids_precompute[:,
+                  (block_id) * self.cluster_patches * self.patches_per_frame:(block_id + 1) * self.cluster_patches * self.patches_per_frame]
+            t = t.unsqueeze(-1).repeat(1, self.patches_per_frame * self.cluster_patches)
+            attention_mask = attention_mask[:, :, :(block_id + 1) * self.cluster_patches * self.patches_per_frame,
+                             :(block_id + 1) * self.cluster_patches * self.patches_per_frame]
+            attention_mask = attention_mask[:, :, -self.patches_per_frame * self.cluster_patches:, :]
+        t = t.reshape(-1)
+        t = self.t_embedder(t, dtype=x.dtype)
+        t = t.reshape(x.shape[0], x.shape[1], -1)
+        conds = self.mask_cond(conds)
+        x = self.x_embedder(x)
+        conds = self.y_embedder(conds)
+        y = t + conds.unsqueeze(1)
+        position_ids = ids
+        for block in self.ACMDMTransformer:
+            x = block(x, y, attention_mask, position_ids=position_ids, block_size=self.patches_per_frame * self.cluster_patches,
+                      cache=cache)
+        x = self.final_layer(x, y)
+        x = x[:, -self.patches_per_frame * self.cluster_patches:, :]
+        if not cfg == 1.0:
+            cond_eps, uncond_eps = torch.split(x, len(x) // 2, dim=0)
+            half_eps = uncond_eps + cfg * (cond_eps - uncond_eps)
+            x = torch.cat([half_eps, half_eps], dim=0)
+        return x
+    def forward_loss(self, latents, y, m_lens):
+        b, d, l, j = latents.shape
+        device = latents.device
+        non_pad_mask = lengths_to_mask(m_lens, l)
+        non_pad_mask = self.patchify_mask(non_pad_mask)
+        latents = self.patchify(latents)
+        b, l, d = latents.shape
+        latents = torch.where(non_pad_mask.unsqueeze(-1), latents, torch.zeros_like(latents))
+        target = latents.clone().detach().chunk(self.cluster, dim=1)
+        target = torch.cat(target, dim=0)
+        force_mask = False
+        if self.cond_mode == 'text':
+            with torch.no_grad():
+                cond_vector = self.encode_text(y)
+        elif self.cond_mode == 'action':
+            cond_vector = self.enc_action(y).to(device).float()
+        elif self.cond_mode == 'uncond':
+            cond_vector = torch.zeros(b, self.latent_dim).float().to(device)
+            force_mask = True
+        else:
+            raise NotImplementedError("Unsupported condition mode!!!")
+        attention_mask = []
+        for i in range(b):
+            a_mask = self.attention_mask.clone()
+            a_mask[:, :, :, m_lens[i] * self.patches_per_frame:] = -torch.inf
+            attention_mask.append(a_mask)
+        attention_mask = torch.cat(attention_mask)
+        model_kwargs = dict(conds=cond_vector, force_mask=force_mask, attention_mask=attention_mask)
+        if self.diff_model == "Flow":
+            loss_dict = self.train_diffusion.training_losses(self.forward, target, model_kwargs, dim=(2))
+        else:
+            t = torch.randint(0, self.train_diffusion.num_timesteps, (target.shape[0],), device=target.device)
+            loss_dict = self.train_diffusion.training_losses(self.forward, target, t, model_kwargs)
+        loss = loss_dict["loss"]
+        loss = loss.chunk(self.cluster, dim=0)
+        loss = torch.cat(loss, dim=1)
+        loss = (loss * non_pad_mask).sum() / non_pad_mask.sum()
+        return loss
+    @torch.no_grad()
+    @eval_decorator
+    def generate(self,
+                 conds,
+                 m_lens,
+                 cond_scale: int,
+                 temperature=1,
+                 ):
+        device = next(self.parameters()).device
+        l = max(m_lens)
+        b = len(m_lens)
+        if self.cond_mode == 'text':
+            with torch.no_grad():
+                cond_vector = self.encode_text(conds)
+        elif self.cond_mode == 'action':
+            cond_vector = self.enc_action(conds).to(device)
+        elif self.cond_mode == 'uncond':
+            cond_vector = torch.zeros(b, self.latent_dim).float().to(device)
+        else:
+            raise NotImplementedError("Unsupported condition mode!!!")
+        padding_mask = ~lengths_to_mask(m_lens, l)
+        if not cond_scale == 1.0:
+            cond_vector = torch.cat([cond_vector, torch.zeros_like(cond_vector)], dim=0)
+        for block in self.ACMDMTransformer:
+            block.set_caching(True)
+        output = []
+        attention_mask = []
+        for i in range(b):
+            a_mask = self.attention_mask.clone()
+            a_mask[:, :, :, m_lens[i] * self.patches_per_frame:] = -torch.inf
+            attention_mask.append(a_mask)
+        attention_mask = torch.cat(attention_mask)
+        if not cond_scale == 1.0:
+            attention_mask = torch.cat([attention_mask, attention_mask], dim=0)
+        for step in range(self.cluster):
+            clean_x = output[-1] if len(output) > 0 else None
+            cache_flag = step > 0
+            noise = torch.randn(b, self.cluster_patches * self.patches_per_frame,
+                                self.input_dim * self.patch_size[0] * self.patch_size[1]).to(device)
+            if not cond_scale == 1.0:
+                noise = torch.cat([noise, noise], dim=0)
+                if clean_x is not None:
+                    clean_x = torch.cat([clean_x, clean_x], dim=0)
+            # cfg scale
+            # cond_scale2 = (cond_scale - 1) * (step+1) / (m_lens//self.cluster_patches + 1) + 1
+            model_kwargs = dict(conds=cond_vector, context=clean_x, block_id=step, cache=cache_flag,
+                                attention_mask=attention_mask, cfg=cond_scale)
+            sample_fn = self.forward_with_CFG
+            if self.diff_model == "Flow":
+                model_fn = self.gen_diffusion.sample_ode()  # default to ode sampling
+                sampled_token_latent = model_fn(noise, sample_fn, **model_kwargs)[-1]
+            else:
+                sampled_token_latent = self.gen_diffusion.p_sample_loop(
+                    sample_fn, noise.shape, noise, clip_denoised=False, model_kwargs=model_kwargs,
+                    progress=False,
+                    temperature=temperature
+                )
+            if not cond_scale == 1:
+                sampled_token_latent, _ = sampled_token_latent.chunk(2, dim=0)
+            output.append(sampled_token_latent.detach().clone())
+        latents = torch.cat(output, dim=1)
+        latents = self.unpatchify(latents[:, :l * self.patches_per_frame, :])
+        latents = torch.where(padding_mask.unsqueeze(1).unsqueeze(-1), torch.zeros_like(latents), latents)
+        for block in self.ACMDMTransformer:
+            block.set_caching(False)
+        return latents
+#################################################################################
+#                                     ACMDM Zoos                                #
+#################################################################################
+def acmdm_noisyprefixar_flow_s_ps22(**kwargs):
+    layer = 8
+    return ACMDM(latent_dim=layer*64, ff_size=layer*64*4, num_layers=layer, num_heads=layer, dropout=0, clip_dim=512,
+                 diff_model="Flow", cond_drop_prob=0.1, max_length=50,
+                 patch_size=(1, 22), stride_size=(1, 22), **kwargs)
+ACMDM_models = {
+    'ACMDM-NoisyPrefixAR-Flow-S-PatchSize22': acmdm_noisyprefixar_flow_s_ps22,
+}
+#################################################################################
+#                                 Inner Architectures                           #
+#################################################################################
+def modulate(x, shift, scale):
+    return x * (1 + scale) + shift
+class ACMDMAttention(Attention):
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=True,
+        rope=None,
+        qk_norm=True,
+        **block_kwargs,
+    ):
+        super().__init__(dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_norm=qk_norm, **block_kwargs)
+        self.caching, self.cached_k, self.cached_v = False, None, None
+        self.rope = rope
+    def set_caching(self, flag):
+        self.caching, self.cached_k, self.cached_v = flag, None, None
+    def forward(self, x, position_ids=None, attention_mask=None, block_size=None, cache=False):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)
+        q, k = self.q_norm(q), self.k_norm(k)
+        if self.rope is not None:
+            q, k = self.rope(q, k, position_ids)
+        if self.caching:
+            if cache:
+                if self.cached_k is None:
+                    self.cached_k = k[:, :, :block_size, :]
+                    self.cached_v = v[:, :, :block_size, :]
+                    self.cached_x = x
+                else:
+                    self.cached_k = torch.cat((self.cached_k, k[:, :, :block_size, :]), dim=2)
+                    self.cached_v = torch.cat((self.cached_v, v[:, :, :block_size, :]), dim=2)
+            if self.cached_k is not None:
+                k = torch.cat((self.cached_k, k[:, :, -block_size:, :]), dim=2)
+                v = torch.cat((self.cached_v, v[:, :, -block_size:, :]), dim=2)
+        x = torch.nn.functional.scaled_dot_product_attention(
+            q, k, v,
+            attn_mask=attention_mask,
+            dropout_p=self.attn_drop.p
+        )
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class SwiGLUFFN(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = in_features
+        hidden_features = hidden_features
+        self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
+        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
+    def forward(self, x):
+        x12 = self.w12(x)
+        x1, x2 = x12.chunk(2, dim=-1)
+        hidden = F.silu(x1) * x2
+        return self.w3(hidden)
+class ACMDMTransBlock(nn.Module):
+    def __init__(self, hidden_size, num_heads, mlp_size=1024, rope=None, qk_norm=True):
+        super().__init__()
+        self.norm1 = LlamaRMSNorm(hidden_size, eps=1e-6)
+        self.attn = ACMDMAttention(hidden_size, num_heads=num_heads, qkv_bias=True, norm_layer=LlamaRMSNorm,
+                                        qk_norm=qk_norm, rope=rope)
+        self.norm2 = LlamaRMSNorm(hidden_size, eps=1e-6)
+        self.mlp = SwiGLUFFN(hidden_size, int(2 / 3 * mlp_size))
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 6 * hidden_size, bias=True)
+        )
+    def set_caching(self, flag):
+        self.attn.set_caching(flag)
+    def forward(self, x, c, attention_mask=None, position_ids=None, block_size=None, cache=False):
+        dtype = x.dtype
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=-1)
+        norm_x1 = self.norm1(x.to(torch.float32)).to(dtype)
+        attn_input_x = modulate(norm_x1, shift_msa, scale_msa)
+        attn_output_x = self.attn(attn_input_x, attention_mask=attention_mask, position_ids=position_ids, block_size=block_size, cache=cache)
+        x = x + gate_msa * attn_output_x
+        norm_x2 = self.norm2(x.to(torch.float32)).to(dtype)
+        gate_input_x = modulate(norm_x2, shift_mlp, scale_mlp)
+        gate_output_x = self.mlp(gate_input_x)
+        x = x + gate_mlp * gate_output_x
+        return x
+class FinalLayer(nn.Module):
+    def __init__(self, hidden_size, output_size):
+        super().__init__()
+        self.norm_final = LlamaRMSNorm(hidden_size, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, output_size, bias=True)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True)
+        )
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=-1)
+        norm_x = self.norm_final(x.to(torch.float32)).to(x.dtype)
+        x = modulate(norm_x, shift, scale)
+        x = self.linear(x)
+        return x
+class TimestepEmbedder(nn.Module):
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000, dtype=torch.float32):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=dtype) / half
+        ).to(device=t.device, dtype=dtype)
+        args = t[:, None] * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t, dtype=torch.bfloat16):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size, dtype=dtype)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class LlamaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return (self.weight * hidden_states).to(input_dtype)

models/ACMDM_Prefix_AR.py ADDED Viewed

	@@ -0,0 +1,434 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import clip
+import math
+from functools import partial
+from timm.models.vision_transformer import Attention
+from models.ROPE import RopeND
+from utils.eval_utils import eval_decorator
+from utils.train_utils import lengths_to_mask
+from diffusions.diffusion import create_diffusion
+from diffusions.transport import create_transport, Sampler
+#################################################################################
+#                                      ACMDM                                    #
+#################################################################################
+class ACMDM(nn.Module):
+    def __init__(self, input_dim, cond_mode, latent_dim=256, ff_size=1024, num_layers=8,
+                 num_heads=4, dropout=0, clip_dim=512,
+                 diff_model='Flow', cond_drop_prob=0.1, max_length=49,
+                 patch_size=(1, 22), stride_size=(1, 22), num_joint=22,
+                 clip_version='ViT-B/32', **kargs):
+        super(ACMDM, self).__init__()
+        self.input_dim = input_dim
+        self.latent_dim = latent_dim
+        self.clip_dim = clip_dim
+        self.dropout = dropout
+        self.cond_mode = cond_mode
+        self.cond_drop_prob = cond_drop_prob
+        if self.cond_mode == 'action':
+            assert 'num_actions' in kargs
+            self.num_actions = kargs.get('num_actions', 1)
+            self.encode_action = partial(F.one_hot, num_classes=self.num_actions)
+        # --------------------------------------------------------------------------
+        # Diffusion
+        self.diff_model = diff_model
+        if self.diff_model == 'Flow':
+            self.train_diffusion = create_transport()  # default to linear, velocity prediction
+            self.gen_diffusion = Sampler(self.train_diffusion)
+        else:
+            self.train_diffusion = create_diffusion(timestep_respacing="", noise_schedule="linear")
+            self.gen_diffusion = create_diffusion(timestep_respacing="", noise_schedule="linear")
+        # --------------------------------------------------------------------------
+        # ACMDM
+        print('Loading ACMDM...')
+        self.t_embedder = TimestepEmbedder(self.latent_dim)
+        self.patch_size = patch_size
+        self.stride_size = stride_size
+        self.patches_per_frame = (num_joint - patch_size[1]) // stride_size[1] + 1
+        # Patchification
+        self.x_embedder = nn.Conv2d(self.input_dim, self.latent_dim, kernel_size=self.patch_size, stride=self.stride_size, bias=True)
+        # Positional Encoding
+        max_length = max_length * self.patches_per_frame
+        self.max_lens = [max_length]
+        self.rope = RopeND(nd=1, nd_split=[1], max_lens=self.max_lens)
+        self.position_ids_precompute = torch.arange(max_length).unsqueeze(0)
+        self.ACMDMTransformer = nn.ModuleList([
+            ACMDMTransBlock(self.latent_dim, num_heads, mlp_size=ff_size, rope=self.rope, qk_norm=True) for _ in range(num_layers)
+        ])
+        if self.cond_mode == 'text':
+            self.y_embedder = nn.Linear(self.clip_dim, self.latent_dim)
+        elif self.cond_mode == 'action':
+            self.y_embedder = nn.Linear(self.num_actions, self.latent_dim)
+        elif self.cond_mode == 'uncond':
+            self.y_embedder = nn.Identity()
+        else:
+            raise KeyError("Unsupported condition mode!!!")
+        self.final_layer = FinalLayer(self.latent_dim, self.input_dim, patch_size=patch_size, stride_size=stride_size, patches=self.patches_per_frame)
+        self.initialize_weights()
+        if self.cond_mode == 'text':
+            print('Loading CLIP...')
+            self.clip_version = clip_version
+            self.clip_model = self.load_and_freeze_clip(clip_version)
+    def initialize_weights(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers in ACMDM blocks:
+        for block in self.ACMDMTransformer:
+            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+        # Zero-out output layers:
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
+        nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+    def load_and_freeze_clip(self, clip_version):
+        clip_model, clip_preprocess = clip.load(clip_version, device='cpu', jit=False)
+        assert torch.cuda.is_available()
+        clip.model.convert_weights(clip_model)
+        clip_model.eval()
+        for p in clip_model.parameters():
+            p.requires_grad = False
+        return clip_model
+    def encode_text(self, raw_text):
+        device = next(self.parameters()).device
+        text = clip.tokenize(raw_text, truncate=True).to(device)
+        feat_clip_text = self.clip_model.encode_text(text).float()
+        return feat_clip_text
+    def mask_cond(self, cond, force_mask=False):
+        bs, d =  cond.shape
+        if force_mask:
+            return torch.zeros_like(cond)
+        elif self.training and self.cond_drop_prob > 0.:
+            mask = torch.bernoulli(torch.ones(bs, device=cond.device) * self.cond_drop_prob).view(bs, 1)
+            return cond * (1. - mask)
+        else:
+            return cond
+    def forward(self, x, t, conds, attention_mask, context, force_mask=False):
+        t = self.t_embedder(t, dtype=x.dtype)
+        conds = self.mask_cond(conds, force_mask=force_mask)
+        x = torch.cat([context, x], dim=2)
+        x = self.x_embedder(x)
+        x = x.flatten(2).transpose(1, 2)
+        conds = self.y_embedder(conds)
+        y = t.unsqueeze(1) + conds.unsqueeze(1)
+        position_ids = self.position_ids_precompute[:, :x.shape[1]]
+        for block in self.ACMDMTransformer:
+            x = block(x, y, attention_mask, position_ids=position_ids)
+        x = self.final_layer(x, y)[:, :, 5:, :]
+        return x
+    def forward_with_CFG(self, x, t, conds, attention_mask, context, cfg=1.0):
+        if not cfg == 1.0:
+            half = x[: len(x) // 2]
+            x = torch.cat([half, half], dim=0)
+            context = torch.cat([context, context], dim=0)
+        x = self.forward(x, t, conds, attention_mask, context)
+        if not cfg == 1.0:
+            cond_eps, uncond_eps = torch.split(x, len(x) // 2, dim=0)
+            half_eps = uncond_eps + cfg * (cond_eps - uncond_eps)
+            x = torch.cat([half_eps, half_eps], dim=0)
+        return x
+    def forward_loss(self, latents, y, m_lens):
+        latents = latents.permute(0, 2, 3, 1)
+        b, l, j, d = latents.shape
+        device = latents.device
+        non_pad_mask = lengths_to_mask(m_lens, l)
+        latents = torch.where(non_pad_mask.unsqueeze(-1).unsqueeze(-1), latents, torch.zeros_like(latents))
+        # prefix 20, prediction 40 style
+        target = latents.clone().permute(0, 3, 1, 2).detach()[:, :, 5:, :]
+        context = latents.clone().permute(0, 3, 1, 2).detach()[:, :, :5, :]
+        force_mask = False
+        if self.cond_mode == 'text':
+            with torch.no_grad():
+                cond_vector = self.encode_text(y)
+        elif self.cond_mode == 'action':
+            cond_vector = self.enc_action(y).to(device).float()
+        elif self.cond_mode == 'uncond':
+            cond_vector = torch.zeros(b, self.latent_dim).float().to(device)
+            force_mask = True
+        else:
+            raise NotImplementedError("Unsupported condition mode!!!")
+        attention_mask = non_pad_mask.unsqueeze(-1).repeat(1, 1, self.patches_per_frame).flatten(1).unsqueeze(
+            1).unsqueeze(1)
+        model_kwargs = dict(conds=cond_vector, force_mask=force_mask, attention_mask=attention_mask, context=context)
+        if self.diff_model == "Flow":
+            loss_dict = self.train_diffusion.training_losses(self.forward, target, model_kwargs)
+        else:
+            t = torch.randint(0, self.train_diffusion.num_timesteps, (target.shape[0],), device=target.device)
+            loss_dict = self.train_diffusion.training_losses(self.forward, target, t, model_kwargs)
+        loss = loss_dict["loss"]
+        non_pad_mask = non_pad_mask[:, 5:]
+        loss = (loss * non_pad_mask).sum() / non_pad_mask.sum()
+        return loss
+    @torch.no_grad()
+    @eval_decorator
+    def generate(self,
+                 conds,
+                 m_lens,
+                 cond_scale: int,
+                 context,
+                 temperature=1,
+                 j=22,
+                 ):
+        device = next(self.parameters()).device
+        l = max(m_lens)
+        b = len(m_lens)
+        if self.cond_mode == 'text':
+            with torch.no_grad():
+                cond_vector = self.encode_text(conds)
+        elif self.cond_mode == 'action':
+            cond_vector = self.enc_action(conds).to(device)
+        elif self.cond_mode == 'uncond':
+            cond_vector = torch.zeros(b, self.latent_dim).float().to(device)
+        else:
+            raise NotImplementedError("Unsupported condition mode!!!")
+        padding_mask = ~lengths_to_mask(m_lens, l)
+        if not cond_scale == 1.0:
+            cond_vector = torch.cat([cond_vector, torch.zeros_like(cond_vector)], dim=0)
+        # really naive way to write the PrefixAR inferece loop, to be improved
+        iter = [(0,15),(10,25),(20, 35), (30, 45), (40, l.item())]
+        out = [context.clone().detach()]
+        for i in range(len(iter)):
+            noise = torch.randn(b, self.input_dim, iter[i][1]-iter[i][0]-5, j).to(device)
+            if not cond_scale == 1.0:
+                noise = torch.cat([noise, noise], dim=0)
+            attention_mask = ((~padding_mask)[:, iter[i][0]:iter[i][1]]).unsqueeze(-1).repeat(1,1,self.patches_per_frame).flatten(1).unsqueeze(1).unsqueeze(1)
+            model_kwargs = dict(conds=cond_vector, attention_mask=attention_mask, context=context, cfg=cond_scale)
+            sample_fn = self.forward_with_CFG
+            if not cond_scale == 1:
+                model_kwargs["attention_mask"] = attention_mask.repeat(2, 1, 1, 1)
+            if self.diff_model == "Flow":
+                model_fn = self.gen_diffusion.sample_ode(sampling_method="euler")  # default to ode sampling, use euler to prevent underflow as current iter can contain paddings
+                sampled_token_latent = model_fn(noise, sample_fn, **model_kwargs)[-1]
+            else:
+                sampled_token_latent = self.gen_diffusion.p_sample_loop(
+                    sample_fn, noise.shape, noise, clip_denoised=False, model_kwargs=model_kwargs,
+                    progress=False,
+                    temperature=temperature
+                )
+            if not cond_scale == 1:
+                sampled_token_latent, _ = sampled_token_latent.chunk(2, dim=0)
+            out.append(sampled_token_latent.clone().detach())
+            context = sampled_token_latent[:, :, 5:, :].clone().detach()
+        sampled_token_latent = torch.cat(out, dim=2).permute(0,2,3,1)
+        latents = torch.where(padding_mask.unsqueeze(-1).unsqueeze(-1), torch.zeros_like(sampled_token_latent), sampled_token_latent)
+        return latents.permute(0,3,1,2)
+#################################################################################
+#                                     ACMDM Zoos                                #
+#################################################################################
+def acmdm_prefixar_flow_s_ps22(**kwargs):
+    layer = 8
+    return ACMDM(latent_dim=layer*64, ff_size=layer*64*4, num_layers=layer, num_heads=layer, dropout=0, clip_dim=512,
+                 diff_model="Flow", cond_drop_prob=0.1, max_length=15,
+                 patch_size=(1, 22), stride_size=(1, 22), **kwargs)
+ACMDM_models = {
+    'ACMDM-PrefixAR-Flow-S-PatchSize22': acmdm_prefixar_flow_s_ps22,
+}
+#################################################################################
+#                                 Inner Architectures                           #
+#################################################################################
+def modulate(x, shift, scale):
+    return x * (1 + scale) + shift
+class ACMDMAttention(Attention):
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=True,
+        rope=None,
+        qk_norm=True,
+        **block_kwargs,
+    ):
+        super().__init__(dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_norm=qk_norm, **block_kwargs)
+        self.rope = rope
+    def forward(self, x, position_ids=None, attention_mask=None):
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)
+        q, k = self.q_norm(q), self.k_norm(k)
+        if self.rope is not None:
+            q, k = self.rope(q, k, position_ids)
+        x = torch.nn.functional.scaled_dot_product_attention(
+            q, k, v,
+            attn_mask=attention_mask,
+            dropout_p=self.attn_drop.p
+        )
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class SwiGLUFFN(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = in_features
+        hidden_features = hidden_features
+        self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
+        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
+    def forward(self, x):
+        x12 = self.w12(x)
+        x1, x2 = x12.chunk(2, dim=-1)
+        hidden = F.silu(x1) * x2
+        return self.w3(hidden)
+class ACMDMTransBlock(nn.Module):
+    def __init__(self, hidden_size, num_heads, mlp_size=1024, rope=None, qk_norm=True):
+        super().__init__()
+        self.norm1 = LlamaRMSNorm(hidden_size, eps=1e-6)
+        self.attn = ACMDMAttention(hidden_size, num_heads=num_heads, qkv_bias=True, norm_layer=LlamaRMSNorm,
+                                        qk_norm=qk_norm, rope=rope)
+        self.norm2 = LlamaRMSNorm(hidden_size, eps=1e-6)
+        self.mlp = SwiGLUFFN(hidden_size, int(2 / 3 * mlp_size))
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 6 * hidden_size, bias=True)
+        )
+    def forward(self, x, c, attention_mask=None, position_ids=None):
+        dtype = x.dtype
+        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=-1)
+        norm_x1 = self.norm1(x.to(torch.float32)).to(dtype)
+        attn_input_x = modulate(norm_x1, shift_msa, scale_msa)
+        attn_output_x = self.attn(attn_input_x, attention_mask=attention_mask, position_ids=position_ids)
+        x = x + gate_msa * attn_output_x
+        norm_x2 = self.norm2(x.to(torch.float32)).to(dtype)
+        gate_input_x = modulate(norm_x2, shift_mlp, scale_mlp)
+        gate_output_x = self.mlp(gate_input_x)
+        x = x + gate_mlp * gate_output_x
+        return x
+class FinalLayer(nn.Module):
+    def __init__(self, hidden_size, output_size, patch_size=(1, 22), stride_size=(1,22), patches=1):
+        super().__init__()
+        self.norm_final = LlamaRMSNorm(hidden_size, eps=1e-6)
+        self.patch_size = patch_size
+        self.stride_size = stride_size
+        self.patches = patches
+        self.linear = nn.Linear(hidden_size, output_size*patch_size[0]*patch_size[1], bias=True)
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(hidden_size, 2 * hidden_size, bias=True)
+        )
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=-1)
+        norm_x = self.norm_final(x.to(torch.float32)).to(x.dtype)
+        x = modulate(norm_x, shift, scale)
+        x = self.linear(x)
+        x = x.reshape(shape=(x.shape[0], x.shape[1]//self.patches, self.patches, self.patch_size[0], self.patch_size[1], x.shape[-1] // self.patch_size[1]))
+        x = torch.einsum('nljpqc->nclpjq', x)
+        x = x.reshape(shape=(x.shape[0], x.shape[1], -1, 22))
+        return x
+class TimestepEmbedder(nn.Module):
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000, dtype=torch.float32):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=dtype) / half
+        ).to(device=t.device, dtype=dtype)
+        args = t[:, None] * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+    def forward(self, t, dtype=torch.bfloat16):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size, dtype=dtype)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+class LlamaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return (self.weight * hidden_states).to(input_dtype)

models/AE_2D_Causal.py ADDED Viewed

	@@ -0,0 +1,245 @@

+import torch
+import numpy as np
+import torch.nn as nn
+import torch.nn.functional as F
+#################################################################################
+#                                       AE                                      #
+#################################################################################
+class AE(nn.Module):
+    def __init__(self, input_width=3, output_emb_width=4, width=512, depth=3, ch_mult=(1,1,1)):
+        super().__init__()
+        self.output_emb_width = output_emb_width
+        self.encoder = Encoder(input_width, output_emb_width, width, depth, in_ch_mult=ch_mult[:-1], ch_mult=ch_mult[1:])
+        self.decoder = Decoder(input_width, output_emb_width, width, depth, in_ch_mult=ch_mult[::-1][1:], ch_mult=ch_mult[::-1][:-1])
+    def preprocess(self, x):
+        x = x.permute(0, 3, 1, 2).float()
+        return x
+    def encode(self, x):
+        x_in = self.preprocess(x)
+        x_encoder = self.encoder(x_in)
+        return x_encoder
+    def forward(self, x):
+        x_in = self.preprocess(x)
+        x_encoder = self.encoder(x_in)
+        x_out = self.decoder(x_encoder)
+        return x_out
+    def decode(self, x):
+        x_out = self.decoder(x)
+        return x_out
+#################################################################################
+#                                       VAE                                     #
+#################################################################################
+class VAE(nn.Module):
+    def __init__(self, input_width=3, output_emb_width=4, width=512, depth=3, ch_mult=(1,1,1)):
+        super().__init__()
+        self.output_emb_width = output_emb_width
+        self.encoder = Encoder(input_width, output_emb_width*2, width, depth, in_ch_mult=ch_mult[:-1], ch_mult=ch_mult[1:])
+        self.decoder = Decoder(input_width, output_emb_width, width, depth, in_ch_mult=ch_mult[::-1][1:], ch_mult=ch_mult[::-1][:-1])
+    def preprocess(self, x):
+        x = x.permute(0, 3, 1, 2).float()
+        return x
+    def encode(self, x):
+        x_in = self.preprocess(x)
+        x_encoder = self.encoder(x_in)
+        x_encoder = DiagonalGaussianDistribution(x_encoder)
+        x_encoder = x_encoder.sample()
+        return x_encoder
+    def forward(self, x, need_loss=False):
+        x_in = self.preprocess(x)
+        x_encoder = self.encoder(x_in)
+        x_encoder = DiagonalGaussianDistribution(x_encoder)
+        kl_loss = x_encoder.kl()
+        x_encoder = x_encoder.sample()
+        x_out = self.decoder(x_encoder)
+        if need_loss:
+            # sigma vae for better quality
+            log_sigma = ((x - x_out) ** 2).mean([1,2,3], keepdim=True).sqrt().log()
+            log_sigma = -6 + F.softplus(log_sigma - (-6))
+            rec = 0.5 * torch.pow((x - x_out) / log_sigma.exp(), 2) + log_sigma
+            rec = rec.sum(dim=(1,2,3))
+            loss = {
+                    "rec": rec.mean(),
+                    "kl": kl_loss.mean()}
+            return x_out, loss
+        else:
+            return x_out
+    def decode(self, x):
+        x_out = self.decoder(x)
+        return x_out
+#################################################################################
+#                                     AE Zoos                                   #
+#################################################################################
+def ae(**kwargs):
+    return AE(output_emb_width=4, width=512, depth=3, ch_mult=(1,1,1), **kwargs)
+def vae(**kwargs):
+    return VAE(output_emb_width=4, width=512, depth=3, ch_mult=(1,1,1), **kwargs)
+AE_models = {
+    'AE_Model': ae, 'VAE_Model': vae
+}
+#################################################################################
+#                                 Inner Architectures                           #
+#################################################################################
+class Encoder(nn.Module):
+    def __init__(self, input_emb_width=3, output_emb_width=4, width=512, depth=3, in_ch_mult=(1,1), ch_mult=(1,1)):
+        super().__init__()
+        self.model = nn.ModuleList()
+        self.conv_in = nn.Conv2d(input_emb_width, width, (3, 1), (1, 1), (0, 0))
+        block_in = width * in_ch_mult[0]
+        for i in range(len(in_ch_mult)):
+            block_in = width * in_ch_mult[i]
+            block_out = width * ch_mult[i]
+            self.model.append(CausalPad2d((0, 0, 2, 0)))
+            self.model.append(nn.Conv2d(width, width, (4, 1), (2, 1), (0, 0)))
+            for j in range(depth):
+                self.model.append(ResnetBlock(in_channels=block_in, out_channels=block_out, dil=2-j))
+                block_in = block_out
+        self.conv_out = torch.nn.Conv2d(block_in, output_emb_width, (3, 1), (1, 1), (0, 0))
+    def forward(self, x):
+        x = F.pad(x, (0, 0, 2, 0))
+        x = self.conv_in(x)
+        for layer in self.model:
+                x = layer(x)
+        x = F.pad(x, (0, 0, 2, 0))
+        x = self.conv_out(x)
+        return x
+class Decoder(nn.Module):
+    def __init__(self, input_emb_width=3, output_emb_width=4, width=512, depth=3, in_ch_mult=(1,1), ch_mult=(1,1)):
+        super().__init__()
+        self.model = nn.ModuleList()
+        block_in = width * ch_mult[0]
+        self.conv_in = nn.Conv2d(output_emb_width, block_in, (3,1), (1,1), (0,0))
+        for i in range(len(in_ch_mult)):
+            block_in = width * ch_mult[i]
+            block_out = width * in_ch_mult[i]
+            for j in range(depth):
+                self.model.append(ResnetBlock(in_channels=block_in, out_channels=block_out, dil=2-j))
+                block_in = block_out
+            self.model.append(Upsample(block_in))
+        self.conv_out1 = torch.nn.Conv2d(block_in, block_in, (3, 1), (1,1), (0,0))
+        self.conv_out2 = torch.nn.Conv2d(block_in, input_emb_width, (3, 1), (1, 1), (0, 0))
+    def forward(self, x):
+        x = F.pad(x, (0, 0, 2, 0))
+        x = self.conv_in(x)
+        for layer in self.model:
+            x = layer(x)
+        x = F.pad(x, (0, 0, 2, 0))
+        x = self.conv_out1(x)
+        x = x * torch.sigmoid(x)
+        x = F.pad(x, (0, 0, 2, 0))
+        x = self.conv_out2(x)
+        return x.permute(0,2,3,1)
+class Upsample(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(in_channels, in_channels,(3, 1), (1, 1), (0, 0))
+    def forward(self, x):
+        x = torch.nn.functional.interpolate(x, scale_factor=(2.0, 1.0), mode="nearest")
+        x = F.pad(x, (0, 0, 2, 0))
+        x = self.conv(x)
+        return x
+class ResnetBlock(nn.Module):
+    def __init__(self, *, in_channels, out_channels=None, dil=0, conv_shortcut=False, dropout=0.2):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.padd = CausalPad2d((0, 0, 2*(3 ** dil), 0))
+        self.conv1 = torch.nn.Conv2d(in_channels,
+                                     out_channels,
+                                     kernel_size=(3, 1),
+                                     stride=(1, 1),
+                                     padding=(0, 0),
+                                     dilation=(3 ** dil, 1),
+                                     )
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(out_channels,
+                                     out_channels,
+                                     kernel_size=(1, 1),
+                                     stride=(1, 1),
+                                     padding=(0, 0),
+                                    )
+    def forward(self, x):
+        h = x
+        h = h*torch.sigmoid(h)
+        h = self.padd(h)
+        h = self.conv1(h)
+        h = h*torch.sigmoid(h)
+        h = self.conv2(h)
+        h = self.dropout(h)
+        return x+h
+class DiagonalGaussianDistribution(object):
+    def __init__(self, parameters, deterministic=False):
+        self.parameters = parameters
+        self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = torch.exp(0.5 * self.logvar)
+        self.var = torch.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device)
+    def sample(self):
+        x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device)
+        return x
+    def kl(self, other=None):
+        if self.deterministic:
+            return torch.Tensor([0.])
+        else:
+            if other is None:
+                return 0.5 * torch.sum(torch.pow(self.mean, 2)
+                                       + self.var - 1.0 - self.logvar,
+                                       dim=[1, 2, 3])
+            else:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean - other.mean, 2) / other.var
+                    + self.var / other.var - 1.0 - self.logvar + other.logvar,
+                    dim=[1, 2, 3])
+    def nll(self, sample, dims=[1,2,3]):
+        if self.deterministic:
+            return torch.Tensor([0.])
+        logtwopi = np.log(2.0 * np.pi)
+        return 0.5 * torch.sum(
+            logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
+            dim=dims)
+    def mode(self):
+        return self.mean
+class CausalPad2d(nn.Module):
+    def __init__(self, pad):
+        super().__init__()
+        self.pad = pad
+    def forward(self, x):
+        return F.pad(x, self.pad)

models/AE_2D_NonCausal.py ADDED Viewed

	@@ -0,0 +1,228 @@

+import torch
+import numpy as np
+import torch.nn as nn
+import torch.nn.functional as F
+#################################################################################
+#                                       AE                                      #
+#################################################################################
+class AE(nn.Module):
+    def __init__(self, input_width=3, output_emb_width=4, width=512, depth=3, ch_mult=(1,1,1)):
+        super().__init__()
+        self.output_emb_width = output_emb_width
+        self.encoder = Encoder(input_width, output_emb_width, width, depth, in_ch_mult=ch_mult[:-1], ch_mult=ch_mult[1:])
+        self.decoder = Decoder(input_width, output_emb_width, width, depth, in_ch_mult=ch_mult[::-1][1:], ch_mult=ch_mult[::-1][:-1])
+    def preprocess(self, x):
+        x = x.permute(0, 3, 1, 2).float()
+        return x
+    def encode(self, x):
+        x_in = self.preprocess(x)
+        x_encoder = self.encoder(x_in)
+        return x_encoder
+    def forward(self, x):
+        x_in = self.preprocess(x)
+        x_encoder = self.encoder(x_in)
+        x_out = self.decoder(x_encoder)
+        return x_out
+    def decode(self, x):
+        x_out = self.decoder(x)
+        return x_out
+#################################################################################
+#                                       VAE                                     #
+#################################################################################
+class VAE(nn.Module):
+    def __init__(self, input_width=3, output_emb_width=4, width=512, depth=3, ch_mult=(1,1,1)):
+        super().__init__()
+        self.output_emb_width = output_emb_width
+        self.encoder = Encoder(input_width, output_emb_width*2, width, depth, in_ch_mult=ch_mult[:-1], ch_mult=ch_mult[1:])
+        self.decoder = Decoder(input_width, output_emb_width, width, depth, in_ch_mult=ch_mult[::-1][1:], ch_mult=ch_mult[::-1][:-1])
+    def preprocess(self, x):
+        x = x.permute(0, 3, 1, 2).float()
+        return x
+    def encode(self, x):
+        x_in = self.preprocess(x)
+        x_encoder = self.encoder(x_in)
+        x_encoder = DiagonalGaussianDistribution(x_encoder)
+        x_encoder = x_encoder.sample()
+        return x_encoder
+    def forward(self, x, need_loss=False):
+        x_in = self.preprocess(x)
+        x_encoder = self.encoder(x_in)
+        x_encoder = DiagonalGaussianDistribution(x_encoder)
+        kl_loss = x_encoder.kl()
+        x_encoder = x_encoder.sample()
+        x_out = self.decoder(x_encoder)
+        if need_loss:
+            # sigma vae for better quality
+            log_sigma = ((x - x_out) ** 2).mean([1,2,3], keepdim=True).sqrt().log()
+            log_sigma = -6 + F.softplus(log_sigma - (-6))
+            rec = 0.5 * torch.pow((x - x_out) / log_sigma.exp(), 2) + log_sigma
+            rec = rec.sum(dim=(1,2,3))
+            loss = {
+                    "rec": rec.mean(),
+                    "kl": kl_loss.mean()}
+            return x_out, loss
+        else:
+            return x_out
+    def decode(self, x):
+        x_out = self.decoder(x)
+        return x_out
+#################################################################################
+#                                     AE Zoos                                   #
+#################################################################################
+def ae(**kwargs):
+    return AE(output_emb_width=4, width=512, depth=3, ch_mult=(1,1,1), **kwargs)
+def vae(**kwargs):
+    return VAE(output_emb_width=4, width=512, depth=3, ch_mult=(1,1,1), **kwargs)
+AE_models = {
+    'AE_Model': ae, 'VAE_Model': vae
+}
+#################################################################################
+#                                 Inner Architectures                           #
+#################################################################################
+class Encoder(nn.Module):
+    def __init__(self, input_emb_width=3, output_emb_width=4, width=512, depth=3, in_ch_mult=(1,1), ch_mult=(1,1)):
+        super().__init__()
+        self.model = nn.ModuleList()
+        self.conv_in = nn.Conv2d(input_emb_width, width, (3, 1), (1, 1), (1, 1))
+        block_in = width * in_ch_mult[0]
+        for i in range(len(in_ch_mult)):
+            block_in = width * in_ch_mult[i]
+            block_out = width * ch_mult[i]
+            self.model.append(nn.Conv2d(width, width, (4, 1), (2, 1), (1, 1)))
+            for j in range(depth):
+                self.model.append(ResnetBlock(in_channels=block_in, out_channels=block_out, dil=2-j))
+                block_in = block_out
+        self.conv_out = torch.nn.Conv2d(block_in, output_emb_width, (3, 1), (1, 1), (1, 1))
+    def forward(self, x):
+        x = self.conv_in(x)
+        for layer in self.model:
+                x = layer(x)
+        x = self.conv_out(x)
+        return x
+class Decoder(nn.Module):
+    def __init__(self, input_emb_width=3, output_emb_width=4, width=512, depth=3, in_ch_mult=(1,1), ch_mult=(1,1)):
+        super().__init__()
+        self.model = nn.ModuleList()
+        block_in = width * ch_mult[0]
+        self.conv_in = nn.Conv2d(output_emb_width, block_in, (3,1), (1,1), (1,1))
+        for i in range(len(in_ch_mult)):
+            block_in = width * ch_mult[i]
+            block_out = width * in_ch_mult[i]
+            for j in range(depth):
+                self.model.append(ResnetBlock(in_channels=block_in, out_channels=block_out, dil=2-j))
+                block_in = block_out
+            self.model.append(Upsample(block_in))
+        self.conv_out1 = torch.nn.Conv2d(block_in, block_in, (3, 1), (1,1), (1,1))
+        self.conv_out2 = torch.nn.Conv2d(block_in, input_emb_width, (3, 1), (1, 1), (1, 1))
+    def forward(self, x):
+        x = self.conv_in(x)
+        for layer in self.model:
+            x = layer(x)
+        x = self.conv_out1(x)
+        x = x * torch.sigmoid(x)
+        x = self.conv_out2(x)
+        return x.permute(0,2,3,1)
+class Upsample(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.conv = torch.nn.Conv2d(in_channels, in_channels,(3, 1), (1, 1), (1, 1))
+    def forward(self, x):
+        x = torch.nn.functional.interpolate(x, scale_factor=(2.0, 1.0), mode="nearest")
+        x = self.conv(x)
+        return x
+class ResnetBlock(nn.Module):
+    def __init__(self, *, in_channels, out_channels=None, dil=0, conv_shortcut=False, dropout=0.2):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.conv1 = torch.nn.Conv2d(in_channels,
+                                     out_channels,
+                                     kernel_size=(3, 1),
+                                     stride=(1, 1),
+                                     padding=(3 ** dil, 0),
+                                     dilation=(3 ** dil, 1),
+                                     )
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(out_channels,
+                                     out_channels,
+                                     kernel_size=(1, 1),
+                                     stride=(1, 1),
+                                     padding=(0, 0),
+                                    )
+    def forward(self, x):
+        h = x
+        h = h*torch.sigmoid(h)
+        h = self.conv1(h)
+        h = h*torch.sigmoid(h)
+        h = self.conv2(h)
+        h = self.dropout(h)
+        return x+h
+class DiagonalGaussianDistribution(object):
+    def __init__(self, parameters, deterministic=False):
+        self.parameters = parameters
+        self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = torch.exp(0.5 * self.logvar)
+        self.var = torch.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device)
+    def sample(self):
+        x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device)
+        return x
+    def kl(self, other=None):
+        if self.deterministic:
+            return torch.Tensor([0.])
+        else:
+            if other is None:
+                return 0.5 * torch.sum(torch.pow(self.mean, 2)
+                                       + self.var - 1.0 - self.logvar,
+                                       dim=[1, 2, 3])
+            else:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean - other.mean, 2) / other.var
+                    + self.var / other.var - 1.0 - self.logvar + other.logvar,
+                    dim=[1, 2, 3])
+    def nll(self, sample, dims=[1,2,3]):
+        if self.deterministic:
+            return torch.Tensor([0.])
+        logtwopi = np.log(2.0 * np.pi)
+        return 0.5 * torch.sum(
+            logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
+            dim=dims)
+    def mode(self):
+        return self.mean

models/AE_Mesh.py ADDED Viewed

	@@ -0,0 +1,601 @@

+# A modified version of "Fully Convolutional Mesh Autoencoder using Efficient Spatially Varying Kernels"
+# https://arxiv.org/abs/2006.04325
+# and thanks to this more modern implementation as well
+# https://github.com/g-fiche/Mesh-VQ-VAE
+# https://arxiv.org/abs/2312.08291
+import torch
+import torch.nn as nn
+import numpy as np
+import os
+#################################################################################
+#                                       AE                                      #
+#################################################################################
+class AE(nn.Module):
+    def __init__(self, model, bs=16, num_vertices=6890):
+        super().__init__()
+        # currently only set up is for SMPL-H
+        self.num_vertices = num_vertices
+        self.bs=bs
+        self.encoder = Encoder(model)
+        self.decoder = Decoder(model)
+    def encode(self, x):
+        B, L = x.shape[0], x.shape[1]
+        x = x.view(B * L, self.num_vertices, 3)
+        x_encoder = self.encoder(x)
+        return x_encoder
+    def forward(self, x):
+        B, L = x.shape[0], x.shape[1]
+        x = x.view(B * L, self.num_vertices, 3)
+        x_encoder = self.encoder(x)
+        x_out = self.decoder(x_encoder)
+        x_out = x_out.view(B, L, self.num_vertices, 3)
+        return x_out
+    def decode(self, x):
+        T = x.shape[1]
+        if x.shape[1] % self.bs != 0:
+            x = torch.cat([x, torch.zeros_like(x[:, :self.bs-x.shape[1] % self.bs])], dim=1)
+        outputs = []
+        for i in range(x.shape[0]):
+            outputss = []
+            for j in range(0, x.shape[1], self.bs):
+                chunk = x[i, j:j + self.bs]
+                out = self.decoder(chunk)
+                outputss.append(out)
+            outputs.append(torch.cat(outputss, dim=0)[:T])
+        x_out = torch.stack(outputs, dim=0)
+        return x_out
+#################################################################################
+#                                     AE Zoos                                   #
+#################################################################################
+def ae(**kwargs):
+    config_model = {"batch": 16,
+                    "connection_folder": "body_models/ConnectionMatrices/",
+                    "initial_connection_fn": "body_models/ConnectionMatrices/_pool0.npy",
+                    "connection_layer_lst": ["pool0", "pool1", "pool2", "pool3", "pool4", "pool5", "pool6", "pool7_28",
+                                             "unpool7_28", "unpool6", "unpool5", "unpool4", "unpool3", "unpool2",
+                                             "unpool1", "unpool0"],
+                    "channel_lst": [64, 64, 128, 128, 256, 256, 512, 12, 512, 256, 256, 128, 128, 64, 64, 3],
+                    "weight_num_lst": [9, 0, 9, 0, 9, 0, 9, 0, 0, 9, 0, 9, 0, 9, 0, 9],
+                    "residual_rate_lst": [0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0],
+                    }
+    return AE(FullyConvAE(config_model, **kwargs), bs=config_model["batch"])
+AE_models = {
+    'AE_Model': ae
+}
+class Encoder(nn.Module):
+    def __init__(self, model):
+        super(Encoder, self).__init__()
+        self.model = model
+    def forward(self, x):
+        out = self.model.forward_till_layer_n(x, len(self.model.channel_lst) // 2)
+        return out
+class Decoder(nn.Module):
+    def __init__(self, model):
+        super(Decoder, self).__init__()
+        self.model = model
+    def forward(self, x):
+        out = self.model.forward_from_layer_n(x, len(self.model.channel_lst) // 2)
+        return out
+class FullyConvAE(nn.Module):
+    def __init__(
+        self, config_model=None, test_mode=False
+    ):  # layer_info_lst= [(point_num, feature_dim)]
+        super(FullyConvAE, self).__init__()
+        self.test_mode = test_mode
+        self.channel_lst = config_model["channel_lst"]
+        self.residual_rate_lst = config_model["residual_rate_lst"]
+        self.weight_num_lst = config_model["weight_num_lst"]
+        self.initial_connection_fn = config_model["initial_connection_fn"]
+        data = np.load(self.initial_connection_fn)
+        neighbor_id_dist_lstlst = data[:, 1:]  # point_num*(1+2*neighbor_num)
+        self.point_num = data.shape[0]
+        self.neighbor_id_lstlst = neighbor_id_dist_lstlst.reshape(
+            (self.point_num, -1, 2)
+        )[
+            :, :, 0
+        ]  # point_num*neighbor_num
+        self.neighbor_num_lst = np.array(data[:, 0])  # point_num
+        self.relu = nn.ELU()
+        self.batch = config_model["batch"]
+        #####For Laplace computation######
+        self.initial_neighbor_id_lstlst = torch.LongTensor(
+            self.neighbor_id_lstlst
+        ).cuda()  # point_num*max_neighbor_num
+        self.initial_neighbor_num_lst = torch.FloatTensor(
+            self.neighbor_num_lst
+        ).cuda()  # point_num
+        self.connection_folder = config_model["connection_folder"]
+        self.connection_layer_fn_lst = []
+        fn_lst = os.listdir(self.connection_folder)
+        self.connection_layer_lst = config_model["connection_layer_lst"]
+        for layer_name in self.connection_layer_lst:
+            layer_name = "_" + layer_name + "."
+            find_fn = False
+            for fn in fn_lst:
+                if (layer_name in fn) and ((".npy" in fn) or (".npz" in fn)):
+                    self.connection_layer_fn_lst += [self.connection_folder + fn]
+                    find_fn = True
+                    break
+            if find_fn == False:
+                print("!!!ERROR: cannot find the connection layer fn")
+        self.init_layers(self.batch)
+        self.initial_max_neighbor_num = self.initial_neighbor_id_lstlst.shape[1]
+    def init_layers(self, batch):
+        self.layer_lst = (
+            []
+        )  ##[in_channel, out_channel, in_pn, out_pn, max_neighbor_num, neighbor_num_lst,neighbor_id_lstlst,conv_layer, residual_layer]
+        self.layer_num = len(self.channel_lst)
+        in_point_num = self.point_num
+        in_channel = 3
+        for l in range(self.layer_num):
+            out_channel = self.channel_lst[l]
+            weight_num = self.weight_num_lst[l]
+            residual_rate = self.residual_rate_lst[l]
+            connection_info = np.load(self.connection_layer_fn_lst[l])
+            out_point_num = connection_info.shape[0]
+            neighbor_num_lst = torch.FloatTensor(
+                connection_info[:, 0].astype(float)
+            ).cuda()  # out_point_num*1
+            neighbor_id_dist_lstlst = connection_info[
+                :, 1:
+            ]  # out_point_num*(max_neighbor_num*2)
+            print(self.connection_layer_fn_lst[l])
+            print()
+            neighbor_id_lstlst = neighbor_id_dist_lstlst.reshape(
+                (out_point_num, -1, 2)
+            )[
+                :, :, 0
+            ]  # out_point_num*max_neighbor_num
+            neighbor_id_lstlst = torch.LongTensor(neighbor_id_lstlst).cuda()
+            max_neighbor_num = neighbor_id_lstlst.shape[1]
+            avg_neighbor_num = round(neighbor_num_lst.mean().item())
+            effective_w_weights_rate = neighbor_num_lst.sum() / float(
+                max_neighbor_num * out_point_num
+            )
+            effective_w_weights_rate = round(effective_w_weights_rate.item(), 3)
+            pc_mask = torch.ones(in_point_num + 1).cuda()
+            pc_mask[in_point_num] = 0
+            neighbor_mask_lst = pc_mask[
+                neighbor_id_lstlst
+            ].contiguous()  # out_pn*max_neighbor_num neighbor is 1 otherwise 0
+            zeros_batch_outpn_outchannel = torch.zeros(
+                (batch, out_point_num, out_channel)
+            ).cuda()
+            if (residual_rate < 0) or (residual_rate > 1):
+                print("Invalid residual rate", residual_rate)
+            ####parameters for conv###############
+            conv_layer = ""
+            if residual_rate < 1:
+                weights = torch.randn(weight_num, out_channel * in_channel).cuda()
+                weights = nn.Parameter(weights).cuda()
+                self.register_parameter("weights" + str(l), weights)
+                bias = nn.Parameter(torch.zeros(out_channel).cuda())
+                self.register_parameter("bias" + str(l), bias)
+                w_weights = torch.randn(out_point_num, max_neighbor_num, weight_num) / (
+                    avg_neighbor_num * weight_num
+                )
+                w_weights = nn.Parameter(w_weights.cuda())
+                self.register_parameter("w_weights" + str(l), w_weights)
+                conv_layer = (weights, bias, w_weights)
+            ####parameters for residual###############
+            ## a residual layer with out_point_num==in_point_num and residual_rate==1 is a pooling or unpooling layer
+            residual_layer = ""
+            if residual_rate > 0:
+                p_neighbors = ""
+                weight_res = ""
+                if out_point_num != in_point_num:
+                    p_neighbors = nn.Parameter(
+                        (
+                            torch.randn(out_point_num, max_neighbor_num)
+                            / (avg_neighbor_num)
+                        ).cuda()
+                    )
+                    self.register_parameter("p_neighbors" + str(l), p_neighbors)
+                if out_channel != in_channel:
+                    weight_res = torch.randn(out_channel, in_channel)
+                    # self.normalize_weights(weight_res)
+                    weight_res = weight_res / out_channel
+                    weight_res = nn.Parameter(weight_res.cuda())
+                    self.register_parameter("weight_res" + str(l), weight_res)
+                residual_layer = (weight_res, p_neighbors)
+            #####put everythin together
+            layer = (
+                in_channel,
+                out_channel,
+                in_point_num,
+                out_point_num,
+                weight_num,
+                max_neighbor_num,
+                neighbor_num_lst,
+                neighbor_id_lstlst,
+                conv_layer,
+                residual_layer,
+                residual_rate,
+                neighbor_mask_lst,
+                zeros_batch_outpn_outchannel,
+            )
+            self.layer_lst += [layer]
+            in_point_num = out_point_num
+            in_channel = out_channel
+    # precompute the parameters so as to accelerate forwarding in testing mode
+    def init_test_mode(self):
+        for l in range(len(self.layer_lst)):
+            layer_info = self.layer_lst[l]
+            (
+                in_channel,
+                out_channel,
+                in_pn,
+                out_pn,
+                weight_num,
+                max_neighbor_num,
+                neighbor_num_lst,
+                neighbor_id_lstlst,
+                conv_layer,
+                residual_layer,
+                residual_rate,
+                neighbor_mask_lst,
+                zeros_batch_outpn_outchannel,
+            ) = layer_info
+            if len(conv_layer) != 0:
+                (
+                    weights,
+                    bias,
+                    raw_w_weights,
+                ) = conv_layer  # weight_num*(out_channel*in_channel)   out_point_num* max_neighbor_num* weight_num
+                w_weights = ""
+                w_weights = raw_w_weights * neighbor_mask_lst.view(
+                    out_pn, max_neighbor_num, 1
+                ).repeat(
+                    1, 1, weight_num
+                )  # out_pn*max_neighbor_num*weight_num
+                weights = torch.einsum(
+                    "pmw,wc->pmc", [w_weights, weights]
+                )  # out_pn*max_neighbor_num*(out_channel*in_channel)
+                weights = weights.view(
+                    out_pn, max_neighbor_num, out_channel, in_channel
+                )
+                conv_layer = weights, bias
+            ####compute output of residual layer####
+            if len(residual_layer) != 0:
+                (
+                    weight_res,
+                    p_neighbors_raw,
+                ) = residual_layer  # out_channel*in_channel  out_pn*max_neighbor_num
+                if in_pn != out_pn:
+                    p_neighbors = torch.abs(p_neighbors_raw) * neighbor_mask_lst
+                    p_neighbors_sum = p_neighbors.sum(1) + 1e-8  # out_pn
+                    p_neighbors = p_neighbors / p_neighbors_sum.view(out_pn, 1).repeat(
+                        1, max_neighbor_num
+                    )
+                    residual_layer = weight_res, p_neighbors
+            self.layer_lst[l] = (
+                in_channel,
+                out_channel,
+                in_pn,
+                out_pn,
+                weight_num,
+                max_neighbor_num,
+                neighbor_num_lst,
+                neighbor_id_lstlst,
+                conv_layer,
+                residual_layer,
+                residual_rate,
+                neighbor_mask_lst,
+                zeros_batch_outpn_outchannel,
+            )
+    # a faster mode for testing
+    # input_pc batch*in_pn*in_channel
+    # out_pc batch*out_pn*out_channel
+    def forward_one_conv_layer_batch_during_test(
+        self, in_pc, layer_info, is_final_layer=False
+    ):
+        batch = in_pc.shape[0]
+        (
+            in_channel,
+            out_channel,
+            in_pn,
+            out_pn,
+            weight_num,
+            max_neighbor_num,
+            neighbor_num_lst,
+            neighbor_id_lstlst,
+            conv_layer,
+            residual_layer,
+            residual_rate,
+            neighbor_mask_lst,
+            zeros_batch_outpn_outchannel,
+        ) = layer_info
+        device = in_pc.get_device()
+        if device < 0:
+            device = "cpu"
+        in_pc_pad = torch.cat(
+            (in_pc, torch.zeros(batch, 1, in_channel).to(device)), 1
+        )  # batch*(in_pn+1)*in_channel
+        in_neighbors = in_pc_pad[
+            :, neighbor_id_lstlst.to(device)
+        ]  # batch*out_pn*max_neighbor_num*in_channel
+        ####compute output of convolution layer####
+        out_pc_conv = zeros_batch_outpn_outchannel.clone()
+        if len(conv_layer) != 0:
+            (
+                weights,
+                bias,
+            ) = conv_layer  # weight_num*(out_channel*in_channel)   out_point_num* max_neighbor_num* weight_num
+            out_neighbors = torch.einsum(
+                "pmoi,bpmi->bpmo", [weights.to(device), in_neighbors]
+            )  # batch*out_pn*max_neighbor_num*out_channel
+            out_pc_conv = out_neighbors.sum(2)
+            out_pc_conv = out_pc_conv + bias
+            if is_final_layer == False:
+                out_pc_conv = self.relu(
+                    out_pc_conv
+                )  ##self.relu is defined in the init function
+        # if(self.residual_rate==0):
+        #    return out_pc
+        ####compute output of residual layer####
+        out_pc_res = zeros_batch_outpn_outchannel.clone()
+        if len(residual_layer) != 0:
+            (
+                weight_res,
+                p_neighbors,
+            ) = residual_layer  # out_channel*in_channel  out_pn*max_neighbor_num
+            if in_channel != out_channel:
+                in_pc_pad = torch.einsum("oi,bpi->bpo", [weight_res, in_pc_pad])
+            out_pc_res = []
+            if in_pn == out_pn:
+                out_pc_res = in_pc_pad[:, 0:in_pn].clone()
+            else:
+                in_neighbors = in_pc_pad[
+                    :, neighbor_id_lstlst.to(device)
+                ]  # batch*out_pn*max_neighbor_num*out_channel
+                out_pc_res = torch.einsum(
+                    "pm,bpmo->bpo", [p_neighbors.to(device), in_neighbors]
+                )
+        out_pc = out_pc_conv.to(device) * np.sqrt(1 - residual_rate) + out_pc_res.to(
+            device
+        ) * np.sqrt(residual_rate)
+        return out_pc
+    # use in train mode. Slower than test mode
+    # input_pc batch*in_pn*in_channel
+    # out_pc batch*out_pn*out_channel
+    def forward_one_conv_layer_batch(self, in_pc, layer_info, is_final_layer=False):
+        batch = in_pc.shape[0]
+        (
+            in_channel,
+            out_channel,
+            in_pn,
+            out_pn,
+            weight_num,
+            max_neighbor_num,
+            neighbor_num_lst,
+            neighbor_id_lstlst,
+            conv_layer,
+            residual_layer,
+            residual_rate,
+            neighbor_mask_lst,
+            zeros_batch_outpn_outchannel,
+        ) = layer_info
+        in_pc_pad = torch.cat(
+            (in_pc, torch.zeros(batch, 1, in_channel).cuda()), 1
+        )  # batch*(in_pn+1)*in_channel
+        in_neighbors = in_pc_pad[
+            :, neighbor_id_lstlst
+        ]  # batch*out_pn*max_neighbor_num*in_channel
+        ####compute output of convolution layer####
+        out_pc_conv = zeros_batch_outpn_outchannel.clone()
+        if len(conv_layer) != 0:
+            (
+                weights,
+                bias,
+                raw_w_weights,
+            ) = conv_layer  # weight_num*(out_channel*in_channel)   out_point_num* max_neighbor_num* weight_num
+            w_weights = raw_w_weights * neighbor_mask_lst.view(
+                out_pn, max_neighbor_num, 1
+            ).repeat(
+                1, 1, weight_num
+            )  # out_pn*max_neighbor_num*weight_num
+            weights = torch.einsum(
+                "pmw,wc->pmc", [w_weights, weights]
+            )  # out_pn*max_neighbor_num*(out_channel*in_channel)
+            weights = weights.view(out_pn, max_neighbor_num, out_channel, in_channel)
+            out_neighbors = torch.einsum(
+                "pmoi,bpmi->bpmo", [weights, in_neighbors]
+            )  # batch*out_pn*max_neighbor_num*out_channel
+            out_pc_conv = out_neighbors.sum(2)
+            out_pc_conv = out_pc_conv + bias
+            if is_final_layer == False:
+                out_pc_conv = self.relu(
+                    out_pc_conv
+                )  ##self.relu is defined in the init function
+        ####compute output of residual layer####
+        out_pc_res = zeros_batch_outpn_outchannel.clone()
+        if len(residual_layer) != 0:
+            (
+                weight_res,
+                p_neighbors_raw,
+            ) = residual_layer  # out_channel*in_channel  out_pn*max_neighbor_num
+            if in_channel != out_channel:
+                in_pc_pad = torch.einsum("oi,bpi->bpo", [weight_res, in_pc_pad])
+            out_pc_res = []
+            if in_pn == out_pn:
+                out_pc_res = in_pc_pad[:, 0:in_pn].clone()
+            else:
+                in_neighbors = in_pc_pad[
+                    :, neighbor_id_lstlst
+                ]  # batch*out_pn*max_neighbor_num*out_channel
+                p_neighbors = torch.abs(p_neighbors_raw) * neighbor_mask_lst
+                p_neighbors_sum = p_neighbors.sum(1) + 1e-8  # out_pn
+                p_neighbors = p_neighbors / p_neighbors_sum.view(out_pn, 1).repeat(
+                    1, max_neighbor_num
+                )
+                out_pc_res = torch.einsum("pm,bpmo->bpo", [p_neighbors, in_neighbors])
+        # print(out_pc_conv.shape, out_pc_res.shape)
+        out_pc = out_pc_conv * np.sqrt(1 - residual_rate) + out_pc_res * np.sqrt(
+            residual_rate
+        )
+        return out_pc
+    def forward_till_layer_n(self, in_pc, layer_n):
+        out_pc = in_pc.clone()
+        for i in range(layer_n):
+            if self.test_mode == False:
+                out_pc = self.forward_one_conv_layer_batch(out_pc, self.layer_lst[i])
+            else:
+                out_pc = self.forward_one_conv_layer_batch_during_test(
+                    out_pc, self.layer_lst[i]
+                )
+        # out_pc = self.final_linear(out_pc.transpose(1,2)).transpose(1,2) #batch*3*point_num
+        return out_pc
+    def forward_from_layer_n(self, in_pc, layer_n):
+        out_pc = in_pc.clone()
+        for i in range(layer_n, self.layer_num):
+            if i < (self.layer_num - 1):
+                if self.test_mode == False:
+                    out_pc = self.forward_one_conv_layer_batch(
+                        out_pc, self.layer_lst[i]
+                    )
+                else:
+                    out_pc = self.forward_one_conv_layer_batch_during_test(
+                        out_pc, self.layer_lst[i]
+                    )
+            else:
+                if self.test_mode == False:
+                    out_pc = self.forward_one_conv_layer_batch(
+                        out_pc, self.layer_lst[i], is_final_layer=True
+                    )
+                else:
+                    out_pc = self.forward_one_conv_layer_batch_during_test(
+                        out_pc, self.layer_lst[i], is_final_layer=True
+                    )
+        return out_pc
+    def forward_layer_n(self, in_pc, layer_n):
+        out_pc = in_pc.clone()
+        if layer_n < (self.layer_num - 1):
+            if self.test_mode == False:
+                out_pc = self.forward_one_conv_layer_batch(
+                    out_pc, self.layer_lst[layer_n]
+                )
+            else:
+                out_pc = self.forward_one_conv_layer_batch_during_test(
+                    out_pc, self.layer_lst[layer_n]
+                )
+        else:
+            if self.test_mode == False:
+                out_pc = self.forward_one_conv_layer_batch(
+                    out_pc, self.layer_lst[layer_n], is_final_layer=True
+                )
+            else:
+                out_pc = self.forward_one_conv_layer_batch_during_test(
+                    out_pc, self.layer_lst[layer_n], is_final_layer=True
+                )
+        return out_pc

models/LengthEstimator.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import torch.nn as nn
+#################################################################################
+#                                 Length Estimator                              #
+#################################################################################
+class LengthEstimator(nn.Module):
+    def __init__(self, input_size, output_size):
+        super(LengthEstimator, self).__init__()
+        nd = 512
+        self.output = nn.Sequential(
+            nn.Linear(input_size, nd),
+            nn.LayerNorm(nd),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Dropout(0.2),
+            nn.Linear(nd, nd // 2),
+            nn.LayerNorm(nd // 2),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Dropout(0.2),
+            nn.Linear(nd // 2, nd // 4),
+            nn.LayerNorm(nd // 4),
+            nn.LeakyReLU(0.2, inplace=True),
+            nn.Linear(nd // 4, output_size)
+        )
+        self.output.apply(self.__init_weights)
+    def __init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def forward(self, text_emb):
+        return self.output(text_emb)

models/ROPE.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import torch
+import math
+class RopeND:
+    def __init__(self, head_dim=64, nd=3, max_lens=[1024, 64, 64], nd_split=[2, 1, 1], bases=[1000, 1000, 1000],
+                 auto_base=True, cache_longer=1):
+        self.nd = nd
+        self.head_dim = head_dim
+        self.max_lens = max_lens
+        self.nd_split = nd_split
+        self.split_dims = [2 * i * (head_dim // 2 // sum(nd_split)) for i in nd_split]
+        assert sum(self.split_dims) == head_dim
+        self.auto_base = auto_base
+        if auto_base:
+            # empirical, make cos(theta) = -1 when length is kL. base = kL/pi
+            # And L=1 the difference (1/base)**(1/32) ~ 0.7-0.8 ~ pi/4
+            # for traditional L = 4096, 8L/pi = 10.4k, base is set to 10k
+            self.bases = [(int(8 * l / math.pi) // 100 + 1) * 100 for l in self.max_lens]
+            print(f"Bases for rope: {self.bases}")
+        else:
+            self.bases = bases
+        self.cache_longer = cache_longer
+    def generated_cos_sin_mix2d(self, max_len, dim, device, base=1000):
+        inv_freq = 1.0 / (base ** \
+                          (torch.linspace(start=0, end=self.head_dim, steps=dim // 2,
+                                          device=device).float() / self.head_dim))
+        assert inv_freq.size(0) * 2 == dim, f"inv_freq.size(0) = {inv_freq.size(0)}, required dim = {dim}"
+        t = torch.arange(max_len * self.cache_longer, device=device).type_as(inv_freq)
+        freqs = torch.einsum("i,j->ij", t, inv_freq)
+        freqs = torch.cat([freqs, freqs], dim=1)
+        return freqs.cos().to(torch.float), freqs.sin().to(torch.float)
+    def generate_pos_embs_mix2d(self, position_ids, device=None):
+        if device is None:
+            device = position_ids.device
+        if position_ids.dim() == 1:
+            position_ids = position_ids.unsqueeze(0)
+        cos_emb_all, sin_emb_all = [], []
+        for i in range(self.nd):
+            dim_i = self.split_dims[i]
+            base_i = self.bases[i]
+            max_len_i = self.max_lens[i]
+            if not hasattr(self, f"cos_{i}"):
+                _cos, _sin = self.generated_cos_sin_mix2d(max_len=max_len_i, dim=dim_i, device=device, base=base_i)
+                setattr(self, f"cos_{i}", _cos)
+                setattr(self, f"sin_{i}", _sin)
+            cos_emb_all.append(getattr(self, f'cos_{i}')[position_ids[i, :], :])
+            sin_emb_all.append(getattr(self, f'sin_{i}')[position_ids[i, :], :])
+        cos_emb = torch.cat(cos_emb_all, dim=-1)
+        sin_emb = torch.cat(sin_emb_all, dim=-1)
+        return cos_emb, sin_emb
+    def __call__(self, q, k, position_ids):
+        '''q: N N_head L C
+        '''
+        cos_emb, sin_emb = self.generate_pos_embs_mix2d(position_ids, device=q.device)
+        def rotate_half(x):
+            """Rotates half the hidden dims of the input."""
+            x1 = x[..., : x.shape[-1] // 2]
+            x2 = x[..., x.shape[-1] // 2:]
+            return torch.cat((-x2, x1), dim=-1)
+        def apply_rotary_pos_emb(q, k, cos, sin):
+            """Applies Rotary Position Embedding to the query and key tensors.
+            Args:
+                q (`torch.Tensor`): The query tensor.
+                k (`torch.Tensor`): The key tensor.
+                cos (`torch.Tensor`): The cosine part of the rotary embedding.
+                sin (`torch.Tensor`): The sine part of the rotary embedding.
+            Returns:
+                `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+            """
+            cos = cos.unsqueeze(0).unsqueeze(0)
+            sin = sin.unsqueeze(0).unsqueeze(0)
+            dtype = q.dtype
+            q = q.to(torch.float)
+            k = k.to(torch.float)
+            q_embed = (q * cos) + (rotate_half(q) * sin)
+            k_embed = (k * cos) + (rotate_half(k) * sin)
+            q_embed = q_embed.to(dtype)
+            k_embed = k_embed.to(dtype)
+            return q_embed, k_embed
+        q, k = apply_rotary_pos_emb(q, k, cos_emb, sin_emb)
+        return q, k

models/__pycache__/ACMDM.cpython-310.pyc ADDED Viewed

Binary file (14.9 kB). View file

models/__pycache__/ACMDM.cpython-313.pyc ADDED Viewed

Binary file (28.7 kB). View file

models/__pycache__/AE_2D_Causal.cpython-310.pyc ADDED Viewed

Binary file (8.63 kB). View file

models/__pycache__/AE_2D_Causal.cpython-313.pyc ADDED Viewed

Binary file (15.7 kB). View file

models/__pycache__/LengthEstimator.cpython-310.pyc ADDED Viewed

Binary file (1.44 kB). View file

models/__pycache__/ROPE.cpython-310.pyc ADDED Viewed

Binary file (3.85 kB). View file