next-social
/

ComfyUI_Seg_VITON2

Model card Files Files and versions Community

ifwi commited on Feb 20

Commit

3c454af

•

1 Parent(s): 63a2e47

add more

Browse files

Files changed (2) hide show

ldm/models/diffusion/ddpm.py +30 -25
ldm/modules/attention.py +37 -29

ldm/models/diffusion/ddpm.py CHANGED Viewed

@@ -47,6 +47,7 @@ def disabled_train(self, mode=True):
 def uniform_on_device(r1, r2, shape, device):
     return (r1 - r2) * torch.rand(*shape, device=device) + r2
 class DDPM(pl.LightningModule):
     # classic DDPM with Gaussian diffusion, in image space
     def __init__(self,
@@ -124,7 +125,8 @@ class DDPM(pl.LightningModule):
             self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys, only_model=load_only_unet)
             if reset_ema:
                 assert self.use_ema
-                print(f"Resetting ema to pure model weights. This is useful when restoring from an ema-only checkpoint.")
                 self.model_ema = LitEma(self.model)
         if reset_num_ema_updates:
             print(" +++++++++++ WARNING: RESETTING NUM_EMA UPDATES TO ZERO +++++++++++ ")
@@ -573,7 +575,7 @@ class LatentDiffusion(DDPM):
             self.scale_factor = scale_factor
         else:
             self.register_buffer('scale_factor', torch.tensor(scale_factor))
         self.instantiate_first_stage(first_stage_config)
         self.instantiate_cond_stage(cond_stage_config)
         self.cond_stage_forward = cond_stage_forward
@@ -586,7 +588,7 @@ class LatentDiffusion(DDPM):
             self.proj_out = None
         if self.use_pbe_weight:
             print("learnable vector gene")
-            self.learnable_vector = nn.Parameter(torch.randn((1,1,768)), requires_grad=True)
         else:
             self.learnable_vector = None
@@ -608,7 +610,7 @@ class LatentDiffusion(DDPM):
             print(" +++++++++++ WARNING: RESETTING NUM_EMA UPDATES TO ZERO +++++++++++ ")
             assert self.use_ema
             self.model_ema.reset_num_updates()
     def make_cond_schedule(self, ):
         self.cond_ids = torch.full(size=(self.num_timesteps,), fill_value=self.num_timesteps - 1, dtype=torch.long)
         ids = torch.round(torch.linspace(0, self.num_timesteps - 1, self.num_timesteps_cond)).long()
@@ -646,7 +648,7 @@ class LatentDiffusion(DDPM):
         self.first_stage_model.train = disabled_train
         for param in self.first_stage_model.parameters():
             param.requires_grad = False
     def instantiate_cond_stage(self, config):
         if not self.cond_stage_trainable:
             if config == "__is_first_stage__":
@@ -791,14 +793,15 @@ class LatentDiffusion(DDPM):
     @torch.no_grad()
     def get_input(self, batch, k, return_first_stage_outputs=False, force_c_encode=False,
-                  cond_key=None, return_original_cond=False, bs=None, return_x=False, no_latent=False, is_controlnet=False):
         x = super().get_input(batch, k)
         if bs is not None:
             x = x[:bs]
         x = x.to(self.device)
         if no_latent:
-            _,_,h,w = x.shape
-            x = resize(x, (h//8, w//8))
             return [x, None]
         encoder_posterior = self.encode_first_stage(x)
         z = self.get_first_stage_encoding(encoder_posterior).detach()
@@ -815,12 +818,12 @@ class LatentDiffusion(DDPM):
                     xc = batch
                 else:
                     xc = super().get_input(batch, cond_key).to(self.device)
-            else:
                 xc = x
             if not self.cond_stage_trainable or force_c_encode:
                 if self.kwargs["use_imageCLIP"]:
-                    xc = resize(xc, (224,224))
-                    xc = self.imagenet_norm((xc+1)/2)
                     c = xc
                 else:
                     if isinstance(xc, dict) or isinstance(xc, list):
@@ -830,8 +833,8 @@ class LatentDiffusion(DDPM):
                         c = c.float()
             else:
                 if self.kwargs["use_imageCLIP"]:
-                    xc = resize(xc, (224,224))
-                    xc = self.imagenet_norm((xc+1)/2)
                 c = xc
             if bs is not None:
                 c = c[:bs]
@@ -847,7 +850,7 @@ class LatentDiffusion(DDPM):
             if self.use_positional_encodings:
                 pos_x, pos_y = self.compute_latent_shifts(batch)
                 c = {'pos_x': pos_x, 'pos_y': pos_y}
         out = [z, c]
         if return_first_stage_outputs:
             xrec = self.decode_first_stage(z)
@@ -872,6 +875,7 @@ class LatentDiffusion(DDPM):
             return output
         else:
             return output.sample
     def decode_first_stage_train(self, z, predict_cids=False, force_not_quantize=False):
         if predict_cids:
             if z.dim() == 4:
@@ -905,12 +909,11 @@ class LatentDiffusion(DDPM):
         # pbe negative condition
         else:
             t = torch.randint(0, self.num_timesteps, (x.shape[0],), device=self.device).long()
-            self.u_cond_prop=random.uniform(0, 1)
             c["c_crossattn"] = [self.get_learned_conditioning(c["c_crossattn"])]
             if self.u_cond_prop < self.u_cond_percent:
-                c["c_crossattn"] = [self.learnable_vector.repeat(x.shape[0],1,1)]
             return self.p_losses(x, c, t, *args, **kwargs)
     def apply_model(self, x_noisy, t, cond, return_ids=False):
         if isinstance(cond, dict):
@@ -931,7 +934,7 @@ class LatentDiffusion(DDPM):
     def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
         return (extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart) / \
-               extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
     def _prior_bpd(self, x_start):
         """
@@ -946,6 +949,7 @@ class LatentDiffusion(DDPM):
         qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
         kl_prior = normal_kl(mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0)
         return mean_flat(kl_prior) / np.log(2.0)
     def p_losses(self, x_start, cond, t, noise=None):
         loss_dict = {}
         noise = default(noise, lambda: torch.randn_like(x_start))
@@ -969,11 +973,11 @@ class LatentDiffusion(DDPM):
         if self.only_agn_simple_loss:
             _, _, l_h, l_w = model_output.shape
             m_agn = F.interpolate(super().get_input(self.batch, "agn_mask"), (l_h, l_w))
-            loss_simple = self.get_loss(model_output * (1-m_agn), target * (1-m_agn), mean=False).mean([1, 2, 3])
         else:
             loss_simple = self.get_loss(model_output, target, mean=False).mean([1, 2, 3])
         loss_dict.update({f'simple': loss_simple.mean()})
         logvar_t = self.logvar[t].to(self.device)
         loss = loss_simple / torch.exp(logvar_t) + logvar_t
         # loss = loss_simple / torch.exp(self.logvar) + self.logvar
@@ -981,7 +985,7 @@ class LatentDiffusion(DDPM):
             loss_dict.update({f'gamma': loss.mean()})
             loss_dict.update({'logvar': self.logvar.data.mean()})
         loss = self.l_simple_weight * loss.mean()
         loss_vlb = self.get_loss(model_output, target, mean=False).mean(dim=(1, 2, 3))
         loss_vlb = (self.lvlb_weights[t] * loss_vlb).mean()
         if self.original_elbo_weight != 0:
@@ -990,7 +994,7 @@ class LatentDiffusion(DDPM):
         if model_loss is not None:
             loss += model_loss
-            loss_dict.update({f"model loss" : model_loss})
         loss_dict.update({f'{prefix}_loss': loss})
         return loss, loss_dict
@@ -1540,7 +1544,7 @@ class LatentUpscaleDiffusion(LatentDiffusion):
                     uc[k] = [uc_tmp]
                 elif k == "c_adm":  # todo: only run with text-based guidance?
                     assert isinstance(c[k], torch.Tensor)
-                    #uc[k] = torch.ones_like(c[k]) * self.low_scale_model.max_noise_level
                     uc[k] = c[k]
                 elif isinstance(c[k], list):
                     uc[k] = [c[k][i] for i in range(len(c[k]))]
@@ -1807,7 +1811,7 @@ class LatentDepth2ImageDiffusion(LatentFinetuneDiffusion):
         log = super().log_images(*args, **kwargs)
         depth = self.depth_model(args[0][self.depth_stage_key])
         depth_min, depth_max = torch.amin(depth, dim=[1, 2, 3], keepdim=True), \
-                               torch.amax(depth, dim=[1, 2, 3], keepdim=True)
         log["depth"] = 2. * (depth - depth_min) / (depth_max - depth_min) - 1.
         return log
@@ -1816,6 +1820,7 @@ class LatentUpscaleFinetuneDiffusion(LatentFinetuneDiffusion):
     """
         condition on low-res image (and optionally on some spatial noise augmentation)
     """
     def __init__(self, concat_keys=("lr",), reshuffle_patch_size=None,
                  low_scale_config=None, low_scale_key=None, *args, **kwargs):
         super().__init__(concat_keys=concat_keys, *args, **kwargs)
@@ -1872,4 +1877,4 @@ class LatentUpscaleFinetuneDiffusion(LatentFinetuneDiffusion):
     def log_images(self, *args, **kwargs):
         log = super().log_images(*args, **kwargs)
         log["lr"] = rearrange(args[0]["lr"], 'b h w c -> b c h w')
-        return log

 def uniform_on_device(r1, r2, shape, device):
     return (r1 - r2) * torch.rand(*shape, device=device) + r2
 class DDPM(pl.LightningModule):
     # classic DDPM with Gaussian diffusion, in image space
     def __init__(self,
             self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys, only_model=load_only_unet)
             if reset_ema:
                 assert self.use_ema
+                print(
+                    f"Resetting ema to pure model weights. This is useful when restoring from an ema-only checkpoint.")
                 self.model_ema = LitEma(self.model)
         if reset_num_ema_updates:
             print(" +++++++++++ WARNING: RESETTING NUM_EMA UPDATES TO ZERO +++++++++++ ")
             self.scale_factor = scale_factor
         else:
             self.register_buffer('scale_factor', torch.tensor(scale_factor))
         self.instantiate_first_stage(first_stage_config)
         self.instantiate_cond_stage(cond_stage_config)
         self.cond_stage_forward = cond_stage_forward
             self.proj_out = None
         if self.use_pbe_weight:
             print("learnable vector gene")
+            self.learnable_vector = nn.Parameter(torch.randn((1, 1, 768)), requires_grad=True)
         else:
             self.learnable_vector = None
             print(" +++++++++++ WARNING: RESETTING NUM_EMA UPDATES TO ZERO +++++++++++ ")
             assert self.use_ema
             self.model_ema.reset_num_updates()
     def make_cond_schedule(self, ):
         self.cond_ids = torch.full(size=(self.num_timesteps,), fill_value=self.num_timesteps - 1, dtype=torch.long)
         ids = torch.round(torch.linspace(0, self.num_timesteps - 1, self.num_timesteps_cond)).long()
         self.first_stage_model.train = disabled_train
         for param in self.first_stage_model.parameters():
             param.requires_grad = False
     def instantiate_cond_stage(self, config):
         if not self.cond_stage_trainable:
             if config == "__is_first_stage__":
     @torch.no_grad()
     def get_input(self, batch, k, return_first_stage_outputs=False, force_c_encode=False,
+                  cond_key=None, return_original_cond=False, bs=None, return_x=False, no_latent=False,
+                  is_controlnet=False):
         x = super().get_input(batch, k)
         if bs is not None:
             x = x[:bs]
         x = x.to(self.device)
         if no_latent:
+            _, _, h, w = x.shape
+            x = resize(x, (h // 8, w // 8))
             return [x, None]
         encoder_posterior = self.encode_first_stage(x)
         z = self.get_first_stage_encoding(encoder_posterior).detach()
                     xc = batch
                 else:
                     xc = super().get_input(batch, cond_key).to(self.device)
+            else:
                 xc = x
             if not self.cond_stage_trainable or force_c_encode:
                 if self.kwargs["use_imageCLIP"]:
+                    xc = resize(xc, (224, 224))
+                    xc = self.imagenet_norm((xc + 1) / 2)
                     c = xc
                 else:
                     if isinstance(xc, dict) or isinstance(xc, list):
                         c = c.float()
             else:
                 if self.kwargs["use_imageCLIP"]:
+                    xc = resize(xc, (224, 224))
+                    xc = self.imagenet_norm((xc + 1) / 2)
                 c = xc
             if bs is not None:
                 c = c[:bs]
             if self.use_positional_encodings:
                 pos_x, pos_y = self.compute_latent_shifts(batch)
                 c = {'pos_x': pos_x, 'pos_y': pos_y}
         out = [z, c]
         if return_first_stage_outputs:
             xrec = self.decode_first_stage(z)
             return output
         else:
             return output.sample
     def decode_first_stage_train(self, z, predict_cids=False, force_not_quantize=False):
         if predict_cids:
             if z.dim() == 4:
         # pbe negative condition
         else:
             t = torch.randint(0, self.num_timesteps, (x.shape[0],), device=self.device).long()
+            self.u_cond_prop = random.uniform(0, 1)
             c["c_crossattn"] = [self.get_learned_conditioning(c["c_crossattn"])]
             if self.u_cond_prop < self.u_cond_percent:
+                c["c_crossattn"] = [self.learnable_vector.repeat(x.shape[0], 1, 1)]
             return self.p_losses(x, c, t, *args, **kwargs)
     def apply_model(self, x_noisy, t, cond, return_ids=False):
         if isinstance(cond, dict):
     def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
         return (extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart) / \
+            extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
     def _prior_bpd(self, x_start):
         """
         qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
         kl_prior = normal_kl(mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0)
         return mean_flat(kl_prior) / np.log(2.0)
     def p_losses(self, x_start, cond, t, noise=None):
         loss_dict = {}
         noise = default(noise, lambda: torch.randn_like(x_start))
         if self.only_agn_simple_loss:
             _, _, l_h, l_w = model_output.shape
             m_agn = F.interpolate(super().get_input(self.batch, "agn_mask"), (l_h, l_w))
+            loss_simple = self.get_loss(model_output * (1 - m_agn), target * (1 - m_agn), mean=False).mean([1, 2, 3])
         else:
             loss_simple = self.get_loss(model_output, target, mean=False).mean([1, 2, 3])
         loss_dict.update({f'simple': loss_simple.mean()})
         logvar_t = self.logvar[t].to(self.device)
         loss = loss_simple / torch.exp(logvar_t) + logvar_t
         # loss = loss_simple / torch.exp(self.logvar) + self.logvar
             loss_dict.update({f'gamma': loss.mean()})
             loss_dict.update({'logvar': self.logvar.data.mean()})
         loss = self.l_simple_weight * loss.mean()
         loss_vlb = self.get_loss(model_output, target, mean=False).mean(dim=(1, 2, 3))
         loss_vlb = (self.lvlb_weights[t] * loss_vlb).mean()
         if self.original_elbo_weight != 0:
         if model_loss is not None:
             loss += model_loss
+            loss_dict.update({f"model loss": model_loss})
         loss_dict.update({f'{prefix}_loss': loss})
         return loss, loss_dict
                     uc[k] = [uc_tmp]
                 elif k == "c_adm":  # todo: only run with text-based guidance?
                     assert isinstance(c[k], torch.Tensor)
+                    # uc[k] = torch.ones_like(c[k]) * self.low_scale_model.max_noise_level
                     uc[k] = c[k]
                 elif isinstance(c[k], list):
                     uc[k] = [c[k][i] for i in range(len(c[k]))]
         log = super().log_images(*args, **kwargs)
         depth = self.depth_model(args[0][self.depth_stage_key])
         depth_min, depth_max = torch.amin(depth, dim=[1, 2, 3], keepdim=True), \
+            torch.amax(depth, dim=[1, 2, 3], keepdim=True)
         log["depth"] = 2. * (depth - depth_min) / (depth_max - depth_min) - 1.
         return log
     """
         condition on low-res image (and optionally on some spatial noise augmentation)
     """
     def __init__(self, concat_keys=("lr",), reshuffle_patch_size=None,
                  low_scale_config=None, low_scale_key=None, *args, **kwargs):
         super().__init__(concat_keys=concat_keys, *args, **kwargs)
     def log_images(self, *args, **kwargs):
         log = super().log_images(*args, **kwargs)
         log["lr"] = rearrange(args[0]["lr"], 'b h w c -> b c h w')
+        return log

ldm/modules/attention.py CHANGED Viewed

@@ -12,20 +12,23 @@ from ldm.modules.diffusionmodules.util import checkpoint
 try:
     import xformers
     import xformers.ops
     XFORMERS_IS_AVAILBLE = True
 except:
     XFORMERS_IS_AVAILBLE = False
 # CrossAttn precision handling
 import os
 _ATTN_PRECISION = os.environ.get("ATTN_PRECISION", "fp32")
 def exists(val):
     return val is not None
 def uniq(arr):
-    return{el: True for el in arr}.keys()
 def default(val, d):
@@ -33,6 +36,7 @@ def default(val, d):
         return val
     return d() if isfunction(d) else d
 class GEGLU(nn.Module):
     def __init__(self, dim_in, dim_out):
         super().__init__()
@@ -110,12 +114,12 @@ class SpatialSelfAttention(nn.Module):
         k = self.k(h_)
         v = self.v(h_)
-        b,c,h,w = q.shape
         q = rearrange(q, 'b c h w -> b (h w) c')
         k = rearrange(k, 'b c h w -> b c (h w)')
         w_ = torch.einsum('bij,bjk->bik', q, k)
-        w_ = w_ * (int(c)**(-0.5))
         w_ = torch.nn.functional.softmax(w_, dim=2)
         v = rearrange(v, 'b c h w -> b c (h w)')
@@ -124,7 +128,8 @@ class SpatialSelfAttention(nn.Module):
         h_ = rearrange(h_, 'b c (h w) -> b c h w', h=h)
         h_ = self.proj_out(h_)
-        return x+h_
 class CrossAttention(nn.Module):
     def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0., **kwargs):
@@ -143,7 +148,6 @@ class CrossAttention(nn.Module):
             nn.Linear(inner_dim, query_dim),
             nn.Dropout(dropout)
         )
     def forward(self, x, context=None, mask=None):
         h = self.heads
@@ -153,26 +157,27 @@ class CrossAttention(nn.Module):
         v = self.to_v(context)
         q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
-        if _ATTN_PRECISION =="fp32":
-            with torch.autocast(enabled=False, device_type = 'cuda'):
                 q, k = q.float(), k.float()
                 sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
         else:
             sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
         del q, k
         if exists(mask):
             mask = rearrange(mask, 'b ... -> b (...)')
             max_neg_value = -torch.finfo(sim.dtype).max
             mask = repeat(mask, 'b j -> (b h) () j', h=h)
             sim.masked_fill_(~mask, max_neg_value)
-        sim = sim.softmax(dim=-1)
         out = einsum('b i j, b j d -> b i d', sim, v)
         out = rearrange(out, '(b h) n d -> b n (h d)', h=h)
         return self.to_out(out)
 class MemoryEfficientCrossAttention(nn.Module):
     # https://github.com/MatthieuTPHR/diffusers/blob/d80b531ff8060ec1ea982b65a1b8df70f73aa67c/src/diffusers/models/attention.py#L223
     def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.0, zero_init=False, **kwargs):
@@ -195,7 +200,6 @@ class MemoryEfficientCrossAttention(nn.Module):
         self.to_out = nn.Sequential(nn.Linear(inner_dim, query_dim), nn.Dropout(dropout))
         self.attention_op: Optional[Any] = None
     def forward(self, x, context=None, mask=None, **kwargs):
         q = self.to_q(x)
@@ -221,23 +225,25 @@ class MemoryEfficientCrossAttention(nn.Module):
             .reshape(b, out.shape[1], self.heads * self.dim_head)
         )
         return self.to_out(out)
 class BasicTransformerBlock(nn.Module):
     ATTENTION_MODES = {
         "softmax": CrossAttention,  # vanilla attention
         "softmax-xformers": MemoryEfficientCrossAttention
     }
     def __init__(
-            self,
-            dim,
-            n_heads,
-            d_head,
-            dropout=0.,
-            context_dim=None,
-            gated_ff=True,
             checkpoint=True,
             disable_self_attn=False
-        ):
         super().__init__()
         attn_mode = "softmax-xformers" if XFORMERS_IS_AVAILBLE else "softmax"
         assert attn_mode in self.ATTENTION_MODES
@@ -247,24 +253,25 @@ class BasicTransformerBlock(nn.Module):
                               context_dim=context_dim if self.disable_self_attn else None)
         self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
         self.attn2 = attn_cls(query_dim=dim, context_dim=context_dim,
-                              heads=n_heads, dim_head=d_head, dropout=dropout)
         self.norm1 = nn.LayerNorm(dim)
         self.norm2 = nn.LayerNorm(dim)
         self.norm3 = nn.LayerNorm(dim)
         self.checkpoint = checkpoint
-    def forward(self, x, context=None,hint=None):
         if hint is None:
             return checkpoint(self._forward, (x, context), self.parameters(), self.checkpoint)
         else:
             return checkpoint(self._forward, (x, context, hint), self.parameters(), self.checkpoint)
-    def _forward(self, x, context=None,hint=None):
-        x = self.attn1(self.norm1(x), context=context if self.disable_self_attn else None,hint=hint) + x
         x = self.attn2(self.norm2(x), context=context) + x
         x = self.ff(self.norm3(x)) + x
         return x
 class SpatialTransformer(nn.Module):
     """
     Transformer block for image-like data.
@@ -274,6 +281,7 @@ class SpatialTransformer(nn.Module):
     Finally, reshape to image
     NEW: use_linear for more efficiency instead of the 1x1 convs
     """
     def __init__(self, in_channels, n_heads, d_head,
                  depth=1, dropout=0., context_dim=None,
                  disable_self_attn=False, use_linear=False,
@@ -296,7 +304,7 @@ class SpatialTransformer(nn.Module):
         self.transformer_blocks = nn.ModuleList(
             [BasicTransformerBlock(inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim[d],
                                    disable_self_attn=disable_self_attn, checkpoint=use_checkpoint)
-                for d in range(depth)]
         )
         if not use_linear:
             self.proj_out = zero_module(nn.Conv2d(inner_dim,
@@ -308,7 +316,7 @@ class SpatialTransformer(nn.Module):
             self.proj_out = zero_module(nn.Linear(in_channels, inner_dim))
         self.use_linear = use_linear
-    def forward(self, x, context=None,hint=None):
         # note: if no context is given, cross-attention defaults to self-attention
         if not isinstance(context, list):
             context = [context]
@@ -321,10 +329,10 @@ class SpatialTransformer(nn.Module):
         if self.use_linear:
             x = self.proj_in(x)
         for i, block in enumerate(self.transformer_blocks):
-            x = block(x, context=context[i],hint=hint)
         if self.use_linear:
             x = self.proj_out(x)
         x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w).contiguous()
         if not self.use_linear:
             x = self.proj_out(x)
-        return x + x_in

 try:
     import xformers
     import xformers.ops
     XFORMERS_IS_AVAILBLE = True
 except:
     XFORMERS_IS_AVAILBLE = False
 # CrossAttn precision handling
 import os
 _ATTN_PRECISION = os.environ.get("ATTN_PRECISION", "fp32")
 def exists(val):
     return val is not None
 def uniq(arr):
+    return {el: True for el in arr}.keys()
 def default(val, d):
         return val
     return d() if isfunction(d) else d
 class GEGLU(nn.Module):
     def __init__(self, dim_in, dim_out):
         super().__init__()
         k = self.k(h_)
         v = self.v(h_)
+        b, c, h, w = q.shape
         q = rearrange(q, 'b c h w -> b (h w) c')
         k = rearrange(k, 'b c h w -> b c (h w)')
         w_ = torch.einsum('bij,bjk->bik', q, k)
+        w_ = w_ * (int(c) ** (-0.5))
         w_ = torch.nn.functional.softmax(w_, dim=2)
         v = rearrange(v, 'b c h w -> b c (h w)')
         h_ = rearrange(h_, 'b c (h w) -> b c h w', h=h)
         h_ = self.proj_out(h_)
+        return x + h_
 class CrossAttention(nn.Module):
     def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0., **kwargs):
             nn.Linear(inner_dim, query_dim),
             nn.Dropout(dropout)
         )
     def forward(self, x, context=None, mask=None):
         h = self.heads
         v = self.to_v(context)
         q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
+        if _ATTN_PRECISION == "fp32":
+            with torch.autocast(enabled=False, device_type='cuda'):
                 q, k = q.float(), k.float()
                 sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
         else:
             sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
         del q, k
         if exists(mask):
             mask = rearrange(mask, 'b ... -> b (...)')
             max_neg_value = -torch.finfo(sim.dtype).max
             mask = repeat(mask, 'b j -> (b h) () j', h=h)
             sim.masked_fill_(~mask, max_neg_value)
+        sim = sim.softmax(dim=-1)
         out = einsum('b i j, b j d -> b i d', sim, v)
         out = rearrange(out, '(b h) n d -> b n (h d)', h=h)
         return self.to_out(out)
 class MemoryEfficientCrossAttention(nn.Module):
     # https://github.com/MatthieuTPHR/diffusers/blob/d80b531ff8060ec1ea982b65a1b8df70f73aa67c/src/diffusers/models/attention.py#L223
     def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.0, zero_init=False, **kwargs):
         self.to_out = nn.Sequential(nn.Linear(inner_dim, query_dim), nn.Dropout(dropout))
         self.attention_op: Optional[Any] = None
     def forward(self, x, context=None, mask=None, **kwargs):
         q = self.to_q(x)
             .reshape(b, out.shape[1], self.heads * self.dim_head)
         )
         return self.to_out(out)
 class BasicTransformerBlock(nn.Module):
     ATTENTION_MODES = {
         "softmax": CrossAttention,  # vanilla attention
         "softmax-xformers": MemoryEfficientCrossAttention
     }
     def __init__(
+            self,
+            dim,
+            n_heads,
+            d_head,
+            dropout=0.,
+            context_dim=None,
+            gated_ff=True,
             checkpoint=True,
             disable_self_attn=False
+    ):
         super().__init__()
         attn_mode = "softmax-xformers" if XFORMERS_IS_AVAILBLE else "softmax"
         assert attn_mode in self.ATTENTION_MODES
                               context_dim=context_dim if self.disable_self_attn else None)
         self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
         self.attn2 = attn_cls(query_dim=dim, context_dim=context_dim,
+                              heads=n_heads, dim_head=d_head, dropout=dropout)
         self.norm1 = nn.LayerNorm(dim)
         self.norm2 = nn.LayerNorm(dim)
         self.norm3 = nn.LayerNorm(dim)
         self.checkpoint = checkpoint
+    def forward(self, x, context=None, hint=None):
         if hint is None:
             return checkpoint(self._forward, (x, context), self.parameters(), self.checkpoint)
         else:
             return checkpoint(self._forward, (x, context, hint), self.parameters(), self.checkpoint)
+    def _forward(self, x, context=None, hint=None):
+        x = self.attn1(self.norm1(x), context=context if self.disable_self_attn else None, hint=hint) + x
         x = self.attn2(self.norm2(x), context=context) + x
         x = self.ff(self.norm3(x)) + x
         return x
 class SpatialTransformer(nn.Module):
     """
     Transformer block for image-like data.
     Finally, reshape to image
     NEW: use_linear for more efficiency instead of the 1x1 convs
     """
     def __init__(self, in_channels, n_heads, d_head,
                  depth=1, dropout=0., context_dim=None,
                  disable_self_attn=False, use_linear=False,
         self.transformer_blocks = nn.ModuleList(
             [BasicTransformerBlock(inner_dim, n_heads, d_head, dropout=dropout, context_dim=context_dim[d],
                                    disable_self_attn=disable_self_attn, checkpoint=use_checkpoint)
+             for d in range(depth)]
         )
         if not use_linear:
             self.proj_out = zero_module(nn.Conv2d(inner_dim,
             self.proj_out = zero_module(nn.Linear(in_channels, inner_dim))
         self.use_linear = use_linear
+    def forward(self, x, context=None, hint=None):
         # note: if no context is given, cross-attention defaults to self-attention
         if not isinstance(context, list):
             context = [context]
         if self.use_linear:
             x = self.proj_in(x)
         for i, block in enumerate(self.transformer_blocks):
+            x = block(x, context=context[i])
         if self.use_linear:
             x = self.proj_out(x)
         x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w).contiguous()
         if not self.use_linear:
             x = self.proj_out(x)
+        return x + x_in