diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..23f414e236f4475ef71a2272ae076cc70c3de154
--- /dev/null
+++ b/app.py
@@ -0,0 +1,211 @@
+import gradio as gr
+import os
+import sys
+import argparse
+import random
+import time
+from omegaconf import OmegaConf
+import torch
+import torchvision
+from pytorch_lightning import seed_everything
+from huggingface_hub import hf_hub_download
+from einops import repeat
+import torchvision.transforms as transforms
+from torchvision.utils import make_grid
+from utils.utils import instantiate_from_config
+
+from collections import OrderedDict
+
+sys.path.insert(0, "scripts/evaluation")
+from lvdm.models.samplers.ddim import DDIMSampler, DDIMStyleSampler
+
+
+def load_model_checkpoint(model, ckpt):
+    state_dict = torch.load(ckpt, map_location="cpu")
+    if "state_dict" in list(state_dict.keys()):
+        state_dict = state_dict["state_dict"]
+    else:       
+        # deepspeed
+        state_dict = OrderedDict()
+        for key in state_dict['module'].keys():
+            state_dict[key[16:]]=state_dict['module'][key]
+
+    model.load_state_dict(state_dict, strict=False)
+    print('>>> model checkpoint loaded.')
+    return model
+
+
+def download_model():
+    REPO_ID = 'VideoCrafter/Text2Video-512'
+    filename_list = ['model.ckpt']
+    os.makedirs('./checkpoints/videocrafter_t2v_320_512/', exist_ok=True)
+    for filename in filename_list:
+        local_file = os.path.join('./checkpoints/videocrafter_t2v_320_512/', filename)
+        if not os.path.exists(local_file):
+            hf_hub_download(repo_id=REPO_ID, filename=filename, local_dir='./checkpoints/videocrafter_t2v_320_512/', force_download=True)
+
+    REPO_ID = 'liuhuohuo/StyleCrafter'
+    filename_list = ['adapter_v1.pth', 'temporal_v1.pth']
+    os.makedirs('./checkpoints/stylecrafter', exist_ok=True)
+    for filename in filename_list:
+        local_file = os.path.join('./checkpoints/stylecrafter', filename)
+        if not os.path.exists(local_file):
+            hf_hub_download(repo_id=REPO_ID, filename=filename, local_dir='./checkpoints/stylecrafter', force_download=True)
+    
+
+def infer(image, prompt, infer_type='image', seed=123, style_strength=1.0, steps=50):
+    download_model()
+    ckpt_path = 'checkpoints/videocrafter_t2v_320_512/model.ckpt'
+    adapter_ckpt_path = 'checkpoints/stylecrafter/adapter_v1.pth'
+    temporal_ckpt_path = 'checkpoints/stylecrafter/temporal_v1.pth'
+    if infer_type == 'image':
+        config_file='configs/inference_image_512_512.yaml'
+        h, w = 512 // 8, 512 // 8
+        unconditional_guidance_scale = 7.5
+        unconditional_guidance_scale_style = None
+    else:
+        config_file='configs/inference_video_320_512.yaml'
+        h, w = 320 // 8, 512 // 8
+        unconditional_guidance_scale = 15.0
+        unconditional_guidance_scale_style = 7.5
+
+    config = OmegaConf.load(config_file)
+    model_config = config.pop("model", OmegaConf.create())
+    model_config['params']['adapter_config']['params']['scale'] = style_strength
+
+
+    model = instantiate_from_config(model_config)
+    model = model.cuda()
+
+    # load ckpt
+    assert os.path.exists(ckpt_path), "Error: checkpoint Not Found!"
+    assert os.path.exists(adapter_ckpt_path), "Error: adapter checkpoint Not Found!"
+    assert os.path.exists(temporal_ckpt_path), "Error: temporal checkpoint Not Found!"
+    model = load_model_checkpoint(model, ckpt_path)
+    model.load_pretrained_adapter(adapter_ckpt_path)
+    if infer_type == 'video':
+        model.load_pretrained_temporal(temporal_ckpt_path)
+    model.eval()
+
+
+    seed_everything(seed)
+
+    batch_size=1
+    channels = model.channels
+    frames = model.temporal_length if infer_type == 'video' else 1
+    noise_shape = [batch_size, channels, frames, h, w]
+
+    # text cond
+    cond = model.get_learned_conditioning([prompt])
+    neg_prompt = batch_size * [""]
+    uc = model.get_learned_conditioning(neg_prompt)
+
+    # style cond
+    style_transforms = torchvision.transforms.Compose([
+        torchvision.transforms.Resize(512),
+        torchvision.transforms.CenterCrop(512),
+        torchvision.transforms.ToTensor(),
+        torchvision.transforms.Lambda(lambda x: x * 2. - 1.),
+    ])
+
+    style_img = style_transforms(image).unsqueeze(0).cuda()
+    style_cond = model.get_batch_style(style_img)
+    append_to_context = model.adapter(style_cond)
+
+    scale_scalar = model.adapter.scale_predictor(torch.concat([append_to_context, cond], dim=1))
+
+    ddim_sampler = DDIMSampler(model) if infer_type == 'image' else DDIMStyleSampler(model) 
+    
+    samples, _ = ddim_sampler.sample(S=steps,
+                                    conditioning=cond,
+                                    batch_size=noise_shape[0],
+                                    shape=noise_shape[1:],
+                                    verbose=False,
+                                    unconditional_guidance_scale=unconditional_guidance_scale,
+                                    unconditional_guidance_scale_style=unconditional_guidance_scale_style,
+                                    unconditional_conditioning=uc,
+                                    eta=1.0,
+                                    temporal_length=noise_shape[2],
+                                    append_to_context=append_to_context,
+                                    scale_scalar=scale_scalar
+                                    )
+    samples = model.decode_first_stage(samples)  
+
+    if infer_type == 'image':
+        samples = samples[:, :, 0, :, :].detach().cpu()
+        out_path = "./output.png"
+        torchvision.utils.save_image(samples, out_path, nrow=1, normalize=True, range=(-1, 1))
+
+    elif infer_type == 'video':
+        samples = samples.detach().cpu()
+        out_path = "./output.mp4"
+        video = torch.clamp(samples, -1, 1)
+        video = video.permute(2, 0, 1, 3, 4) # [T, B, C, H, W]
+        frame_grids = [torchvision.utils.make_grid(video[t], nrow=1) for t in range(video.shape[0])]
+        grid = torch.stack(frame_grids, dim=0)
+        grid = (grid + 1.0) / 2.0
+        grid = (grid * 255).permute(0, 2, 3, 1).numpy().astype('uint8')
+        torchvision.io.write_video(out_path, grid, fps=8, video_codec='h264', options={'crf': '10'})
+        
+
+    return out_path
+
+
+def read_content(file_path: str) -> str:
+    """read the content of target file
+    """
+    with open(file_path, 'r', encoding='utf-8') as f:
+        content = f.read()
+
+    return content
+
+
+demo_exaples = [
+    ['eval_data/3d_1.png', 'A bouquet of flowers in a vase.', 'image', 123, 1.0, 50],
+    ['eval_data/craft_1.png', 'A modern cityscape with towering skyscrapers.', 'image', 124, 1.0, 50],
+    ['eval_data/digital_art_2.jpeg', 'A lighthouse standing tall on a rocky coast.', 'image', 123, 1.0, 50],
+    ['eval_data/oil_paint_2.jpg', 'A man playing the guitar on a city street.', 'image', 123, 1.0, 50],
+    ['eval_data/craft_2.jpg', 'City street at night with bright lights and busy traffic.', 'video', 123, 1.0, 50],
+    ['eval_data/anime_1.jpg', 'A field of sunflowers on a sunny day.', 'video', 123, 1.0, 50],
+    ['eval_data/ink_2.jpeg', 'A knight riding a horse through a field.', 'video', 123, 1.0, 50],
+    ['eval_data/oil_paint_2.jpg', 'A street performer playing the guitar.', 'video', 121, 1.0, 50],
+    ['eval_data/icon_1.png', 'A campfire surrounded by tents.', 'video', 123, 1.0, 50],
+]
+css = """
+#input_img {max-height: 512px} 
+#output_vid {max-width: 512px;}
+"""
+
+with gr.Blocks(analytics_enabled=False, css=css) as demo_iface:
+    gr.HTML(read_content("header.html"))
+    
+    with gr.Tab(label='Stylized Generation'):
+        with gr.Column():
+            with gr.Row():
+                with gr.Column():
+                    with gr.Row():
+                        input_style_ref = gr.Image(label="Style Reference",elem_id="input_img")
+                    with gr.Row():
+                        input_prompt = gr.Text(label='Prompts')
+                    with gr.Row():
+                        input_seed = gr.Slider(label='Random Seed', minimum=0, maximum=10000, step=1, value=123)
+                        input_style_strength = gr.Slider(minimum=0.0, maximum=2.0, step=0.01, label='Style Strength', value=1.0)
+                    with gr.Row():
+                        input_step = gr.Slider(minimum=1, maximum=75, step=1, elem_id="i2v_steps", label="Sampling steps", value=50)
+                        input_type = gr.Radio(choices=["image", "video"], label="Generation Type", value="image")
+                    input_end_btn = gr.Button("Generate")
+                # with gr.Tab(label='Result'):
+                with gr.Row():
+                    output_result = gr.Video(label="Generated Results",elem_id="output_vid",autoplay=True,show_share_button=True)
+
+            gr.Examples(examples=demo_exaples,
+                        inputs=[input_style_ref, input_prompt, input_type, input_seed, input_style_strength, input_step],
+                        outputs=[output_result],
+                        fn = infer,
+            )
+        input_end_btn.click(inputs=[input_style_ref, input_prompt, input_type, input_seed, input_style_strength, input_step],
+                        outputs=[output_result],
+                        fn = infer
+        )
+
+demo_iface.queue(max_size=12).launch(show_api=True)
\ No newline at end of file
diff --git a/configs/inference_image_512_512.yaml b/configs/inference_image_512_512.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..01e3ecd14d27fc7db9bd935c63aee5c0244a47b2
--- /dev/null
+++ b/configs/inference_image_512_512.yaml
@@ -0,0 +1,118 @@
+model:
+  target: lvdm.models.ddpm3d_cond.T2IAdapterStyleAS
+  params:
+    linear_start: 0.00085
+    linear_end: 0.012
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: video
+    cond_stage_key: caption
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    image_size: [64, 64]
+    channels: 4
+    #monitor: val/loss_simple
+    scale_by_std: false
+    scale_factor: 0.18215
+    # training related
+    use_ema: false
+    uncond_prob: 0.0
+    uncond_type: 'empty_seq'
+    scheduler_config:
+        target: utils.lr_scheduler.LambdaLRScheduler
+        interval: 'step'
+        frequency: 100
+        params:
+          start_step: 0
+          final_decay_ratio: 0.01
+          decay_steps: 20000
+
+    unet_config:
+      target: lvdm.modules.networks.openaimodel3d.UNet2DModel
+      params:
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [4, 2, 1]
+        num_res_blocks: 2
+        channel_mult: [1, 2, 4, 4]
+        #num_heads: 8
+        num_head_channels: 64 # need to fix for flash-attn
+        transformer_depth: 1
+        context_dim: 1024
+        use_linear: true
+        use_checkpoint: true
+        temporal_conv: false
+        temporal_attention: true
+        temporal_selfatt_only: true
+        use_relative_position: true
+        use_causal_attention: false
+        temporal_length: 16
+        addition_attention: true
+
+    first_stage_config:
+      target: lvdm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult: [1, 2, 4, 4]
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: lvdm.modules.encoders.condition.FrozenOpenCLIPEmbedder
+      params:
+        freeze: true
+        layer: "penultimate"
+        # version: checkpoints/open_clip/CLIP-ViT-H-14-laion2B-s32B-b79K/open_clip_pytorch_model.bin
+
+    style_stage_config:
+      target: lvdm.modules.encoders.condition.FrozenOpenCLIPImageEmbedder
+      params:
+        # version: checkpoints/open_clip/CLIP-ViT-H-14-laion2B-s32B-b79K/open_clip_pytorch_model.bin
+        freeze: true
+        only_cls: false
+        use_proj: false
+        use_shuffle: false
+        mask_ratio: 0.0
+
+    adapter_config:
+      target: lvdm.modules.encoders.adapter.StyleAdapterDualAttnAS
+      cond_name: style
+      trainable: true
+      params:
+        scale: 1.0
+        use_norm: true
+        image_context_config:
+          target: lvdm.modules.encoders.adapter.StyleTransformer
+          params:
+            in_dim: 1280
+            out_dim: 1024
+            num_heads: 8
+            num_tokens: 8
+            n_layers: 3
+        scale_predictor_config:
+          target: lvdm.modules.encoders.adapter.ScaleEncoder
+          params:
+            in_dim: 1024
+            out_dim: 1
+            num_heads: 8
+            num_tokens: 16
+            n_layers: 2
+          # target: lvdm.modules.encoders.adapter.ImageContext
+          # params:
+          #   width: 1024
+          #   context_dim: 1024
+          #   token_num: 4
+      
\ No newline at end of file
diff --git a/configs/inference_video_320_512.yaml b/configs/inference_video_320_512.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f6d11d0e090fd2a4113cb9733dd92fc589ed5add
--- /dev/null
+++ b/configs/inference_video_320_512.yaml
@@ -0,0 +1,122 @@
+model:
+  target: lvdm.models.ddpm3d_cond.T2VFintoneStyleAS
+  params:
+    linear_start: 0.00085
+    linear_end: 0.012
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: video
+    cond_stage_key: caption
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    image_size: [64, 64]
+    channels: 4
+    #monitor: val/loss_simple
+    scale_by_std: false
+    scale_factor: 0.18215
+    # training related
+    use_ema: false
+    uncond_prob: 0.0
+    uncond_type: 'empty_seq'
+
+
+    scheduler_config:
+        target: utils.lr_scheduler.LambdaLRScheduler
+        interval: 'step'
+        frequency: 100
+        params:
+          start_step: 0
+          final_decay_ratio: 0.01
+          decay_steps: 20000
+
+    # train_strategy: 'video_only'
+
+    unet_config:
+      target: lvdm.modules.networks.openaimodel3d.UNetModel
+      params:
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [4, 2, 1]
+        num_res_blocks: 2
+        channel_mult: [1, 2, 4, 4]
+        #num_heads: 8
+        num_head_channels: 64 # need to fix for flash-attn
+        transformer_depth: 1
+        context_dim: 1024
+        use_linear: true
+        use_checkpoint: true
+        temporal_conv: false
+        temporal_attention: true
+        temporal_selfatt_only: true
+        use_relative_position: true
+        use_causal_attention: false
+        temporal_length: 16
+        addition_attention: true
+
+    first_stage_config:
+      target: lvdm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult: [1, 2, 4, 4]
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: lvdm.modules.encoders.condition.FrozenOpenCLIPEmbedder
+      params:
+        version: checkpoints/open_clip/CLIP-ViT-H-14-laion2B-s32B-b79K/open_clip_pytorch_model.bin
+        freeze: true
+        layer: "penultimate"
+
+    style_stage_config:
+      target: lvdm.modules.encoders.condition.FrozenOpenCLIPImageEmbedder
+      params:
+        version: checkpoints/open_clip/CLIP-ViT-H-14-laion2B-s32B-b79K/open_clip_pytorch_model.bin
+        freeze: true
+        only_cls: false
+        use_proj: false
+        use_shuffle: false
+        mask_ratio: 0.0
+
+    adapter_config:
+      target: lvdm.modules.encoders.adapter.StyleAdapterDualAttnAS
+      cond_name: style
+      trainable: true
+      params:
+        scale: 1.0
+        use_norm: true
+        image_context_config:
+          target: lvdm.modules.encoders.adapter.StyleTransformer
+          params:
+            in_dim: 1280
+            out_dim: 1024
+            num_heads: 8
+            num_tokens: 8
+            n_layers: 3
+        scale_predictor_config:
+          target: lvdm.modules.encoders.adapter.ScaleEncoder
+          params:
+            in_dim: 1024
+            out_dim: 1
+            num_heads: 8
+            num_tokens: 16
+            n_layers: 2
+          # target: lvdm.modules.encoders.adapter.ImageContext
+          # params:
+          #   width: 1024
+          #   context_dim: 1024
+          #   token_num: 4
+      
\ No newline at end of file
diff --git a/eval_data/3d_1.png b/eval_data/3d_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..89328eaac12832a554a0fa120fb485b17805c579
Binary files /dev/null and b/eval_data/3d_1.png differ
diff --git a/eval_data/anime_1.jpg b/eval_data/anime_1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..e47149a1f343e86aa741cd890748a8cb9e17b628
Binary files /dev/null and b/eval_data/anime_1.jpg differ
diff --git a/eval_data/craft_1.jpg b/eval_data/craft_1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..aa0e5f646e169f233247da22b902f083d2510399
Binary files /dev/null and b/eval_data/craft_1.jpg differ
diff --git a/eval_data/craft_2.png b/eval_data/craft_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..e6eeb02fad17e2c03be84be31e7508b440ddb80e
Binary files /dev/null and b/eval_data/craft_2.png differ
diff --git a/eval_data/digital_art_2.jpeg b/eval_data/digital_art_2.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..1efc72fd02eec874ab638ce21d1be9656818fb16
Binary files /dev/null and b/eval_data/digital_art_2.jpeg differ
diff --git a/eval_data/icon_1.png b/eval_data/icon_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..aa34b21b277b49a49a09fd18e62382f5eae518e9
Binary files /dev/null and b/eval_data/icon_1.png differ
diff --git a/eval_data/ink_2.jpeg b/eval_data/ink_2.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..e5c214f0e13c0376b83492664917570559d6508e
Binary files /dev/null and b/eval_data/ink_2.jpeg differ
diff --git a/eval_data/oil_paint_2.jpg b/eval_data/oil_paint_2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..82e523e1cfa5a40e41eaa47258bfd7c7a15e3145
Binary files /dev/null and b/eval_data/oil_paint_2.jpg differ
diff --git a/header.html b/header.html
new file mode 100644
index 0000000000000000000000000000000000000000..342de756ed503b836c771860f355410395909e83
--- /dev/null
+++ b/header.html
@@ -0,0 +1,36 @@
+<style>
+    .button-container {
+        display: flex;
+        justify-content: center;
+        align-items: center;
+        gap: 1rem;
+    }
+</style>
+
+<div style="text-align: center; max-width: 900px; margin: 0 auto;">
+    <div>
+        <h1>
+            StyleCrafter: Enhancing Stylized Text-to-Video Generation with Style Adapter
+        </h1>
+    </div>
+
+    &nbsp;
+
+    <div style="text-align: center; max-width: 600px; margin: 0 auto;">
+        <p style="align-items: center; margin-bottom: 7px;">
+            This is a online demo for StyleCrafter, a model that can generate images/videos with your favorite style.
+        </p>
+        <p style="align-items: center; margin-bottom: 7px;">
+            You can upload your own style image and text description, and StyleCrafter will intelligently combine the style elements from the image and the text to create a unique and visually appealing output.
+        </p>
+    </div>
+
+    &nbsp;
+
+    <div class="column has-text-centered button-container">
+        <a href='https://arxiv.org/abs/2312.00330'><img src='https://img.shields.io/badge/arXiv-2312.00330-b31b1b.svg'></a> 
+        <a href='https://gongyeliu.github.io/StyleCrafter.github.io/'><img src='https://img.shields.io/badge/Project-Page-Green'></a>
+        <a href='https://github.com/GongyeLiu/StyleCrafter'><img src='https://img.shields.io/badge/GitHub-Code-181717?logo=github&labelCase=asis'></a>
+    </div>
+
+</div>
\ No newline at end of file
diff --git a/lvdm/__pycache__/basics.cpython-39.pyc b/lvdm/__pycache__/basics.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..77719833258b3a45939d75388e4d83f53928e9db
Binary files /dev/null and b/lvdm/__pycache__/basics.cpython-39.pyc differ
diff --git a/lvdm/__pycache__/common.cpython-39.pyc b/lvdm/__pycache__/common.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..72baee08d6810baa2cbff614b885f1d59e17aa03
Binary files /dev/null and b/lvdm/__pycache__/common.cpython-39.pyc differ
diff --git a/lvdm/__pycache__/distributions.cpython-39.pyc b/lvdm/__pycache__/distributions.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..741802a65f21d5cbf6b0f7cb9845b24f5f159d38
Binary files /dev/null and b/lvdm/__pycache__/distributions.cpython-39.pyc differ
diff --git a/lvdm/__pycache__/ema.cpython-39.pyc b/lvdm/__pycache__/ema.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6b84e899b225cfe87b02d2b1c8cb2156d0bde6ae
Binary files /dev/null and b/lvdm/__pycache__/ema.cpython-39.pyc differ
diff --git a/lvdm/basics.py b/lvdm/basics.py
new file mode 100644
index 0000000000000000000000000000000000000000..65c771d13a7f4a932ac370f08797a8b6ba9e85ff
--- /dev/null
+++ b/lvdm/basics.py
@@ -0,0 +1,100 @@
+# adopted from
+# https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+# and
+# https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
+# and
+# https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/nn.py
+#
+# thanks!
+
+import torch.nn as nn
+from utils.utils import instantiate_from_config
+
+
+def disabled_train(self, mode=True):
+    """Overwrite model.train with this function to make sure train/eval mode
+    does not change anymore."""
+    return self
+
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+
+def scale_module(module, scale):
+    """
+    Scale the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().mul_(scale)
+    return module
+
+
+def conv_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D convolution module.
+    """
+    if dims == 1:
+        return nn.Conv1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.Conv2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.Conv3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+
+
+def linear(*args, **kwargs):
+    """
+    Create a linear module.
+    """
+    return nn.Linear(*args, **kwargs)
+
+
+def avg_pool_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D average pooling module.
+    """
+    if dims == 1:
+        return nn.AvgPool1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.AvgPool2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.AvgPool3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+
+
+def nonlinearity(type='silu'):
+    if type == 'silu':
+        return nn.SiLU()
+    elif type == 'leaky_relu':
+        return nn.LeakyReLU()
+
+
+class GroupNormSpecific(nn.GroupNorm):
+    def forward(self, x):
+        return super().forward(x.float()).type(x.dtype)
+
+
+def normalization(channels, num_groups=32):
+    """
+    Make a standard normalization layer.
+    :param channels: number of input channels.
+    :return: an nn.Module for normalization.
+    """
+    return GroupNormSpecific(num_groups, channels)
+
+
+class HybridConditioner(nn.Module):
+
+    def __init__(self, c_concat_config, c_crossattn_config):
+        super().__init__()
+        self.concat_conditioner = instantiate_from_config(c_concat_config)
+        self.crossattn_conditioner = instantiate_from_config(c_crossattn_config)
+
+    def forward(self, c_concat, c_crossattn):
+        c_concat = self.concat_conditioner(c_concat)
+        c_crossattn = self.crossattn_conditioner(c_crossattn)
+        return {'c_concat': [c_concat], 'c_crossattn': [c_crossattn]}
\ No newline at end of file
diff --git a/lvdm/common.py b/lvdm/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..35569b25aa97236d7d083d8b6ef0c0f3187c2388
--- /dev/null
+++ b/lvdm/common.py
@@ -0,0 +1,95 @@
+import math
+from inspect import isfunction
+import torch
+from torch import nn
+import torch.distributed as dist
+
+
+def gather_data(data, return_np=True):
+    ''' gather data from multiple processes to one list '''
+    data_list = [torch.zeros_like(data) for _ in range(dist.get_world_size())]
+    dist.all_gather(data_list, data)  # gather not supported with NCCL
+    if return_np:
+        data_list = [data.cpu().numpy() for data in data_list]
+    return data_list
+
+def autocast(f):
+    def do_autocast(*args, **kwargs):
+        with torch.cuda.amp.autocast(enabled=True,
+                                     dtype=torch.get_autocast_gpu_dtype(),
+                                     cache_enabled=torch.is_autocast_cache_enabled()):
+            return f(*args, **kwargs)
+    return do_autocast
+
+
+def extract_into_tensor(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+
+
+def noise_like(shape, device, repeat=False):
+    repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
+    noise = lambda: torch.randn(shape, device=device)
+    return repeat_noise() if repeat else noise()
+
+
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+
+def exists(val):
+    return val is not None
+
+def identity(*args, **kwargs):
+    return nn.Identity()
+
+def uniq(arr):
+    return{el: True for el in arr}.keys()
+
+def mean_flat(tensor):
+    """
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+
+def ismap(x):
+    if not isinstance(x, torch.Tensor):
+        return False
+    return (len(x.shape) == 4) and (x.shape[1] > 3)
+
+def isimage(x):
+    if not isinstance(x,torch.Tensor):
+        return False
+    return (len(x.shape) == 4) and (x.shape[1] == 3 or x.shape[1] == 1)
+
+def max_neg_value(t):
+    return -torch.finfo(t.dtype).max
+
+def shape_to_str(x):
+    shape_str = "x".join([str(x) for x in x.shape])
+    return shape_str
+
+def init_(tensor):
+    dim = tensor.shape[-1]
+    std = 1 / math.sqrt(dim)
+    tensor.uniform_(-std, std)
+    return tensor
+
+ckpt = torch.utils.checkpoint.checkpoint
+def checkpoint(func, inputs, params, flag):
+    """
+    Evaluate a function without caching intermediate activations, allowing for
+    reduced memory at the expense of extra compute in the backward pass.
+    :param func: the function to evaluate.
+    :param inputs: the argument sequence to pass to `func`.
+    :param params: a sequence of parameters `func` depends on but does not
+                   explicitly take as arguments.
+    :param flag: if False, disable gradient checkpointing.
+    """
+    if flag:
+        return ckpt(func, *inputs)
+    else:
+        return func(*inputs)
+
diff --git a/lvdm/distributions.py b/lvdm/distributions.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b69b6984880ec24279b658384ed8031335e3474
--- /dev/null
+++ b/lvdm/distributions.py
@@ -0,0 +1,95 @@
+import torch
+import numpy as np
+
+
+class AbstractDistribution:
+    def sample(self):
+        raise NotImplementedError()
+
+    def mode(self):
+        raise NotImplementedError()
+
+
+class DiracDistribution(AbstractDistribution):
+    def __init__(self, value):
+        self.value = value
+
+    def sample(self):
+        return self.value
+
+    def mode(self):
+        return self.value
+
+
+class DiagonalGaussianDistribution(object):
+    def __init__(self, parameters, deterministic=False):
+        self.parameters = parameters
+        self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = torch.exp(0.5 * self.logvar)
+        self.var = torch.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device)
+
+    def sample(self, noise=None):
+        if noise is None:
+            noise = torch.randn(self.mean.shape)
+        
+        x = self.mean + self.std * noise.to(device=self.parameters.device)
+        return x
+
+    def kl(self, other=None):
+        if self.deterministic:
+            return torch.Tensor([0.])
+        else:
+            if other is None:
+                return 0.5 * torch.sum(torch.pow(self.mean, 2)
+                                       + self.var - 1.0 - self.logvar,
+                                       dim=[1, 2, 3])
+            else:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean - other.mean, 2) / other.var
+                    + self.var / other.var - 1.0 - self.logvar + other.logvar,
+                    dim=[1, 2, 3])
+
+    def nll(self, sample, dims=[1,2,3]):
+        if self.deterministic:
+            return torch.Tensor([0.])
+        logtwopi = np.log(2.0 * np.pi)
+        return 0.5 * torch.sum(
+            logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
+            dim=dims)
+
+    def mode(self):
+        return self.mean
+
+
+def normal_kl(mean1, logvar1, mean2, logvar2):
+    """
+    source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/losses.py#L12
+    Compute the KL divergence between two gaussians.
+    Shapes are automatically broadcasted, so batches can be compared to
+    scalars, among other use cases.
+    """
+    tensor = None
+    for obj in (mean1, logvar1, mean2, logvar2):
+        if isinstance(obj, torch.Tensor):
+            tensor = obj
+            break
+    assert tensor is not None, "at least one argument must be a Tensor"
+
+    # Force variances to be Tensors. Broadcasting helps convert scalars to
+    # Tensors, but it does not work for torch.exp().
+    logvar1, logvar2 = [
+        x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor)
+        for x in (logvar1, logvar2)
+    ]
+
+    return 0.5 * (
+        -1.0
+        + logvar2
+        - logvar1
+        + torch.exp(logvar1 - logvar2)
+        + ((mean1 - mean2) ** 2) * torch.exp(-logvar2)
+    )
diff --git a/lvdm/ema.py b/lvdm/ema.py
new file mode 100644
index 0000000000000000000000000000000000000000..c8c75af43565f6e140287644aaaefa97dd6e67c5
--- /dev/null
+++ b/lvdm/ema.py
@@ -0,0 +1,76 @@
+import torch
+from torch import nn
+
+
+class LitEma(nn.Module):
+    def __init__(self, model, decay=0.9999, use_num_upates=True):
+        super().__init__()
+        if decay < 0.0 or decay > 1.0:
+            raise ValueError('Decay must be between 0 and 1')
+
+        self.m_name2s_name = {}
+        self.register_buffer('decay', torch.tensor(decay, dtype=torch.float32))
+        self.register_buffer('num_updates', torch.tensor(0,dtype=torch.int) if use_num_upates
+                             else torch.tensor(-1,dtype=torch.int))
+
+        for name, p in model.named_parameters():
+            if p.requires_grad:
+                #remove as '.'-character is not allowed in buffers
+                s_name = name.replace('.','')
+                self.m_name2s_name.update({name:s_name})
+                self.register_buffer(s_name,p.clone().detach().data)
+
+        self.collected_params = []
+
+    def forward(self,model):
+        decay = self.decay
+
+        if self.num_updates >= 0:
+            self.num_updates += 1
+            decay = min(self.decay,(1 + self.num_updates) / (10 + self.num_updates))
+
+        one_minus_decay = 1.0 - decay
+
+        with torch.no_grad():
+            m_param = dict(model.named_parameters())
+            shadow_params = dict(self.named_buffers())
+
+            for key in m_param:
+                if m_param[key].requires_grad:
+                    sname = self.m_name2s_name[key]
+                    shadow_params[sname] = shadow_params[sname].type_as(m_param[key])
+                    shadow_params[sname].sub_(one_minus_decay * (shadow_params[sname] - m_param[key]))
+                else:
+                    assert not key in self.m_name2s_name
+
+    def copy_to(self, model):
+        m_param = dict(model.named_parameters())
+        shadow_params = dict(self.named_buffers())
+        for key in m_param:
+            if m_param[key].requires_grad:
+                m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data)
+            else:
+                assert not key in self.m_name2s_name
+
+    def store(self, parameters):
+        """
+        Save the current parameters for restoring later.
+        Args:
+          parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+            temporarily stored.
+        """
+        self.collected_params = [param.clone() for param in parameters]
+
+    def restore(self, parameters):
+        """
+        Restore the parameters stored with the `store` method.
+        Useful to validate the model with EMA parameters without affecting the
+        original optimization process. Store the parameters before the
+        `copy_to` method. After validation (or model saving), use this to
+        restore the former parameters.
+        Args:
+          parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+            updated with the stored parameters.
+        """
+        for c_param, param in zip(self.collected_params, parameters):
+            param.data.copy_(c_param.data)
diff --git a/lvdm/models/__pycache__/autoencoder.cpython-39.pyc b/lvdm/models/__pycache__/autoencoder.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..031d18cdfacc78c66477b6c1b0404b19f6159448
Binary files /dev/null and b/lvdm/models/__pycache__/autoencoder.cpython-39.pyc differ
diff --git a/lvdm/models/__pycache__/ddpm3d.cpython-39.pyc b/lvdm/models/__pycache__/ddpm3d.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d6502f0edde5e5da1a6e25f77d655dcbeb55053f
Binary files /dev/null and b/lvdm/models/__pycache__/ddpm3d.cpython-39.pyc differ
diff --git a/lvdm/models/__pycache__/ddpm3d_cond.cpython-39.pyc b/lvdm/models/__pycache__/ddpm3d_cond.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6a819bc3eed1a356c6cf032d53c4ab4730b40156
Binary files /dev/null and b/lvdm/models/__pycache__/ddpm3d_cond.cpython-39.pyc differ
diff --git a/lvdm/models/__pycache__/utils_diffusion.cpython-39.pyc b/lvdm/models/__pycache__/utils_diffusion.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f939b56a0eac01c2bd48dab9e180e0e7007032b9
Binary files /dev/null and b/lvdm/models/__pycache__/utils_diffusion.cpython-39.pyc differ
diff --git a/lvdm/models/autoencoder.py b/lvdm/models/autoencoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc479d8b446b530885f4a3cc5d25cb58f0c00d74
--- /dev/null
+++ b/lvdm/models/autoencoder.py
@@ -0,0 +1,219 @@
+import os
+from contextlib import contextmanager
+import torch
+import numpy as np
+from einops import rearrange
+import torch.nn.functional as F
+import pytorch_lightning as pl
+from lvdm.modules.networks.ae_modules import Encoder, Decoder
+from lvdm.distributions import DiagonalGaussianDistribution
+from utils.utils import instantiate_from_config
+
+
+class AutoencoderKL(pl.LightningModule):
+    def __init__(self,
+                 ddconfig,
+                 lossconfig,
+                 embed_dim,
+                 ckpt_path=None,
+                 ignore_keys=[],
+                 image_key="image",
+                 colorize_nlabels=None,
+                 monitor=None,
+                 test=False,
+                 logdir=None,
+                 input_dim=4,
+                 test_args=None,
+                 ):
+        super().__init__()
+        self.image_key = image_key
+        self.encoder = Encoder(**ddconfig)
+        self.decoder = Decoder(**ddconfig)
+        self.loss = instantiate_from_config(lossconfig)
+        assert ddconfig["double_z"]
+        self.quant_conv = torch.nn.Conv2d(2*ddconfig["z_channels"], 2*embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
+        self.embed_dim = embed_dim
+        self.input_dim = input_dim
+        self.test = test
+        self.test_args = test_args
+        self.logdir = logdir
+        if colorize_nlabels is not None:
+            assert type(colorize_nlabels)==int
+            self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
+        if monitor is not None:
+            self.monitor = monitor
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+        if self.test:
+            self.init_test()
+    
+    def init_test(self,):
+        self.test = True
+        save_dir = os.path.join(self.logdir, "test")
+        if 'ckpt' in self.test_args:
+            ckpt_name = os.path.basename(self.test_args.ckpt).split('.ckpt')[0] + f'_epoch{self._cur_epoch}'
+            self.root = os.path.join(save_dir, ckpt_name)
+        else:
+            self.root = save_dir
+        if 'test_subdir' in self.test_args:
+            self.root = os.path.join(save_dir, self.test_args.test_subdir)
+
+        self.root_zs = os.path.join(self.root, "zs")
+        self.root_dec = os.path.join(self.root, "reconstructions")
+        self.root_inputs = os.path.join(self.root, "inputs")
+        os.makedirs(self.root, exist_ok=True)
+
+        if self.test_args.save_z:
+            os.makedirs(self.root_zs, exist_ok=True)
+        if self.test_args.save_reconstruction:
+            os.makedirs(self.root_dec, exist_ok=True)
+        if self.test_args.save_input:
+            os.makedirs(self.root_inputs, exist_ok=True)
+        assert(self.test_args is not None)
+        self.test_maximum = getattr(self.test_args, 'test_maximum', None) 
+        self.count = 0
+        self.eval_metrics = {}
+        self.decodes = []
+        self.save_decode_samples = 2048
+
+    def init_from_ckpt(self, path, ignore_keys=list()):
+        sd = torch.load(path, map_location="cpu")
+        try:
+            self._cur_epoch = sd['epoch']
+            sd = sd["state_dict"]
+        except:
+            self._cur_epoch = 'null'
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print("Deleting key {} from state_dict.".format(k))
+                    del sd[k]
+        self.load_state_dict(sd, strict=False)
+        # self.load_state_dict(sd, strict=True)
+        print(f"Restored from {path}")
+
+    def encode(self, x, **kwargs):
+        
+        h = self.encoder(x)
+        moments = self.quant_conv(h)
+        posterior = DiagonalGaussianDistribution(moments)
+        return posterior
+
+    def decode(self, z, **kwargs):
+        z = self.post_quant_conv(z)
+        dec = self.decoder(z)
+        return dec
+
+    def forward(self, input, sample_posterior=True):
+        posterior = self.encode(input)
+        if sample_posterior:
+            z = posterior.sample()
+        else:
+            z = posterior.mode()
+        dec = self.decode(z)
+        return dec, posterior
+
+    def get_input(self, batch, k):
+        x = batch[k]
+        if x.dim() == 5 and self.input_dim == 4:
+            b,c,t,h,w = x.shape
+            self.b = b
+            self.t = t 
+            x = rearrange(x, 'b c t h w -> (b t) c h w')
+
+        return x
+
+    def training_step(self, batch, batch_idx, optimizer_idx):
+        inputs = self.get_input(batch, self.image_key)
+        reconstructions, posterior = self(inputs)
+
+        if optimizer_idx == 0:
+            # train encoder+decoder+logvar
+            aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
+                                            last_layer=self.get_last_layer(), split="train")
+            self.log("aeloss", aeloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
+            self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=False)
+            return aeloss
+
+        if optimizer_idx == 1:
+            # train the discriminator
+            discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step,
+                                                last_layer=self.get_last_layer(), split="train")
+
+            self.log("discloss", discloss, prog_bar=True, logger=True, on_step=True, on_epoch=True)
+            self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=False)
+            return discloss
+
+    def validation_step(self, batch, batch_idx):
+        inputs = self.get_input(batch, self.image_key)
+        reconstructions, posterior = self(inputs)
+        aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, 0, self.global_step,
+                                        last_layer=self.get_last_layer(), split="val")
+
+        discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, 1, self.global_step,
+                                            last_layer=self.get_last_layer(), split="val")
+
+        self.log("val/rec_loss", log_dict_ae["val/rec_loss"])
+        self.log_dict(log_dict_ae)
+        self.log_dict(log_dict_disc)
+        return self.log_dict
+    
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        opt_ae = torch.optim.Adam(list(self.encoder.parameters())+
+                                  list(self.decoder.parameters())+
+                                  list(self.quant_conv.parameters())+
+                                  list(self.post_quant_conv.parameters()),
+                                  lr=lr, betas=(0.5, 0.9))
+        opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(),
+                                    lr=lr, betas=(0.5, 0.9))
+        return [opt_ae, opt_disc], []
+
+    def get_last_layer(self):
+        return self.decoder.conv_out.weight
+
+    @torch.no_grad()
+    def log_images(self, batch, only_inputs=False, **kwargs):
+        log = dict()
+        x = self.get_input(batch, self.image_key)
+        x = x.to(self.device)
+        if not only_inputs:
+            xrec, posterior = self(x)
+            if x.shape[1] > 3:
+                # colorize with random projection
+                assert xrec.shape[1] > 3
+                x = self.to_rgb(x)
+                xrec = self.to_rgb(xrec)
+            log["samples"] = self.decode(torch.randn_like(posterior.sample()))
+            log["reconstructions"] = xrec
+        log["inputs"] = x
+        return log
+
+    def to_rgb(self, x):
+        assert self.image_key == "segmentation"
+        if not hasattr(self, "colorize"):
+            self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
+        x = F.conv2d(x, weight=self.colorize)
+        x = 2.*(x-x.min())/(x.max()-x.min()) - 1.
+        return x
+
+class IdentityFirstStage(torch.nn.Module):
+    def __init__(self, *args, vq_interface=False, **kwargs):
+        self.vq_interface = vq_interface  # TODO: Should be true by default but check to not break older stuff
+        super().__init__()
+
+    def encode(self, x, *args, **kwargs):
+        return x
+
+    def decode(self, x, *args, **kwargs):
+        return x
+
+    def quantize(self, x, *args, **kwargs):
+        if self.vq_interface:
+            return x, None, [None, None, None]
+        return x
+
+    def forward(self, x, *args, **kwargs):
+        return x
diff --git a/lvdm/models/ddpm3d.py b/lvdm/models/ddpm3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d8286e3082804be95394f7f75e787897b9152de
--- /dev/null
+++ b/lvdm/models/ddpm3d.py
@@ -0,0 +1,781 @@
+"""
+wild mixture of
+https://github.com/openai/improved-diffusion/blob/e94489283bb876ac1477d5dd7709bbbd2d9902ce/improved_diffusion/gaussian_diffusion.py
+https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
+https://github.com/CompVis/taming-transformers
+-- merci
+"""
+
+from functools import partial
+from contextlib import contextmanager
+import numpy as np
+from tqdm import tqdm
+from einops import rearrange, repeat
+import logging
+mainlogger = logging.getLogger('mainlogger')
+import torch
+import torch.nn as nn
+from torchvision.utils import make_grid
+import pytorch_lightning as pl
+from utils.utils import instantiate_from_config
+from lvdm.ema import LitEma
+from lvdm.distributions import DiagonalGaussianDistribution
+from lvdm.models.utils_diffusion import make_beta_schedule
+from lvdm.modules.encoders.ip_resampler import ImageProjModel, Resampler
+from lvdm.basics import disabled_train
+from lvdm.common import (
+    extract_into_tensor,
+    noise_like,
+    exists,
+    default
+)
+
+
+__conditioning_keys__ = {'concat': 'c_concat',
+                         'crossattn': 'c_crossattn',
+                         'adm': 'y'}
+
+class DDPM(pl.LightningModule):
+    # classic DDPM with Gaussian diffusion, in image space
+    def __init__(self,
+                 unet_config,
+                 timesteps=1000,
+                 beta_schedule="linear",
+                 loss_type="l2",
+                 ckpt_path=None,
+                 ignore_keys=[],
+                 load_only_unet=False,
+                 monitor=None,
+                 use_ema=True,
+                 first_stage_key="image",
+                 image_size=256,
+                 channels=3,
+                 log_every_t=100,
+                 clip_denoised=True,
+                 linear_start=1e-4,
+                 linear_end=2e-2,
+                 cosine_s=8e-3,
+                 given_betas=None,
+                 original_elbo_weight=0.,
+                 v_posterior=0.,  # weight for choosing posterior variance as sigma = (1-v) * beta_tilde + v * beta
+                 l_simple_weight=1.,
+                 conditioning_key=None,
+                 parameterization="eps",  # all assuming fixed variance schedules
+                 scheduler_config=None,
+                 use_positional_encodings=False,
+                 learn_logvar=False,
+                 logvar_init=0.
+                 ):
+        super().__init__()
+        assert parameterization in ["eps", "x0"], 'currently only supporting "eps" and "x0"'
+        self.parameterization = parameterization
+        mainlogger.info(f"{self.__class__.__name__}: Running in {self.parameterization}-prediction mode")
+        self.cond_stage_model = None
+        self.clip_denoised = clip_denoised
+        self.log_every_t = log_every_t
+        self.first_stage_key = first_stage_key
+        self.channels = channels
+        self.temporal_length = unet_config.params.temporal_length
+        self.image_size = image_size 
+        if isinstance(self.image_size, int):
+            self.image_size = [self.image_size, self.image_size]
+        self.use_positional_encodings = use_positional_encodings
+        self.model = DiffusionWrapper(unet_config, conditioning_key)
+        self.use_ema = use_ema
+        if self.use_ema:
+            self.model_ema = LitEma(self.model)
+            mainlogger.info(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
+
+        self.use_scheduler = scheduler_config is not None
+        if self.use_scheduler:
+            self.scheduler_config = scheduler_config
+
+        self.v_posterior = v_posterior
+        self.original_elbo_weight = original_elbo_weight
+        self.l_simple_weight = l_simple_weight
+
+        if monitor is not None:
+            self.monitor = monitor
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys, only_model=load_only_unet)
+
+        self.register_schedule(given_betas=given_betas, beta_schedule=beta_schedule, timesteps=timesteps,
+                               linear_start=linear_start, linear_end=linear_end, cosine_s=cosine_s)
+
+        self.loss_type = loss_type
+
+        self.learn_logvar = learn_logvar
+        self.logvar = torch.full(fill_value=logvar_init, size=(self.num_timesteps,))
+        if self.learn_logvar:
+            self.logvar = nn.Parameter(self.logvar, requires_grad=True)
+
+
+    def register_schedule(self, given_betas=None, beta_schedule="linear", timesteps=1000,
+                          linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
+        if exists(given_betas):
+            betas = given_betas
+        else:
+            betas = make_beta_schedule(beta_schedule, timesteps, linear_start=linear_start, linear_end=linear_end,
+                                       cosine_s=cosine_s)
+        alphas = 1. - betas
+        alphas_cumprod = np.cumprod(alphas, axis=0)
+        alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1])
+
+        timesteps, = betas.shape
+        self.num_timesteps = int(timesteps)
+        self.linear_start = linear_start
+        self.linear_end = linear_end
+        assert alphas_cumprod.shape[0] == self.num_timesteps, 'alphas have to be defined for each timestep'
+
+        to_torch = partial(torch.tensor, dtype=torch.float32)
+
+        self.register_buffer('betas', to_torch(betas))
+        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
+        self.register_buffer('alphas_cumprod_prev', to_torch(alphas_cumprod_prev))
+
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod)))
+        self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod)))
+        self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod)))
+        self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod)))
+        self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod - 1)))
+
+        # calculations for posterior q(x_{t-1} | x_t, x_0)
+        posterior_variance = (1 - self.v_posterior) * betas * (1. - alphas_cumprod_prev) / (
+                    1. - alphas_cumprod) + self.v_posterior * betas
+        # above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t)
+        self.register_buffer('posterior_variance', to_torch(posterior_variance))
+        # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
+        self.register_buffer('posterior_log_variance_clipped', to_torch(np.log(np.maximum(posterior_variance, 1e-20))))
+        self.register_buffer('posterior_mean_coef1', to_torch(
+            betas * np.sqrt(alphas_cumprod_prev) / (1. - alphas_cumprod)))
+        self.register_buffer('posterior_mean_coef2', to_torch(
+            (1. - alphas_cumprod_prev) * np.sqrt(alphas) / (1. - alphas_cumprod)))
+
+        if self.parameterization == "eps":
+            lvlb_weights = self.betas ** 2 / (
+                        2 * self.posterior_variance * to_torch(alphas) * (1 - self.alphas_cumprod))
+        elif self.parameterization == "x0":
+            lvlb_weights = 0.5 * np.sqrt(torch.Tensor(alphas_cumprod)) / (2. * 1 - torch.Tensor(alphas_cumprod))
+        else:
+            raise NotImplementedError("mu not supported")
+        # TODO how to choose this term
+        lvlb_weights[0] = lvlb_weights[1]
+        self.register_buffer('lvlb_weights', lvlb_weights, persistent=False)
+        assert not torch.isnan(self.lvlb_weights).all()
+
+    @contextmanager
+    def ema_scope(self, context=None):
+        if self.use_ema:
+            self.model_ema.store(self.model.parameters())
+            self.model_ema.copy_to(self.model)
+            if context is not None:
+                mainlogger.info(f"{context}: Switched to EMA weights")
+        try:
+            yield None
+        finally:
+            if self.use_ema:
+                self.model_ema.restore(self.model.parameters())
+                if context is not None:
+                    mainlogger.info(f"{context}: Restored training weights")
+
+    def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
+        sd = torch.load(path, map_location="cpu")
+        if "state_dict" in list(sd.keys()):
+            sd = sd["state_dict"]
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    mainlogger.info("Deleting key {} from state_dict.".format(k))
+                    del sd[k]
+        missing, unexpected = self.load_state_dict(sd, strict=False) if not only_model else self.model.load_state_dict(
+            sd, strict=False)
+        mainlogger.info(f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys")
+        if len(missing) > 0:
+            mainlogger.info(f"Missing Keys: {missing}")
+        if len(unexpected) > 0:
+            mainlogger.info(f"Unexpected Keys: {unexpected}")
+
+    def q_mean_variance(self, x_start, t):
+        """
+        Get the distribution q(x_t | x_0).
+        :param x_start: the [N x C x ...] tensor of noiseless inputs.
+        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+        :return: A tuple (mean, variance, log_variance), all of x_start's shape.
+        """
+        mean = (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start)
+        variance = extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
+        log_variance = extract_into_tensor(self.log_one_minus_alphas_cumprod, t, x_start.shape)
+        return mean, variance, log_variance
+
+    def predict_start_from_noise(self, x_t, t, noise):
+        return (
+                extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t -
+                extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * noise
+        )
+
+    def q_posterior(self, x_start, x_t, t):
+        posterior_mean = (
+                extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start +
+                extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
+        )
+        posterior_variance = extract_into_tensor(self.posterior_variance, t, x_t.shape)
+        posterior_log_variance_clipped = extract_into_tensor(self.posterior_log_variance_clipped, t, x_t.shape)
+        return posterior_mean, posterior_variance, posterior_log_variance_clipped
+
+    def p_mean_variance(self, x, t, clip_denoised: bool):
+        model_out = self.model(x, t)
+        if self.parameterization == "eps":
+            x_recon = self.predict_start_from_noise(x, t=t, noise=model_out)
+        elif self.parameterization == "x0":
+            x_recon = model_out
+        if clip_denoised:
+            x_recon.clamp_(-1., 1.)
+
+        model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start=x_recon, x_t=x, t=t)
+        return model_mean, posterior_variance, posterior_log_variance
+
+    @torch.no_grad()
+    def p_sample(self, x, t, clip_denoised=True, repeat_noise=False):
+        b, *_, device = *x.shape, x.device
+        model_mean, _, model_log_variance = self.p_mean_variance(x=x, t=t, clip_denoised=clip_denoised)
+        noise = noise_like(x.shape, device, repeat_noise)
+        # no noise when t == 0
+        nonzero_mask = (1 - (t == 0).float()).reshape(b, *((1,) * (len(x.shape) - 1)))
+        return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise
+
+    @torch.no_grad()
+    def p_sample_loop(self, shape, return_intermediates=False):
+        device = self.betas.device
+        b = shape[0]
+        img = torch.randn(shape, device=device)
+        intermediates = [img]
+        for i in tqdm(reversed(range(0, self.num_timesteps)), desc='Sampling t', total=self.num_timesteps):
+            img = self.p_sample(img, torch.full((b,), i, device=device, dtype=torch.long),
+                                clip_denoised=self.clip_denoised)
+            if i % self.log_every_t == 0 or i == self.num_timesteps - 1:
+                intermediates.append(img)
+        if return_intermediates:
+            return img, intermediates
+        return img
+
+    @torch.no_grad()
+    def sample(self, batch_size=16, return_intermediates=False):
+        image_size = self.image_size
+        channels = self.channels
+        return self.p_sample_loop((batch_size, channels, image_size, image_size),
+                                  return_intermediates=return_intermediates)
+
+    def q_sample(self, x_start, t, noise=None):
+        noise = default(noise, lambda: torch.randn_like(x_start))
+        return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start *
+                extract_into_tensor(self.scale_arr, t, x_start.shape) +
+                extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise)
+
+    def get_input(self, batch, k):
+        x = batch[k]
+        x = x.to(memory_format=torch.contiguous_format).float()
+        return x
+
+    def _get_rows_from_list(self, samples):
+        n_imgs_per_row = len(samples)
+        denoise_grid = rearrange(samples, 'n b c h w -> b n c h w')
+        denoise_grid = rearrange(denoise_grid, 'b n c h w -> (b n) c h w')
+        denoise_grid = make_grid(denoise_grid, nrow=n_imgs_per_row)
+        return denoise_grid
+
+    @torch.no_grad()
+    def log_images(self, batch, N=8, n_row=2, sample=True, return_keys=None, **kwargs):
+        log = dict()
+        x = self.get_input(batch, self.first_stage_key)
+        N = min(x.shape[0], N)
+        n_row = min(x.shape[0], n_row)
+        x = x.to(self.device)[:N]
+        log["inputs"] = x
+
+        # get diffusion row
+        diffusion_row = list()
+        x_start = x[:n_row]
+
+        for t in range(self.num_timesteps):
+            if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
+                t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
+                t = t.to(self.device).long()
+                noise = torch.randn_like(x_start)
+                x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
+                diffusion_row.append(x_noisy)
+
+        log["diffusion_row"] = self._get_rows_from_list(diffusion_row)
+
+        if sample:
+            # get denoise row
+            with self.ema_scope("Plotting"):
+                samples, denoise_row = self.sample(batch_size=N, return_intermediates=True)
+
+            log["samples"] = samples
+            log["denoise_row"] = self._get_rows_from_list(denoise_row)
+
+        if return_keys:
+            if np.intersect1d(list(log.keys()), return_keys).shape[0] == 0:
+                return log
+            else:
+                return {key: log[key] for key in return_keys}
+        return log
+
+
+class LatentDiffusion(DDPM):
+    """main class"""
+    def __init__(self,
+                 first_stage_config,
+                 cond_stage_config,
+                 num_timesteps_cond=None,
+                 cond_stage_key="caption",
+                 cond_stage_trainable=False,
+                 cond_stage_forward=None,
+                 conditioning_key=None,
+                 uncond_prob=0.2,
+                 uncond_type="empty_seq",
+                 scale_factor=1.0,
+                 scale_by_std=False,
+                 encoder_type="2d",
+                 only_model=False,
+                 use_scale=False,
+                 scale_a=1,
+                 scale_b=0.3,
+                 mid_step=400,
+                 fix_scale_bug=False,
+                 perframe_ae=True,
+                 *args, **kwargs):
+        self.num_timesteps_cond = default(num_timesteps_cond, 1)
+        self.scale_by_std = scale_by_std
+        assert self.num_timesteps_cond <= kwargs['timesteps']
+        # for backwards compatibility after implementation of DiffusionWrapper
+        ckpt_path = kwargs.pop("ckpt_path", None)
+        ignore_keys = kwargs.pop("ignore_keys", [])
+        conditioning_key = default(conditioning_key, 'crossattn')
+        super().__init__(conditioning_key=conditioning_key, *args, **kwargs)
+
+        self.cond_stage_trainable = cond_stage_trainable
+        self.cond_stage_key = cond_stage_key
+        self.perframe_ae = perframe_ae
+
+        # scale factor
+        self.use_scale=use_scale
+        if self.use_scale:
+            self.scale_a=scale_a
+            self.scale_b=scale_b
+            if fix_scale_bug:
+                scale_step=self.num_timesteps-mid_step
+            else: #bug
+                scale_step = self.num_timesteps
+
+            scale_arr1 = np.linspace(scale_a, scale_b, mid_step)
+            scale_arr2 = np.full(scale_step, scale_b)
+            scale_arr = np.concatenate((scale_arr1, scale_arr2))
+            scale_arr_prev = np.append(scale_a, scale_arr[:-1])
+            to_torch = partial(torch.tensor, dtype=torch.float32)
+            self.register_buffer('scale_arr', to_torch(scale_arr))
+
+        try:
+            self.num_downs = len(first_stage_config.params.ddconfig.ch_mult) - 1
+        except:
+            self.num_downs = 0
+        if not scale_by_std:
+            self.scale_factor = scale_factor
+        else:
+            self.register_buffer('scale_factor', torch.tensor(scale_factor))
+        self.instantiate_first_stage(first_stage_config)
+        self.instantiate_cond_stage(cond_stage_config)
+        self.first_stage_config = first_stage_config
+        self.cond_stage_config = cond_stage_config        
+        self.clip_denoised = False
+
+        self.cond_stage_forward = cond_stage_forward
+        self.encoder_type = encoder_type
+        assert(encoder_type in ["2d", "3d"])
+        self.uncond_prob = uncond_prob
+        self.classifier_free_guidance = True if uncond_prob > 0 else False
+        assert(uncond_type in ["zero_embed", "empty_seq"])
+        self.uncond_type = uncond_type
+
+
+        self.restarted_from_ckpt = False
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys, only_model=only_model)
+            self.restarted_from_ckpt = True
+                
+
+    def make_cond_schedule(self, ):
+        self.cond_ids = torch.full(size=(self.num_timesteps,), fill_value=self.num_timesteps - 1, dtype=torch.long)
+        ids = torch.round(torch.linspace(0, self.num_timesteps - 1, self.num_timesteps_cond)).long()
+        self.cond_ids[:self.num_timesteps_cond] = ids
+
+    def q_sample(self, x_start, t, noise=None):
+        noise = default(noise, lambda: torch.randn_like(x_start))
+        if self.use_scale:  
+            return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start *
+                extract_into_tensor(self.scale_arr, t, x_start.shape) +
+                extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise)
+        else:
+            return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start +
+                extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise)
+
+
+    def _freeze_model(self):
+        for name, para in self.model.diffusion_model.named_parameters():
+            para.requires_grad = False
+
+    def instantiate_first_stage(self, config):
+        model = instantiate_from_config(config)
+        self.first_stage_model = model.eval()
+        self.first_stage_model.train = disabled_train
+        for param in self.first_stage_model.parameters():
+            param.requires_grad = False
+
+    def instantiate_cond_stage(self, config):
+        if not self.cond_stage_trainable:
+            model = instantiate_from_config(config)
+            self.cond_stage_model = model.eval()
+            self.cond_stage_model.train = disabled_train
+            for param in self.cond_stage_model.parameters():
+                param.requires_grad = False
+        else:
+            model = instantiate_from_config(config)
+            self.cond_stage_model = model
+    
+    def get_learned_conditioning(self, c):
+        if self.cond_stage_forward is None:
+            if hasattr(self.cond_stage_model, 'encode') and callable(self.cond_stage_model.encode):
+                c = self.cond_stage_model.encode(c)
+                if isinstance(c, DiagonalGaussianDistribution):
+                    c = c.mode()
+            else:
+                c = self.cond_stage_model(c)
+        else:
+            assert hasattr(self.cond_stage_model, self.cond_stage_forward)
+            c = getattr(self.cond_stage_model, self.cond_stage_forward)(c)
+        return c
+
+    def get_first_stage_encoding(self, encoder_posterior, noise=None):
+        if isinstance(encoder_posterior, DiagonalGaussianDistribution):
+            z = encoder_posterior.sample(noise=noise)
+        elif isinstance(encoder_posterior, torch.Tensor):
+            z = encoder_posterior
+        else:
+            raise NotImplementedError(f"encoder_posterior of type '{type(encoder_posterior)}' not yet implemented")
+        return self.scale_factor * z
+   
+    @torch.no_grad()
+    def encode_first_stage(self, x):
+        if self.encoder_type == "2d" and x.dim() == 5 and not self.perframe_ae:
+            b, _, t, _, _ = x.shape
+            x = rearrange(x, 'b c t h w -> (b t) c h w')
+            reshape_back = True
+        else:
+            reshape_back = False
+        
+        if not self.perframe_ae:
+            encoder_posterior = self.first_stage_model.encode(x)
+            results = self.get_first_stage_encoding(encoder_posterior).detach()
+        else:
+            results = []
+            for index in range(x.shape[2]):
+                frame_batch = self.first_stage_model.encode(x[:,:,index,:,:])
+                frame_result = self.get_first_stage_encoding(frame_batch).detach()
+                results.append(frame_result)
+            results = torch.stack(results, dim=2)
+            
+        if reshape_back:
+            results = rearrange(results, '(b t) c h w -> b c t h w', b=b,t=t)
+        
+        return results
+    
+    @torch.no_grad()
+    def encode_first_stage_2DAE(self, x):
+
+        b, _, t, _, _ = x.shape
+        results = torch.cat([self.get_first_stage_encoding(self.first_stage_model.encode(x[:,:,i])).detach().unsqueeze(2) for i in range(t)], dim=2)
+        
+        return results
+    
+    def decode_core(self, z, **kwargs):
+        if self.encoder_type == "2d" and z.dim() == 5 and not self.perframe_ae:
+            b, _, t, _, _ = z.shape
+            z = rearrange(z, 'b c t h w -> (b t) c h w')
+            reshape_back = True
+        else:
+            reshape_back = False
+
+        if not self.perframe_ae:            
+            z = 1. / self.scale_factor * z
+            results = self.first_stage_model.decode(z, **kwargs)
+        else:
+            results = []
+            for index in range(z.shape[2]):
+                frame_z = 1. / self.scale_factor * z[:,:,index,:,:]
+                frame_result = self.first_stage_model.decode(frame_z, **kwargs)
+                results.append(frame_result)
+            results = torch.stack(results, dim=2)
+
+
+        if reshape_back:
+            results = rearrange(results, '(b t) c h w -> b c t h w', b=b,t=t)
+        return results
+
+    @torch.no_grad()
+    def decode_first_stage(self, z, **kwargs):
+        return self.decode_core(z, **kwargs)
+
+    def apply_model(self, x_noisy, t, cond, **kwargs):
+        if isinstance(cond, dict):
+            # hybrid case, cond is exptected to be a dict
+            pass
+        else:
+            if not isinstance(cond, list):
+                cond = [cond]
+            key = 'c_concat' if self.model.conditioning_key == 'concat' else 'c_crossattn'
+            cond = {key: cond}
+
+        x_recon = self.model(x_noisy, t, **cond, **kwargs)
+
+        if isinstance(x_recon, tuple):
+            return x_recon[0]
+        else:
+            return x_recon
+
+    def _get_denoise_row_from_list(self, samples, desc=''):
+        denoise_row = []
+        for zd in tqdm(samples, desc=desc):
+            denoise_row.append(self.decode_first_stage(zd.to(self.device)))
+        n_log_timesteps = len(denoise_row)
+
+        denoise_row = torch.stack(denoise_row)  # n_log_timesteps, b, C, H, W
+        
+        if denoise_row.dim() == 5:
+            # img, num_imgs= n_log_timesteps * bs, grid_size=[bs,n_log_timesteps]
+            denoise_grid = rearrange(denoise_row, 'n b c h w -> b n c h w')
+            denoise_grid = rearrange(denoise_grid, 'b n c h w -> (b n) c h w')
+            denoise_grid = make_grid(denoise_grid, nrow=n_log_timesteps)
+        elif denoise_row.dim() == 6:
+            # video, grid_size=[n_log_timesteps*bs, t]
+            video_length = denoise_row.shape[3]
+            denoise_grid = rearrange(denoise_row, 'n b c t h w -> b n c t h w')
+            denoise_grid = rearrange(denoise_grid, 'b n c t h w -> (b n) c t h w')
+            denoise_grid = rearrange(denoise_grid, 'n c t h w -> (n t) c h w')
+            denoise_grid = make_grid(denoise_grid, nrow=video_length)
+        else:
+            raise ValueError
+
+        return denoise_grid
+ 
+
+    @torch.no_grad()
+    def decode_first_stage_2DAE(self, z, **kwargs):
+
+        b, _, t, _, _ = z.shape
+        z = 1. / self.scale_factor * z
+        results = torch.cat([self.first_stage_model.decode(z[:,:,i], **kwargs).unsqueeze(2) for i in range(t)], dim=2)
+
+        return results
+
+
+    def p_mean_variance(self, x, c, t, clip_denoised: bool, return_x0=False, score_corrector=None, corrector_kwargs=None, **kwargs):
+        t_in = t
+        model_out = self.apply_model(x, t_in, c, **kwargs)
+
+        if score_corrector is not None:
+            assert self.parameterization == "eps"
+            model_out = score_corrector.modify_score(self, model_out, x, t, c, **corrector_kwargs)
+
+        if self.parameterization == "eps":
+            x_recon = self.predict_start_from_noise(x, t=t, noise=model_out)
+        elif self.parameterization == "x0":
+            x_recon = model_out
+        else:
+            raise NotImplementedError()
+
+        if clip_denoised:
+            x_recon.clamp_(-1., 1.)
+
+        model_mean, posterior_variance, posterior_log_variance = self.q_posterior(x_start=x_recon, x_t=x, t=t)
+
+        if return_x0:
+            return model_mean, posterior_variance, posterior_log_variance, x_recon
+        else:
+            return model_mean, posterior_variance, posterior_log_variance
+
+    @torch.no_grad()
+    def p_sample(self, x, c, t, clip_denoised=False, repeat_noise=False, return_x0=False, \
+                 temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None, **kwargs):
+        b, *_, device = *x.shape, x.device
+        outputs = self.p_mean_variance(x=x, c=c, t=t, clip_denoised=clip_denoised, return_x0=return_x0, \
+                                       score_corrector=score_corrector, corrector_kwargs=corrector_kwargs, **kwargs)
+        if return_x0:
+            model_mean, _, model_log_variance, x0 = outputs
+        else:
+            model_mean, _, model_log_variance = outputs
+
+        noise = noise_like(x.shape, device, repeat_noise) * temperature
+        if noise_dropout > 0.:
+            noise = torch.nn.functional.dropout(noise, p=noise_dropout)
+        # no noise when t == 0
+        nonzero_mask = (1 - (t == 0).float()).reshape(b, *((1,) * (len(x.shape) - 1)))
+
+        if return_x0:
+            return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise, x0
+        else:
+            return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise
+
+    @torch.no_grad()
+    def p_sample_loop(self, cond, shape, return_intermediates=False, x_T=None, verbose=True, callback=None, \
+                      timesteps=None, mask=None, x0=None, img_callback=None, start_T=None, log_every_t=None, **kwargs):
+
+        if not log_every_t:
+            log_every_t = self.log_every_t
+        device = self.betas.device
+        b = shape[0]        
+        # sample an initial noise
+        if x_T is None:
+            img = torch.randn(shape, device=device)
+        else:
+            img = x_T
+
+        intermediates = [img]
+        if timesteps is None:
+            timesteps = self.num_timesteps
+        if start_T is not None:
+            timesteps = min(timesteps, start_T)
+
+        iterator = tqdm(reversed(range(0, timesteps)), desc='Sampling t', total=timesteps) if verbose else reversed(range(0, timesteps))
+
+        if mask is not None:
+            assert x0 is not None
+            assert x0.shape[2:3] == mask.shape[2:3]  # spatial size has to match
+
+        for i in iterator:
+            ts = torch.full((b,), i, device=device, dtype=torch.long)
+            if self.shorten_cond_schedule:
+                assert self.model.conditioning_key != 'hybrid'
+                tc = self.cond_ids[ts].to(cond.device)
+                cond = self.q_sample(x_start=cond, t=tc, noise=torch.randn_like(cond))
+
+            img = self.p_sample(img, cond, ts, clip_denoised=self.clip_denoised, **kwargs)
+            if mask is not None:
+                img_orig = self.q_sample(x0, ts)
+                img = img_orig * mask + (1. - mask) * img
+
+            if i % log_every_t == 0 or i == timesteps - 1:
+                intermediates.append(img)
+            if callback: callback(i)
+            if img_callback: img_callback(img, i)
+
+        if return_intermediates:
+            return img, intermediates
+        return img
+
+
+class LatentVisualDiffusion(LatentDiffusion):
+    def __init__(self, cond_img_config, finegrained=False, random_cond=False, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.random_cond = random_cond
+        self.instantiate_img_embedder(cond_img_config, freeze=True)
+        num_tokens = 16 if finegrained else 4
+        self.image_proj_model = self.init_projector(use_finegrained=finegrained, num_tokens=num_tokens, input_dim=1024,\
+                                            cross_attention_dim=1024, dim=1280)    
+
+    def instantiate_img_embedder(self, config, freeze=True):
+        embedder = instantiate_from_config(config)
+        if freeze:
+            self.embedder = embedder.eval()
+            self.embedder.train = disabled_train
+            for param in self.embedder.parameters():
+                param.requires_grad = False
+
+    def init_projector(self, use_finegrained, num_tokens, input_dim, cross_attention_dim, dim):
+        if not use_finegrained:
+            image_proj_model = ImageProjModel(clip_extra_context_tokens=num_tokens, cross_attention_dim=cross_attention_dim,
+                clip_embeddings_dim=input_dim
+            )
+        else:
+            image_proj_model = Resampler(dim=input_dim, depth=4, dim_head=64, heads=12, num_queries=num_tokens,
+                embedding_dim=dim, output_dim=cross_attention_dim, ff_mult=4
+            )
+        return image_proj_model
+
+    ## Never delete this func: it is used in log_images() and inference stage
+    def get_image_embeds(self, batch_imgs):
+        ## img: b c h w
+        img_token = self.embedder(batch_imgs)
+        img_emb = self.image_proj_model(img_token)
+        return img_emb
+
+
+class DiffusionWrapper(pl.LightningModule):
+    def __init__(self, diff_model_config, conditioning_key):
+        super().__init__()
+        self.diffusion_model = instantiate_from_config(diff_model_config)
+        self.conditioning_key = conditioning_key
+
+    def forward(self, x, t, c_concat: list = None, c_crossattn: list = None,
+                c_adm=None, s=None, mask=None, **kwargs):
+        # temporal_context = fps is foNone
+        if self.conditioning_key is None:
+            out = self.diffusion_model(x, t)
+        elif self.conditioning_key == 'concat':
+            xc = torch.cat([x] + c_concat, dim=1)
+            out = self.diffusion_model(xc, t, **kwargs)
+        elif self.conditioning_key == 'crossattn':
+            cc = torch.cat(c_crossattn, 1)
+            out = self.diffusion_model(x, t, context=cc, **kwargs)
+        elif self.conditioning_key == 'hybrid':
+            ## it is just right [b,c,t,h,w]: concatenate in channel dim
+            xc = torch.cat([x] + c_concat, dim=1)
+            cc = torch.cat(c_crossattn, 1)
+            out = self.diffusion_model(xc, t, context=cc)
+        elif self.conditioning_key == 'resblockcond':
+            cc = c_crossattn[0]
+            out = self.diffusion_model(x, t, context=cc)
+        elif self.conditioning_key == 'adm':
+            cc = c_crossattn[0]
+            out = self.diffusion_model(x, t, y=cc)
+        elif self.conditioning_key == 'hybrid-adm':
+            assert c_adm is not None
+            xc = torch.cat([x] + c_concat, dim=1)
+            cc = torch.cat(c_crossattn, 1)
+            out = self.diffusion_model(xc, t, context=cc, y=c_adm)
+        elif self.conditioning_key == 'hybrid-time':
+            assert s is not None
+            xc = torch.cat([x] + c_concat, dim=1)
+            cc = torch.cat(c_crossattn, 1)
+            out = self.diffusion_model(xc, t, context=cc, s=s)
+        elif self.conditioning_key == 'concat-time-mask':
+            # assert s is not None
+            # mainlogger.info('x & mask:',x.shape,c_concat[0].shape)
+            xc = torch.cat([x] + c_concat, dim=1)
+            out = self.diffusion_model(xc, t, context=None, s=s, mask=mask)
+        elif self.conditioning_key == 'concat-adm-mask':
+            # assert s is not None
+            # mainlogger.info('x & mask:',x.shape,c_concat[0].shape)
+            if c_concat is not None:
+                xc = torch.cat([x] + c_concat, dim=1)
+            else:
+                xc = x
+            out = self.diffusion_model(xc, t, context=None, y=s, mask=mask)
+        elif self.conditioning_key == 'hybrid-adm-mask':
+            cc = torch.cat(c_crossattn, 1)
+            if c_concat is not None:
+                xc = torch.cat([x] + c_concat, dim=1)
+            else:
+                xc = x
+            out = self.diffusion_model(xc, t, context=cc, y=s, mask=mask)
+        elif self.conditioning_key == 'hybrid-time-adm': # adm means y, e.g., class index
+            # assert s is not None
+            assert c_adm is not None
+            xc = torch.cat([x] + c_concat, dim=1)
+            cc = torch.cat(c_crossattn, 1)
+            out = self.diffusion_model(xc, t, context=cc, s=s, y=c_adm)
+        else:
+            raise NotImplementedError()
+
+        return out
\ No newline at end of file
diff --git a/lvdm/models/ddpm3d_cond.py b/lvdm/models/ddpm3d_cond.py
new file mode 100644
index 0000000000000000000000000000000000000000..987c1f2861041ac8c69cb21209e258402c469ec5
--- /dev/null
+++ b/lvdm/models/ddpm3d_cond.py
@@ -0,0 +1,141 @@
+import os, random
+from einops import rearrange, repeat
+
+import torch
+from utils.utils import instantiate_from_config
+from lvdm.models.ddpm3d import LatentDiffusion
+from lvdm.models.samplers.ddim import DDIMSampler
+from lvdm.modules.attention import TemporalTransformer
+
+class T2VAdapterDepth(LatentDiffusion):
+    def __init__(self, depth_stage_config, adapter_config, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.depth_stage = instantiate_from_config(depth_stage_config)
+        self.adapter = instantiate_from_config(adapter_config)
+        self.condtype = adapter_config.cond_name
+        
+        if 'pretrained' in adapter_config: 
+            self.load_pretrained_adapter(adapter_config.pretrained)
+        
+        for param in self.depth_stage.parameters():
+            param.requires_grad = False
+    
+    def prepare_midas_input(self, x):
+        # x: (b, c, h, w)
+        h, w = x.shape[-2:]
+        x_midas = torch.nn.functional.interpolate(x, size=(h, w), mode='bilinear')
+        return x_midas
+
+    @torch.no_grad()
+    def get_batch_depth(self, x, target_size):
+        # x: (b, c, t, h, w)
+        # get depth image, reshape to target_size and normalize to [-1, 1]
+        b, c, t, h, w = x.shape
+        x = rearrange(x, 'b c t h w -> (b t) c h w')
+        x_midas = self.prepare_midas_input(x)
+        cond_depth = self.depth_stage(x_midas)
+        cond_depth = torch.nn.functional.interpolate(cond_depth, size=target_size, mode='bilinear')
+        depth_min, depth_max = torch.amin(cond_depth, dim=[1, 2, 3], keepdim=True), torch.amax(cond_depth, dim=[1, 2, 3], keepdim=True)
+        cond_depth = (cond_depth - depth_min) / (depth_max - depth_min + 1e-7)
+        cond_depth = 2. * cond_depth - 1.
+        cond_depth = rearrange(cond_depth, '(b t) c h w -> b c t h w', b=b, t=t)
+        return cond_depth
+    
+    def load_pretrained_adapter(self, adapter_ckpt):
+        # load pretrained adapter
+        print(">>> Load pretrained adapter checkpoint.")
+        try:
+            state_dict = torch.load(adapter_ckpt, map_location="cpu")
+            if "state_dict" in list(state_dict.keys()):
+                state_dict = state_dict["state_dict"]
+            self.adapter.load_state_dict(state_dict, strict=True)
+        except:
+            state_dict = torch.load(adapter_ckpt, map_location=f"cpu")
+            if "state_dict" in list(state_dict.keys()):
+                state_dict = state_dict["state_dict"]
+            model_state_dict = self.adapter.state_dict()
+            n_unmatched = 0
+            for n, p in model_state_dict.items():
+                if p.shape != state_dict[n].shape:
+                    state_dict.pop(n)
+                    n_unmatched += 1
+            model_state_dict.update(state_dict)
+            self.adapter.load_state_dict(model_state_dict)
+            print(f"Pretrained adapter IS NOT complete [{n_unmatched} units have unmatched shape].")
+
+
+class T2IAdapterStyleAS(LatentDiffusion):
+    def __init__(self, style_stage_config, adapter_config, *args, **kwargs):
+        super(T2IAdapterStyleAS, self).__init__(*args, **kwargs)
+        self.adapter = instantiate_from_config(adapter_config)
+        self.condtype = adapter_config.cond_name
+        ## adapter loading / saving paths
+        self.style_stage_model = instantiate_from_config(style_stage_config)
+
+        self.adapter.create_cross_attention_adapter(self.model.diffusion_model)
+            
+        if 'pretrained' in adapter_config:
+            self.load_pretrained_adapter(adapter_config.pretrained)
+        
+        # freeze the style stage model  
+        for param in self.style_stage_model.parameters():
+            param.requires_grad = False
+    
+    def load_pretrained_adapter(self, pretrained):
+        state_dict = torch.load(pretrained, map_location=f"cpu")
+        
+        if "state_dict" in list(state_dict.keys()):
+            state_dict = state_dict["state_dict"]
+        self.adapter.load_state_dict(state_dict, strict=False)
+        print('>>> adapter checkpoint loaded.')
+
+    @torch.no_grad()
+    def get_batch_style(self, batch_x):
+        b, c, h, w = batch_x.shape
+        cond_style = self.style_stage_model(batch_x)
+        return cond_style
+    
+class T2VFintoneStyleAS(T2IAdapterStyleAS):
+    def _get_temp_attn_parameters(self):
+        temp_attn_params = []
+        def register_recr(net_, name):
+            if isinstance(net_, TemporalTransformer):
+                temp_attn_params.extend(net_.parameters())
+            else:
+                for sub_name, net in net_.named_children():
+                    register_recr(net, f"{name}.{sub_name}")
+                
+        for name, net in self.model.diffusion_model.named_children():
+            register_recr(net, name)
+        return temp_attn_params
+
+    def _get_temp_attn_state_dict(self):
+        temp_attn_state_dict = {}
+        def register_recr(net_, name):
+            if isinstance(net_, TemporalTransformer):
+                temp_attn_state_dict[name] = net_.state_dict()
+            else:
+                for sub_name, net in net_.named_children():
+                    register_recr(net, f"{name}.{sub_name}")
+                
+        for name, net in self.model.diffusion_model.named_children():
+            register_recr(net, name)
+        return temp_attn_state_dict
+
+    def _load_temp_attn_state_dict(self, temp_attn_state_dict):
+        def register_recr(net_, name):
+            if isinstance(net_, TemporalTransformer):
+                net_.load_state_dict(temp_attn_state_dict[name], strict=True)
+            else:
+                for sub_name, net in net_.named_children():
+                    register_recr(net, f"{name}.{sub_name}")
+                
+        for name, net in self.model.diffusion_model.named_children():
+            register_recr(net, name)
+
+    def load_pretrained_temporal(self, pretrained):
+        temp_attn_ckpt = torch.load(pretrained, map_location=f"cpu")
+        if "state_dict" in list(temp_attn_ckpt.keys()):
+            temp_attn_ckpt = temp_attn_ckpt["state_dict"]
+        self._load_temp_attn_state_dict(temp_attn_ckpt)
+        print('>>> Temporal Attention checkpoint loaded.')
\ No newline at end of file
diff --git a/lvdm/models/samplers/__pycache__/ddim.cpython-39.pyc b/lvdm/models/samplers/__pycache__/ddim.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2ee19b52c772f632758874e7c5dfb9412d7d2211
Binary files /dev/null and b/lvdm/models/samplers/__pycache__/ddim.cpython-39.pyc differ
diff --git a/lvdm/models/samplers/ddim.py b/lvdm/models/samplers/ddim.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f336d17d679a2808d2767415c4e705f87f10298
--- /dev/null
+++ b/lvdm/models/samplers/ddim.py
@@ -0,0 +1,420 @@
+import numpy as np
+from tqdm import tqdm
+import torch
+from lvdm.models.utils_diffusion import make_ddim_sampling_parameters, make_ddim_timesteps
+from lvdm.common import noise_like
+
+
+class DDIMSampler(object):
+    def __init__(self, model, schedule="linear", **kwargs):
+        super().__init__()
+        self.model = model
+        self.ddpm_num_timesteps = model.num_timesteps
+        self.schedule = schedule
+        self.counter = 0
+
+    def register_buffer(self, name, attr):
+        if type(attr) == torch.Tensor:
+            if attr.device != torch.device("cuda"):
+                attr = attr.to(torch.device("cuda"))
+        setattr(self, name, attr)
+
+    def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
+        self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
+                                                  num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=verbose)
+        alphas_cumprod = self.model.alphas_cumprod
+        assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
+        to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
+
+        self.register_buffer('betas', to_torch(self.model.betas))
+        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
+        self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev))
+        self.use_scale = self.model.use_scale
+
+        if self.use_scale:
+            self.register_buffer('scale_arr', to_torch(self.model.scale_arr))
+            ddim_scale_arr = self.scale_arr.cpu()[self.ddim_timesteps]
+            self.register_buffer('ddim_scale_arr', ddim_scale_arr)
+            ddim_scale_arr = np.asarray([self.scale_arr.cpu()[0]] + self.scale_arr.cpu()[self.ddim_timesteps[:-1]].tolist())
+            self.register_buffer('ddim_scale_arr_prev', ddim_scale_arr)
+
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu())))
+        self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
+        self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu())))
+        self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
+        self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
+
+        # ddim sampling parameters
+        ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
+                                                                                   ddim_timesteps=self.ddim_timesteps,
+                                                                                   eta=ddim_eta,verbose=verbose)
+        self.register_buffer('ddim_sigmas', ddim_sigmas)
+        self.register_buffer('ddim_alphas', ddim_alphas)
+        self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
+        self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
+        sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
+            (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
+                        1 - self.alphas_cumprod / self.alphas_cumprod_prev))
+        self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
+
+    @torch.no_grad()
+    def sample(self,
+               S,
+               batch_size,
+               shape,
+               conditioning=None,
+               callback=None,
+               normals_sequence=None,
+               img_callback=None,
+               quantize_x0=False,
+               eta=0.,
+               mask=None,
+               x0=None,
+               temperature=1.,
+               noise_dropout=0.,
+               score_corrector=None,
+               corrector_kwargs=None,
+               verbose=True,
+               schedule_verbose=False,
+               x_T=None,
+               log_every_t=100,
+               unconditional_guidance_scale=1.,
+               unconditional_conditioning=None,
+               # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
+               **kwargs
+               ):
+        
+        # check condition bs
+        if conditioning is not None:
+            if isinstance(conditioning, dict):
+                try:
+                    cbs = conditioning[list(conditioning.keys())[0]].shape[0]
+                except:
+                    cbs = conditioning[list(conditioning.keys())[0]][0].shape[0]
+
+                if cbs != batch_size:
+                    print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
+            else:
+                if conditioning.shape[0] != batch_size:
+                    print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
+
+        self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=schedule_verbose)
+        
+        # make shape
+        if len(shape) == 3:
+            C, H, W = shape
+            size = (batch_size, C, H, W)
+        elif len(shape) == 4:
+            C, T, H, W = shape
+            size = (batch_size, C, T, H, W)
+        # print(f'Data shape for DDIM sampling is {size}, eta {eta}')
+        
+        samples, intermediates = self.ddim_sampling(conditioning, size,
+                                                    callback=callback,
+                                                    img_callback=img_callback,
+                                                    quantize_denoised=quantize_x0,
+                                                    mask=mask, x0=x0,
+                                                    ddim_use_original_steps=False,
+                                                    noise_dropout=noise_dropout,
+                                                    temperature=temperature,
+                                                    score_corrector=score_corrector,
+                                                    corrector_kwargs=corrector_kwargs,
+                                                    x_T=x_T,
+                                                    log_every_t=log_every_t,
+                                                    unconditional_guidance_scale=unconditional_guidance_scale,
+                                                    unconditional_conditioning=unconditional_conditioning,
+                                                    verbose=verbose,
+                                                    **kwargs)
+        return samples, intermediates
+
+    @torch.no_grad()
+    def ddim_sampling(self, cond, shape,
+                      x_T=None, ddim_use_original_steps=False,
+                      callback=None, timesteps=None, quantize_denoised=False,
+                      mask=None, x0=None, img_callback=None, log_every_t=100,
+                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
+                      unconditional_guidance_scale=1., unconditional_conditioning=None, verbose=True,
+                      cond_tau=1., target_size=None, start_timesteps=None,
+                      **kwargs):
+        device = self.model.betas.device        
+        b = shape[0]
+        if x_T is None:
+            img = torch.randn(shape, device=device)
+        else:
+            img = x_T
+        
+        if timesteps is None:
+            timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
+        elif timesteps is not None and not ddim_use_original_steps:
+            subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1
+            timesteps = self.ddim_timesteps[:subset_end]
+            
+        intermediates = {'x_inter': [img], 'pred_x0': [img]}
+        time_range = reversed(range(0,timesteps)) if ddim_use_original_steps else np.flip(timesteps)
+        total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
+        if verbose:
+            iterator = tqdm(time_range, desc='DDIM Sampler', total=total_steps)
+        else:
+            iterator = time_range
+
+        init_x0 = False
+        clean_cond = kwargs.pop("clean_cond", False)
+        for i, step in enumerate(iterator):
+            index = total_steps - i - 1
+            ts = torch.full((b,), step, device=device, dtype=torch.long)
+            if start_timesteps is not None:
+                assert x0 is not None
+                if step > start_timesteps*time_range[0]:
+                    continue
+                elif not init_x0:
+                    img = self.model.q_sample(x0, ts) 
+                    init_x0 = True
+
+            # use mask to blend noised original latent (img_orig) & new sampled latent (img)
+            if mask is not None:
+                assert x0 is not None
+                if clean_cond:
+                    img_orig = x0
+                else:
+                    img_orig = self.model.q_sample(x0, ts)  # TODO: deterministic forward pass? <ddim inversion>
+                img = img_orig * mask + (1. - mask) * img # keep original & modify use img
+            
+            index_clip =  int((1 - cond_tau) * total_steps)
+            if index <= index_clip and target_size is not None:
+                target_size_ = [target_size[0], target_size[1]//8, target_size[2]//8]
+                img = torch.nn.functional.interpolate(
+                img,
+                size=target_size_,
+                mode="nearest",
+                )
+            outs = self.p_sample_ddim(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps,
+                                      quantize_denoised=quantize_denoised, temperature=temperature,
+                                      noise_dropout=noise_dropout, score_corrector=score_corrector,
+                                      corrector_kwargs=corrector_kwargs,
+                                      unconditional_guidance_scale=unconditional_guidance_scale,
+                                      unconditional_conditioning=unconditional_conditioning,
+                                      x0=x0,
+                                      **kwargs)
+            
+            img, pred_x0 = outs
+            if callback: callback(i)
+            if img_callback: img_callback(pred_x0, i)
+
+            if index % log_every_t == 0 or index == total_steps - 1:
+                intermediates['x_inter'].append(img)
+                intermediates['pred_x0'].append(pred_x0)
+
+        return img, intermediates
+
+    @torch.no_grad()
+    def p_sample_ddim(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
+                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
+                      unconditional_guidance_scale=1., unconditional_conditioning=None,
+                      uc_type=None, conditional_guidance_scale_temporal=None, **kwargs):
+        b, *_, device = *x.shape, x.device
+        if x.dim() == 5:
+            is_video = True
+        else:
+            is_video = False
+
+        uncond_kwargs = kwargs.copy()
+        uncond_kwargs['append_to_context'] = None
+
+        if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
+            e_t = self.model.apply_model(x, t, c, **kwargs) # unet denoiser
+        else:
+            # with unconditional condition
+            if isinstance(c, torch.Tensor):
+                e_t = self.model.apply_model(x, t, c, **kwargs)
+                e_t_uncond = self.model.apply_model(x, t, unconditional_conditioning, **uncond_kwargs)
+            elif isinstance(c, dict):
+                e_t = self.model.apply_model(x, t, c, **kwargs)
+                e_t_uncond = self.model.apply_model(x, t, unconditional_conditioning, **uncond_kwargs)
+            else:
+                raise NotImplementedError
+            # text cfg
+            if uc_type is None:
+                e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
+            else:
+                if uc_type == 'cfg_original':
+                    e_t = e_t + unconditional_guidance_scale * (e_t - e_t_uncond)
+                elif uc_type == 'cfg_ours':
+                    e_t = e_t + unconditional_guidance_scale * (e_t_uncond - e_t)
+                else:
+                    raise NotImplementedError
+            # temporal guidance
+            if conditional_guidance_scale_temporal is not None:
+                e_t_temporal = self.model.apply_model(x, t, c, **kwargs)
+                e_t_image = self.model.apply_model(x, t, c, no_temporal_attn=True, **kwargs)
+                e_t = e_t + conditional_guidance_scale_temporal * (e_t_temporal - e_t_image)
+
+        if score_corrector is not None:
+            assert self.model.parameterization == "eps"
+            e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
+
+        alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
+        alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
+        sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
+        sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
+        # select parameters corresponding to the currently considered timestep
+        
+        if is_video:
+            size = (b, 1, 1, 1, 1)
+        else:
+            size = (b, 1, 1, 1)
+        a_t = torch.full(size, alphas[index], device=device)
+        a_prev = torch.full(size, alphas_prev[index], device=device)
+        sigma_t = torch.full(size, sigmas[index], device=device)
+        sqrt_one_minus_at = torch.full(size, sqrt_one_minus_alphas[index],device=device)
+
+        # current prediction for x_0
+        pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
+        if quantize_denoised:
+            pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
+        # direction pointing to x_t
+        dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
+
+        noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
+        if noise_dropout > 0.:
+            noise = torch.nn.functional.dropout(noise, p=noise_dropout)
+        
+        alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
+        if self.use_scale:
+            scale_arr = self.model.scale_arr if use_original_steps else self.ddim_scale_arr
+            scale_t = torch.full(size, scale_arr[index], device=device)
+            scale_arr_prev = self.model.scale_arr_prev if use_original_steps else self.ddim_scale_arr_prev
+            scale_t_prev = torch.full(size, scale_arr_prev[index], device=device)
+            pred_x0 /= scale_t 
+            x_prev = a_prev.sqrt() * scale_t_prev * pred_x0 + dir_xt + noise
+        else:
+            x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
+
+        return x_prev, pred_x0
+
+
+    @torch.no_grad()
+    def stochastic_encode(self, x0, t, use_original_steps=False, noise=None):
+        # fast, but does not allow for exact reconstruction
+        # t serves as an index to gather the correct alphas
+        if use_original_steps:
+            sqrt_alphas_cumprod = self.sqrt_alphas_cumprod
+            sqrt_one_minus_alphas_cumprod = self.sqrt_one_minus_alphas_cumprod
+        else:
+            sqrt_alphas_cumprod = torch.sqrt(self.ddim_alphas)
+            sqrt_one_minus_alphas_cumprod = self.ddim_sqrt_one_minus_alphas
+
+        if noise is None:
+            noise = torch.randn_like(x0)
+
+        def extract_into_tensor(a, t, x_shape):
+            b, *_ = t.shape
+            out = a.gather(-1, t)
+            return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+
+        return (extract_into_tensor(sqrt_alphas_cumprod, t, x0.shape) * x0 +
+                extract_into_tensor(sqrt_one_minus_alphas_cumprod, t, x0.shape) * noise)
+
+    @torch.no_grad()
+    def decode(self, x_latent, cond, t_start, unconditional_guidance_scale=1.0, unconditional_conditioning=None,
+               use_original_steps=False):
+
+        timesteps = np.arange(self.ddpm_num_timesteps) if use_original_steps else self.ddim_timesteps
+        timesteps = timesteps[:t_start]
+
+        time_range = np.flip(timesteps)
+        total_steps = timesteps.shape[0]
+        print(f"Running DDIM Sampling with {total_steps} timesteps")
+
+        iterator = tqdm(time_range, desc='Decoding image', total=total_steps)
+        x_dec = x_latent
+        for i, step in enumerate(iterator):
+            index = total_steps - i - 1
+            ts = torch.full((x_latent.shape[0],), step, device=x_latent.device, dtype=torch.long)
+            x_dec, _ = self.p_sample_ddim(x_dec, cond, ts, index=index, use_original_steps=use_original_steps,
+                                          unconditional_guidance_scale=unconditional_guidance_scale,
+                                          unconditional_conditioning=unconditional_conditioning)
+        return x_dec
+
+
+class DDIMStyleSampler(DDIMSampler):
+    @torch.no_grad()
+    def p_sample_ddim(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
+                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
+                      unconditional_guidance_scale=1., unconditional_guidance_scale_style=None, unconditional_conditioning=None,
+                      uc_type=None, conditional_guidance_scale_temporal=None, **kwargs):
+        b, *_, device = *x.shape, x.device
+        if x.dim() == 5:
+            is_video = True
+        else:
+            is_video = False
+        uncond_kwargs = kwargs.copy()
+        uncond_kwargs['append_to_context'] = None
+
+        if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
+            e_t = self.model.apply_model(x, t, c, **kwargs) # unet denoiser
+        else:
+            # with unconditional condition
+            if isinstance(c, torch.Tensor):
+                e_t = self.model.apply_model(x, t, c, **kwargs)
+                e_t_uncond = self.model.apply_model(x, t, unconditional_conditioning, **uncond_kwargs)
+                if unconditional_guidance_scale_style is not None:
+                    e_t_uncond_style = self.model.apply_model(x, t, c, **uncond_kwargs)
+            elif isinstance(c, dict):
+                e_t = self.model.apply_model(x, t, c, **kwargs)
+                e_t_uncond = self.model.apply_model(x, t, unconditional_conditioning, **uncond_kwargs)
+                if unconditional_guidance_scale_style is not None:
+                    e_t_uncond_style = self.model.apply_model(x, t, c, **uncond_kwargs)
+            else:
+                raise NotImplementedError
+            
+            if unconditional_guidance_scale_style is None:
+                e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
+            else:
+                e_t = e_t + unconditional_guidance_scale_style * (e_t - e_t_uncond_style) + \
+                    unconditional_guidance_scale * (e_t_uncond_style - e_t_uncond)
+            
+            # temporal guidance
+            if conditional_guidance_scale_temporal is not None:
+                e_t_temporal = self.model.apply_model(x, t, c, **kwargs)
+                e_t_image = self.model.apply_model(x, t, c, no_temporal_attn=True, **kwargs)
+                e_t = e_t + conditional_guidance_scale_temporal * (e_t_temporal - e_t_image)
+
+        if score_corrector is not None:
+            assert self.model.parameterization == "eps"
+            e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
+
+        alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
+        alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
+        sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
+        sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
+        # select parameters corresponding to the currently considered timestep
+        
+        if is_video:
+            size = (b, 1, 1, 1, 1)
+        else:
+            size = (b, 1, 1, 1)
+        a_t = torch.full(size, alphas[index], device=device)
+        a_prev = torch.full(size, alphas_prev[index], device=device)
+        sigma_t = torch.full(size, sigmas[index], device=device)
+        sqrt_one_minus_at = torch.full(size, sqrt_one_minus_alphas[index],device=device)
+
+        # current prediction for x_0
+        pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
+        # print(f't={t}, pred_x0, min={torch.min(pred_x0)}, max={torch.max(pred_x0)}',file=f)
+        if quantize_denoised:
+            pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
+        # direction pointing to x_t
+        dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
+        # # norm pred_x0
+        # p=2
+        # s=()
+        # pred_x0 = pred_x0 - torch.max(torch.abs(pred_x0))
+
+        noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
+        if noise_dropout > 0.:
+            noise = torch.nn.functional.dropout(noise, p=noise_dropout)
+    
+        x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
+
+        return x_prev, pred_x0
\ No newline at end of file
diff --git a/lvdm/models/utils_diffusion.py b/lvdm/models/utils_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..603fa817b07cea3581a70ff225d479b7d1518463
--- /dev/null
+++ b/lvdm/models/utils_diffusion.py
@@ -0,0 +1,104 @@
+import math
+import numpy as np
+from einops import repeat
+import torch
+import torch.nn.functional as F
+
+
+def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False):
+    """
+    Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+    if not repeat_only:
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
+        ).to(device=timesteps.device)
+        args = timesteps[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    else:
+        embedding = repeat(timesteps, 'b -> b d', d=dim)
+    return embedding
+
+
+def make_beta_schedule(schedule, n_timestep, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
+    if schedule == "linear":
+        betas = (
+                torch.linspace(linear_start ** 0.5, linear_end ** 0.5, n_timestep, dtype=torch.float64) ** 2
+        )
+
+    elif schedule == "cosine":
+        timesteps = (
+                torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep + cosine_s
+        )
+        alphas = timesteps / (1 + cosine_s) * np.pi / 2
+        alphas = torch.cos(alphas).pow(2)
+        alphas = alphas / alphas[0]
+        betas = 1 - alphas[1:] / alphas[:-1]
+        betas = np.clip(betas, a_min=0, a_max=0.999)
+
+    elif schedule == "sqrt_linear":
+        betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64)
+    elif schedule == "sqrt":
+        betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64) ** 0.5
+    else:
+        raise ValueError(f"schedule '{schedule}' unknown.")
+    return betas.numpy()
+
+
+def make_ddim_timesteps(ddim_discr_method, num_ddim_timesteps, num_ddpm_timesteps, verbose=True):
+    if ddim_discr_method == 'uniform':
+        c = num_ddpm_timesteps // num_ddim_timesteps
+        ddim_timesteps = np.asarray(list(range(0, num_ddpm_timesteps, c)))
+    elif ddim_discr_method == 'quad':
+        ddim_timesteps = ((np.linspace(0, np.sqrt(num_ddpm_timesteps * .8), num_ddim_timesteps)) ** 2).astype(int)
+    else:
+        raise NotImplementedError(f'There is no ddim discretization method called "{ddim_discr_method}"')
+
+    # assert ddim_timesteps.shape[0] == num_ddim_timesteps
+    # add one to get the final alpha values right (the ones from first scale to data during sampling)
+    steps_out = ddim_timesteps + 1
+    if verbose:
+        print(f'Selected timesteps for ddim sampler: {steps_out}')
+    return steps_out
+
+
+def make_ddim_sampling_parameters(alphacums, ddim_timesteps, eta, verbose=True):
+    # select alphas for computing the variance schedule
+    # print(f'ddim_timesteps={ddim_timesteps}, len_alphacums={len(alphacums)}')
+    alphas = alphacums[ddim_timesteps]
+    alphas_prev = np.asarray([alphacums[0]] + alphacums[ddim_timesteps[:-1]].tolist())
+
+    # according the the formula provided in https://arxiv.org/abs/2010.02502
+    sigmas = eta * np.sqrt((1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev))
+    if verbose:
+        print(f'Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}')
+        print(f'For the chosen value of eta, which is {eta}, '
+              f'this results in the following sigma_t schedule for ddim sampler {sigmas}')
+    return sigmas, alphas, alphas_prev
+
+
+def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function,
+    which defines the cumulative product of (1-beta) over time from t = [0,1].
+    :param num_diffusion_timesteps: the number of betas to produce.
+    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
+                      produces the cumulative product of (1-beta) up to that
+                      part of the diffusion process.
+    :param max_beta: the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+    """
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return np.array(betas)
\ No newline at end of file
diff --git a/lvdm/modules/__pycache__/attention.cpython-39.pyc b/lvdm/modules/__pycache__/attention.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f2a4af398b1a7f61e526e5eec89629b51cff7166
Binary files /dev/null and b/lvdm/modules/__pycache__/attention.cpython-39.pyc differ
diff --git a/lvdm/modules/attention.py b/lvdm/modules/attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a63a06189255fa6e3693bb60448a2d29dec26f9
--- /dev/null
+++ b/lvdm/modules/attention.py
@@ -0,0 +1,851 @@
+from functools import partial
+import torch
+from torch import nn, einsum
+import torch.nn.functional as F
+from einops import rearrange, repeat
+try:
+    import xformers
+    import xformers.ops
+    XFORMERS_IS_AVAILBLE = True
+except:
+    XFORMERS_IS_AVAILBLE = False
+from lvdm.common import (
+    checkpoint,
+    exists,
+    default,
+)
+from lvdm.basics import (
+    zero_module,
+)
+
+class RelativePosition(nn.Module):
+    """ https://github.com/evelinehong/Transformer_Relative_Position_PyTorch/blob/master/relative_position.py """
+
+    def __init__(self, num_units, max_relative_position):
+        super().__init__()
+        self.num_units = num_units
+        self.max_relative_position = max_relative_position
+        self.embeddings_table = nn.Parameter(torch.Tensor(max_relative_position * 2 + 1, num_units))
+        nn.init.xavier_uniform_(self.embeddings_table)
+
+    def forward(self, length_q, length_k):
+        device = self.embeddings_table.device
+        range_vec_q = torch.arange(length_q, device=device)
+        range_vec_k = torch.arange(length_k, device=device)
+        distance_mat = range_vec_k[None, :] - range_vec_q[:, None]
+        distance_mat_clipped = torch.clamp(distance_mat, -self.max_relative_position, self.max_relative_position)
+        final_mat = distance_mat_clipped + self.max_relative_position
+        final_mat = final_mat.long()
+        embeddings = self.embeddings_table[final_mat]
+        return embeddings
+
+
+class CrossAttention(nn.Module):
+
+    def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0., 
+                 relative_position=False, temporal_length=None, img_cross_attention=False):
+        super().__init__()
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        self.dim_head = dim_head
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_out = nn.Sequential(nn.Linear(inner_dim, query_dim), nn.Dropout(dropout))
+
+        self.image_cross_attention_scale = 1.0
+        self.text_context_len = 77
+        self.img_cross_attention = img_cross_attention
+        if self.img_cross_attention:
+            self.to_k_ip = nn.Linear(context_dim, inner_dim, bias=False)
+            self.to_v_ip = nn.Linear(context_dim, inner_dim, bias=False)
+        
+        self.relative_position = relative_position
+        if self.relative_position:
+            assert(temporal_length is not None)
+            self.relative_position_k = RelativePosition(num_units=dim_head, max_relative_position=temporal_length)
+            self.relative_position_v = RelativePosition(num_units=dim_head, max_relative_position=temporal_length)
+        else:
+            ## only used for spatial attention, while NOT for temporal attention
+            if XFORMERS_IS_AVAILBLE and temporal_length is None:
+                self.forward = self.efficient_forward
+
+    def forward(self, x, context=None, mask=None, is_imgbatch=False, **kwargs):
+        h = self.heads
+
+        q = self.to_q(x)
+        context = default(context, x)
+        ## considering image token additionally
+        if context is not None and self.img_cross_attention:
+            context, context_img = context[:,:self.text_context_len,:], context[:,self.text_context_len:,:]
+            k = self.to_k(context)
+            v = self.to_v(context)
+            k_ip = self.to_k_ip(context_img)
+            v_ip = self.to_v_ip(context_img)
+        else:
+            k = self.to_k(context)
+            v = self.to_v(context)
+
+        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
+        sim = torch.einsum('b i d, b j d -> b i j', q, k) * self.scale
+        if self.relative_position and not is_imgbatch:
+            len_q, len_k, len_v = q.shape[1], k.shape[1], v.shape[1]
+            k2 = self.relative_position_k(len_q, len_k)
+            sim2 = einsum('b t d, t s d -> b t s', q, k2) * self.scale # TODO check 
+            sim += sim2
+        del k
+
+        if exists(mask):
+            ## feasible for causal attention mask only
+            max_neg_value = -torch.finfo(sim.dtype).max
+            mask = repeat(mask, 'b i j -> (b h) i j', h=h)
+            sim.masked_fill_(~(mask>0.5), max_neg_value)
+
+        # attention, what we cannot get enough of
+        sim = sim.softmax(dim=-1)
+        out = torch.einsum('b i j, b j d -> b i d', sim, v)
+        if self.relative_position and not is_imgbatch:
+            v2 = self.relative_position_v(len_q, len_v)
+            out2 = einsum('b t s, t s d -> b t d', sim, v2) # TODO check
+            out += out2
+        out = rearrange(out, '(b h) n d -> b n (h d)', h=h)
+
+        ## considering image token additionally
+        if context is not None and self.img_cross_attention:
+            k_ip, v_ip = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (k_ip, v_ip))
+            sim_ip =  torch.einsum('b i d, b j d -> b i j', q, k_ip) * self.scale
+            del k_ip
+            sim_ip = sim_ip.softmax(dim=-1)
+            out_ip = torch.einsum('b i j, b j d -> b i d', sim_ip, v_ip)
+            out_ip = rearrange(out, '(b h) n d -> b n (h d)', h=h)
+            out = out + self.image_cross_attention_scale * out_ip
+        del q
+
+        return self.to_out(out)
+    
+    def efficient_forward(self, x, context=None, mask=None, is_imgbatch=False, **kwargs):
+        q = self.to_q(x)
+        context = default(context, x)
+
+        ## considering image token additionally
+        if context is not None and self.img_cross_attention:
+            context, context_img = context[:,:self.text_context_len,:], context[:,self.text_context_len:,:]
+            k = self.to_k(context)
+            v = self.to_v(context)
+            k_ip = self.to_k_ip(context_img)
+            v_ip = self.to_v_ip(context_img)
+        else:
+            k = self.to_k(context)
+            v = self.to_v(context)
+
+        b, _, _ = q.shape
+        q, k, v = map(
+            lambda t: t.unsqueeze(3)
+            .reshape(b, t.shape[1], self.heads, self.dim_head)
+            .permute(0, 2, 1, 3)
+            .reshape(b * self.heads, t.shape[1], self.dim_head)
+            .contiguous(),
+            (q, k, v),
+        )
+        # actually compute the attention, what we cannot get enough of
+        out = xformers.ops.memory_efficient_attention(q, k, v, attn_bias=None, op=None)
+
+        ## considering image token additionally
+        if context is not None and self.img_cross_attention:
+            k_ip, v_ip = map(
+                lambda t: t.unsqueeze(3)
+                .reshape(b, t.shape[1], self.heads, self.dim_head)
+                .permute(0, 2, 1, 3)
+                .reshape(b * self.heads, t.shape[1], self.dim_head)
+                .contiguous(),
+                (k_ip, v_ip),
+            )
+            out_ip = xformers.ops.memory_efficient_attention(q, k_ip, v_ip, attn_bias=None, op=None)
+            out_ip = (
+                out_ip.unsqueeze(0)
+                .reshape(b, self.heads, out.shape[1], self.dim_head)
+                .permute(0, 2, 1, 3)
+                .reshape(b, out.shape[1], self.heads * self.dim_head)
+            )
+
+        if exists(mask):
+            raise NotImplementedError
+        out = (
+            out.unsqueeze(0)
+            .reshape(b, self.heads, out.shape[1], self.dim_head)
+            .permute(0, 2, 1, 3)
+            .reshape(b, out.shape[1], self.heads * self.dim_head)
+        )
+        if context is not None and self.img_cross_attention:
+            out = out + self.image_cross_attention_scale * out_ip
+        return self.to_out(out)
+
+
+class BasicTransformerBlock(nn.Module):
+
+    def __init__(self, dim, n_heads, d_head, dropout=0., context_dim=None, gated_ff=True, checkpoint=True,
+                disable_self_attn=False, attention_cls=None, img_cross_attention=False):
+        super().__init__()
+        attn_cls = CrossAttention if attention_cls is None else attention_cls
+        self.disable_self_attn = disable_self_attn
+        self.attn1 = attn_cls(query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout,
+            context_dim=context_dim if self.disable_self_attn else None)
+        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
+        self.attn2 = attn_cls(query_dim=dim, context_dim=context_dim, heads=n_heads, dim_head=d_head, dropout=dropout,
+            img_cross_attention=img_cross_attention)
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.norm3 = nn.LayerNorm(dim)
+        self.checkpoint = checkpoint
+
+    def forward(self, x, context=None, mask=None, emb=None, scale_scalar=None, is_imgbatch=False):
+        ## implementation tricks: because checkpointing doesn't support non-tensor (e.g. None or scalar) arguments
+        input_tuple = (x,)      ## should not be (x), otherwise *input_tuple will decouple x into multiple arguments
+        if context is not None:
+            input_tuple = (x, context, None, emb, scale_scalar, is_imgbatch)
+        if mask is not None:
+            forward_mask = partial(self._forward, mask=mask, is_imgbatch=is_imgbatch)
+            return checkpoint(forward_mask, (x,), self.parameters(), self.checkpoint)
+        if context is not None and mask is not None:
+            input_tuple = (x, context, mask, emb, scale_scalar, is_imgbatch)
+        return checkpoint(self._forward, input_tuple, self.parameters(), self.checkpoint)
+
+    def _forward(self, x, context=None, mask=None, emb=None, scale_scalar=None, is_imgbatch=False):
+        x = self.attn1(self.norm1(x), context=context if self.disable_self_attn else None, mask=mask, emb=emb, scale_scalar=scale_scalar, is_imgbatch=is_imgbatch) + x
+        x = self.attn2(self.norm2(x), context=context, mask=mask, emb=emb, scale_scalar=scale_scalar, is_imgbatch=is_imgbatch) + x
+        x = self.ff(self.norm3(x)) + x
+        return x
+
+
+class SpatialTransformer(nn.Module):
+    """
+    Transformer block for image-like data in spatial axis.
+    First, project the input (aka embedding)
+    and reshape to b, t, d.
+    Then apply standard transformer action.
+    Finally, reshape to image
+    NEW: use_linear for more efficiency instead of the 1x1 convs
+    """
+
+    def __init__(self, in_channels, n_heads, d_head, depth=1, dropout=0., context_dim=None,
+                 use_checkpoint=True, disable_self_attn=False, use_linear=False, img_cross_attention=False):
+        super().__init__()
+        self.in_channels = in_channels
+        inner_dim = n_heads * d_head
+        self.norm = torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        if not use_linear:
+            self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+        else:
+            self.proj_in = nn.Linear(in_channels, inner_dim)
+
+        self.transformer_blocks = nn.ModuleList([
+            BasicTransformerBlock(
+                inner_dim,
+                n_heads,
+                d_head,
+                dropout=dropout,
+                context_dim=context_dim,
+                img_cross_attention=img_cross_attention,
+                disable_self_attn=disable_self_attn,
+                checkpoint=use_checkpoint) for d in range(depth)
+        ])
+        if not use_linear:
+            self.proj_out = zero_module(nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0))
+        else:
+            self.proj_out = zero_module(nn.Linear(inner_dim, in_channels))
+        self.use_linear = use_linear
+
+
+    def forward(self, x, context=None, emb=None, scale_scalar=None):
+        b, c, h, w = x.shape
+        x_in = x
+        x = self.norm(x)
+        if not self.use_linear:
+            x = self.proj_in(x)
+        x = rearrange(x, 'b c h w -> b (h w) c').contiguous()
+        if self.use_linear:
+            x = self.proj_in(x)
+        for i, block in enumerate(self.transformer_blocks):
+            x = block(x, context=context, emb=emb, scale_scalar=scale_scalar)
+        if self.use_linear:
+            x = self.proj_out(x)
+        x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w).contiguous()
+        if not self.use_linear:
+            x = self.proj_out(x)
+        return x + x_in
+    
+    
+class TemporalTransformer(nn.Module):
+    """
+    Transformer block for image-like data in temporal axis.
+    First, reshape to b, t, d.
+    Then apply standard transformer action.
+    Finally, reshape to image
+    """
+    def __init__(self, in_channels, n_heads, d_head, depth=1, dropout=0., context_dim=None,
+                 use_checkpoint=True, use_linear=False, only_self_att=True, causal_attention=False,
+                 relative_position=False, temporal_length=None):
+        super().__init__()
+        self.only_self_att = only_self_att
+        self.relative_position = relative_position
+        self.causal_attention = causal_attention
+        self.in_channels = in_channels
+        inner_dim = n_heads * d_head
+        self.norm = torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.proj_in = nn.Conv1d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+        if not use_linear:
+            self.proj_in = nn.Conv1d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+        else:
+            self.proj_in = nn.Linear(in_channels, inner_dim)
+
+        if relative_position:
+            assert(temporal_length is not None)
+            attention_cls = partial(CrossAttention, relative_position=True, temporal_length=temporal_length)
+        else:
+            attention_cls = None
+        if self.causal_attention:
+            assert(temporal_length is not None)
+            self.mask = torch.tril(torch.ones([1, temporal_length, temporal_length]))
+
+        if self.only_self_att:
+            context_dim = None
+        self.transformer_blocks = nn.ModuleList([
+            BasicTransformerBlock(
+                inner_dim,
+                n_heads,
+                d_head,
+                dropout=dropout,
+                context_dim=context_dim,
+                attention_cls=attention_cls,
+                checkpoint=use_checkpoint) for d in range(depth)
+        ])
+        if not use_linear:
+            self.proj_out = zero_module(nn.Conv1d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0))
+        else:
+            self.proj_out = zero_module(nn.Linear(inner_dim, in_channels))
+        self.use_linear = use_linear
+
+    def forward(self, x, context=None, is_imgbatch=False, emb=None):
+        b, c, t, h, w = x.shape
+        x_in = x
+        x = self.norm(x)
+        x = rearrange(x, 'b c t h w -> (b h w) c t').contiguous()
+        if not self.use_linear:
+            x = self.proj_in(x)
+        x = rearrange(x, 'bhw c t -> bhw t c').contiguous()
+        if self.use_linear:
+            x = self.proj_in(x)
+
+        if is_imgbatch:
+            maks = torch.eye(t).unsqueeze(0)
+            maks = maks.to(x.device)
+            maks = repeat(maks, 'l i j -> (l bhw) i j', bhw=b*h*w)
+        elif self.causal_attention:
+            mask = self.mask.to(x.device)
+            mask = repeat(mask, 'l i j -> (l bhw) i j', bhw=b*h*w)
+        else:
+            mask = None
+
+        if self.only_self_att:
+            ## note: if no context is given, cross-attention defaults to self-attention
+            for i, block in enumerate(self.transformer_blocks):
+                x = block(x, mask=mask)
+            x = rearrange(x, '(b hw) t c -> b hw t c', b=b).contiguous()
+        else:
+            x = rearrange(x, '(b hw) t c -> b hw t c', b=b).contiguous()
+            context = rearrange(context, '(b t) l con -> b t l con', t=t).contiguous()
+            for i, block in enumerate(self.transformer_blocks):
+                # calculate each batch one by one (since number in shape could not greater then 65,535 for some package)
+                for j in range(b):
+                    context_j = repeat(
+                        context[j],
+                        't l con -> (t r) l con', r=(h * w) // t, t=t).contiguous()
+                    ## note: causal mask will not applied in cross-attention case
+                    x[j] = block(x[j], context=context_j, is_imgbatch=is_imgbatch)
+        
+        if self.use_linear:
+            x = self.proj_out(x)
+            x = rearrange(x, 'b (h w) t c -> b c t h w', h=h, w=w).contiguous()
+        if not self.use_linear:
+            x = rearrange(x, 'b hw t c -> (b hw) c t').contiguous()
+            x = self.proj_out(x)
+            x = rearrange(x, '(b h w) c t -> b c t h w', b=b, h=h, w=w).contiguous()
+
+        return x + x_in
+    
+
+class GEGLU(nn.Module):
+    def __init__(self, dim_in, dim_out):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2)
+
+    def forward(self, x):
+        x, gate = self.proj(x).chunk(2, dim=-1)
+        return x * F.gelu(gate)
+
+
+class FeedForward(nn.Module):
+    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = default(dim_out, dim)
+        project_in = nn.Sequential(
+            nn.Linear(dim, inner_dim),
+            nn.GELU()
+        ) if not glu else GEGLU(dim, inner_dim)
+
+        self.net = nn.Sequential(
+            project_in,
+            nn.Dropout(dropout),
+            nn.Linear(inner_dim, dim_out)
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+class LinearAttention(nn.Module):
+    def __init__(self, dim, heads=4, dim_head=32):
+        super().__init__()
+        self.heads = heads
+        hidden_dim = dim_head * heads
+        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias = False)
+        self.to_out = nn.Conv2d(hidden_dim, dim, 1)
+
+    def forward(self, x):
+        b, c, h, w = x.shape
+        qkv = self.to_qkv(x)
+        q, k, v = rearrange(qkv, 'b (qkv heads c) h w -> qkv b heads c (h w)', heads = self.heads, qkv=3)
+        k = k.softmax(dim=-1)  
+        context = torch.einsum('bhdn,bhen->bhde', k, v)
+        out = torch.einsum('bhde,bhdn->bhen', context, q)
+        out = rearrange(out, 'b heads c (h w) -> b (heads c) h w', heads=self.heads, h=h, w=w)
+        return self.to_out(out)
+
+
+class SpatialSelfAttention(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+
+        self.norm = torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.q = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.k = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.v = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.proj_out = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=1,
+                                        stride=1,
+                                        padding=0)
+
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+
+        # compute attention
+        b,c,h,w = q.shape
+        q = rearrange(q, 'b c h w -> b (h w) c')
+        k = rearrange(k, 'b c h w -> b c (h w)')
+        w_ = torch.einsum('bij,bjk->bik', q, k)
+
+        w_ = w_ * (int(c)**(-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+
+        # attend to values
+        v = rearrange(v, 'b c h w -> b c (h w)')
+        w_ = rearrange(w_, 'b i j -> b j i')
+        h_ = torch.einsum('bij,bjk->bik', v, w_)
+        h_ = rearrange(h_, 'b c (h w) -> b c h w', h=h)
+        h_ = self.proj_out(h_)
+
+        return x+h_
+
+
+class CrossAttentionProcessor(nn.Module):
+    def forward(self, attn, x, context=None, mask=None, is_imgbatch=False):
+        h = attn.heads
+        q = attn.to_q(x)
+        context = default(context, x)
+        k = attn.to_k(context)
+        v = attn.to_v(context)
+
+        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
+        sim = torch.einsum('b i d, b j d -> b i j', q, k) * attn.scale
+        if attn.relative_position and not is_imgbatch:
+            len_q, len_k, len_v = q.shape[1], k.shape[1], v.shape[1]
+            k2 = attn.relative_position_k(len_q, len_k)
+            sim2 = einsum('b t d, t s d -> b t s', q, k2) * attn.scale # TODO check 
+            sim += sim2
+        del q, k
+
+        if exists(mask):
+            raise NotImplementedError
+
+        # attention, what we cannot get enough of
+        sim = sim.softmax(dim=-1)
+
+        out = torch.einsum('b i j, b j d -> b i d', sim, v)
+        if attn.relative_position and not is_imgbatch:
+            v2 = attn.relative_position_v(len_q, len_v)
+            out2 = einsum('b t s, t s d -> b t d', sim, v2) # TODO check
+            out += out2
+        out = rearrange(out, '(b h) n d -> b n (h d)', h=h)
+        return attn.to_out(out)
+    
+    def efficient_forward(self, attn, x, context=None, mask=None, **kwargs):
+        q = attn.to_q(x)
+        context = default(context, x)
+        k = attn.to_k(context)
+        v = attn.to_v(context)
+
+        b, _, _ = q.shape
+
+        q, k, v = map(
+            lambda t: t.unsqueeze(3)
+            .reshape(b, t.shape[1], attn.heads, attn.dim_head)
+            .permute(0, 2, 1, 3)
+            .reshape(b * attn.heads, t.shape[1], attn.dim_head)
+            .contiguous(),
+            (q, k, v),
+        )
+        # actually compute the attention, what we cannot get enough of
+        out = xformers.ops.memory_efficient_attention(q, k, v, attn_bias=None, op=None)
+
+        if exists(mask):
+            raise NotImplementedError
+        out = (
+            out.unsqueeze(0)
+            .reshape(b, attn.heads, out.shape[1], attn.dim_head)
+            .permute(0, 2, 1, 3)
+            .reshape(b, out.shape[1], attn.heads * attn.dim_head)
+        )
+        return attn.to_out(out)
+    
+    def __call__(self, **kwargs):
+        if XFORMERS_IS_AVAILBLE:
+            return self.efficient_forward(**kwargs)
+        else:
+            return self.forward(**kwargs)
+        
+
+def register_attn_processor(unet):
+    Attn_processor = {}
+    def attn_forward(self):
+        assert hasattr(self, "processor")
+        def forward(x, context=None, mask=None, **kwargs):
+            return self.processor(self, x, context, mask, **kwargs)
+
+        return forward
+
+    def register_recr_in_block(net_, name):
+        """
+        find and register cross attention in the SpatialTransformer block
+        assert only one cross attention in each block
+        """
+        if net_.__class__.__name__ == 'BasicTransformerBlock':
+            processor_name = f"{name}.attn2.processor"
+            net_.attn2.processor = CrossAttentionProcessor()
+            net_.attn2.forward = attn_forward(net_.attn2)
+            Attn_processor.update({processor_name: net_.attn2.processor})
+            print(f"Register Attention Processor in {processor_name} successfully!")
+        elif hasattr(net_, 'children'):
+            for sub_name, net in net_.named_children():
+                register_recr_in_block(net, f"{name}.{sub_name}")
+        return
+
+    def register_recr(net_, name):
+        # find SpatialTransformer block
+        if isinstance(net_, SpatialTransformer):
+            register_recr_in_block(net_, name)
+        elif hasattr(net_, 'children'):
+            for sub_name, net in net_.named_children():
+                register_recr(net, f"{name}.{sub_name}")
+
+
+    for name, net in unet.named_children():
+        register_recr(net, name)
+
+    print("==========================================")
+    print(f"Totally {len(Attn_processor.keys())} processors are registered successfully! hiahiahia")
+
+    return Attn_processor
+
+
+def set_attn_processor(unet, processor):
+
+    def register_recr(net_, name):
+        if hasattr(net_, "processor"):
+            net_.processor = processor[f"{name}.processor"]
+            print(f"Set New Attention Processor in {name}.processor successfully!")
+        
+        else:
+            for sub_name, net in net_.named_children():
+                register_recr(net, f"{name}.{sub_name}")
+    
+    for name, net in unet.named_children():
+        register_recr(net, name)
+    
+    return
+
+
+def get_attn_processor(unet):
+    processor_dict = {}
+    def register_recr(net_, name):
+        if hasattr(net_, "processor"):
+            processor_dict[f"{name}.processor"] = net_.processor
+        
+        else:
+            for sub_name, net in net_.named_children():
+                register_recr(net, f"{name}.{sub_name}")
+    
+    for name, net in unet.named_children():
+        register_recr(net, name)
+    
+    return processor_dict
+
+
+class DualCrossAttnProcessor(nn.Module):
+    def __init__(self, context_dim, inner_dim, scale=1.0, state_dict=None, use_norm=False, layer_idx=0):
+        super().__init__()
+        self.to_k_style = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v_style = nn.Linear(context_dim, inner_dim, bias=False)
+        self.scale = scale
+        self.layer_idx = layer_idx
+
+        if state_dict is not None:
+            self.to_k_style.load_state_dict(state_dict['k'], strict=True)
+            self.to_v_style.load_state_dict(state_dict['v'], strict=True)
+        
+        self.use_norm = use_norm
+        if use_norm:
+            self.norm_style = nn.LayerNorm(inner_dim)
+        else:
+            self.norm_style = lambda x: x
+
+    def forward(self, attn, x, context=None, mask=None, context_style=None, **kwargs):
+        h = attn.heads
+        q = attn.to_q(x)
+        context = default(context, x)
+        k = attn.to_k(context)
+        v = attn.to_v(context)
+
+        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
+        sim = torch.einsum('b i d, b j d -> b i j', q, k) * attn.scale
+
+        if exists(mask):
+            ## feasible for causal attention mask only
+            max_neg_value = -torch.finfo(sim.dtype).max
+            mask = repeat(mask, 'b i j -> (b h) i j', h=h)
+            sim.masked_fill_(~(mask>0.5), max_neg_value)
+
+        # attention, what we cannot get enough of
+        sim = sim.softmax(dim=-1)
+
+        out = torch.einsum('b i j, b j d -> b i d', sim, v)
+        out = rearrange(out, '(b h) n d -> b n (h d)', h=h)
+
+        # for another cross attention
+        if context_style is not None:
+            k_style = self.to_k_style(context_style)
+            v_style = self.to_v_style(context_style)
+
+            k_style, v_style = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (k_style, v_style))
+            sim_style = torch.einsum('b i d, b j d -> b i j', q, k_style)
+            sim_style = sim_style.softmax(dim=-1)
+            out_style = torch.einsum('b i j, b j d -> b i d', sim_style, v_style)
+            out_style = rearrange(out_style, '(b h) n d -> b n (h d)', h=h)
+
+            out = out + out_style
+
+        return attn.to_out(out)
+    
+    def efficient_forward(self, attn, x, context=None, mask=None, context_style=None, **kwargs):
+        q = attn.to_q(x)
+        context = default(context, x)
+        k = attn.to_k(context)
+        v = attn.to_v(context)
+
+        b, _, _ = q.shape
+
+        q, k, v = map(
+            lambda t: t.unsqueeze(3)
+            .reshape(b, t.shape[1], attn.heads, attn.dim_head)
+            .permute(0, 2, 1, 3)
+            .reshape(b * attn.heads, t.shape[1], attn.dim_head)
+            .contiguous(),
+            (q, k, v),
+        )
+        out = xformers.ops.memory_efficient_attention(q, k, v, attn_bias=None, op=None)
+        
+        out = (
+            out.unsqueeze(0)
+            .reshape(b, attn.heads, out.shape[1], attn.dim_head)
+            .permute(0, 2, 1, 3)
+            .reshape(b, out.shape[1], attn.heads * attn.dim_head)
+        )
+
+
+        if context_style is not None:
+            k_style = self.to_k_style(context_style)
+            v_style = self.to_v_style(context_style)
+
+            k_style, v_style = map(
+                lambda t: t.unsqueeze(3)
+                .reshape(b, t.shape[1], attn.heads, attn.dim_head)
+                .permute(0, 2, 1, 3)
+                .reshape(b * attn.heads, t.shape[1], attn.dim_head)
+                .contiguous(),
+                (k_style, v_style),
+            )
+            out_style = xformers.ops.memory_efficient_attention(q, k_style, v_style, attn_bias=None, op=None)
+
+            out_style = (
+                out_style.unsqueeze(0)
+                .reshape(b, attn.heads, out_style.shape[1], attn.dim_head)
+                .permute(0, 2, 1, 3)
+                .reshape(b, out_style.shape[1], attn.heads * attn.dim_head)
+            )
+            
+            out = out + out_style
+
+        return attn.to_out(out)
+    
+    def __call__(self, attn, x, context=None, mask=None, **kwargs):
+        # print("Hello! I am working!")
+
+        # separate the context
+        # print(context.shape)
+        if context.shape[1] == 77:
+            context_style = None
+        else:
+            context_style = context[:, 77:, :]
+            context = context[:, :77, :] 
+
+        if XFORMERS_IS_AVAILBLE:
+            return self.efficient_forward(attn, x, context=context, mask=mask, context_style=context_style, **kwargs)
+        else:
+            return self.forward(attn, x, context=context, mask=mask, context_style=context_style, **kwargs)
+
+
+
+class DualCrossAttnProcessorAS(DualCrossAttnProcessor):
+    def forward(self, attn, x, context=None, mask=None, context_style=None, scale_scalar=None, **kwargs):
+        h = attn.heads
+        q = attn.to_q(x)
+        context = default(context, x)
+        k = attn.to_k(context)
+        v = attn.to_v(context)
+
+        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
+        sim = torch.einsum('b i d, b j d -> b i j', q, k) * attn.scale
+
+        # attention, what we cannot get enough of
+        sim = sim.softmax(dim=-1)
+
+        out = torch.einsum('b i j, b j d -> b i d', sim, v)
+        out = rearrange(out, '(b h) n d -> b n (h d)', h=h)
+
+        # for another cross attention
+        if context_style is not None:
+            k_style = self.to_k_style(context_style)
+            v_style = self.to_v_style(context_style)
+
+            k_style, v_style = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (k_style, v_style))
+            sim_style = torch.einsum('b i d, b j d -> b i j', q, k_style)
+            sim_style = sim_style.softmax(dim=-1)
+            out_style = torch.einsum('b i j, b j d -> b i d', sim_style, v_style)
+            out_style = rearrange(out_style, '(b h) n d -> b n (h d)', h=h)
+
+            if scale_scalar is not None:
+                scale = 1 + scale_scalar[:, self.layer_idx]
+                scale = scale[:, None]
+            else:
+                scale = 1.0
+
+            if self.use_norm:
+                out_style = self.norm_style(out_style)
+
+            out = out + scale * out_style * self.scale
+
+        return attn.to_out(out)
+    
+    def efficient_forward(self, attn, x, context=None, mask=None, context_style=None, scale_scalar=None, **kwargs):
+        q = attn.to_q(x)
+        context = default(context, x)
+        k = attn.to_k(context)
+        v = attn.to_v(context)
+
+        b, _, _ = q.shape
+
+        q, k, v = map(
+            lambda t: t.unsqueeze(3)
+            .reshape(b, t.shape[1], attn.heads, attn.dim_head)
+            .permute(0, 2, 1, 3)
+            .reshape(b * attn.heads, t.shape[1], attn.dim_head)
+            .contiguous(),
+            (q, k, v),
+        )
+        out = xformers.ops.memory_efficient_attention(q, k, v, attn_bias=None, op=None)
+        
+        out = (
+            out.unsqueeze(0)
+            .reshape(b, attn.heads, out.shape[1], attn.dim_head)
+            .permute(0, 2, 1, 3)
+            .reshape(b, out.shape[1], attn.heads * attn.dim_head)
+        )
+
+        if context_style is not None:
+            k_style = self.to_k_style(context_style)
+            v_style = self.to_v_style(context_style)
+
+            k_style, v_style = map(
+                lambda t: t.unsqueeze(3)
+                .reshape(b, t.shape[1], attn.heads, attn.dim_head)
+                .permute(0, 2, 1, 3)
+                .reshape(b * attn.heads, t.shape[1], attn.dim_head)
+                .contiguous(),
+                (k_style, v_style),
+            )
+            out_style = xformers.ops.memory_efficient_attention(q, k_style, v_style, attn_bias=None, op=None)
+
+            out_style = (
+                out_style.unsqueeze(0)
+                .reshape(b, attn.heads, out_style.shape[1], attn.dim_head)
+                .permute(0, 2, 1, 3)
+                .reshape(b, out_style.shape[1], attn.heads * attn.dim_head)
+            )
+            
+            if scale_scalar is not None:
+                scale = 1 + scale_scalar[:, self.layer_idx]
+                scale = scale[:, None]
+            else:
+                scale = 1.0
+
+
+            if self.use_norm:
+                out_style = self.norm_style(out_style)
+            
+            out = out + scale * out_style * self.scale
+
+        return attn.to_out(out)
+
+  
+    
\ No newline at end of file
diff --git a/lvdm/modules/encoders/__pycache__/adapter.cpython-39.pyc b/lvdm/modules/encoders/__pycache__/adapter.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c74c66a6e0ba49b33366a219c994837d555ace8b
Binary files /dev/null and b/lvdm/modules/encoders/__pycache__/adapter.cpython-39.pyc differ
diff --git a/lvdm/modules/encoders/__pycache__/arch_transformer.cpython-39.pyc b/lvdm/modules/encoders/__pycache__/arch_transformer.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2c3363f8e92581cf0bfca9f3c035c0d35b37c3a8
Binary files /dev/null and b/lvdm/modules/encoders/__pycache__/arch_transformer.cpython-39.pyc differ
diff --git a/lvdm/modules/encoders/__pycache__/condition.cpython-39.pyc b/lvdm/modules/encoders/__pycache__/condition.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..821e12905a9b5a5842f664e90656ecb5214ddd2a
Binary files /dev/null and b/lvdm/modules/encoders/__pycache__/condition.cpython-39.pyc differ
diff --git a/lvdm/modules/encoders/__pycache__/condition2.cpython-39.pyc b/lvdm/modules/encoders/__pycache__/condition2.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f8b56b7c3babf80a29f17b97ff72aa9edd7a80d0
Binary files /dev/null and b/lvdm/modules/encoders/__pycache__/condition2.cpython-39.pyc differ
diff --git a/lvdm/modules/encoders/__pycache__/ip_resampler.cpython-39.pyc b/lvdm/modules/encoders/__pycache__/ip_resampler.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..251c460d1f789ed1e49e910feed65c7e34dd138c
Binary files /dev/null and b/lvdm/modules/encoders/__pycache__/ip_resampler.cpython-39.pyc differ
diff --git a/lvdm/modules/encoders/__pycache__/transformers.cpython-39.pyc b/lvdm/modules/encoders/__pycache__/transformers.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d9c9135826d2b5562a843d456ac4d1836f43aeb9
Binary files /dev/null and b/lvdm/modules/encoders/__pycache__/transformers.cpython-39.pyc differ
diff --git a/lvdm/modules/encoders/adapter.py b/lvdm/modules/encoders/adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb793073bf55ad29a5f673f91cce4323c8560ec9
--- /dev/null
+++ b/lvdm/modules/encoders/adapter.py
@@ -0,0 +1,190 @@
+import torch
+import torch.nn as nn
+from collections import OrderedDict
+from lvdm.basics import (
+    zero_module,
+    conv_nd,
+    avg_pool_nd
+)
+from einops import rearrange
+from lvdm.modules.attention import register_attn_processor, set_attn_processor, DualCrossAttnProcessor, get_attn_processor
+from lvdm.modules.attention import DualCrossAttnProcessorAS
+from utils.utils import instantiate_from_config
+
+from lvdm.modules.encoders.arch_transformer import Transformer
+
+
+class StyleTransformer(nn.Module):
+    def __init__(self, in_dim=1024, out_dim=1024, num_heads=8, num_tokens=4, n_layers=2):
+        super().__init__()
+        scale = in_dim ** -0.5
+        self.num_tokens = num_tokens
+        self.style_emb = nn.Parameter(torch.randn(1, num_tokens, in_dim) * scale)
+        self.transformer_blocks = Transformer(
+            width=in_dim,
+            layers=n_layers,
+            heads=num_heads,
+        )
+        self.ln1 = nn.LayerNorm(in_dim)
+        self.ln2 = nn.LayerNorm(in_dim)
+        self.proj = nn.Parameter(torch.randn(in_dim, out_dim) * scale)
+    
+    def forward(self, x):
+        style_emb = self.style_emb.repeat(x.shape[0], 1, 1)
+        x = torch.cat([style_emb, x], dim=1)
+        # x = torch.cat([x, style_emb], dim=1)
+        x = self.ln1(x)
+        
+        x = x.permute(1, 0, 2)
+        x = self.transformer_blocks(x)
+        x = x.permute(1, 0, 2)
+
+        x = self.ln2(x[:, :self.num_tokens, :])
+        x = x @ self.proj
+        return x
+
+
+class ScaleEncoder(nn.Module):
+    def __init__(self, in_dim=1024, out_dim=1, num_heads=8, num_tokens=16, n_layers=2):
+        super().__init__()
+        scale = in_dim ** -0.5
+        self.num_tokens = num_tokens
+        self.scale_emb = nn.Parameter(torch.randn(1, num_tokens, in_dim) * scale)
+        self.transformer_blocks = Transformer(
+            width=in_dim,
+            layers=n_layers,
+            heads=num_heads,
+        )
+        self.ln1 = nn.LayerNorm(in_dim)
+        self.ln2 = nn.LayerNorm(in_dim)
+
+        self.out = nn.Sequential(
+            nn.Linear(in_dim, 32),
+            nn.GELU(),
+            nn.Linear(32, out_dim),
+            nn.Tanh(),
+        )
+        
+    def forward(self, x):
+        scale_emb = self.scale_emb.repeat(x.shape[0], 1, 1)
+        x = torch.cat([scale_emb, x], dim=1)
+        x = self.ln1(x)
+        
+        x = x.permute(1, 0, 2)
+        x = self.transformer_blocks(x)
+        x = x.permute(1, 0, 2)
+
+        x = self.ln2(x[:, :self.num_tokens, :])
+        x = self.out(x)
+        return x
+
+
+class DropPath(nn.Module):
+    r"""DropPath but without rescaling and supports optional all-zero and/or all-keep.
+    """
+    def __init__(self, p):
+        super(DropPath, self).__init__()
+        self.p = p
+    
+    def forward(self, *args, zero=None, keep=None):
+        if not self.training:
+            return args[0] if len(args) == 1 else args
+        
+        # params
+        x = args[0]
+        b = x.size(0)
+        n = (torch.rand(b) < self.p).sum()
+
+        # non-zero and non-keep mask
+        mask = x.new_ones(b, dtype=torch.bool)
+        if keep is not None:
+            mask[keep] = False
+        if zero is not None:
+            mask[zero] = False
+        
+        # drop-path index
+        index = torch.where(mask)[0]
+        index = index[torch.randperm(len(index))[:n]]
+        if zero is not None:
+            index = torch.cat([index, torch.where(zero)[0]], dim=0)
+        
+        # drop-path multiplier
+        multiplier = x.new_ones(b)
+        multiplier[index] = 0.0
+        output = tuple(u * self.broadcast(multiplier, u) for u in args)
+        return output[0] if len(args) == 1 else output
+    
+    def broadcast(self, src, dst):
+        assert src.size(0) == dst.size(0)
+        shape = (dst.size(0), ) + (1, ) * (dst.ndim - 1)
+        return src.view(shape)
+    
+
+class ImageContext(nn.Module):
+    def __init__(self, width=1024, context_dim=768, token_num=1):
+        super().__init__()
+        self.width = width
+        self.token_num = token_num
+        self.context_dim = context_dim
+
+        self.fc = nn.Sequential(
+            nn.Linear(context_dim, width),
+            nn.SiLU(),
+            nn.Linear(width, token_num * context_dim),
+        )
+        self.drop_path = DropPath(0.5)
+
+    def forward(self, x):
+        # x shape [B, C]
+        out = self.drop_path(self.fc(x))
+        out = rearrange(out, 'b (n c) -> b n c', n=self.token_num)
+        return out
+
+
+class StyleAdapterDualAttnAS(nn.Module):
+    def __init__(self, image_context_config, scale_predictor_config, scale=1.0, use_norm=False, time_embed_dim=1024, mid_dim=32):
+        super().__init__()
+        self.image_context_model = instantiate_from_config(image_context_config)
+        self.scale_predictor = instantiate_from_config(scale_predictor_config)
+        self.scale = scale
+        self.use_norm = use_norm
+        self.time_embed_dim = time_embed_dim
+        self.mid_dim = mid_dim
+        
+    def create_cross_attention_adapter(self, unet):
+        ori_processor = register_attn_processor(unet)
+        dual_attn_processor = {}
+        for idx, key in enumerate(ori_processor.keys()):
+            kv_state_dicts = {
+                'k': {'weight': unet.state_dict()[key[:-10] + '.to_k.weight']},
+                'v': {'weight': unet.state_dict()[key[:-10] + '.to_v.weight']},
+            }
+            context_dim = kv_state_dicts['k']['weight'].shape[1]
+            inner_dim = kv_state_dicts['k']['weight'].shape[0]
+            print(key, context_dim, inner_dim)
+            
+            dual_attn_processor[key] = DualCrossAttnProcessorAS(
+                context_dim=context_dim,
+                inner_dim=inner_dim,
+                state_dict=kv_state_dicts,
+                scale=self.scale,
+                use_norm=self.use_norm,
+                layer_idx=idx,
+            )
+    
+        set_attn_processor(unet, dual_attn_processor)
+
+        dual_attn_processor = {key.replace('.', '_'): value for key, value in dual_attn_processor.items()}
+        self.add_module('kv_attn_layers', nn.ModuleDict(dual_attn_processor))
+            
+    def set_cross_attention_adapter(self, unet):
+        dual_attn_processor = get_attn_processor(unet)
+        for key in dual_attn_processor.keys():
+            module_key = key.replace('.', '_')
+            dual_attn_processor[key] = self.kv_attn_layers[module_key]
+            print('set', key, module_key)
+        set_attn_processor(unet, dual_attn_processor)
+
+    def forward(self, x):
+        # x shape [B, C]
+        return self.image_context_model(x)
diff --git a/lvdm/modules/encoders/arch_transformer.py b/lvdm/modules/encoders/arch_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..931774891f7830381ee8725b9e4bc5774b9cd4ac
--- /dev/null
+++ b/lvdm/modules/encoders/arch_transformer.py
@@ -0,0 +1,252 @@
+from collections import OrderedDict
+import math
+from typing import Callable, Optional, Sequence, Tuple
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.utils.checkpoint import checkpoint
+
+class LayerNormFp32(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16 (by casting to float32 and back)."""
+
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        x = F.layer_norm(x.to(torch.float32), self.normalized_shape, self.weight, self.bias, self.eps)
+        return x.to(orig_type)
+
+
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm (with cast back to input dtype)."""
+
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+        return x.to(orig_type)
+    
+
+class QuickGELU(nn.Module):
+    # NOTE This is slower than nn.GELU or nn.SiLU and uses more GPU memory
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+    
+
+class LayerScale(nn.Module):
+    def __init__(self, dim, init_values=1e-5, inplace=False):
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+
+    def forward(self, x):
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
+
+
+class PatchDropout(nn.Module):
+    """
+    https://arxiv.org/abs/2212.00794
+    """
+
+    def __init__(self, prob, exclude_first_token=True):
+        super().__init__()
+        assert 0 <= prob < 1.
+        self.prob = prob
+        self.exclude_first_token = exclude_first_token  # exclude CLS token
+
+    def forward(self, x):
+        if not self.training or self.prob == 0.:
+            return x
+
+        if self.exclude_first_token:
+            cls_tokens, x = x[:, :1], x[:, 1:]
+        else:
+            cls_tokens = torch.jit.annotate(torch.Tensor, x[:, :1])
+
+        batch = x.size()[0]
+        num_tokens = x.size()[1]
+
+        batch_indices = torch.arange(batch)
+        batch_indices = batch_indices[..., None]
+
+        keep_prob = 1 - self.prob
+        num_patches_keep = max(1, int(num_tokens * keep_prob))
+
+        rand = torch.randn(batch, num_tokens)
+        patch_indices_keep = rand.topk(num_patches_keep, dim=-1).indices
+
+        x = x[batch_indices, patch_indices_keep]
+
+        if self.exclude_first_token:
+            x = torch.cat((cls_tokens, x), dim=1)
+
+        return x
+
+
+class Attention(nn.Module):
+    def __init__(
+            self,
+            dim,
+            num_heads=8,
+            qkv_bias=True,
+            scaled_cosine=False,
+            scale_heads=False,
+            logit_scale_max=math.log(1. / 0.01),
+            attn_drop=0.,
+            proj_drop=0.
+    ):
+        super().__init__()
+        self.scaled_cosine = scaled_cosine
+        self.scale_heads = scale_heads
+        assert dim % num_heads == 0, 'dim should be divisible by num_heads'
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5
+        self.logit_scale_max = logit_scale_max
+
+        # keeping in_proj in this form (instead of nn.Linear) to match weight scheme of original
+        self.in_proj_weight = nn.Parameter(torch.randn((dim * 3, dim)) * self.scale)
+        if qkv_bias:
+            self.in_proj_bias = nn.Parameter(torch.zeros(dim * 3))
+        else:
+            self.in_proj_bias = None
+
+        if self.scaled_cosine:
+            self.logit_scale = nn.Parameter(torch.log(10 * torch.ones((num_heads, 1, 1))))
+        else:
+            self.logit_scale = None
+        self.attn_drop = nn.Dropout(attn_drop)
+        if self.scale_heads:
+            self.head_scale = nn.Parameter(torch.ones((num_heads, 1, 1)))
+        else:
+            self.head_scale = None
+        self.out_proj = nn.Linear(dim, dim)
+        self.out_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, attn_mask: Optional[torch.Tensor] = None):
+        L, N, C = x.shape
+        q, k, v = F.linear(x, self.in_proj_weight, self.in_proj_bias).chunk(3, dim=-1)
+        q = q.contiguous().view(L, N * self.num_heads, -1).transpose(0, 1)
+        k = k.contiguous().view(L, N * self.num_heads, -1).transpose(0, 1)
+        v = v.contiguous().view(L, N * self.num_heads, -1).transpose(0, 1)
+
+        if self.logit_scale is not None:
+            attn = torch.bmm(F.normalize(q, dim=-1), F.normalize(k, dim=-1).transpose(-1, -2))
+            logit_scale = torch.clamp(self.logit_scale, max=self.logit_scale_max).exp()
+            attn = attn.view(N, self.num_heads, L, L) * logit_scale
+            attn = attn.view(-1, L, L)
+        else:
+            q = q * self.scale
+            attn = torch.bmm(q, k.transpose(-1, -2))
+
+        if attn_mask is not None:
+            if attn_mask.dtype == torch.bool:
+                new_attn_mask = torch.zeros_like(attn_mask, dtype=q.dtype)
+                new_attn_mask.masked_fill_(attn_mask, float("-inf"))
+                attn_mask = new_attn_mask
+            attn += attn_mask
+
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = torch.bmm(attn, v)
+        if self.head_scale is not None:
+            x = x.view(N, self.num_heads, L, C) * self.head_scale
+            x = x.view(-1, L, C)
+        x = x.transpose(0, 1).reshape(L, N, C)
+        x = self.out_proj(x)
+        x = self.out_drop(x)
+        return x
+    
+
+class ResidualAttentionBlock(nn.Module):
+    def __init__(
+            self,
+            d_model: int,
+            n_head: int,
+            mlp_ratio: float = 4.0,
+            ls_init_value: float = None,
+            act_layer: Callable = nn.GELU,
+            norm_layer: Callable = LayerNorm,
+            is_cross_attention: bool = False,
+    ):
+        super().__init__()
+
+        self.ln_1 = norm_layer(d_model)
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ls_1 = LayerScale(d_model, ls_init_value) if ls_init_value is not None else nn.Identity()
+        if is_cross_attention:
+            self.ln_1_kv = norm_layer(d_model)
+
+        self.ln_2 = norm_layer(d_model)
+        mlp_width = int(d_model * mlp_ratio)
+        self.mlp = nn.Sequential(OrderedDict([
+            ("c_fc", nn.Linear(d_model, mlp_width)),
+            ("gelu", act_layer()),
+            ("c_proj", nn.Linear(mlp_width, d_model))
+        ]))
+        self.ls_2 = LayerScale(d_model, ls_init_value) if ls_init_value is not None else nn.Identity()
+
+    def attention(
+            self,
+            q_x: torch.Tensor,
+            k_x: Optional[torch.Tensor] = None,
+            v_x: Optional[torch.Tensor] = None,
+            attn_mask: Optional[torch.Tensor] = None,
+    ):
+        k_x = k_x if k_x is not None else q_x
+        v_x = v_x if v_x is not None else q_x
+
+        attn_mask = attn_mask.to(q_x.dtype) if attn_mask is not None else None
+        return self.attn(
+            q_x, k_x, v_x, need_weights=False, attn_mask=attn_mask
+        )[0]
+
+    def forward(
+            self,
+            q_x: torch.Tensor,
+            k_x: Optional[torch.Tensor] = None,
+            v_x: Optional[torch.Tensor] = None,
+            attn_mask: Optional[torch.Tensor] = None,
+    ):
+        k_x = self.ln_1_kv(k_x) if hasattr(self, "ln_1_kv") and k_x is not None else None
+        v_x = self.ln_1_kv(v_x) if hasattr(self, "ln_1_kv") and v_x is not None else None
+
+        x = q_x + self.ls_1(self.attention(q_x=self.ln_1(q_x), k_x=k_x, v_x=v_x, attn_mask=attn_mask))
+        x = x + self.ls_2(self.mlp(self.ln_2(x)))
+        return x
+    
+
+class Transformer(nn.Module):
+    def __init__(
+            self,
+            width: int,
+            layers: int,
+            heads: int,
+            mlp_ratio: float = 4.0,
+            ls_init_value: float = None,
+            act_layer: Callable = nn.GELU,
+            norm_layer: Callable = LayerNorm,
+    ):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.grad_checkpointing = False
+
+        self.resblocks = nn.ModuleList([
+            ResidualAttentionBlock(
+                width, heads, mlp_ratio, ls_init_value=ls_init_value, act_layer=act_layer, norm_layer=norm_layer)
+            for _ in range(layers)
+        ])
+
+    def get_cast_dtype(self) -> torch.dtype:
+        if hasattr(self.resblocks[0].mlp.c_fc, 'int8_original_dtype'):
+            return self.resblocks[0].mlp.c_fc.int8_original_dtype
+        return self.resblocks[0].mlp.c_fc.weight.dtype
+
+    def forward(self, x: torch.Tensor, attn_mask: Optional[torch.Tensor] = None):
+        for r in self.resblocks:
+            if self.grad_checkpointing and not torch.jit.is_scripting():
+                # TODO: handle kwargs https://github.com/pytorch/pytorch/issues/79887#issuecomment-1161758372
+                x = checkpoint(r, x, None, None, attn_mask)
+            else:
+                x = r(x, attn_mask=attn_mask)
+        return x
\ No newline at end of file
diff --git a/lvdm/modules/encoders/condition.py b/lvdm/modules/encoders/condition.py
new file mode 100644
index 0000000000000000000000000000000000000000..fba54b2d2064a2c731df8b953428d63119c0a1f0
--- /dev/null
+++ b/lvdm/modules/encoders/condition.py
@@ -0,0 +1,461 @@
+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+import kornia
+import open_clip
+from transformers import T5Tokenizer, T5EncoderModel, CLIPTokenizer, CLIPTextModel
+from lvdm.common import autocast
+from utils.utils import count_params
+import os
+
+class AbstractEncoder(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def encode(self, *args, **kwargs):
+        raise NotImplementedError
+
+
+class IdentityEncoder(AbstractEncoder):
+
+    def encode(self, x):
+        return x
+
+
+class ClassEmbedder(nn.Module):
+    def __init__(self, embed_dim, n_classes=1000, key='class', ucg_rate=0.1):
+        super().__init__()
+        self.key = key
+        self.embedding = nn.Embedding(n_classes, embed_dim)
+        self.n_classes = n_classes
+        self.ucg_rate = ucg_rate
+
+    def forward(self, batch, key=None, disable_dropout=False):
+        if key is None:
+            key = self.key
+        # this is for use in crossattn
+        c = batch[key][:, None]
+        if self.ucg_rate > 0. and not disable_dropout:
+            mask = 1. - torch.bernoulli(torch.ones_like(c) * self.ucg_rate)
+            c = mask * c + (1 - mask) * torch.ones_like(c) * (self.n_classes - 1)
+            c = c.long()
+        c = self.embedding(c)
+        return c
+
+    def get_unconditional_conditioning(self, bs, device="cuda"):
+        uc_class = self.n_classes - 1  # 1000 classes --> 0 ... 999, one extra class for ucg (class 1000)
+        uc = torch.ones((bs,), device=device) * uc_class
+        uc = {self.key: uc}
+        return uc
+
+
+def disabled_train(self, mode=True):
+    """Overwrite model.train with this function to make sure train/eval mode
+    does not change anymore."""
+    return self
+
+
+class FrozenT5Embedder(AbstractEncoder):
+    """Uses the T5 transformer encoder for text"""
+
+    def __init__(self, version="google/t5-v1_1-large", device="cuda", max_length=77,
+                 freeze=True):  # others are google/t5-v1_1-xl and google/t5-v1_1-xxl
+        super().__init__()
+        self.tokenizer = T5Tokenizer.from_pretrained(version)
+        self.transformer = T5EncoderModel.from_pretrained(version)
+        self.device = device
+        self.max_length = max_length  # TODO: typical value?
+        if freeze:
+            self.freeze()
+
+    def freeze(self):
+        self.transformer = self.transformer.eval()
+        # self.train = disabled_train
+        for param in self.parameters():
+            param.requires_grad = False
+
+    def forward(self, text):
+        batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
+                                        return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
+        tokens = batch_encoding["input_ids"].to(self.device)
+        outputs = self.transformer(input_ids=tokens)
+
+        z = outputs.last_hidden_state
+        return z
+
+    def encode(self, text):
+        return self(text)
+
+
+class FrozenCLIPEmbedder(AbstractEncoder):
+    """Uses the CLIP transformer encoder for text (from huggingface)"""
+    LAYERS = [
+        "last",
+        "pooled",
+        "hidden"
+    ]
+
+    def __init__(self, version="openai/clip-vit-large-patch14", device="cuda", max_length=77,
+                 freeze=True, layer="last", layer_idx=None):  # clip-vit-base-patch32
+        super().__init__()
+        assert layer in self.LAYERS
+        self.tokenizer = CLIPTokenizer.from_pretrained(version)
+        self.transformer = CLIPTextModel.from_pretrained(version)
+        self.device = device
+        self.max_length = max_length
+        if freeze:
+            self.freeze()
+        self.layer = layer
+        self.layer_idx = layer_idx
+        if layer == "hidden":
+            assert layer_idx is not None
+            assert 0 <= abs(layer_idx) <= 12
+
+    def freeze(self):
+        self.transformer = self.transformer.eval()
+        # self.train = disabled_train
+        for param in self.parameters():
+            param.requires_grad = False
+
+    def forward(self, text):
+        batch_encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, return_length=True,
+                                        return_overflowing_tokens=False, padding="max_length", return_tensors="pt")
+        tokens = batch_encoding["input_ids"].to(self.device)
+        outputs = self.transformer(input_ids=tokens, output_hidden_states=self.layer == "hidden")
+        if self.layer == "last":
+            z = outputs.last_hidden_state
+        elif self.layer == "pooled":
+            z = outputs.pooler_output[:, None, :]
+        else:
+            z = outputs.hidden_states[self.layer_idx]
+        return z
+
+    def encode(self, text):
+        return self(text)
+
+
+class ClipImageEmbedder(nn.Module):
+    def __init__(
+            self,
+            model,
+            jit=False,
+            device='cuda' if torch.cuda.is_available() else 'cpu',
+            antialias=True,
+            ucg_rate=0.
+    ):
+        super().__init__()
+        from clip import load as load_clip
+        self.model, _ = load_clip(name=model, device=device, jit=jit)
+
+        self.antialias = antialias
+
+        self.register_buffer('mean', torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False)
+        self.register_buffer('std', torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False)
+        self.ucg_rate = ucg_rate
+
+    def preprocess(self, x):
+        # normalize to [0,1]
+        x = kornia.geometry.resize(x, (224, 224),
+                                   interpolation='bicubic', align_corners=True,
+                                   antialias=self.antialias)
+        x = (x + 1.) / 2.
+        # re-normalize according to clip
+        x = kornia.enhance.normalize(x, self.mean, self.std)
+        return x
+
+    def forward(self, x, no_dropout=False):
+        # x is assumed to be in range [-1,1]
+        out = self.model.encode_image(self.preprocess(x))
+        out = out.to(x.dtype)
+        if self.ucg_rate > 0. and not no_dropout:
+            out = torch.bernoulli((1. - self.ucg_rate) * torch.ones(out.shape[0], device=out.device))[:, None] * out
+        return out
+
+
+class FrozenOpenCLIPEmbedder(AbstractEncoder):
+    """
+    Uses the OpenCLIP transformer encoder for text
+    """
+    LAYERS = [
+        # "pooled",
+        "last",
+        "penultimate"
+    ]
+
+    def __init__(self, arch="ViT-H-14", version="laion2b_s32b_b79k", device="cuda", max_length=77,
+                 freeze=True, layer="last"):
+        super().__init__()
+        assert layer in self.LAYERS
+        model, _, _ = open_clip.create_model_and_transforms(arch, device=torch.device('cpu'), pretrained=version,)
+        del model.visual
+        self.model = model
+
+        self.device = device
+        self.max_length = max_length
+        if freeze:
+            self.freeze()
+        self.layer = layer
+        if self.layer == "last":
+            self.layer_idx = 0
+        elif self.layer == "penultimate":
+            self.layer_idx = 1
+        else:
+            raise NotImplementedError()
+
+    def freeze(self):
+        self.model = self.model.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+
+    def forward(self, text):
+        self.device = self.model.positional_embedding.device
+        tokens = open_clip.tokenize(text)
+        z = self.encode_with_transformer(tokens.to(self.device))
+        return z
+
+    def encode_with_transformer(self, text):
+        x = self.model.token_embedding(text)  # [batch_size, n_ctx, d_model]
+        x = x + self.model.positional_embedding
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.text_transformer_forward(x, attn_mask=self.model.attn_mask)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.model.ln_final(x)
+        return x
+
+    def text_transformer_forward(self, x: torch.Tensor, attn_mask=None):
+        for i, r in enumerate(self.model.transformer.resblocks):
+            if i == len(self.model.transformer.resblocks) - self.layer_idx:
+                break
+            if self.model.transformer.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(r, x, attn_mask)
+            else:
+                x = r(x, attn_mask=attn_mask)
+        return x
+
+    def encode(self, text):
+        return self(text)
+
+
+class FrozenOpenCLIPImageEmbedder(AbstractEncoder):
+    """
+    Uses the OpenCLIP vision transformer encoder for images
+    """
+
+    def __init__(self, arch="ViT-H-14", version="laion2b_s32b_b79k", device="cuda", max_length=77,
+                 freeze=True, layer="pooled", antialias=True, ucg_rate=0., only_cls=True, use_proj=True, 
+                 use_shuffle=False, mask_ratio=0.0):
+        super().__init__()
+        model, _, _ = open_clip.create_model_and_transforms(arch, device=torch.device('cpu'),
+                                                            pretrained=version, )
+        del model.transformer
+        self.model = model
+        self.mask_ratio = mask_ratio
+        # self.patch_dropout = PatchDropout(prob=patch_dropout, exclude_first_token=True) if patch_dropout > 0.0 else nn.Identity()
+
+        self.device = device
+        self.max_length = max_length
+        if freeze:
+            self.freeze()
+        self.layer = layer
+        if self.layer == "penultimate":
+            raise NotImplementedError()
+            self.layer_idx = 1
+
+        self.antialias = antialias
+
+        self.register_buffer('mean', torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False)
+        self.register_buffer('std', torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False)
+        self.ucg_rate = ucg_rate
+        self.only_cls = only_cls
+        self.use_proj = use_proj
+        self.use_shuffle = use_shuffle
+
+    def preprocess(self, x):
+        # normalize to [0,1]
+        x = kornia.geometry.resize(x, (224, 224),
+                                   interpolation='bicubic', align_corners=True,
+                                   antialias=self.antialias)
+        x = (x + 1.) / 2.
+        # renormalize according to clip
+        x = kornia.enhance.normalize(x, self.mean, self.std)
+        return x
+
+    def freeze(self):
+        self.model = self.model.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+
+    @autocast
+    def forward(self, image, use_shuffle=False, drop_prob=None):
+        with torch.no_grad():
+            z = self.encode_with_vision_transformer(image, use_shuffle, drop_prob)
+        return z.detach().half()
+
+    @torch.no_grad()
+    def encode_with_vision_transformer(self, img, use_shuffle=False, mask_ratio=None):
+        if mask_ratio is None:
+            mask_ratio = self.mask_ratio
+        assert 0 <= mask_ratio < 1.
+
+        x = self.preprocess(img)
+
+        assert not self.model.visual.input_patchnorm
+        x = self.model.visual.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+
+        # shuffle
+        if use_shuffle:
+            x = x[:, torch.randperm(x.shape[1]), :]
+
+        # class embeddings and positional embeddings
+        x = torch.cat(
+            [self.model.visual.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device),
+             x], dim=1)  # shape = [*, grid ** 2 + 1, width]
+        x = x + self.model.visual.positional_embedding.to(x.dtype)
+
+        # patch dropout
+        x = self.random_masking(x, mask_ratio, exclude_first_token=True)
+
+        x = self.model.visual.ln_pre(x)
+
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.model.visual.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+
+        assert self.model.visual.attn_pool is None
+        pooled, tokens = self.model.visual._global_pool(x)
+        pooled = self.model.visual.ln_post(pooled)
+
+        if self.model.visual.proj is not None and self.use_proj:
+            pooled = pooled @ self.model.visual.proj
+
+        if self.only_cls:
+            out = pooled.unsqueeze(1)
+        else:
+            out = torch.cat([pooled.unsqueeze(1), tokens], dim=1)
+        return out
+
+    def encode(self, text):
+        return self(text)
+
+    def random_masking(self, x, mask_ratio, exclude_first_token=True):
+        if mask_ratio == 0.:
+            return x
+
+        N, L, D = x.shape
+        if exclude_first_token:
+            L = L - 1
+
+        len_keep = int(L * (1 - mask_ratio))
+        noise = torch.rand(N, L, device=x.device)
+
+        # sort noise for each sample
+        ids_shuffle = torch.argsort(noise, dim=1)
+        ids_restore = torch.argsort(ids_shuffle, dim=1)
+
+        # keep the first subset
+        ids_keep = ids_shuffle[:, :len_keep]
+        if exclude_first_token:
+            ids_keep = ids_keep + 1
+            ids_keep = torch.cat([torch.zeros(N, 1, device=x.device, dtype=torch.long), ids_keep], dim=1)
+        x_masked = torch.gather(x, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, D))
+
+        return x_masked
+
+
+class FrozenOpenCLIPImageEmbedderV2(AbstractEncoder):
+    """
+    Uses the OpenCLIP vision transformer encoder for images
+    """
+
+    def __init__(self, arch="ViT-H-14", version="laion2b_s32b_b79k", device="cuda",
+                 freeze=True, layer="pooled", antialias=True):
+        super().__init__()
+        model, _, _ = open_clip.create_model_and_transforms(arch, device=torch.device('cpu'),
+                                                            pretrained=version, )
+        del model.transformer
+        self.model = model
+        self.device = device
+
+        if freeze:
+            self.freeze()
+        self.layer = layer
+        if self.layer == "penultimate":
+            raise NotImplementedError()
+            self.layer_idx = 1
+
+        self.antialias = antialias
+        self.register_buffer('mean', torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False)
+        self.register_buffer('std', torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False)
+
+
+    def preprocess(self, x):
+        # normalize to [0,1]
+        x = kornia.geometry.resize(x, (224, 224),
+                                   interpolation='bicubic', align_corners=True,
+                                   antialias=self.antialias)
+        x = (x + 1.) / 2.
+        # renormalize according to clip
+        x = kornia.enhance.normalize(x, self.mean, self.std)
+        return x
+
+    def freeze(self):
+        self.model = self.model.eval()
+        for param in self.model.parameters():
+            param.requires_grad = False
+
+    def forward(self, image, no_dropout=False):
+        ## image: b c h w
+        z = self.encode_with_vision_transformer(image)
+        return z
+
+    def encode_with_vision_transformer(self, x):
+        x = self.preprocess(x)
+
+        # to patches - whether to use dual patchnorm - https://arxiv.org/abs/2302.01327v1
+        if self.model.visual.input_patchnorm:
+            # einops - rearrange(x, 'b c (h p1) (w p2) -> b (h w) (c p1 p2)')
+            x = x.reshape(x.shape[0], x.shape[1], self.model.visual.grid_size[0], self.model.visual.patch_size[0], self.model.visual.grid_size[1], self.model.visual.patch_size[1])
+            x = x.permute(0, 2, 4, 1, 3, 5)
+            x = x.reshape(x.shape[0], self.model.visual.grid_size[0] * self.model.visual.grid_size[1], -1)
+            x = self.model.visual.patchnorm_pre_ln(x)
+            x = self.model.visual.conv1(x)
+        else:
+            x = self.model.visual.conv1(x)  # shape = [*, width, grid, grid]
+            x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
+            x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+
+        # class embeddings and positional embeddings
+        x = torch.cat(
+            [self.model.visual.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device),
+             x], dim=1)  # shape = [*, grid ** 2 + 1, width]
+        x = x + self.model.visual.positional_embedding.to(x.dtype)
+
+        # a patch_dropout of 0. would mean it is disabled and this function would do nothing but return what was passed in
+        x = self.model.visual.patch_dropout(x)
+        x = self.model.visual.ln_pre(x)
+
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.model.visual.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+
+        return x
+
+
+class FrozenCLIPT5Encoder(AbstractEncoder):
+    def __init__(self, clip_version="openai/clip-vit-large-patch14", t5_version="google/t5-v1_1-xl", device="cuda",
+                 clip_max_length=77, t5_max_length=77):
+        super().__init__()
+        self.clip_encoder = FrozenCLIPEmbedder(clip_version, device, max_length=clip_max_length)
+        self.t5_encoder = FrozenT5Embedder(t5_version, device, max_length=t5_max_length)
+        print(f"{self.clip_encoder.__class__.__name__} has {count_params(self.clip_encoder) * 1.e-6:.2f} M parameters, "
+              f"{self.t5_encoder.__class__.__name__} comes with {count_params(self.t5_encoder) * 1.e-6:.2f} M params.")
+
+    def encode(self, text):
+        return self(text)
+
+    def forward(self, text):
+        clip_z = self.clip_encoder.encode(text)
+        t5_z = self.t5_encoder.encode(text)
+        return [clip_z, t5_z]
\ No newline at end of file
diff --git a/lvdm/modules/encoders/ip_resampler.py b/lvdm/modules/encoders/ip_resampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..500820a789150a55d6e8fdca4dd3e4d6ad542d4a
--- /dev/null
+++ b/lvdm/modules/encoders/ip_resampler.py
@@ -0,0 +1,136 @@
+# modified from https://github.com/mlfoundations/open_flamingo/blob/main/open_flamingo/src/helpers.py
+import math
+import torch
+import torch.nn as nn
+
+
+class ImageProjModel(nn.Module):
+    """Projection Model"""
+    def __init__(self, cross_attention_dim=1024, clip_embeddings_dim=1024, clip_extra_context_tokens=4):
+        super().__init__()        
+        self.cross_attention_dim = cross_attention_dim
+        self.clip_extra_context_tokens = clip_extra_context_tokens
+        self.proj = nn.Linear(clip_embeddings_dim, self.clip_extra_context_tokens * cross_attention_dim)
+        self.norm = nn.LayerNorm(cross_attention_dim)
+        
+    def forward(self, image_embeds):
+        #embeds = image_embeds
+        embeds = image_embeds.type(list(self.proj.parameters())[0].dtype)
+        clip_extra_context_tokens = self.proj(embeds).reshape(-1, self.clip_extra_context_tokens, self.cross_attention_dim)
+        clip_extra_context_tokens = self.norm(clip_extra_context_tokens)
+        return clip_extra_context_tokens
+
+# FFN
+def FeedForward(dim, mult=4):
+    inner_dim = int(dim * mult)
+    return nn.Sequential(
+        nn.LayerNorm(dim),
+        nn.Linear(dim, inner_dim, bias=False),
+        nn.GELU(),
+        nn.Linear(inner_dim, dim, bias=False),
+    )
+    
+    
+def reshape_tensor(x, heads):
+    bs, length, width = x.shape
+    #(bs, length, width) --> (bs, length, n_heads, dim_per_head)
+    x = x.view(bs, length, heads, -1)
+    # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
+    x = x.transpose(1, 2)
+    # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head)
+    x = x.reshape(bs, heads, length, -1)
+    return x
+
+
+class PerceiverAttention(nn.Module):
+    def __init__(self, *, dim, dim_head=64, heads=8):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.dim_head = dim_head
+        self.heads = heads
+        inner_dim = dim_head * heads
+
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+
+
+    def forward(self, x, latents):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, n1, D)
+            latent (torch.Tensor): latent features
+                shape (b, n2, D)
+        """
+        x = self.norm1(x)
+        latents = self.norm2(latents)
+        
+        b, l, _ = latents.shape
+
+        q = self.to_q(latents)
+        kv_input = torch.cat((x, latents), dim=-2)
+        k, v = self.to_kv(kv_input).chunk(2, dim=-1)
+        
+        q = reshape_tensor(q, self.heads)
+        k = reshape_tensor(k, self.heads)
+        v = reshape_tensor(v, self.heads)
+
+        # attention
+        scale = 1 / math.sqrt(math.sqrt(self.dim_head))
+        weight = (q * scale) @ (k * scale).transpose(-2, -1) # More stable with f16 than dividing afterwards
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        out = weight @ v
+        
+        out = out.permute(0, 2, 1, 3).reshape(b, l, -1)
+
+        return self.to_out(out)
+
+
+class Resampler(nn.Module):
+    def __init__(
+        self,
+        dim=1024,
+        depth=8,
+        dim_head=64,
+        heads=16,
+        num_queries=8,
+        embedding_dim=768,
+        output_dim=1024,
+        ff_mult=4,
+    ):
+        super().__init__()
+        
+        self.latents = nn.Parameter(torch.randn(1, num_queries, dim) / dim**0.5)
+        
+        self.proj_in = nn.Linear(embedding_dim, dim)
+
+        self.proj_out = nn.Linear(dim, output_dim)
+        self.norm_out = nn.LayerNorm(output_dim)
+        
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
+                        FeedForward(dim=dim, mult=ff_mult),
+                    ]
+                )
+            )
+
+    def forward(self, x):
+        
+        latents = self.latents.repeat(x.size(0), 1, 1)
+        
+        x = self.proj_in(x)
+        
+        for attn, ff in self.layers:
+            latents = attn(x, latents) + latents
+            latents = ff(latents) + latents
+            
+        latents = self.proj_out(latents)
+        return self.norm_out(latents)
\ No newline at end of file
diff --git a/lvdm/modules/networks/__pycache__/ae_modules.cpython-39.pyc b/lvdm/modules/networks/__pycache__/ae_modules.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3c97de5331a7ba45503240f6f34967d9177cc5f0
Binary files /dev/null and b/lvdm/modules/networks/__pycache__/ae_modules.cpython-39.pyc differ
diff --git a/lvdm/modules/networks/__pycache__/openaimodel3d.cpython-39.pyc b/lvdm/modules/networks/__pycache__/openaimodel3d.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2bf7f70fefbd5781519b21fb282669c6d2153169
Binary files /dev/null and b/lvdm/modules/networks/__pycache__/openaimodel3d.cpython-39.pyc differ
diff --git a/lvdm/modules/networks/ae_modules.py b/lvdm/modules/networks/ae_modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c2e93fbadb4a0d86957a5cd73b5c2bf5b01a4b7
--- /dev/null
+++ b/lvdm/modules/networks/ae_modules.py
@@ -0,0 +1,845 @@
+# pytorch_diffusion + derived encoder decoder
+import math
+import torch
+import numpy as np
+import torch.nn as nn
+from einops import rearrange
+from utils.utils import instantiate_from_config
+from lvdm.modules.attention import LinearAttention
+
+def nonlinearity(x):
+    # swish
+    return x*torch.sigmoid(x)
+
+
+def Normalize(in_channels, num_groups=32):
+    return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+
+
+
+class LinAttnBlock(LinearAttention):
+    """to match AttnBlock usage"""
+    def __init__(self, in_channels):
+        super().__init__(dim=in_channels, heads=1, dim_head=in_channels)
+
+
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.k = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.v = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.proj_out = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=1,
+                                        stride=1,
+                                        padding=0)
+
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+
+        # compute attention
+        b,c,h,w = q.shape
+        q = q.reshape(b,c,h*w) # bcl
+        q = q.permute(0,2,1)   # bcl -> blc l=hw
+        k = k.reshape(b,c,h*w) # bcl
+        
+        w_ = torch.bmm(q,k)    # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c)**(-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+
+        # attend to values
+        v = v.reshape(b,c,h*w)
+        w_ = w_.permute(0,2,1)   # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(v,w_)     # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b,c,h,w)
+
+        h_ = self.proj_out(h_)
+
+        return x+h_
+
+def make_attn(in_channels, attn_type="vanilla"):
+    assert attn_type in ["vanilla", "linear", "none"], f'attn_type {attn_type} unknown'
+    #print(f"making attention of type '{attn_type}' with {in_channels} in_channels")
+    if attn_type == "vanilla":
+        return AttnBlock(in_channels)
+    elif attn_type == "none":
+        return nn.Identity(in_channels)
+    else:
+        return LinAttnBlock(in_channels)
+ 
+class Downsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        self.in_channels = in_channels
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=0)
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0,1,0,1)
+            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+
+class Upsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        self.in_channels = in_channels
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+
+    def forward(self, x):
+        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+
+def get_timestep_embedding(timesteps, embedding_dim):
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models:
+    From Fairseq.
+    Build sinusoidal embeddings.
+    This matches the implementation in tensor2tensor, but differs slightly
+    from the description in Section 3.5 of "Attention Is All You Need".
+    """
+    assert len(timesteps.shape) == 1
+
+    half_dim = embedding_dim // 2
+    emb = math.log(10000) / (half_dim - 1)
+    emb = torch.exp(torch.arange(half_dim, dtype=torch.float32) * -emb)
+    emb = emb.to(device=timesteps.device)
+    emb = timesteps.float()[:, None] * emb[None, :]
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+    if embedding_dim % 2 == 1:  # zero pad
+        emb = torch.nn.functional.pad(emb, (0,1,0,0))
+    return emb
+
+
+
+class ResnetBlock(nn.Module):
+    def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False,
+                 dropout, temb_channels=512):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+
+        self.norm1 = Normalize(in_channels)
+        self.conv1 = torch.nn.Conv2d(in_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1)
+        if temb_channels > 0:
+            self.temb_proj = torch.nn.Linear(temb_channels,
+                                             out_channels)
+        self.norm2 = Normalize(out_channels)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(out_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv2d(in_channels,
+                                                     out_channels,
+                                                     kernel_size=3,
+                                                     stride=1,
+                                                     padding=1)
+            else:
+                self.nin_shortcut = torch.nn.Conv2d(in_channels,
+                                                    out_channels,
+                                                    kernel_size=1,
+                                                    stride=1,
+                                                    padding=0)
+
+    def forward(self, x, temb):
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+
+        if temb is not None:
+            h = h + self.temb_proj(nonlinearity(temb))[:,:,None,None]
+
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+
+        return x+h
+
+class Model(nn.Module):
+    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
+                 resolution, use_timestep=True, use_linear_attn=False, attn_type="vanilla"):
+        super().__init__()
+        if use_linear_attn: attn_type = "linear"
+        self.ch = ch
+        self.temb_ch = self.ch*4
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+
+        self.use_timestep = use_timestep
+        if self.use_timestep:
+            # timestep embedding
+            self.temb = nn.Module()
+            self.temb.dense = nn.ModuleList([
+                torch.nn.Linear(self.ch,
+                                self.temb_ch),
+                torch.nn.Linear(self.temb_ch,
+                                self.temb_ch),
+            ])
+
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(in_channels,
+                                       self.ch,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+
+        curr_res = resolution
+        in_ch_mult = (1,)+tuple(ch_mult)
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch*in_ch_mult[i_level]
+            block_out = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(make_attn(block_in, attn_type=attn_type))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions-1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch*ch_mult[i_level]
+            skip_in = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks+1):
+                if i_block == self.num_res_blocks:
+                    skip_in = ch*in_ch_mult[i_level]
+                block.append(ResnetBlock(in_channels=block_in+skip_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(make_attn(block_in, attn_type=attn_type))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up) # prepend to get consistent order
+
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        out_ch,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+
+    def forward(self, x, t=None, context=None):
+        #assert x.shape[2] == x.shape[3] == self.resolution
+        if context is not None:
+            # assume aligned context, cat along channel axis
+            x = torch.cat((x, context), dim=1)
+        if self.use_timestep:
+            # timestep embedding
+            assert t is not None
+            temb = get_timestep_embedding(t, self.ch)
+            temb = self.temb.dense[0](temb)
+            temb = nonlinearity(temb)
+            temb = self.temb.dense[1](temb)
+        else:
+            temb = None
+
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions-1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks+1):
+                h = self.up[i_level].block[i_block](
+                    torch.cat([h, hs.pop()], dim=1), temb)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+
+    def get_last_layer(self):
+        return self.conv_out.weight
+
+
+class Encoder(nn.Module):
+    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
+                 resolution, z_channels, double_z=True, use_linear_attn=False, attn_type="vanilla",
+                 **ignore_kwargs):
+        super().__init__()
+        if use_linear_attn: attn_type = "linear"
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(in_channels,
+                                       self.ch,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+
+        curr_res = resolution
+        in_ch_mult = (1,)+tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch*in_ch_mult[i_level]
+            block_out = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(make_attn(block_in, attn_type=attn_type))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions-1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        2*z_channels if double_z else z_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+
+    def forward(self, x):
+        # timestep embedding
+        temb = None
+
+        # print(f'encoder-input={x.shape}')
+        # downsampling
+        hs = [self.conv_in(x)]
+        # print(f'encoder-conv in feat={hs[0].shape}')
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                # print(f'encoder-down feat={h.shape}')
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions-1:
+                # print(f'encoder-downsample (input)={hs[-1].shape}')
+                hs.append(self.down[i_level].downsample(hs[-1]))
+                # print(f'encoder-downsample (output)={hs[-1].shape}')
+
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        # print(f'encoder-mid1 feat={h.shape}')
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # print(f'encoder-mid2 feat={h.shape}')
+
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        # print(f'end feat={h.shape}')
+        return h
+
+
+class Decoder(nn.Module):
+    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
+                 resolution, z_channels, give_pre_end=False, tanh_out=False, use_linear_attn=False,
+                 attn_type="vanilla", **ignorekwargs):
+        super().__init__()
+        if use_linear_attn: attn_type = "linear"
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.give_pre_end = give_pre_end
+        self.tanh_out = tanh_out
+
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        in_ch_mult = (1,)+tuple(ch_mult)
+        block_in = ch*ch_mult[self.num_resolutions-1]
+        curr_res = resolution // 2**(self.num_resolutions-1)
+        self.z_shape = (1,z_channels,curr_res,curr_res)
+        print("AE working on z of shape {} = {} dimensions.".format(
+            self.z_shape, np.prod(self.z_shape)))
+
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(z_channels,
+                                       block_in,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks+1):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(make_attn(block_in, attn_type=attn_type))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up) # prepend to get consistent order
+
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        out_ch,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+
+    def forward(self, z):
+        #assert z.shape[1:] == self.z_shape[1:]
+        self.last_z_shape = z.shape
+
+        # print(f'decoder-input={z.shape}')
+        # timestep embedding
+        temb = None
+
+        # z to block_in
+        h = self.conv_in(z)
+        # print(f'decoder-conv in feat={h.shape}')
+
+        # middle
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # print(f'decoder-mid feat={h.shape}')
+
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks+1):
+                h = self.up[i_level].block[i_block](h, temb)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+                # print(f'decoder-up feat={h.shape}')
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+                # print(f'decoder-upsample feat={h.shape}')
+
+        # end
+        if self.give_pre_end:
+            return h
+
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        # print(f'decoder-conv_out feat={h.shape}')
+        if self.tanh_out:
+            h = torch.tanh(h)
+        return h
+
+
+class SimpleDecoder(nn.Module):
+    def __init__(self, in_channels, out_channels, *args, **kwargs):
+        super().__init__()
+        self.model = nn.ModuleList([nn.Conv2d(in_channels, in_channels, 1),
+                                     ResnetBlock(in_channels=in_channels,
+                                                 out_channels=2 * in_channels,
+                                                 temb_channels=0, dropout=0.0),
+                                     ResnetBlock(in_channels=2 * in_channels,
+                                                out_channels=4 * in_channels,
+                                                temb_channels=0, dropout=0.0),
+                                     ResnetBlock(in_channels=4 * in_channels,
+                                                out_channels=2 * in_channels,
+                                                temb_channels=0, dropout=0.0),
+                                     nn.Conv2d(2*in_channels, in_channels, 1),
+                                     Upsample(in_channels, with_conv=True)])
+        # end
+        self.norm_out = Normalize(in_channels)
+        self.conv_out = torch.nn.Conv2d(in_channels,
+                                        out_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+
+    def forward(self, x):
+        for i, layer in enumerate(self.model):
+            if i in [1,2,3]:
+                x = layer(x, None)
+            else:
+                x = layer(x)
+
+        h = self.norm_out(x)
+        h = nonlinearity(h)
+        x = self.conv_out(h)
+        return x
+
+
+class UpsampleDecoder(nn.Module):
+    def __init__(self, in_channels, out_channels, ch, num_res_blocks, resolution,
+                 ch_mult=(2,2), dropout=0.0):
+        super().__init__()
+        # upsampling
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        block_in = in_channels
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        self.res_blocks = nn.ModuleList()
+        self.upsample_blocks = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            res_block = []
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                res_block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+            self.res_blocks.append(nn.ModuleList(res_block))
+            if i_level != self.num_resolutions - 1:
+                self.upsample_blocks.append(Upsample(block_in, True))
+                curr_res = curr_res * 2
+
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        out_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+
+    def forward(self, x):
+        # upsampling
+        h = x
+        for k, i_level in enumerate(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.res_blocks[i_level][i_block](h, None)
+            if i_level != self.num_resolutions - 1:
+                h = self.upsample_blocks[k](h)
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+
+
+class LatentRescaler(nn.Module):
+    def __init__(self, factor, in_channels, mid_channels, out_channels, depth=2):
+        super().__init__()
+        # residual block, interpolate, residual block
+        self.factor = factor
+        self.conv_in = nn.Conv2d(in_channels,
+                                 mid_channels,
+                                 kernel_size=3,
+                                 stride=1,
+                                 padding=1)
+        self.res_block1 = nn.ModuleList([ResnetBlock(in_channels=mid_channels,
+                                                     out_channels=mid_channels,
+                                                     temb_channels=0,
+                                                     dropout=0.0) for _ in range(depth)])
+        self.attn = AttnBlock(mid_channels)
+        self.res_block2 = nn.ModuleList([ResnetBlock(in_channels=mid_channels,
+                                                     out_channels=mid_channels,
+                                                     temb_channels=0,
+                                                     dropout=0.0) for _ in range(depth)])
+
+        self.conv_out = nn.Conv2d(mid_channels,
+                                  out_channels,
+                                  kernel_size=1,
+                                  )
+
+    def forward(self, x):
+        x = self.conv_in(x)
+        for block in self.res_block1:
+            x = block(x, None)
+        x = torch.nn.functional.interpolate(x, size=(int(round(x.shape[2]*self.factor)), int(round(x.shape[3]*self.factor))))
+        x = self.attn(x)
+        for block in self.res_block2:
+            x = block(x, None)
+        x = self.conv_out(x)
+        return x
+
+
+class MergedRescaleEncoder(nn.Module):
+    def __init__(self, in_channels, ch, resolution, out_ch, num_res_blocks,
+                 attn_resolutions, dropout=0.0, resamp_with_conv=True,
+                 ch_mult=(1,2,4,8), rescale_factor=1.0, rescale_module_depth=1):
+        super().__init__()
+        intermediate_chn = ch * ch_mult[-1]
+        self.encoder = Encoder(in_channels=in_channels, num_res_blocks=num_res_blocks, ch=ch, ch_mult=ch_mult,
+                               z_channels=intermediate_chn, double_z=False, resolution=resolution,
+                               attn_resolutions=attn_resolutions, dropout=dropout, resamp_with_conv=resamp_with_conv,
+                               out_ch=None)
+        self.rescaler = LatentRescaler(factor=rescale_factor, in_channels=intermediate_chn,
+                                       mid_channels=intermediate_chn, out_channels=out_ch, depth=rescale_module_depth)
+
+    def forward(self, x):
+        x = self.encoder(x)
+        x = self.rescaler(x)
+        return x
+
+
+class MergedRescaleDecoder(nn.Module):
+    def __init__(self, z_channels, out_ch, resolution, num_res_blocks, attn_resolutions, ch, ch_mult=(1,2,4,8),
+                 dropout=0.0, resamp_with_conv=True, rescale_factor=1.0, rescale_module_depth=1):
+        super().__init__()
+        tmp_chn = z_channels*ch_mult[-1]
+        self.decoder = Decoder(out_ch=out_ch, z_channels=tmp_chn, attn_resolutions=attn_resolutions, dropout=dropout,
+                               resamp_with_conv=resamp_with_conv, in_channels=None, num_res_blocks=num_res_blocks,
+                               ch_mult=ch_mult, resolution=resolution, ch=ch)
+        self.rescaler = LatentRescaler(factor=rescale_factor, in_channels=z_channels, mid_channels=tmp_chn,
+                                       out_channels=tmp_chn, depth=rescale_module_depth)
+
+    def forward(self, x):
+        x = self.rescaler(x)
+        x = self.decoder(x)
+        return x
+
+
+class Upsampler(nn.Module):
+    def __init__(self, in_size, out_size, in_channels, out_channels, ch_mult=2):
+        super().__init__()
+        assert out_size >= in_size
+        num_blocks = int(np.log2(out_size//in_size))+1
+        factor_up = 1.+ (out_size % in_size)
+        print(f"Building {self.__class__.__name__} with in_size: {in_size} --> out_size {out_size} and factor {factor_up}")
+        self.rescaler = LatentRescaler(factor=factor_up, in_channels=in_channels, mid_channels=2*in_channels,
+                                       out_channels=in_channels)
+        self.decoder = Decoder(out_ch=out_channels, resolution=out_size, z_channels=in_channels, num_res_blocks=2,
+                               attn_resolutions=[], in_channels=None, ch=in_channels,
+                               ch_mult=[ch_mult for _ in range(num_blocks)])
+
+    def forward(self, x):
+        x = self.rescaler(x)
+        x = self.decoder(x)
+        return x
+
+
+class Resize(nn.Module):
+    def __init__(self, in_channels=None, learned=False, mode="bilinear"):
+        super().__init__()
+        self.with_conv = learned
+        self.mode = mode
+        if self.with_conv:
+            print(f"Note: {self.__class__.__name} uses learned downsampling and will ignore the fixed {mode} mode")
+            raise NotImplementedError()
+            assert in_channels is not None
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=4,
+                                        stride=2,
+                                        padding=1)
+
+    def forward(self, x, scale_factor=1.0):
+        if scale_factor==1.0:
+            return x
+        else:
+            x = torch.nn.functional.interpolate(x, mode=self.mode, align_corners=False, scale_factor=scale_factor)
+        return x
+
+class FirstStagePostProcessor(nn.Module):
+
+    def __init__(self, ch_mult:list, in_channels,
+                 pretrained_model:nn.Module=None,
+                 reshape=False,
+                 n_channels=None,
+                 dropout=0.,
+                 pretrained_config=None):
+        super().__init__()
+        if pretrained_config is None:
+            assert pretrained_model is not None, 'Either "pretrained_model" or "pretrained_config" must not be None'
+            self.pretrained_model = pretrained_model
+        else:
+            assert pretrained_config is not None, 'Either "pretrained_model" or "pretrained_config" must not be None'
+            self.instantiate_pretrained(pretrained_config)
+
+        self.do_reshape = reshape
+
+        if n_channels is None:
+            n_channels = self.pretrained_model.encoder.ch
+
+        self.proj_norm = Normalize(in_channels,num_groups=in_channels//2)
+        self.proj = nn.Conv2d(in_channels,n_channels,kernel_size=3,
+                            stride=1,padding=1)
+
+        blocks = []
+        downs = []
+        ch_in = n_channels
+        for m in ch_mult:
+            blocks.append(ResnetBlock(in_channels=ch_in,out_channels=m*n_channels,dropout=dropout))
+            ch_in = m * n_channels
+            downs.append(Downsample(ch_in, with_conv=False))
+
+        self.model = nn.ModuleList(blocks)
+        self.downsampler = nn.ModuleList(downs)
+
+
+    def instantiate_pretrained(self, config):
+        model = instantiate_from_config(config)
+        self.pretrained_model = model.eval()
+        # self.pretrained_model.train = False
+        for param in self.pretrained_model.parameters():
+            param.requires_grad = False
+
+
+    @torch.no_grad()
+    def encode_with_pretrained(self,x):
+        c = self.pretrained_model.encode(x)
+        if isinstance(c, DiagonalGaussianDistribution):
+            c = c.mode()
+        return  c
+
+    def forward(self,x):
+        z_fs = self.encode_with_pretrained(x)
+        z = self.proj_norm(z_fs)
+        z = self.proj(z)
+        z = nonlinearity(z)
+
+        for submodel, downmodel in zip(self.model,self.downsampler):
+            z = submodel(z,temb=None)
+            z = downmodel(z)
+
+        if self.do_reshape:
+            z = rearrange(z,'b c h w -> b (h w) c')
+        return z
+
diff --git a/lvdm/modules/networks/openaimodel3d.py b/lvdm/modules/networks/openaimodel3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..7eb393ad352f1e783f8bb49ca4fefe01715f98ce
--- /dev/null
+++ b/lvdm/modules/networks/openaimodel3d.py
@@ -0,0 +1,641 @@
+from functools import partial
+from abc import abstractmethod
+import torch
+import torch.nn as nn
+from einops import rearrange
+import torch.nn.functional as F
+from lvdm.models.utils_diffusion import timestep_embedding
+from lvdm.common import checkpoint
+from lvdm.basics import (
+    zero_module,
+    conv_nd,
+    linear,
+    avg_pool_nd,
+    normalization
+)
+from lvdm.modules.attention import SpatialTransformer, TemporalTransformer
+
+
+class TimestepBlock(nn.Module):
+    """
+    Any module where forward() takes timestep embeddings as a second argument.
+    """
+    @abstractmethod
+    def forward(self, x, emb):
+        """
+        Apply the module to `x` given `emb` timestep embeddings.
+        """
+
+
+class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
+    """
+    A sequential module that passes timestep embeddings to the children that
+    support it as an extra input.
+    """
+
+    def forward(self, x, emb, context=None, batch_size=None, is_imgbatch=False, use_temp=True, scale_scalar=None):
+        for layer in self:
+            if isinstance(layer, TimestepBlock):
+                x = layer(x, emb, batch_size, is_imgbatch=is_imgbatch)
+            elif isinstance(layer, SpatialTransformer):
+                x = layer(x, context, emb, scale_scalar=scale_scalar)
+            elif isinstance(layer, TemporalTransformer):
+                if use_temp:
+                    x = rearrange(x, '(b f) c h w -> b c f h w', b=batch_size)
+                    x = layer(x, context, is_imgbatch=is_imgbatch, emb=emb)
+                    x = rearrange(x, 'b c f h w -> (b f) c h w')
+                else:
+                    pass
+            else:
+                x = layer(x,)
+        return x
+
+
+class Downsample(nn.Module):
+    """
+    A downsampling layer with an optional convolution.
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 downsampling occurs in the inner-two dimensions.
+    """
+
+    def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        stride = 2 if dims != 3 else (1, 2, 2)
+        if use_conv:
+            self.op = conv_nd(
+                dims, self.channels, self.out_channels, 3, stride=stride, padding=padding
+            )
+        else:
+            assert self.channels == self.out_channels
+            self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
+
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        return self.op(x)
+
+
+class Upsample(nn.Module):
+    """
+    An upsampling layer with an optional convolution.
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 upsampling occurs in the inner-two dimensions.
+    """
+
+    def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        if use_conv:
+            self.conv = conv_nd(dims, self.channels, self.out_channels, 3, padding=padding)
+
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        if self.dims == 3:
+            x = F.interpolate(x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode='nearest')
+        else:
+            x = F.interpolate(x, scale_factor=2, mode='nearest')
+        if self.use_conv:
+            x = self.conv(x)
+        return x
+
+
+class ResBlock(TimestepBlock):
+    """
+    A residual block that can optionally change the number of channels.
+    :param channels: the number of input channels.
+    :param emb_channels: the number of timestep embedding channels.
+    :param dropout: the rate of dropout.
+    :param out_channels: if specified, the number of out channels.
+    :param use_conv: if True and out_channels is specified, use a spatial
+        convolution instead of a smaller 1x1 convolution to change the
+        channels in the skip connection.
+    :param dims: determines if the signal is 1D, 2D, or 3D.
+    :param up: if True, use this block for upsampling.
+    :param down: if True, use this block for downsampling.
+    """
+
+    def __init__(
+        self,
+        channels,
+        emb_channels,
+        dropout,
+        out_channels=None,
+        use_scale_shift_norm=False,
+        dims=2,
+        use_checkpoint=False,
+        use_conv=False,
+        up=False,
+        down=False,
+        use_temporal_conv=False,
+        tempspatial_aware=False
+    ):
+        super().__init__()
+        self.channels = channels
+        self.emb_channels = emb_channels
+        self.dropout = dropout
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_checkpoint = use_checkpoint
+        self.use_scale_shift_norm = use_scale_shift_norm
+        self.use_temporal_conv = use_temporal_conv
+
+        self.in_layers = nn.Sequential(
+            normalization(channels),
+            nn.SiLU(),
+            conv_nd(dims, channels, self.out_channels, 3, padding=1),
+        )
+
+        self.updown = up or down
+
+        if up:
+            self.h_upd = Upsample(channels, False, dims)
+            self.x_upd = Upsample(channels, False, dims)
+        elif down:
+            self.h_upd = Downsample(channels, False, dims)
+            self.x_upd = Downsample(channels, False, dims)
+        else:
+            self.h_upd = self.x_upd = nn.Identity()
+
+        self.emb_layers = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(
+                emb_channels,
+                2 * self.out_channels if use_scale_shift_norm else self.out_channels,
+            ),
+        )
+        self.out_layers = nn.Sequential(
+            normalization(self.out_channels),
+            nn.SiLU(),
+            nn.Dropout(p=dropout),
+            zero_module(nn.Conv2d(self.out_channels, self.out_channels, 3, padding=1)),
+        )
+
+        if self.out_channels == channels:
+            self.skip_connection = nn.Identity()
+        elif use_conv:
+            self.skip_connection = conv_nd(dims, channels, self.out_channels, 3, padding=1)
+        else:
+            self.skip_connection = conv_nd(dims, channels, self.out_channels, 1)
+
+        if self.use_temporal_conv:
+            self.temopral_conv = TemporalConvBlock(
+                self.out_channels,
+                self.out_channels,
+                dropout=0.1,
+                spatial_aware=tempspatial_aware
+            )
+
+    def forward(self, x, emb,  batch_size=None, is_imgbatch=False):
+        """
+        Apply the block to a Tensor, conditioned on a timestep embedding.
+        :param x: an [N x C x ...] Tensor of features.
+        :param emb: an [N x emb_channels] Tensor of timestep embeddings.
+        :return: an [N x C x ...] Tensor of outputs.
+        """
+        input_tuple = (x, emb,)
+        if batch_size:
+            forward_batchsize = partial(self._forward, batch_size=batch_size, is_imgbatch=is_imgbatch)
+            return checkpoint(forward_batchsize, input_tuple, self.parameters(), self.use_checkpoint)
+        return checkpoint(self._forward, input_tuple, self.parameters(), self.use_checkpoint)
+
+    def _forward(self, x, emb,  batch_size=None, is_imgbatch=False):
+        if self.updown:
+            in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
+            h = in_rest(x)
+            h = self.h_upd(h)
+            x = self.x_upd(x)
+            h = in_conv(h)
+        else:
+            h = self.in_layers(x)
+        emb_out = self.emb_layers(emb).type(h.dtype)
+        while len(emb_out.shape) < len(h.shape):
+            emb_out = emb_out[..., None]
+        if self.use_scale_shift_norm:
+            out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
+            scale, shift = torch.chunk(emb_out, 2, dim=1)
+            h = out_norm(h) * (1 + scale) + shift
+            h = out_rest(h)
+        else:
+            h = h + emb_out
+            h = self.out_layers(h)
+        h = self.skip_connection(x) + h
+        
+        if self.use_temporal_conv and batch_size and not is_imgbatch:
+            h = rearrange(h, '(b t) c h w -> b c t h w', b=batch_size)
+            h = self.temopral_conv(h)
+            h = rearrange(h, 'b c t h w -> (b t) c h w')
+        return h
+
+
+class TemporalConvBlock(nn.Module):
+    """
+    Adapted from modelscope: https://github.com/modelscope/modelscope/blob/master/modelscope/models/multi_modal/video_synthesis/unet_sd.py
+    """
+
+    def __init__(self, in_channels, out_channels=None, dropout=0.0, spatial_aware=False):
+        super(TemporalConvBlock, self).__init__()
+        if out_channels is None:
+            out_channels = in_channels
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        kernel_shape = (3, 1, 1) if not spatial_aware else (3, 3, 3)
+        padding_shape = (1, 0, 0) if not spatial_aware else (1, 1, 1)
+
+        # conv layers
+        self.conv1 = nn.Sequential(
+            nn.GroupNorm(32, in_channels), nn.SiLU(),
+            nn.Conv3d(in_channels, out_channels, kernel_shape, padding=padding_shape))
+        self.conv2 = nn.Sequential(
+            nn.GroupNorm(32, out_channels), nn.SiLU(), nn.Dropout(dropout),
+            nn.Conv3d(out_channels, in_channels, kernel_shape, padding=padding_shape))
+        self.conv3 = nn.Sequential(
+            nn.GroupNorm(32, out_channels), nn.SiLU(), nn.Dropout(dropout),
+            nn.Conv3d(out_channels, in_channels, (3, 1, 1), padding=(1, 0, 0)))
+        self.conv4 = nn.Sequential(
+            nn.GroupNorm(32, out_channels), nn.SiLU(), nn.Dropout(dropout),
+            nn.Conv3d(out_channels, in_channels, (3, 1, 1), padding=(1, 0, 0)))
+
+        # zero out the last layer params,so the conv block is identity
+        nn.init.zeros_(self.conv4[-1].weight)
+        nn.init.zeros_(self.conv4[-1].bias)
+
+    def forward(self, x):
+        identity = x
+        x = self.conv1(x)
+        x = self.conv2(x)
+        x = self.conv3(x)
+        x = self.conv4(x)
+
+        return x + identity
+
+
+class UNetModel(nn.Module):
+    """
+    The full UNet model with attention and timestep embedding.
+    :param in_channels: in_channels in the input Tensor.
+    :param model_channels: base channel count for the model.
+    :param out_channels: channels in the output Tensor.
+    :param num_res_blocks: number of residual blocks per downsample.
+    :param attention_resolutions: a collection of downsample rates at which
+        attention will take place. May be a set, list, or tuple.
+        For example, if this contains 4, then at 4x downsampling, attention
+        will be used.
+    :param dropout: the dropout probability.
+    :param channel_mult: channel multiplier for each level of the UNet.
+    :param conv_resample: if True, use learned convolutions for upsampling and
+        downsampling.
+    :param dims: determines if the signal is 1D, 2D, or 3D.
+    :param num_classes: if specified (as an int), then this model will be
+        class-conditional with `num_classes` classes.
+    :param use_checkpoint: use gradient checkpointing to reduce memory usage.
+    :param num_heads: the number of attention heads in each attention layer.
+    :param num_heads_channels: if specified, ignore num_heads and instead use
+                               a fixed channel width per attention head.
+    :param num_heads_upsample: works with num_heads to set a different number
+                               of heads for upsampling. Deprecated.
+    :param use_scale_shift_norm: use a FiLM-like conditioning mechanism.
+    :param resblock_updown: use residual blocks for up/downsampling.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 model_channels,
+                 out_channels,
+                 num_res_blocks,
+                 attention_resolutions,
+                 dropout=0.0,
+                 channel_mult=(1, 2, 4, 8),
+                 conv_resample=True,
+                 dims=2,
+                 context_dim=None,
+                 use_scale_shift_norm=False,
+                 resblock_updown=False,
+                 num_heads=-1,
+                 num_head_channels=-1,
+                 transformer_depth=1,
+                 use_linear=False,
+                 use_checkpoint=False,
+                 temporal_conv=False,
+                 tempspatial_aware=False,
+                 temporal_attention=True,
+                 temporal_selfatt_only=True,
+                 use_relative_position=True,
+                 use_causal_attention=False,
+                 temporal_length=None,
+                 use_fp16=False,
+                 addition_attention=False,
+                 use_image_attention=False,
+                 temporal_transformer_depth=1,
+                 fps_cond=False,
+                ):
+        super(UNetModel, self).__init__()
+        if num_heads == -1:
+            assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
+        if num_head_channels == -1:
+            assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
+
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        self.num_res_blocks = num_res_blocks
+        self.attention_resolutions = attention_resolutions
+        self.dropout = dropout
+        self.channel_mult = channel_mult
+        self.conv_resample = conv_resample
+        self.temporal_attention = temporal_attention
+        time_embed_dim = model_channels * 4
+        self.use_checkpoint = use_checkpoint
+        self.dtype = torch.float16 if use_fp16 else torch.float32
+        self.addition_attention=addition_attention
+        self.use_image_attention = use_image_attention
+        self.fps_cond=fps_cond
+
+
+
+        self.time_embed = nn.Sequential(
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
+        )
+        if self.fps_cond:
+            self.fps_embedding = nn.Sequential(
+                linear(model_channels, time_embed_dim),
+                nn.SiLU(),
+                linear(time_embed_dim, time_embed_dim),
+            )
+
+        self.input_blocks = nn.ModuleList(
+            [
+                TimestepEmbedSequential(conv_nd(dims, in_channels, model_channels, 3, padding=1))
+            ]
+        )
+        if self.addition_attention:
+            self.init_attn=TimestepEmbedSequential(
+                TemporalTransformer(
+                    model_channels,
+                    n_heads=8,
+                    d_head=num_head_channels,
+                    depth=transformer_depth,
+                    context_dim=context_dim,
+                    use_checkpoint=use_checkpoint, only_self_att=temporal_selfatt_only, 
+                    causal_attention=use_causal_attention, relative_position=use_relative_position, 
+                    temporal_length=temporal_length))
+            
+        input_block_chans = [model_channels]
+        ch = model_channels
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for _ in range(num_res_blocks):
+                layers = [
+                    ResBlock(ch, time_embed_dim, dropout,
+                        out_channels=mult * model_channels, dims=dims, use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm, tempspatial_aware=tempspatial_aware,
+                        use_temporal_conv=temporal_conv
+                    )
+                ]
+                ch = mult * model_channels
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+                    layers.append(
+                        SpatialTransformer(ch, num_heads, dim_head, 
+                            depth=transformer_depth, context_dim=context_dim, use_linear=use_linear,
+                            use_checkpoint=use_checkpoint, disable_self_attn=False,
+                            img_cross_attention=self.use_image_attention
+                        )
+                    )
+                    if self.temporal_attention:
+                        layers.append(
+                            TemporalTransformer(ch, num_heads, dim_head,
+                                depth=temporal_transformer_depth, context_dim=context_dim, use_linear=use_linear,
+                                use_checkpoint=use_checkpoint, only_self_att=temporal_selfatt_only, 
+                                causal_attention=use_causal_attention, relative_position=use_relative_position, 
+                                temporal_length=temporal_length
+                            )
+                        )
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        ResBlock(ch, time_embed_dim, dropout, 
+                            out_channels=out_ch, dims=dims, use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            down=True
+                        )
+                        if resblock_updown
+                        else Downsample(ch, conv_resample, dims=dims, out_channels=out_ch)
+                    )
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                ds *= 2
+
+        if num_head_channels == -1:
+            dim_head = ch // num_heads
+        else:
+            num_heads = ch // num_head_channels
+            dim_head = num_head_channels
+        layers = [
+            ResBlock(ch, time_embed_dim, dropout,
+                dims=dims, use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm, tempspatial_aware=tempspatial_aware,
+                use_temporal_conv=temporal_conv
+            ),
+            SpatialTransformer(ch, num_heads, dim_head, 
+                depth=transformer_depth, context_dim=context_dim, use_linear=use_linear,
+                use_checkpoint=use_checkpoint, disable_self_attn=False,
+                img_cross_attention=self.use_image_attention
+            )
+        ]
+        if self.temporal_attention:
+            layers.append(
+                TemporalTransformer(ch, num_heads, dim_head,
+                    depth=temporal_transformer_depth, context_dim=context_dim, use_linear=use_linear,
+                    use_checkpoint=use_checkpoint, only_self_att=temporal_selfatt_only, 
+                    causal_attention=use_causal_attention, relative_position=use_relative_position, 
+                    temporal_length=temporal_length
+                )
+            )
+        layers.append(
+            ResBlock(ch, time_embed_dim, dropout,
+                dims=dims, use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm, tempspatial_aware=tempspatial_aware,
+                use_temporal_conv=temporal_conv
+                )
+        )
+        self.middle_block = TimestepEmbedSequential(*layers)
+
+        self.output_blocks = nn.ModuleList([])
+        for level, mult in list(enumerate(channel_mult))[::-1]:
+            for i in range(num_res_blocks + 1):
+                ich = input_block_chans.pop()
+                layers = [
+                    ResBlock(ch + ich, time_embed_dim, dropout,
+                        out_channels=mult * model_channels, dims=dims, use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm, tempspatial_aware=tempspatial_aware,
+                        use_temporal_conv=temporal_conv
+                    )
+                ]
+                ch = model_channels * mult
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+                    layers.append(
+                        SpatialTransformer(ch, num_heads, dim_head, 
+                            depth=transformer_depth, context_dim=context_dim, use_linear=use_linear,
+                            use_checkpoint=use_checkpoint, disable_self_attn=False,
+                            img_cross_attention=self.use_image_attention
+                        )
+                    )
+                    if self.temporal_attention:
+                        layers.append(
+                            TemporalTransformer(ch, num_heads, dim_head,
+                                depth=temporal_transformer_depth, context_dim=context_dim, use_linear=use_linear,
+                                use_checkpoint=use_checkpoint, only_self_att=temporal_selfatt_only, 
+                                causal_attention=use_causal_attention, relative_position=use_relative_position, 
+                                temporal_length=temporal_length
+                            )
+                        )
+                if level and i == num_res_blocks:
+                    out_ch = ch
+                    layers.append(
+                        ResBlock(ch, time_embed_dim, dropout,
+                            out_channels=out_ch, dims=dims, use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            up=True
+                        )
+                        if resblock_updown
+                        else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch)
+                    )
+                    ds //= 2
+                self.output_blocks.append(TimestepEmbedSequential(*layers))
+
+        self.out = nn.Sequential(
+            normalization(ch),
+            nn.SiLU(),
+            zero_module(conv_nd(dims, model_channels, out_channels, 3, padding=1)),
+        )
+
+    def forward(self, x, timesteps, context=None, append_to_context=None, features_adapter=None, scale_scalar=None, is_imgbatch=False, fps=16, **kwargs):
+        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
+        emb = self.time_embed(t_emb)
+
+        # add style context
+        if append_to_context is not None:
+            context = torch.cat((context, append_to_context), dim=1)
+
+            
+        if self.fps_cond:
+            if type(fps) == int:
+                fps = torch.full_like(timesteps, fps)
+            fps_emb = timestep_embedding(fps,self.model_channels, repeat_only=False)
+            emb += self.fps_embedding(fps_emb)
+
+        b,_,t,_,_ = x.shape
+        ## repeat t times for context [(b t) 77 768] & time embedding
+        if not is_imgbatch:
+            context = context.repeat_interleave(repeats=t, dim=0)
+            if scale_scalar is not None:
+                scale_scalar = scale_scalar.repeat_interleave(repeats=t, dim=0)
+
+        emb = emb.repeat_interleave(repeats=t, dim=0)
+
+        ## always in shape (b t) c h w, except for temporal layer
+        x = rearrange(x, 'b c t h w -> (b t) c h w')
+
+        h = x.type(self.dtype)
+        adapter_idx = 0
+        hs = []
+        for id, module in enumerate(self.input_blocks):
+            h = module(h, emb, context=context, batch_size=b, is_imgbatch=is_imgbatch, scale_scalar=scale_scalar)
+            if id ==0 and self.addition_attention:
+                h = self.init_attn(h, emb, context=context, batch_size=b, is_imgbatch=is_imgbatch, scale_scalar=scale_scalar)
+            ## plug-in adapter features
+            if ((id+1)%3 == 0) and features_adapter is not None:
+                h = h + features_adapter[adapter_idx]
+                adapter_idx += 1
+            hs.append(h)
+        if features_adapter is not None:
+            assert len(features_adapter)==adapter_idx, 'Wrong features_adapter'
+
+        h = self.middle_block(h, emb, context=context, batch_size=b, is_imgbatch=is_imgbatch, scale_scalar=scale_scalar)
+        for module in self.output_blocks:
+            h = torch.cat([h, hs.pop()], dim=1)
+            h = module(h, emb, context=context, batch_size=b, is_imgbatch=is_imgbatch, scale_scalar=scale_scalar)
+        h = h.type(x.dtype)
+        y = self.out(h)
+        
+        # reshape back to (b c t h w)
+        y = rearrange(y, '(b t) c h w -> b c t h w', b=b)
+        return y
+
+
+
+class UNet2DModel(UNetModel):
+    def forward(self, x, timesteps, context=None, append_to_context=None, features_adapter=None, scale_scalar=None, fps=16, use_temp=False, **kwargs):
+        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False)
+        emb = self.time_embed(t_emb)
+
+        if self.fps_cond:
+            if type(fps) == int:
+                fps = torch.full_like(timesteps, fps)
+            fps_emb = timestep_embedding(fps,self.model_channels, repeat_only=False)
+            emb += self.fps_embedding(fps_emb)
+
+        b,_,t,_,_ = x.shape
+        ## repeat t times for context [(b t) 77 768] & time embedding
+        if context.shape[0] != b*t:
+            context = context.repeat_interleave(repeats=t, dim=0)
+        if emb.shape[0] != b*t:
+            emb = emb.repeat_interleave(repeats=t, dim=0)
+
+        # add style context
+        if append_to_context is not None:
+            context = torch.cat((context, append_to_context), dim=1)
+
+        ## always in shape (b t) c h w, except for temporal layer
+        x = rearrange(x, 'b c t h w -> (b t) c h w')
+
+        h = x.type(self.dtype)
+        adapter_idx = 0
+        hs = []
+        for id, module in enumerate(self.input_blocks):
+            h = module(h, emb, context=context, batch_size=b, is_imgbatch=True, use_temp=use_temp, scale_scalar=scale_scalar)
+            if id ==0 and self.addition_attention:
+                h = self.init_attn(h, emb, context=context, batch_size=b, is_imgbatch=True, use_temp=use_temp, scale_scalar=scale_scalar)
+            ## plug-in adapter features
+            if ((id+1)%3 == 0) and features_adapter is not None:
+                h = h + features_adapter[adapter_idx]
+                adapter_idx += 1
+            hs.append(h)
+        if features_adapter is not None:
+            assert len(features_adapter)==adapter_idx, 'Wrong features_adapter'
+
+        h = self.middle_block(h, emb, context=context, batch_size=b, is_imgbatch=True, use_temp=use_temp, scale_scalar=scale_scalar)
+        for module in self.output_blocks:
+            h = torch.cat([h, hs.pop()], dim=1)
+            h = module(h, emb, context=context, batch_size=b, is_imgbatch=True, use_temp=use_temp, scale_scalar=scale_scalar)
+        h = h.type(x.dtype)
+        y = self.out(h)
+        
+        # reshape back to (b c t h w)
+        y = rearrange(y, '(b t) c h w -> b c t h w', b=b)
+        return y
diff --git a/lvdm/modules/x_transformer.py b/lvdm/modules/x_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f252ab4032a78407ed487495807940c4ba802ffa
--- /dev/null
+++ b/lvdm/modules/x_transformer.py
@@ -0,0 +1,640 @@
+"""shout-out to https://github.com/lucidrains/x-transformers/tree/main/x_transformers"""
+from functools import partial
+from inspect import isfunction
+from collections import namedtuple
+from einops import rearrange, repeat
+import torch
+from torch import nn, einsum
+import torch.nn.functional as F
+
+# constants
+DEFAULT_DIM_HEAD = 64
+
+Intermediates = namedtuple('Intermediates', [
+    'pre_softmax_attn',
+    'post_softmax_attn'
+])
+
+LayerIntermediates = namedtuple('Intermediates', [
+    'hiddens',
+    'attn_intermediates'
+])
+
+
+class AbsolutePositionalEmbedding(nn.Module):
+    def __init__(self, dim, max_seq_len):
+        super().__init__()
+        self.emb = nn.Embedding(max_seq_len, dim)
+        self.init_()
+
+    def init_(self):
+        nn.init.normal_(self.emb.weight, std=0.02)
+
+    def forward(self, x):
+        n = torch.arange(x.shape[1], device=x.device)
+        return self.emb(n)[None, :, :]
+
+
+class FixedPositionalEmbedding(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        inv_freq = 1. / (10000 ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer('inv_freq', inv_freq)
+
+    def forward(self, x, seq_dim=1, offset=0):
+        t = torch.arange(x.shape[seq_dim], device=x.device).type_as(self.inv_freq) + offset
+        sinusoid_inp = torch.einsum('i , j -> i j', t, self.inv_freq)
+        emb = torch.cat((sinusoid_inp.sin(), sinusoid_inp.cos()), dim=-1)
+        return emb[None, :, :]
+
+
+# helpers
+
+def exists(val):
+    return val is not None
+
+
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+
+
+def always(val):
+    def inner(*args, **kwargs):
+        return val
+    return inner
+
+
+def not_equals(val):
+    def inner(x):
+        return x != val
+    return inner
+
+
+def equals(val):
+    def inner(x):
+        return x == val
+    return inner
+
+
+def max_neg_value(tensor):
+    return -torch.finfo(tensor.dtype).max
+
+
+# keyword argument helpers
+
+def pick_and_pop(keys, d):
+    values = list(map(lambda key: d.pop(key), keys))
+    return dict(zip(keys, values))
+
+
+def group_dict_by_key(cond, d):
+    return_val = [dict(), dict()]
+    for key in d.keys():
+        match = bool(cond(key))
+        ind = int(not match)
+        return_val[ind][key] = d[key]
+    return (*return_val,)
+
+
+def string_begins_with(prefix, str):
+    return str.startswith(prefix)
+
+
+def group_by_key_prefix(prefix, d):
+    return group_dict_by_key(partial(string_begins_with, prefix), d)
+
+
+def groupby_prefix_and_trim(prefix, d):
+    kwargs_with_prefix, kwargs = group_dict_by_key(partial(string_begins_with, prefix), d)
+    kwargs_without_prefix = dict(map(lambda x: (x[0][len(prefix):], x[1]), tuple(kwargs_with_prefix.items())))
+    return kwargs_without_prefix, kwargs
+
+
+# classes
+class Scale(nn.Module):
+    def __init__(self, value, fn):
+        super().__init__()
+        self.value = value
+        self.fn = fn
+
+    def forward(self, x, **kwargs):
+        x, *rest = self.fn(x, **kwargs)
+        return (x * self.value, *rest)
+
+
+class Rezero(nn.Module):
+    def __init__(self, fn):
+        super().__init__()
+        self.fn = fn
+        self.g = nn.Parameter(torch.zeros(1))
+
+    def forward(self, x, **kwargs):
+        x, *rest = self.fn(x, **kwargs)
+        return (x * self.g, *rest)
+
+
+class ScaleNorm(nn.Module):
+    def __init__(self, dim, eps=1e-5):
+        super().__init__()
+        self.scale = dim ** -0.5
+        self.eps = eps
+        self.g = nn.Parameter(torch.ones(1))
+
+    def forward(self, x):
+        norm = torch.norm(x, dim=-1, keepdim=True) * self.scale
+        return x / norm.clamp(min=self.eps) * self.g
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, dim, eps=1e-8):
+        super().__init__()
+        self.scale = dim ** -0.5
+        self.eps = eps
+        self.g = nn.Parameter(torch.ones(dim))
+
+    def forward(self, x):
+        norm = torch.norm(x, dim=-1, keepdim=True) * self.scale
+        return x / norm.clamp(min=self.eps) * self.g
+
+
+class Residual(nn.Module):
+    def forward(self, x, residual):
+        return x + residual
+
+
+class GRUGating(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.gru = nn.GRUCell(dim, dim)
+
+    def forward(self, x, residual):
+        gated_output = self.gru(
+            rearrange(x, 'b n d -> (b n) d'),
+            rearrange(residual, 'b n d -> (b n) d')
+        )
+
+        return gated_output.reshape_as(x)
+
+
+# feedforward
+
+class GEGLU(nn.Module):
+    def __init__(self, dim_in, dim_out):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2)
+
+    def forward(self, x):
+        x, gate = self.proj(x).chunk(2, dim=-1)
+        return x * F.gelu(gate)
+
+
+class FeedForward(nn.Module):
+    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = default(dim_out, dim)
+        project_in = nn.Sequential(
+            nn.Linear(dim, inner_dim),
+            nn.GELU()
+        ) if not glu else GEGLU(dim, inner_dim)
+
+        self.net = nn.Sequential(
+            project_in,
+            nn.Dropout(dropout),
+            nn.Linear(inner_dim, dim_out)
+        )
+
+    def forward(self, x):
+        return self.net(x)
+
+
+# attention.
+class Attention(nn.Module):
+    def __init__(
+            self,
+            dim,
+            dim_head=DEFAULT_DIM_HEAD,
+            heads=8,
+            causal=False,
+            mask=None,
+            talking_heads=False,
+            sparse_topk=None,
+            use_entmax15=False,
+            num_mem_kv=0,
+            dropout=0.,
+            on_attn=False
+    ):
+        super().__init__()
+        if use_entmax15:
+            raise NotImplementedError("Check out entmax activation instead of softmax activation!")
+        self.scale = dim_head ** -0.5
+        self.heads = heads
+        self.causal = causal
+        self.mask = mask
+
+        inner_dim = dim_head * heads
+
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(dim, inner_dim, bias=False)
+        self.dropout = nn.Dropout(dropout)
+
+        # talking heads
+        self.talking_heads = talking_heads
+        if talking_heads:
+            self.pre_softmax_proj = nn.Parameter(torch.randn(heads, heads))
+            self.post_softmax_proj = nn.Parameter(torch.randn(heads, heads))
+
+        # explicit topk sparse attention
+        self.sparse_topk = sparse_topk
+
+        # entmax
+        #self.attn_fn = entmax15 if use_entmax15 else F.softmax
+        self.attn_fn = F.softmax
+
+        # add memory key / values
+        self.num_mem_kv = num_mem_kv
+        if num_mem_kv > 0:
+            self.mem_k = nn.Parameter(torch.randn(heads, num_mem_kv, dim_head))
+            self.mem_v = nn.Parameter(torch.randn(heads, num_mem_kv, dim_head))
+
+        # attention on attention
+        self.attn_on_attn = on_attn
+        self.to_out = nn.Sequential(nn.Linear(inner_dim, dim * 2), nn.GLU()) if on_attn else nn.Linear(inner_dim, dim)
+
+    def forward(
+            self,
+            x,
+            context=None,
+            mask=None,
+            context_mask=None,
+            rel_pos=None,
+            sinusoidal_emb=None,
+            prev_attn=None,
+            mem=None
+    ):
+        b, n, _, h, talking_heads, device = *x.shape, self.heads, self.talking_heads, x.device
+        kv_input = default(context, x)
+
+        q_input = x
+        k_input = kv_input
+        v_input = kv_input
+
+        if exists(mem):
+            k_input = torch.cat((mem, k_input), dim=-2)
+            v_input = torch.cat((mem, v_input), dim=-2)
+
+        if exists(sinusoidal_emb):
+            # in shortformer, the query would start at a position offset depending on the past cached memory
+            offset = k_input.shape[-2] - q_input.shape[-2]
+            q_input = q_input + sinusoidal_emb(q_input, offset=offset)
+            k_input = k_input + sinusoidal_emb(k_input)
+
+        q = self.to_q(q_input)
+        k = self.to_k(k_input)
+        v = self.to_v(v_input)
+
+        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=h), (q, k, v))
+
+        input_mask = None
+        if any(map(exists, (mask, context_mask))):
+            q_mask = default(mask, lambda: torch.ones((b, n), device=device).bool())
+            k_mask = q_mask if not exists(context) else context_mask
+            k_mask = default(k_mask, lambda: torch.ones((b, k.shape[-2]), device=device).bool())
+            q_mask = rearrange(q_mask, 'b i -> b () i ()')
+            k_mask = rearrange(k_mask, 'b j -> b () () j')
+            input_mask = q_mask * k_mask
+
+        if self.num_mem_kv > 0:
+            mem_k, mem_v = map(lambda t: repeat(t, 'h n d -> b h n d', b=b), (self.mem_k, self.mem_v))
+            k = torch.cat((mem_k, k), dim=-2)
+            v = torch.cat((mem_v, v), dim=-2)
+            if exists(input_mask):
+                input_mask = F.pad(input_mask, (self.num_mem_kv, 0), value=True)
+
+        dots = einsum('b h i d, b h j d -> b h i j', q, k) * self.scale
+        mask_value = max_neg_value(dots)
+
+        if exists(prev_attn):
+            dots = dots + prev_attn
+
+        pre_softmax_attn = dots
+
+        if talking_heads:
+            dots = einsum('b h i j, h k -> b k i j', dots, self.pre_softmax_proj).contiguous()
+
+        if exists(rel_pos):
+            dots = rel_pos(dots)
+
+        if exists(input_mask):
+            dots.masked_fill_(~input_mask, mask_value)
+            del input_mask
+
+        if self.causal:
+            i, j = dots.shape[-2:]
+            r = torch.arange(i, device=device)
+            mask = rearrange(r, 'i -> () () i ()') < rearrange(r, 'j -> () () () j')
+            mask = F.pad(mask, (j - i, 0), value=False)
+            dots.masked_fill_(mask, mask_value)
+            del mask
+
+        if exists(self.sparse_topk) and self.sparse_topk < dots.shape[-1]:
+            top, _ = dots.topk(self.sparse_topk, dim=-1)
+            vk = top[..., -1].unsqueeze(-1).expand_as(dots)
+            mask = dots < vk
+            dots.masked_fill_(mask, mask_value)
+            del mask
+
+        attn = self.attn_fn(dots, dim=-1)
+        post_softmax_attn = attn
+
+        attn = self.dropout(attn)
+
+        if talking_heads:
+            attn = einsum('b h i j, h k -> b k i j', attn, self.post_softmax_proj).contiguous()
+
+        out = einsum('b h i j, b h j d -> b h i d', attn, v)
+        out = rearrange(out, 'b h n d -> b n (h d)')
+
+        intermediates = Intermediates(
+            pre_softmax_attn=pre_softmax_attn,
+            post_softmax_attn=post_softmax_attn
+        )
+
+        return self.to_out(out), intermediates
+
+
+class AttentionLayers(nn.Module):
+    def __init__(
+            self,
+            dim,
+            depth,
+            heads=8,
+            causal=False,
+            cross_attend=False,
+            only_cross=False,
+            use_scalenorm=False,
+            use_rmsnorm=False,
+            use_rezero=False,
+            rel_pos_num_buckets=32,
+            rel_pos_max_distance=128,
+            position_infused_attn=False,
+            custom_layers=None,
+            sandwich_coef=None,
+            par_ratio=None,
+            residual_attn=False,
+            cross_residual_attn=False,
+            macaron=False,
+            pre_norm=True,
+            gate_residual=False,
+            **kwargs
+    ):
+        super().__init__()
+        ff_kwargs, kwargs = groupby_prefix_and_trim('ff_', kwargs)
+        attn_kwargs, _ = groupby_prefix_and_trim('attn_', kwargs)
+
+        dim_head = attn_kwargs.get('dim_head', DEFAULT_DIM_HEAD)
+
+        self.dim = dim
+        self.depth = depth
+        self.layers = nn.ModuleList([])
+
+        self.has_pos_emb = position_infused_attn
+        self.pia_pos_emb = FixedPositionalEmbedding(dim) if position_infused_attn else None
+        self.rotary_pos_emb = always(None)
+
+        assert rel_pos_num_buckets <= rel_pos_max_distance, 'number of relative position buckets must be less than the relative position max distance'
+        self.rel_pos = None
+
+        self.pre_norm = pre_norm
+
+        self.residual_attn = residual_attn
+        self.cross_residual_attn = cross_residual_attn
+
+        norm_class = ScaleNorm if use_scalenorm else nn.LayerNorm
+        norm_class = RMSNorm if use_rmsnorm else norm_class
+        norm_fn = partial(norm_class, dim)
+
+        norm_fn = nn.Identity if use_rezero else norm_fn
+        branch_fn = Rezero if use_rezero else None
+
+        if cross_attend and not only_cross:
+            default_block = ('a', 'c', 'f')
+        elif cross_attend and only_cross:
+            default_block = ('c', 'f')
+        else:
+            default_block = ('a', 'f')
+
+        if macaron:
+            default_block = ('f',) + default_block
+
+        if exists(custom_layers):
+            layer_types = custom_layers
+        elif exists(par_ratio):
+            par_depth = depth * len(default_block)
+            assert 1 < par_ratio <= par_depth, 'par ratio out of range'
+            default_block = tuple(filter(not_equals('f'), default_block))
+            par_attn = par_depth // par_ratio
+            depth_cut = par_depth * 2 // 3  # 2 / 3 attention layer cutoff suggested by PAR paper
+            par_width = (depth_cut + depth_cut // par_attn) // par_attn
+            assert len(default_block) <= par_width, 'default block is too large for par_ratio'
+            par_block = default_block + ('f',) * (par_width - len(default_block))
+            par_head = par_block * par_attn
+            layer_types = par_head + ('f',) * (par_depth - len(par_head))
+        elif exists(sandwich_coef):
+            assert sandwich_coef > 0 and sandwich_coef <= depth, 'sandwich coefficient should be less than the depth'
+            layer_types = ('a',) * sandwich_coef + default_block * (depth - sandwich_coef) + ('f',) * sandwich_coef
+        else:
+            layer_types = default_block * depth
+
+        self.layer_types = layer_types
+        self.num_attn_layers = len(list(filter(equals('a'), layer_types)))
+
+        for layer_type in self.layer_types:
+            if layer_type == 'a':
+                layer = Attention(dim, heads=heads, causal=causal, **attn_kwargs)
+            elif layer_type == 'c':
+                layer = Attention(dim, heads=heads, **attn_kwargs)
+            elif layer_type == 'f':
+                layer = FeedForward(dim, **ff_kwargs)
+                layer = layer if not macaron else Scale(0.5, layer)
+            else:
+                raise Exception(f'invalid layer type {layer_type}')
+
+            if isinstance(layer, Attention) and exists(branch_fn):
+                layer = branch_fn(layer)
+
+            if gate_residual:
+                residual_fn = GRUGating(dim)
+            else:
+                residual_fn = Residual()
+
+            self.layers.append(nn.ModuleList([
+                norm_fn(),
+                layer,
+                residual_fn
+            ]))
+
+    def forward(
+            self,
+            x,
+            context=None,
+            mask=None,
+            context_mask=None,
+            mems=None,
+            return_hiddens=False
+    ):
+        hiddens = []
+        intermediates = []
+        prev_attn = None
+        prev_cross_attn = None
+
+        mems = mems.copy() if exists(mems) else [None] * self.num_attn_layers
+
+        for ind, (layer_type, (norm, block, residual_fn)) in enumerate(zip(self.layer_types, self.layers)):
+            is_last = ind == (len(self.layers) - 1)
+
+            if layer_type == 'a':
+                hiddens.append(x)
+                layer_mem = mems.pop(0)
+
+            residual = x
+
+            if self.pre_norm:
+                x = norm(x)
+
+            if layer_type == 'a':
+                out, inter = block(x, mask=mask, sinusoidal_emb=self.pia_pos_emb, rel_pos=self.rel_pos,
+                                   prev_attn=prev_attn, mem=layer_mem)
+            elif layer_type == 'c':
+                out, inter = block(x, context=context, mask=mask, context_mask=context_mask, prev_attn=prev_cross_attn)
+            elif layer_type == 'f':
+                out = block(x)
+
+            x = residual_fn(out, residual)
+
+            if layer_type in ('a', 'c'):
+                intermediates.append(inter)
+
+            if layer_type == 'a' and self.residual_attn:
+                prev_attn = inter.pre_softmax_attn
+            elif layer_type == 'c' and self.cross_residual_attn:
+                prev_cross_attn = inter.pre_softmax_attn
+
+            if not self.pre_norm and not is_last:
+                x = norm(x)
+
+        if return_hiddens:
+            intermediates = LayerIntermediates(
+                hiddens=hiddens,
+                attn_intermediates=intermediates
+            )
+
+            return x, intermediates
+
+        return x
+
+
+class Encoder(AttentionLayers):
+    def __init__(self, **kwargs):
+        assert 'causal' not in kwargs, 'cannot set causality on encoder'
+        super().__init__(causal=False, **kwargs)
+
+
+
+class TransformerWrapper(nn.Module):
+    def __init__(
+            self,
+            *,
+            num_tokens,
+            max_seq_len,
+            attn_layers,
+            emb_dim=None,
+            max_mem_len=0.,
+            emb_dropout=0.,
+            num_memory_tokens=None,
+            tie_embedding=False,
+            use_pos_emb=True
+    ):
+        super().__init__()
+        assert isinstance(attn_layers, AttentionLayers), 'attention layers must be one of Encoder or Decoder'
+
+        dim = attn_layers.dim
+        emb_dim = default(emb_dim, dim)
+
+        self.max_seq_len = max_seq_len
+        self.max_mem_len = max_mem_len
+        self.num_tokens = num_tokens
+
+        self.token_emb = nn.Embedding(num_tokens, emb_dim)
+        self.pos_emb = AbsolutePositionalEmbedding(emb_dim, max_seq_len) if (
+                    use_pos_emb and not attn_layers.has_pos_emb) else always(0)
+        self.emb_dropout = nn.Dropout(emb_dropout)
+
+        self.project_emb = nn.Linear(emb_dim, dim) if emb_dim != dim else nn.Identity()
+        self.attn_layers = attn_layers
+        self.norm = nn.LayerNorm(dim)
+
+        self.init_()
+
+        self.to_logits = nn.Linear(dim, num_tokens) if not tie_embedding else lambda t: t @ self.token_emb.weight.t()
+
+        # memory tokens (like [cls]) from Memory Transformers paper
+        num_memory_tokens = default(num_memory_tokens, 0)
+        self.num_memory_tokens = num_memory_tokens
+        if num_memory_tokens > 0:
+            self.memory_tokens = nn.Parameter(torch.randn(num_memory_tokens, dim))
+
+            # let funnel encoder know number of memory tokens, if specified
+            if hasattr(attn_layers, 'num_memory_tokens'):
+                attn_layers.num_memory_tokens = num_memory_tokens
+
+    def init_(self):
+        nn.init.normal_(self.token_emb.weight, std=0.02)
+
+    def forward(
+            self,
+            x,
+            return_embeddings=False,
+            mask=None,
+            return_mems=False,
+            return_attn=False,
+            mems=None,
+            **kwargs
+    ):
+        b, n, device, num_mem = *x.shape, x.device, self.num_memory_tokens
+        x = self.token_emb(x)
+        x += self.pos_emb(x)
+        x = self.emb_dropout(x)
+
+        x = self.project_emb(x)
+
+        if num_mem > 0:
+            mem = repeat(self.memory_tokens, 'n d -> b n d', b=b)
+            x = torch.cat((mem, x), dim=1)
+
+            # auto-handle masking after appending memory tokens
+            if exists(mask):
+                mask = F.pad(mask, (num_mem, 0), value=True)
+
+        x, intermediates = self.attn_layers(x, mask=mask, mems=mems, return_hiddens=True, **kwargs)
+        x = self.norm(x)
+
+        mem, x = x[:, :num_mem], x[:, num_mem:]
+
+        out = self.to_logits(x) if not return_embeddings else x
+
+        if return_mems:
+            hiddens = intermediates.hiddens
+            new_mems = list(map(lambda pair: torch.cat(pair, dim=-2), zip(mems, hiddens))) if exists(mems) else hiddens
+            new_mems = list(map(lambda t: t[..., -self.max_mem_len:, :].detach(), new_mems))
+            return out, new_mems
+
+        if return_attn:
+            attn_maps = list(map(lambda t: t.post_softmax_attn, intermediates.attn_intermediates))
+            return out, attn_maps
+
+        return out
+
diff --git a/scripts/evaluation/__pycache__/style_inference.cpython-39.pyc b/scripts/evaluation/__pycache__/style_inference.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8b53c1a7d591686de7b0fdb90b4d88253cd5c363
Binary files /dev/null and b/scripts/evaluation/__pycache__/style_inference.cpython-39.pyc differ
diff --git a/scripts/evaluation/ddp_wrapper.py b/scripts/evaluation/ddp_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3af8d2fa76c35bc47cbd68026eff182e31b0a68
--- /dev/null
+++ b/scripts/evaluation/ddp_wrapper.py
@@ -0,0 +1,48 @@
+import os, sys
+import datetime, time
+import argparse, importlib
+from pytorch_lightning import seed_everything
+
+import torch
+import torch.distributed as dist
+#from inference import run_inference, get_parser
+
+def setup_dist(local_rank):
+    if dist.is_initialized():
+        return
+    torch.cuda.set_device(local_rank)
+    torch.distributed.init_process_group('nccl', init_method='env://')
+
+
+def get_dist_info():
+    if dist.is_available():
+        initialized = dist.is_initialized()
+    else:
+        initialized = False
+    if initialized:
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+    else:
+        rank = 0
+        world_size = 1
+    return rank, world_size
+
+
+if __name__ == '__main__':
+    now = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--module", type=str, help="module name", default="inference")
+    parser.add_argument("--local_rank", type=int, nargs="?", help="for ddp", default=0)
+    args, unknown = parser.parse_known_args()
+    inference_api = importlib.import_module(args.module, package=None)
+
+    inference_parser = inference_api.get_parser()
+    inference_args, unknown = inference_parser.parse_known_args()
+
+    seed_everything(inference_args.seed)
+    setup_dist(args.local_rank)
+    torch.backends.cudnn.benchmark = True
+    rank, gpu_num = get_dist_info()
+
+    print("@CoLVDM Inference [rank%d]: %s"%(rank, now))
+    inference_api.run_inference(inference_args, gpu_num, rank)
\ No newline at end of file
diff --git a/scripts/evaluation/funcs.py b/scripts/evaluation/funcs.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b61562daa11c2be675bf4344cc29a831bd11564
--- /dev/null
+++ b/scripts/evaluation/funcs.py
@@ -0,0 +1,237 @@
+import argparse, os, sys, glob, yaml, math, random
+import datetime, time
+import numpy as np
+from omegaconf import OmegaConf
+from tqdm import trange, tqdm
+from einops import repeat
+from collections import OrderedDict
+from decord import VideoReader, cpu
+
+import torch
+import torchvision
+sys.path.insert(1, os.path.join(sys.path[0], '..', '..'))
+from lvdm.models.samplers.ddim import DDIMSampler
+
+
+def batch_ddim_sampling(model, cond, noise_shape, n_samples=1, ddim_steps=50, ddim_eta=1.0,\
+                        cfg_scale=1.0, temporal_cfg_scale=None, **kwargs):
+    ddim_sampler = DDIMSampler(model)
+    uncond_type = model.uncond_type
+    batch_size = noise_shape[0]
+
+    ## construct unconditional guidance
+    if cfg_scale != 1.0:
+        if isinstance(cond, dict):
+            c_cat, text_emb = cond["c_concat"][0], cond["c_crossattn"][0]
+        else:
+            text_emb = cond
+
+        if uncond_type == "empty_seq":
+            prompts = batch_size * [""]
+            uc = model.get_learned_conditioning(prompts)
+        elif uncond_type == "zero_embed":
+            uc = torch.zeros_like(text_emb)
+        else:
+            raise NotImplementedError
+        
+        ## hybrid case
+        if isinstance(cond, dict):
+            uc_hybrid = {"c_concat": [c_cat], "c_crossattn": [uc]}
+            if 'c_adm' in cond:
+                uc_hybrid.update({'c_adm': cond['c_adm']})
+            uc = uc_hybrid
+    else:
+        uc = None
+    
+    ## sampling
+    batch_variants = []
+    for _ in range(n_samples):
+        if ddim_sampler is not None:
+            kwargs.update({"clean_cond": True})
+            samples, _ = ddim_sampler.sample(S=ddim_steps,
+                                            conditioning=cond,
+                                            batch_size=noise_shape[0],
+                                            shape=noise_shape[1:],
+                                            verbose=False,
+                                            unconditional_guidance_scale=cfg_scale,
+                                            unconditional_conditioning=uc,
+                                            eta=ddim_eta,
+                                            temporal_length=noise_shape[2],
+                                            conditional_guidance_scale_temporal=temporal_cfg_scale,
+                                            x_T=None,
+                                            **kwargs
+                                            )
+        ## reconstruct from latent to pixel space
+        batch_images = model.decode_first_stage(samples)
+        batch_variants.append(batch_images)
+    ## batch, <samples>, c, t, h, w
+    batch_variants = torch.stack(batch_variants, dim=1)
+    return batch_variants
+
+
+def batch_sliding_interpolation(model, cond, base_videos, base_stride, noise_shape, n_samples=1,\
+                                ddim_steps=50, ddim_eta=1.0, cfg_scale=1.0, temporal_cfg_scale=None, **kwargs):
+    '''
+    Current implementation has a flaw: the inter-episode keyframe is used as pre-last and cur-first, so keyframe repeated.
+    For example, cond_frames=[0,4,7], model.temporal_length=8, base_stride=4, then
+    base frame  : 0   4   8   12  16  20  24  28
+    interplation: (0~7)   (8~15)  (16~23) (20~27)
+    '''
+    b,c,t,h,w = noise_shape
+    base_z0 = model.encode_first_stage(base_videos)
+    unit_length = model.temporal_length
+    n_base_frames = base_videos.shape[2]
+    n_refs = len(model.cond_frames)
+    sliding_steps = (n_base_frames-1) // (n_refs-1)
+    sliding_steps = sliding_steps+1 if (n_base_frames-1) % (n_refs-1) > 0 else sliding_steps
+
+    cond_mask = model.cond_mask.to("cuda")
+    proxy_z0 = torch.zeros((b,c,unit_length,h,w), dtype=torch.float32).to("cuda")
+    batch_samples = None
+    last_offset = None
+    for idx in range(sliding_steps):
+        base_idx = idx * (n_refs-1)
+        ## check index overflow
+        if base_idx+n_refs > n_base_frames:
+            last_offset = base_idx - (n_base_frames - n_refs)
+            base_idx = n_base_frames - n_refs
+        cond_z0 = base_z0[:,:,base_idx:base_idx+n_refs,:,:]
+        proxy_z0[:,:,model.cond_frames,:,:] = cond_z0
+
+        if isinstance(cond, dict):
+            c_cat, text_emb = cond["c_concat"][0], cond["c_crossattn"][0]
+            episode_idx = idx * unit_length
+            if last_offset is not None:
+                episode_idx = episode_idx - last_offset * base_stride
+            cond_idx = {"c_concat": [c_cat[:,:,episode_idx:episode_idx+unit_length,:,:]], "c_crossattn": [text_emb]}
+        else:
+            cond_idx = cond
+        noise_shape_idx = [b,c,unit_length,h,w]
+        ## batch, <samples>, c, t, h, w
+        batch_idx = batch_ddim_sampling(model, cond_idx, noise_shape_idx, n_samples, ddim_steps, ddim_eta, cfg_scale, \
+                                        temporal_cfg_scale, mask=cond_mask, x0=proxy_z0, **kwargs)
+        
+        if batch_samples is None:
+            batch_samples = batch_idx
+        else:
+            ## b,s,c,t,h,w
+            if last_offset is None:
+                batch_samples = torch.cat([batch_samples[:,:,:,:-1,:,:], batch_idx], dim=3)
+            else:
+                batch_samples = torch.cat([batch_samples[:,:,:,:-1,:,:], batch_idx[:,:,:,last_offset * base_stride:,:,:]], dim=3)
+                
+    return batch_samples
+
+
+def get_filelist(data_dir, ext='*'):
+    file_list = glob.glob(os.path.join(data_dir, '*.%s'%ext))
+    file_list.sort()
+    return file_list
+
+def get_dirlist(path):
+    list = []
+    if (os.path.exists(path)):
+        files = os.listdir(path)
+        for file in files:
+            m = os.path.join(path,file)
+            if (os.path.isdir(m)):
+                list.append(m)
+    list.sort()
+    return list
+
+
+def load_model_checkpoint(model, ckpt, adapter_ckpt=None):    
+    def load_checkpoint(model, ckpt, full_strict):
+        state_dict = torch.load(ckpt, map_location="cpu")
+        try:
+            ## deepspeed
+            new_pl_sd = OrderedDict()
+            for key in state_dict['module'].keys():
+                new_pl_sd[key[16:]]=state_dict['module'][key]
+            model.load_state_dict(new_pl_sd, strict=full_strict)
+        except:
+            if "state_dict" in list(state_dict.keys()):
+                state_dict = state_dict["state_dict"]
+            model.load_state_dict(state_dict, strict=full_strict)
+        return model
+
+    if adapter_ckpt:
+        ## main model
+        load_checkpoint(model, ckpt, full_strict=False)
+        print('>>> model checkpoint loaded.')
+        ## adapter
+        state_dict = torch.load(adapter_ckpt, map_location="cpu")
+        if "state_dict" in list(state_dict.keys()):
+            state_dict = state_dict["state_dict"]
+        model.adapter.load_state_dict(state_dict, strict=True)
+        print('>>> adapter checkpoint loaded.')
+    else:
+        load_checkpoint(model, ckpt, full_strict=True)
+        print('>>> model checkpoint loaded.')
+    return model
+
+
+def load_prompts(prompt_file):
+    f = open(prompt_file, 'r')
+    prompt_list = []
+    for idx, line in enumerate(f.readlines()):
+        l = line.strip()
+        if len(l) != 0:
+            prompt_list.append(l)
+        f.close()
+    return prompt_list
+
+
+def load_video_batch(filepath_list, frame_stride, video_size=(256,256), video_frames=16):
+    '''
+    Notice about some special cases:
+    1. video_frames=-1 means to take all the frames (with fs=1)
+    2. when the total video frames is less than required, padding strategy will be used (repreated last frame)
+    '''
+    fps_list = []
+    batch_tensor = []
+    assert frame_stride > 0, "valid frame stride should be a positive interge!"
+    for filepath in filepath_list:
+        padding_num = 0
+        vidreader = VideoReader(filepath, ctx=cpu(0), width=video_size[1], height=video_size[0])
+        fps = vidreader.get_avg_fps()
+        total_frames = len(vidreader)
+        max_valid_frames = (total_frames-1) // frame_stride + 1
+        if video_frames < 0:
+            ## all frames are collected: fs=1 is a must
+            required_frames = total_frames
+            frame_stride = 1
+        else:
+            required_frames = video_frames
+        query_frames = min(required_frames, max_valid_frames)
+        frame_indices = [frame_stride*i for i in range(query_frames)]
+
+        ## [t,h,w,c] -> [c,t,h,w]
+        frames = vidreader.get_batch(frame_indices)
+        frame_tensor = torch.tensor(frames.asnumpy()).permute(3, 0, 1, 2).float()
+        frame_tensor = (frame_tensor / 255. - 0.5) * 2
+        if max_valid_frames < required_frames:
+            padding_num = required_frames - max_valid_frames
+            frame_tensor = torch.cat([frame_tensor, *([frame_tensor[:,-1:,:,:]]*padding_num)], dim=1)
+            print(f'{os.path.split(filepath)[1]} is not long enough: {padding_num} frames padded.')
+        batch_tensor.append(frame_tensor)
+        sample_fps = int(fps/frame_stride)
+        fps_list.append(sample_fps)
+    
+    return torch.stack(batch_tensor, dim=0)
+
+
+def save_videos(batch_tensors, savedir, filenames, fps=10):
+    # b,samples,c,t,h,w
+    n_samples = batch_tensors.shape[1]
+    for idx, vid_tensor in enumerate(batch_tensors):
+        video = vid_tensor.detach().cpu()
+        video = torch.clamp(video.float(), -1., 1.)
+        video = video.permute(2, 0, 1, 3, 4) # t,n,c,h,w
+        frame_grids = [torchvision.utils.make_grid(framesheet, nrow=int(n_samples)) for framesheet in video] #[3, 1*h, n*w]
+        grid = torch.stack(frame_grids, dim=0) # stack in temporal dim [t, 3, n*h, w]
+        grid = (grid + 1.0) / 2.0
+        grid = (grid * 255).to(torch.uint8).permute(0, 2, 3, 1)
+        savepath = os.path.join(savedir, f"{filenames[idx]}.mp4")
+        torchvision.io.write_video(savepath, grid, fps=fps, video_codec='h264', options={'crf': '10'})
+
diff --git a/scripts/evaluation/style_inference.py b/scripts/evaluation/style_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..27687112ded6680dd4ba2d116b5f1061507ab18e
--- /dev/null
+++ b/scripts/evaluation/style_inference.py
@@ -0,0 +1,313 @@
+import argparse, os, sys, glob
+import datetime, time
+import numpy as np
+from omegaconf import OmegaConf
+from tqdm import tqdm
+from einops import rearrange, repeat
+from collections import OrderedDict
+
+import torch
+import torchvision
+from torch.utils.data import DataLoader
+from pytorch_lightning import seed_everything
+## note: decord should be imported after torch
+from decord import VideoReader, cpu
+from PIL import Image
+import json
+from torchvision.transforms import transforms
+from torchvision.utils import make_grid
+
+sys.path.insert(1, os.path.join(sys.path[0], '..', '..'))
+from lvdm.models.samplers.ddim import DDIMSampler, DDIMStyleSampler
+from utils.utils import instantiate_from_config
+from utils.save_video import tensor_to_mp4
+
+
+def save_img(img, path, is_tensor=True):
+    if is_tensor:
+        img = img.permute(1, 2, 0).cpu().numpy()
+    img = (img * 127.5 + 127.5).clip(0, 255).astype(np.uint8)
+    img = Image.fromarray(img)
+    img.save(path)
+
+def get_filelist(data_dir, ext='*'):
+    file_list = glob.glob(os.path.join(data_dir, '*.%s'%ext))
+    file_list.sort()
+    return file_list
+
+def load_model_checkpoint(model, ckpt):
+    state_dict = torch.load(ckpt, map_location="cpu")
+    if "state_dict" in list(state_dict.keys()):
+        state_dict = state_dict["state_dict"]
+    else:       
+        # deepspeed
+        state_dict = OrderedDict()
+        for key in state_dict['module'].keys():
+            state_dict[key[16:]]=state_dict['module'][key]
+
+    model.load_state_dict(state_dict, strict=False)
+    print('>>> model checkpoint loaded.')
+    return model
+
+def load_data_from_json(data_dir, filename=None, DISABLE_MULTI_REF=False):
+    # load data from json file
+    if filename is not None:
+        json_file = os.path.join(data_dir, filename)
+        with open(json_file, 'r') as f:
+            data = json.load(f)
+    else:
+        json_file = get_filelist(data_dir, 'json')
+        assert len(json_file) > 0, "Error: found NO prompt file!"
+        default_idx = 0
+        default_idx = min(default_idx, len(json_file)-1)
+        if len(json_file) > 1:
+            print(f"Warning: multiple prompt files exist. The one {os.path.split(json_file[default_idx])[1]} is used.")
+        ## only use the first one (sorted by name) if multiple exist
+        with open(json_file[default_idx], 'r') as f:
+            data = json.load(f)
+
+    n_samples = len(data)
+    data_list = []
+
+    style_transforms = torchvision.transforms.Compose([
+        torchvision.transforms.Resize(512),
+        torchvision.transforms.CenterCrop(512),
+        torchvision.transforms.ToTensor(),
+        torchvision.transforms.Lambda(lambda x: x * 2. - 1.),
+    ])
+
+    for idx in range(n_samples):
+        prompt = data[idx]['prompt']
+
+        # load style image
+        if data[idx]['style_path'] is not None:
+            style_path = data[idx]['style_path']
+            if isinstance(style_path, list) and not DISABLE_MULTI_REF:
+                style_imgs = []
+                for path in style_path:
+                    style_img = Image.open(os.path.join(data_dir, path)).convert('RGB')
+                    style_img_tensor = style_transforms(style_img)
+                    style_imgs.append(style_img_tensor)
+                style_img_tensor = torch.stack(style_imgs, dim=0)
+            elif isinstance(style_path, list) and DISABLE_MULTI_REF:
+                rand_idx = np.random.randint(0, len(style_path))
+                style_img = Image.open(os.path.join(data_dir, style_path[rand_idx])).convert('RGB')
+                style_img_tensor = style_transforms(style_img)
+                print(f"Warning: multiple style images exist. The one {style_path[rand_idx]} is used.")
+            else:
+                style_img = Image.open(os.path.join(data_dir, style_path)).convert('RGB')
+                style_img_tensor = style_transforms(style_img)
+        else:
+            raise ValueError("Error: style image path is None!")
+            
+        data_list.append({
+            'prompt': prompt,
+            'style': style_img_tensor
+        })
+
+    return data_list
+
+def save_results(prompt, samples, filename, sample_dir, prompt_dir, fps=10, out_type='video'):
+    ## save prompt
+    prompt = prompt[0] if isinstance(prompt, list) else prompt
+    path = os.path.join(prompt_dir, "%s.txt"%filename)
+    with open(path, 'w') as f:
+        f.write(f'{prompt}')
+        f.close()
+
+    ## save video
+    if out_type == 'image':
+        n = samples.shape[0]
+        output = make_grid(samples, nrow=n, normalize=True, range=(-1, 1))
+        output_img = Image.fromarray(output.mul(255).clamp(0, 255).byte().permute(1, 2, 0).cpu().numpy())
+        output_img.save(os.path.join(sample_dir, "%s.jpg"%filename))
+    elif out_type == 'video':
+        ## save video
+        # b,c,t,h,w
+        video = samples.detach().cpu()
+        video = torch.clamp(video.float(), -1., 1.)
+        n = video.shape[0]
+        video = video.permute(2, 0, 1, 3, 4) # t,n,c,h,w
+        frame_grids = [torchvision.utils.make_grid(framesheet, nrow=int(n)) for framesheet in video] #[3, 1*h, n*w]
+        grid = torch.stack(frame_grids, dim=0) # stack in temporal dim [t, 3, n*h, w]
+        grid = (grid + 1.0) / 2.0
+        grid = (grid * 255).to(torch.uint8).permute(0, 2, 3, 1)
+        path = os.path.join(sample_dir, "%s.mp4"%filename)
+        torchvision.io.write_video(path, grid, fps=fps, video_codec='h264', options={'crf': '10'})
+    else:
+        raise ValueError("Error: output type should be image or video!")
+
+def style_guided_synthesis(model, prompts, style, noise_shape, n_samples=1, ddim_steps=50, ddim_eta=1., \
+                        unconditional_guidance_scale=1.0, unconditional_guidance_scale_style=None, **kwargs):
+    ddim_sampler = DDIMSampler(model) if unconditional_guidance_scale_style is None else DDIMStyleSampler(model) 
+
+    batch_size = noise_shape[0]
+    ## get condition embeddings (support single prompt only)
+    if isinstance(prompts, str):
+        prompts = [prompts]
+    cond = model.get_learned_conditioning(prompts)
+    # cond = repeat(cond, 'b n c -> (b f) n c', f=16)
+    if unconditional_guidance_scale != 1.0:
+        prompts = batch_size * [""]
+        uc = model.get_learned_conditioning(prompts)
+        # uc = repeat(uc, 'b n c -> (b f) n c', f=16)
+    else:
+        uc = None
+    
+    if len(style.shape) == 4:
+        style_cond = model.get_batch_style(style)
+        append_to_context = model.adapter(style_cond)
+    else:
+        bs, n, c, h, w = style.shape
+        style = rearrange(style, "b n c h w -> (b n) c h w")
+        style_cond = model.get_batch_style(style)
+        style_cond = rearrange(style_cond, "(b n) l c -> b (n l ) c", b=bs)
+        append_to_context = model.adapter(style_cond)
+    # append_to_context = repeat(append_to_context, 'b n c -> (b f) n c', f=16)
+
+    if hasattr(model.adapter, "scale_predictor"):
+        scale_scalar = model.adapter.scale_predictor(torch.concat([append_to_context, cond], dim=1))
+    else:
+        scale_scalar = None
+
+    batch_variants = []
+
+    for _ in range(n_samples):
+        if ddim_sampler is not None:
+            samples, _ = ddim_sampler.sample(S=ddim_steps,
+                                            conditioning=cond,
+                                            batch_size=noise_shape[0],
+                                            shape=noise_shape[1:],
+                                            verbose=False,
+                                            unconditional_guidance_scale=unconditional_guidance_scale,
+                                            unconditional_guidance_scale_style=unconditional_guidance_scale_style,
+                                            unconditional_conditioning=uc,
+                                            eta=ddim_eta,
+                                            temporal_length=noise_shape[2],
+                                            append_to_context=append_to_context,
+                                            scale_scalar=scale_scalar,
+                                            **kwargs
+                                            )    
+        ## reconstruct from latent to pixel space
+        batch_images = model.decode_first_stage(samples)
+        batch_variants.append(batch_images)
+    ## variants, batch, c, t, h, w
+    batch_variants = torch.stack(batch_variants)
+    return batch_variants.permute(1, 0, 2, 3, 4, 5)
+
+
+def run_inference(args, gpu_num, gpu_no):
+    ## model config
+    config = OmegaConf.load(args.base)
+    model_config = config.pop("model", OmegaConf.create())
+    model_config['params']['adapter_config']['params']['scale'] = args.style_weight
+    print(f"Set adapter scale to {args.style_weight:.2f}")
+    model = instantiate_from_config(model_config)
+    model = model.cuda(gpu_no)
+    assert os.path.exists(args.ckpt_path), "Error: checkpoint Not Found!"
+
+    model = load_model_checkpoint(model, args.ckpt_path)
+    model.load_pretrained_adapter(args.adapter_ckpt)
+    if args.out_type == 'video' and args.temporal_ckpt is not None:
+        model.load_pretrained_temporal(args.temporal_ckpt)
+    model.eval()
+
+    ## run over data
+    assert (args.height % 16 == 0) and (args.width % 16 == 0), "Error: image size [h,w] should be multiples of 16!"
+    ## latent noise shape
+    h, w = args.height // 8, args.width // 8
+    channels = model.channels
+    frames = model.temporal_length if args.out_type == 'video' else 1
+    noise_shape = [args.bs, channels, frames, h, w]
+
+    sample_dir = os.path.join(args.savedir, "samples")
+    prompt_dir = os.path.join(args.savedir, "prompts")
+    style_dir = os.path.join(args.savedir, "style")
+    os.makedirs(sample_dir, exist_ok=True)
+    os.makedirs(prompt_dir, exist_ok=True)
+    os.makedirs(style_dir, exist_ok=True)
+    
+    ## prompt file setting
+    assert os.path.exists(args.prompt_dir), "Error: prompt file Not Found!"
+    data_list = load_data_from_json(args.prompt_dir, args.filename, args.disable_multi_ref)
+    num_samples = len(data_list)
+    samples_split = num_samples // gpu_num
+    print('Prompts testing [rank:%d] %d/%d samples loaded.'%(gpu_no, samples_split, num_samples))
+    #indices = random.choices(list(range(0, num_samples)), k=samples_per_device)
+    indices = list(range(samples_split*gpu_no, samples_split*(gpu_no+1)))
+    data_list_rank = [data_list[i] for i in indices]
+
+    start = time.time() 
+    for idx, indice in tqdm(enumerate(range(0, len(data_list_rank), args.bs)), desc='Sample Batch'):
+        prompts = [batch_data['prompt'] for batch_data in data_list_rank[indice:indice+args.bs]]
+        styles = [batch_data['style'] for batch_data in data_list_rank[indice:indice+args.bs]]
+
+        if isinstance(styles, list):
+            styles = torch.stack(styles, dim=0).to("cuda")
+        else:
+            styles = styles.unsqueeze(0).to("cuda")
+        
+
+        # if os.path.exists(os.path.join(args.savedir, 'style/{:04d}_style_randk{:d}.png'.format(idx + 1, gpu_no))):
+        #     continue
+        with torch.cuda.amp.autocast(dtype=torch.float32):
+            batch_samples = style_guided_synthesis(model, prompts, styles, noise_shape, args.n_samples, args.ddim_steps, args.ddim_eta, \
+                                                args.unconditional_guidance_scale, args.unconditional_guidance_scale_style)
+            if args.out_type == 'image':
+                batch_samples = batch_samples[:, :, :, 0, :, :]
+        
+        if len(styles.shape) == 4:
+            for nn in range(styles.shape[0]):
+                filename = "%04d"%(idx*args.bs+nn + gpu_no * samples_split)
+                save_img(styles[nn], os.path.join(style_dir, f'{filename}.png'))
+        else:
+            for nn in range(styles.shape[0]):
+                filename = "%04d"%(idx*args.bs+nn + gpu_no * samples_split)
+                for i in range(styles.shape[1]):
+                    save_img(styles[nn, i], os.path.join(style_dir, f'{filename}_{i:02d}.png'))
+        
+        ## save each example individually
+        for nn, samples in enumerate(batch_samples):
+            ## samples : [n_samples,c,t,h,w]
+            prompt = prompts[nn]
+            filename = "%04d"%(idx*args.bs+nn + gpu_no * samples_split)
+            for i in range(args.n_samples):
+                save_results(prompt, samples[i:i+1], f"{filename}_{i}", sample_dir, prompt_dir, fps=10, out_type=args.out_type)
+
+    print(f"Saved in {args.savedir}. Time used: {(time.time() - start):.2f} seconds")
+
+
+def get_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--savedir", type=str, default=None, help="results saving path")
+    parser.add_argument("--ckpt_path", type=str, default=None, help="checkpoint path")
+    parser.add_argument("--adapter_ckpt", type=str, default=None, help="adapter checkpoint path")
+    parser.add_argument("--temporal_ckpt", type=str, default=None, help="temporal checkpoint path")
+    parser.add_argument("--base", type=str, help="config (yaml) path")
+    parser.add_argument("--cond_type", default='style', type=str, help="conditon type: {style, depth, style_depth}")
+    parser.add_argument("--out_type", default='video', type=str, help="output type: {image, video}")
+    parser.add_argument("--prompt_dir", type=str, default=None, help="a data dir containing videos and prompts")
+    parser.add_argument("--filename", type=str, default=None, help="a data dir containing videos and prompts")
+    parser.add_argument("--n_samples", type=int, default=1, help="num of samples per prompt",)
+    parser.add_argument("--ddim_steps", type=int, default=50, help="steps of ddim if positive, otherwise use DDPM",)
+    parser.add_argument("--ddim_eta", type=float, default=1.0, help="eta for ddim sampling (0.0 yields deterministic sampling)",)
+    parser.add_argument("--bs", type=int, default=1, help="batch size for inference")
+    parser.add_argument("--height", type=int, default=512, help="image height, in pixel space")
+    parser.add_argument("--width", type=int, default=512, help="image width, in pixel space")
+    parser.add_argument("--unconditional_guidance_scale", type=float, default=1.0, help="prompt classifier-free guidance")
+    parser.add_argument("--unconditional_guidance_scale_style", type=float, default=None, help="prompt classifier-free guidance")
+    parser.add_argument("--seed", type=int, default=0, help="seed for seed_everything")
+    parser.add_argument("--style_weight", type=float, default=1.0)
+    parser.add_argument("--disable_multi_ref", action='store_true', help="disable multiple style images")
+    return parser
+
+
+if __name__ == '__main__':
+    now = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
+    print("@CoLVDM cond-Inference: %s"%now)
+    parser = get_parser()
+    args = parser.parse_args()
+
+    seed_everything(args.seed)
+    rank, gpu_num = 0, 1
+    run_inference(args, gpu_num, rank)
\ No newline at end of file
diff --git a/scripts/run_infer_image.sh b/scripts/run_infer_image.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f6622b873c8d3589011332eac1695d3a24b4caf7
--- /dev/null
+++ b/scripts/run_infer_image.sh
@@ -0,0 +1,50 @@
+name="style_image_generation"
+config="configs/inference_image_512_512.yaml"
+ckpt="checkpoints/videocrafter_t2v_320_512/model.ckpt"
+adapter_ckpt="checkpoints/stylecrafter/adapter_v1.pth"
+prompt_dir="eval_data"
+filename="eval_image_gen.json"
+res_dir="output"
+seed=123
+n_samples=1
+
+
+use_ddp=0
+# set use_ddp=1 if you want to use multi GPU
+# export CUDA_VISIBLE_DEVICES=2
+if [ $use_ddp == 0 ]; then
+python3 scripts/evaluation/style_inference.py \
+--out_type 'image' \
+--adapter_ckpt $adapter_ckpt \
+--seed $seed \
+--ckpt_path $ckpt \
+--base $config \
+--savedir $res_dir/$name \
+--n_samples $n_samples \
+--bs 1 --height 512 --width 512 \
+--unconditional_guidance_scale 6.0 \
+--ddim_steps 50 \
+--ddim_eta 1.0 \
+--prompt_dir $prompt_dir \
+--filename $filename 
+fi
+
+if [ $use_ddp == 1 ]; then
+python3 -m torch.distributed.launch \
+--nproc_per_node=$HOST_GPU_NUM --nnodes=$HOST_NUM --master_addr=$CHIEF_IP --master_port=23466 --node_rank=$INDEX \
+scripts/evaluation/ddp_wrapper.py \
+--module 'style_inference' \
+--out_type 'image' \
+--adapter_ckpt $adapter_ckpt \
+--seed $seed \
+--ckpt_path $ckpt \
+--base $config \
+--savedir $res_dir/$name \
+--n_samples $n_samples \
+--bs 1 --height 512 --width 512 \
+--unconditional_guidance_scale 6.0 \
+--ddim_steps 50 \
+--ddim_eta 1.0 \
+--prompt_dir $prompt_dir \
+--filename $filename 
+fi
\ No newline at end of file
diff --git a/scripts/run_infer_video.sh b/scripts/run_infer_video.sh
new file mode 100644
index 0000000000000000000000000000000000000000..dd66f71ab6ecbd488d709cea596876bea33b7f76
--- /dev/null
+++ b/scripts/run_infer_video.sh
@@ -0,0 +1,55 @@
+name="style_video_generation"
+config="configs/inference_video_320_512.yaml"
+ckpt="checkpoints/videocrafter_t2v_320_512/model.ckpt"
+adapter_ckpt="checkpoints/stylecrafter/adapter_v1.pth"
+temporal_ckpt="checkpoints/stylecrafter/temporal_v1.pth"
+prompt_dir="eval_data"
+filename="eval_video_gen.json"
+res_dir="output"
+seed=123
+n_samples=1
+
+
+use_ddp=0
+# set use_ddp=1 if you want to use multi GPU
+# export CUDA_VISIBLE_DEVICES=0, 1
+if [ $use_ddp == 0 ]; then
+python3 scripts/evaluation/style_inference.py \
+--out_type 'video' \
+--adapter_ckpt $adapter_ckpt \
+--temporal_ckpt $temporal_ckpt \
+--seed $seed \
+--ckpt_path $ckpt \
+--base $config \
+--savedir $res_dir/$name \
+--n_samples $n_samples \
+--bs 1 --height 320 --width 512 \
+--unconditional_guidance_scale 15.0 \
+--unconditional_guidance_scale_style 7.5 \
+--ddim_steps 50 \
+--ddim_eta 1.0 \
+--prompt_dir $prompt_dir \
+--filename $filename 
+fi
+
+if [ $use_ddp == 1 ]; then
+python3 -m torch.distributed.launch \
+--nproc_per_node=$HOST_GPU_NUM --nnodes=$HOST_NUM --master_addr=$CHIEF_IP --master_port=23466 --node_rank=$INDEX \
+scripts/evaluation/ddp_wrapper.py \
+--module 'style_inference' \
+--out_type 'video' \
+--adapter_ckpt $adapter_ckpt \
+--temporal_ckpt $temporal_ckpt \
+--seed $seed \
+--ckpt_path $ckpt \
+--base $config \
+--savedir $res_dir/$name \
+--n_samples $n_samples \
+--bs 1 --height 320 --width 512 \
+--unconditional_guidance_scale 15.0 \
+--unconditional_guidance_scale_style 7.5 \
+--ddim_steps 50 \
+--ddim_eta 1.0 \
+--prompt_dir $prompt_dir \
+--filename $filename 
+fi
\ No newline at end of file
diff --git a/utils/__pycache__/save_video.cpython-39.pyc b/utils/__pycache__/save_video.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..136b9dc392038b5225115dcf455c5fd1cbca9061
Binary files /dev/null and b/utils/__pycache__/save_video.cpython-39.pyc differ
diff --git a/utils/__pycache__/utils.cpython-39.pyc b/utils/__pycache__/utils.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3d13533bfa5b167c31ab684062a92ec8368ae046
Binary files /dev/null and b/utils/__pycache__/utils.cpython-39.pyc differ
diff --git a/utils/save_video.py b/utils/save_video.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e3d86f405858ff77f3ebd35f23a9ba41fd6d658
--- /dev/null
+++ b/utils/save_video.py
@@ -0,0 +1,251 @@
+import os
+import numpy as np
+from tqdm import tqdm
+from PIL import Image
+from einops import rearrange
+
+import torch
+import torchvision
+from torch import Tensor
+from torchvision.utils import make_grid
+from torchvision.transforms.functional import to_tensor
+
+def save_video_tensor_to_mp4(video, path, fps):
+    # b,c,t,h,w
+    video = video.detach().cpu()
+    video = torch.clamp(video.float(), -1., 1.)
+    n = video.shape[0]
+    video = video.permute(2, 0, 1, 3, 4) # t,n,c,h,w
+    frame_grids = [torchvision.utils.make_grid(framesheet, nrow=int(n)) for framesheet in video] #[3, 1*h, n*w]
+    grid = torch.stack(frame_grids, dim=0) # stack in temporal dim [t, 3, n*h, w]
+    grid = (grid + 1.0) / 2.0
+    grid = (grid * 255).to(torch.uint8).permute(0, 2, 3, 1)
+    torchvision.io.write_video(path, grid, fps=fps, video_codec='h264', options={'crf': '10'})
+
+def save_video_tensor_to_frames(video, dir):
+    os.makedirs(dir, exist_ok=True)
+    # b,c,t,h,w
+    video = video.detach().cpu()
+    video = torch.clamp(video.float(), -1., 1.)
+    n = video.shape[0]
+    assert(n == 1)
+    video = video[0] # cthw
+    video = video.permute(1,2,3,0) # thwc
+    # video = video.permute(2, 0, 1, 3, 4) # t,n,c,h,w
+    video = (video + 1.0) / 2.0 * 255
+    video = video.to(torch.uint8).numpy()
+    for i in range(video.shape[0]):
+        img = video[i] #hwc
+        image = Image.fromarray(img)
+        image.save(os.path.join(dir, f'frame{i:03d}.jpg'), q=95)
+
+def frames_to_mp4(frame_dir,output_path,fps):
+    def read_first_n_frames(d: os.PathLike, num_frames: int):
+        if num_frames:
+            images = [Image.open(os.path.join(d, f)) for f in sorted(os.listdir(d))[:num_frames]]
+        else:
+            images = [Image.open(os.path.join(d, f)) for f in sorted(os.listdir(d))]
+        images = [to_tensor(x) for x in images]
+        return torch.stack(images)
+    videos = read_first_n_frames(frame_dir, num_frames=None)
+    videos = videos.mul(255).to(torch.uint8).permute(0, 2, 3, 1)
+    torchvision.io.write_video(output_path, videos, fps=fps, video_codec='h264', options={'crf': '10'})
+
+
+def tensor_to_mp4(video, savepath, fps, rescale=True, nrow=None):
+    """
+    video: torch.Tensor, b,c,t,h,w, 0-1
+    if -1~1, enable rescale=True
+    """
+    n = video.shape[0]
+    video = video.permute(2, 0, 1, 3, 4) # t,n,c,h,w
+    nrow = int(np.sqrt(n)) if nrow is None else nrow
+    frame_grids = [torchvision.utils.make_grid(framesheet, nrow=nrow) for framesheet in video] # [3, grid_h, grid_w]
+    grid = torch.stack(frame_grids, dim=0) # stack in temporal dim [T, 3, grid_h, grid_w]
+    grid = torch.clamp(grid.float(), -1., 1.)
+    if rescale:
+        grid = (grid + 1.0) / 2.0
+    grid = (grid * 255).to(torch.uint8).permute(0, 2, 3, 1) # [T, 3, grid_h, grid_w] -> [T, grid_h, grid_w, 3]
+    #print(f'Save video to {savepath}')
+    torchvision.io.write_video(savepath, grid, fps=fps, video_codec='h264', options={'crf': '10'})
+
+    
+def tensor2videogrids(video, root, filename, fps, rescale=True, clamp=True):
+    
+    assert(video.dim() == 5) # b,c,t,h,w
+    assert(isinstance(video, torch.Tensor))
+
+    video = video.detach().cpu()
+    if clamp:
+        video = torch.clamp(video, -1., 1.)
+    n = video.shape[0]
+    video = video.permute(2, 0, 1, 3, 4) # t,n,c,h,w
+    frame_grids = [torchvision.utils.make_grid(framesheet, nrow=int(np.sqrt(n))) for framesheet in video] # [3, grid_h, grid_w]
+    grid = torch.stack(frame_grids, dim=0) # stack in temporal dim [T, 3, grid_h, grid_w]
+    if rescale:
+        grid = (grid + 1.0) / 2.0
+    grid = (grid * 255).to(torch.uint8).permute(0, 2, 3, 1) # [T, 3, grid_h, grid_w] -> [T, grid_h, grid_w, 3]
+    path = os.path.join(root, filename)
+    # print('Save video ...')
+    torchvision.io.write_video(path, grid, fps=fps, video_codec='h264', options={'crf': '10'})
+    # print('Finish!')
+
+
+def log_txt_as_img(wh, xc, size=10):
+    # wh a tuple of (width, height)
+    # xc a list of captions to plot
+    b = len(xc)
+    txts = list()
+    for bi in range(b):
+        txt = Image.new("RGB", wh, color="white")
+        draw = ImageDraw.Draw(txt)
+        font = ImageFont.truetype('data/DejaVuSans.ttf', size=size)
+        nc = int(40 * (wh[0] / 256))
+        lines = "\n".join(xc[bi][start:start + nc] for start in range(0, len(xc[bi]), nc))
+
+        try:
+            draw.text((0, 0), lines, fill="black", font=font)
+        except UnicodeEncodeError:
+            print("Cant encode string for logging. Skipping.")
+
+        txt = np.array(txt).transpose(2, 0, 1) / 127.5 - 1.0
+        txts.append(txt)
+    txts = np.stack(txts)
+    txts = torch.tensor(txts)
+    return txts
+
+
+def log_local(batch_logs, save_dir, filename, save_fps=10, rescale=True):
+    if batch_logs is None:
+        return None
+    """ save images and videos from images dict """
+    def save_img_grid(grid, path, rescale):
+        if rescale:
+                grid = (grid + 1.0) / 2.0  # -1,1 -> 0,1; c,h,w
+        grid = grid.transpose(0, 1).transpose(1, 2).squeeze(-1)
+        grid = grid.numpy()
+        grid = (grid * 255).astype(np.uint8)
+        os.makedirs(os.path.split(path)[0], exist_ok=True)
+        Image.fromarray(grid).save(path)
+    for key in batch_logs:
+        value = batch_logs[key]
+        if isinstance(value, list) and isinstance(value[0], str):
+            ## a batch of captions
+            path = os.path.join(save_dir, "%s-%s.txt"%(key, filename))
+            with open(path, 'w') as f:
+                for i, txt in enumerate(value):
+                    f.write(f'idx={i}, txt={txt}\n')
+                f.close()
+        elif isinstance(value, torch.Tensor) and value.dim() == 5:
+            ## save video grids
+            video = value # b,c,t,h,w
+            ## only save grayscale or rgb mode
+            if video.shape[1] != 1 and video.shape[1] != 3:
+                continue
+            n = video.shape[0]
+            video = video.permute(2, 0, 1, 3, 4) # t,n,c,h,w
+            frame_grids = [torchvision.utils.make_grid(framesheet, nrow=int(1)) for framesheet in video] #[3, n*h, 1*w]
+            grid = torch.stack(frame_grids, dim=0) # stack in temporal dim [t, 3, n*h, w]
+            if rescale:
+                grid = (grid + 1.0) / 2.0
+            grid = (grid * 255).to(torch.uint8).permute(0, 2, 3, 1)
+            path = os.path.join(save_dir, "%s-%s.mp4"%(key, filename))
+            torchvision.io.write_video(path, grid, fps=save_fps, video_codec='h264', options={'crf': '10'})
+            
+            ## save frame sheet
+            img = value
+            video_frames = rearrange(img, 'b c t h w -> (b t) c h w')
+            t = img.shape[2]
+            grid = torchvision.utils.make_grid(video_frames, nrow=t)
+            path = os.path.join(save_dir, "%s-%s.jpg"%(key, filename))
+            #save_img_grid(grid, path, rescale)
+        elif isinstance(value, torch.Tensor) and value.dim() == 4:
+            ## save image grids
+            img = value
+            ## only save grayscale or rgb mode
+            if img.shape[1] != 1 and img.shape[1] != 3:
+                continue
+            n = img.shape[0]
+            grid = torchvision.utils.make_grid(img, nrow=1)
+            path = os.path.join(save_dir, "%s-%s.jpg"%(key, filename))
+            save_img_grid(grid, path, rescale)
+        else:
+            pass
+
+def prepare_to_log(batch_logs, max_images=100000, clamp=True):
+    if batch_logs is None:
+        return None
+    # process
+    for key in batch_logs:
+        if batch_logs[key] is not None:
+            N = batch_logs[key].shape[0] if hasattr(batch_logs[key], 'shape') else len(batch_logs[key])
+            N = min(N, max_images)
+            batch_logs[key] = batch_logs[key][:N]
+            ## in batch_logs: images <batched tensor> & caption <text list>
+            if isinstance(batch_logs[key], torch.Tensor):
+                batch_logs[key] = batch_logs[key].detach().cpu()
+                if clamp:
+                    try:
+                        batch_logs[key] = torch.clamp(batch_logs[key].float(), -1., 1.)
+                    except RuntimeError:
+                        print("clamp_scalar_cpu not implemented for Half")
+    return batch_logs
+
+# ----------------------------------------------------------------------------------------------
+
+def fill_with_black_squares(video, desired_len: int) -> Tensor:
+    if len(video) >= desired_len:
+        return video
+
+    return torch.cat([
+        video,
+        torch.zeros_like(video[0]).unsqueeze(0).repeat(desired_len - len(video), 1, 1, 1),
+    ], dim=0)
+
+# ----------------------------------------------------------------------------------------------
+def load_num_videos(data_path, num_videos):
+    # first argument can be either data_path of np array 
+    if isinstance(data_path, str):
+        videos = np.load(data_path)['arr_0'] # NTHWC
+    elif isinstance(data_path, np.ndarray):
+        videos = data_path
+    else:
+        raise Exception
+
+    if num_videos is not None:
+        videos = videos[:num_videos, :, :, :, :]
+    return videos
+
+def npz_to_video_grid(data_path, out_path, num_frames, fps, num_videos=None, nrow=None, verbose=True):
+    # videos = torch.tensor(np.load(data_path)['arr_0']).permute(0,1,4,2,3).div_(255).mul_(2) - 1.0 # NTHWC->NTCHW, np int -> torch tensor 0-1
+    if isinstance(data_path, str):
+        videos = load_num_videos(data_path, num_videos)
+    elif isinstance(data_path, np.ndarray):
+        videos = data_path
+    else:
+        raise Exception
+    n,t,h,w,c = videos.shape
+    videos_th = []
+    for i in range(n):
+        video = videos[i, :,:,:,:]
+        images = [video[j, :,:,:] for j in range(t)]
+        images = [to_tensor(img) for img in images]
+        video = torch.stack(images)
+        videos_th.append(video)
+    if verbose:
+        videos = [fill_with_black_squares(v, num_frames) for v in tqdm(videos_th, desc='Adding empty frames')] # NTCHW
+    else:
+        videos = [fill_with_black_squares(v, num_frames) for v in videos_th] # NTCHW
+
+    frame_grids = torch.stack(videos).permute(1, 0, 2, 3, 4) # [T, N, C, H, W]
+    if nrow is None:
+        nrow = int(np.ceil(np.sqrt(n)))
+    if verbose:
+        frame_grids = [make_grid(fs, nrow=nrow) for fs in tqdm(frame_grids, desc='Making grids')]
+    else:
+        frame_grids = [make_grid(fs, nrow=nrow) for fs in frame_grids]
+
+    if os.path.dirname(out_path) != "":
+        os.makedirs(os.path.dirname(out_path), exist_ok=True)
+    frame_grids = (torch.stack(frame_grids) * 255).to(torch.uint8).permute(0, 2, 3, 1) # [T, H, W, C]
+    torchvision.io.write_video(out_path, frame_grids, fps=fps, video_codec='h264', options={'crf': '10'})
diff --git a/utils/utils.py b/utils/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c73b93e006c4250161b427e4d1fff512ca046f7c
--- /dev/null
+++ b/utils/utils.py
@@ -0,0 +1,77 @@
+import importlib
+import numpy as np
+import cv2
+import torch
+import torch.distributed as dist
+
+
+def count_params(model, verbose=False):
+    total_params = sum(p.numel() for p in model.parameters())
+    if verbose:
+        print(f"{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.")
+    return total_params
+
+
+def check_istarget(name, para_list):
+    """ 
+    name: full name of source para
+    para_list: partial name of target para 
+    """
+    istarget=False
+    for para in para_list:
+        if para in name:
+            return True
+    return istarget
+
+
+def instantiate_from_config(config):
+    if not "target" in config:
+        if config == '__is_first_stage__':
+            return None
+        elif config == "__is_unconditional__":
+            return None
+        raise KeyError("Expected key `target` to instantiate.")
+    return get_obj_from_str(config["target"])(**config.get("params", dict()))
+
+
+def get_obj_from_str(string, reload=False):
+    module, cls = string.rsplit(".", 1)
+    if reload:
+        module_imp = importlib.import_module(module)
+        importlib.reload(module_imp)
+    return getattr(importlib.import_module(module, package=None), cls)
+
+
+def load_npz_from_dir(data_dir):
+    data = [np.load(os.path.join(data_dir, data_name))['arr_0'] for data_name in os.listdir(data_dir)]
+    data = np.concatenate(data, axis=0)
+    return data
+
+
+def load_npz_from_paths(data_paths):
+    data = [np.load(data_path)['arr_0'] for data_path in data_paths]
+    data = np.concatenate(data, axis=0)
+    return data   
+
+
+def resize_numpy_image(image, max_resolution=512 * 512, resize_short_edge=None):
+    h, w = image.shape[:2]
+    if resize_short_edge is not None:
+        k = resize_short_edge / min(h, w)
+    else:
+        k = max_resolution / (h * w)
+        k = k**0.5
+    h = int(np.round(h * k / 64)) * 64
+    w = int(np.round(w * k / 64)) * 64
+    image = cv2.resize(image, (w, h), interpolation=cv2.INTER_LANCZOS4)
+    return image
+
+
+def setup_dist(args):
+    if dist.is_initialized():
+        return
+    torch.cuda.set_device(args.local_rank)
+    torch.distributed.init_process_group(
+        'nccl',
+        init_method='env://'
+    )
\ No newline at end of file