Spaces:

obvious-research
/

OnlyFlow

Running

App Files Files Community

arlaz commited on Dec 17, 2024

Commit

9bb001a

0 Parent(s):

initial commit

Browse files

Files changed (16) hide show

.gitattributes +35 -0
.gitignore +1 -0
README.md +14 -0
app.py +641 -0
onlyflow/data/dataset_idx.py +88 -0
onlyflow/data/dataset_itr.py +81 -0
onlyflow/models/attention.py +359 -0
onlyflow/models/attention_processor.py +456 -0
onlyflow/models/flow_adaptor.py +247 -0
onlyflow/models/transformer_2d.py +566 -0
onlyflow/models/unet.py +0 -0
onlyflow/pipelines/pipeline_animation.py +497 -0
onlyflow/pipelines/pipeline_animation_long.py +555 -0
onlyflow/utils/util.py +140 -0
requirements.txt +11 -0
tools/optical_flow.py +22 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .DS_Store

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: OnlyFlow
+emoji: 🐢
+colorFrom: pink
+colorTo: red
+sdk: gradio
+sdk_version: 5.16.0
+app_file: app.py
+pinned: false
+license: mit
+short_description: 'Optical flow based motion conditioned video generation'
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,641 @@

+import os
+import imageio
+import numpy as np
+import torch
+import random
+import spaces
+import gradio as gr
+import torchvision
+import torchvision.transforms as T
+from einops import rearrange
+from huggingface_hub import hf_hub_download
+from torchvision.models.optical_flow import raft_large, Raft_Large_Weights
+from torchvision.utils import flow_to_image
+from diffusers import AutoencoderKL, MotionAdapter, UNet2DConditionModel
+from diffusers import DDIMScheduler
+from transformers import CLIPTextModel, CLIPTokenizer
+from onlyflow.models.flow_adaptor import FlowEncoder, FlowAdaptor
+from onlyflow.models.unet import UNetMotionModel
+from onlyflow.pipelines.pipeline_animation_long import FlowCtrlPipeline
+from tools.optical_flow import get_optical_flow
+def save_videos_grid(videos: torch.Tensor, path: str, rescale=False, n_rows=6, fps=8):
+    videos = rearrange(videos, "b c t h w -> t b c h w")
+    outputs = []
+    for x in videos:
+        x = torchvision.utils.make_grid(x, nrow=n_rows)
+        x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
+        if rescale:
+            x = (x + 1.0) / 2.0  # -1,1 -> 0,1
+        x = (x * 255).numpy().astype(np.uint8)
+        outputs.append(x)
+    os.makedirs(os.path.dirname(path), exist_ok=True)
+    imageio.mimsave(path, outputs, fps=fps)
+css = """
+.toolbutton {
+    margin-buttom: 0em 0em 0em 0em;
+    max-width: 2.5em;
+    min-width: 2.5em !important;
+    height: 2.5em;
+}
+"""
+class AnimateController:
+    def __init__(self):
+        # config dirs
+        self.basedir                = os.getcwd()
+        self.stable_diffusion_dir   = os.path.join(self.basedir, "models", "StableDiffusion")
+        self.motion_module_dir      = os.path.join(self.basedir, "models", "Motion_Module")
+        self.personalized_model_dir = os.path.join(self.basedir, "models", "DreamBooth_LoRA")
+        self.savedir                = os.path.join(self.basedir, "samples")
+        os.makedirs(self.savedir, exist_ok=True)
+        ckpt_path = hf_hub_download('obvious-research/onlyflow', 'weights_fp16.ckpt')
+        ckpt = torch.load(ckpt_path, map_location="cpu", weights_only=True)
+        self.flow_encoder_state_dict = ckpt['flow_encoder_state_dict']
+        self.attention_processor_state_dict = ckpt['attention_processor_state_dict']
+        self.tokenizer             = None
+        self.text_encoder          = None
+        self.vae                   = None
+        self.unet                  = None
+        self.motion_adapter        = None
+    def update_base_model(self, base_model_id, progress=gr.Progress()):
+        progress(0, desc="Starting...")
+        self.tokenizer = CLIPTokenizer.from_pretrained(base_model_id, subfolder="tokenizer")
+        self.text_encoder = CLIPTextModel.from_pretrained(base_model_id, subfolder="text_encoder")
+        self.vae = AutoencoderKL.from_pretrained(base_model_id, subfolder="vae")
+        self.unet = UNet2DConditionModel.from_pretrained(base_model_id, subfolder="unet")
+        return base_model_id
+    def update_motion_module(self, motion_module_id, progress=gr.Progress()):
+        self.motion_adapter = MotionAdapter.from_pretrained(motion_module_id)
+    def animate(
+            self,
+            id_base_model,
+            id_motion_module,
+            prompt_textbox_positive,
+            prompt_textbox_negative,
+            seed_textbox,
+            input_video,
+            height,
+            width,
+            flow_scale,
+            cfg,
+            diffusion_steps,
+            temporal_ds,
+            ctx_stride
+    ):
+        #if any([x is None for x in [self.tokenizer, self.text_encoder, self.vae, self.unet, self.motion_adapter]]) or isinstance(self.unet, str):
+        self.update_base_model(id_base_model)
+        self.update_motion_module(id_motion_module)
+        self.unet = UNetMotionModel.from_unet2d(
+            self.unet,
+            motion_adapter=self.motion_adapter
+        )
+        self.raft = raft_large(weights=Raft_Large_Weights.DEFAULT, progress=False).eval()
+        self.flow_encoder = FlowEncoder(
+            downscale_factor=8,
+            channels=[320, 640, 1280, 1280],
+            nums_rb=2,
+            ksize=1,
+            sk=True,
+            use_conv=False,
+            compression_factor=1,
+            temporal_attention_nhead=8,
+            positional_embeddings="sinusoidal",
+            num_positional_embeddings=16,
+            checkpointing=False
+        ).eval()
+        self.vae.requires_grad_(False)
+        self.text_encoder.requires_grad_(False)
+        self.unet.requires_grad_(False)
+        self.raft.requires_grad_(False)
+        self.flow_encoder.requires_grad_(False)
+        self.unet.set_all_attn(
+            flow_channels=[320, 640, 1280, 1280],
+            add_spatial=False,
+            add_temporal=True,
+            encoder_only=False,
+            query_condition=True,
+            key_value_condition=True,
+            flow_scale=1.0,
+        )
+        self.flow_adaptor = FlowAdaptor(self.unet, self.flow_encoder).eval()
+        # load the flow encoder weights
+        pose_enc_m, pose_enc_u = self.flow_adaptor.flow_encoder.load_state_dict(
+            self.flow_encoder_state_dict,
+            strict=False
+        )
+        assert len(pose_enc_m) == 0 and len(pose_enc_u) == 0
+        # load the attention processor weights
+        _, attention_processor_u = self.flow_adaptor.unet.load_state_dict(
+            self.attention_processor_state_dict,
+            strict=False
+        )
+        assert len(attention_processor_u) == 0
+        pipeline = FlowCtrlPipeline(
+            vae=self.vae,
+            text_encoder=self.text_encoder,
+            tokenizer=self.tokenizer,
+            unet=self.unet,
+            motion_adapter=self.motion_adapter,
+            flow_encoder=self.flow_encoder,
+            scheduler=DDIMScheduler.from_pretrained(id_base_model, subfolder="scheduler"),
+        )
+        if int(seed_textbox) > 0:
+            seed = int(seed_textbox)
+        else:
+            seed = random.randint(1, int(1e16))
+        return animate_diffusion(seed, pipeline, self.raft, input_video, prompt_textbox_positive, prompt_textbox_negative, width, height, flow_scale, cfg, diffusion_steps, temporal_ds, ctx_stride)
+@spaces.GPU(duration=150)
+def animate_diffusion(seed, pipeline, raft_model, base_video, prompt_textbox, negative_prompt_textbox, width_slider, height_slider, flow_scale, cfg, diffusion_steps, temporal_ds, context_stride):
+    savedir = './samples'
+    device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
+    generator = torch.Generator(device="cpu")
+    generator.manual_seed(seed)
+    raft_model = raft_model.to(device)
+    pipeline = pipeline.to(device)
+    pixel_values = torchvision.io.read_video(base_video, output_format="TCHW", pts_unit='sec')[0][::temporal_ds]
+    print("Video loaded, shape:", pixel_values.shape)
+    if width_slider/height_slider > pixel_values.shape[3]/pixel_values.shape[2]:
+        print("Resizing video to fit width cause input video is not wide enough")
+        temp_height = int(width_slider * pixel_values.shape[2]/pixel_values.shape[3])
+        temp_width = width_slider
+    else:
+        print("Resizing video to fit height cause input video is not tall enough")
+        temp_height = height_slider
+        temp_width = int(height_slider * pixel_values.shape[3]/pixel_values.shape[2])
+    print("Resizing video to:", temp_height, temp_width)
+    pixel_values = T.Resize((temp_height, temp_width))(pixel_values)
+    pixel_values = T.CenterCrop((height_slider, width_slider))(pixel_values)
+    pixel_values = T.ConvertImageDtype(torch.float32)(pixel_values)[None, ...].contiguous().to(device)
+    save_sample_path_input = os.path.join(savedir, f"input.mp4")
+    pixel_values_save = pixel_values[0] * 255
+    pixel_values_save = pixel_values_save.cpu()
+    pixel_values_save = torch.permute(pixel_values_save, (0, 2, 3, 1))
+    torchvision.io.write_video(save_sample_path_input, pixel_values_save, fps=8)
+    del pixel_values_save
+    print("Video loaded, shape:", pixel_values.shape)
+    flow = get_optical_flow(
+        raft_model,
+        (pixel_values * 2) - 1,
+        pixel_values.shape[1] - 1,
+        encode_chunk_size=16,
+    ).to('cpu')
+    sample_flow = (flow_to_image(rearrange(flow[0], "c f h w -> f c h w")))  # N, 3, H, W
+    save_sample_path_flow = os.path.join(savedir, f"flow.mp4")
+    sample_flow = (sample_flow).cpu().to(torch.uint8).permute(0, 2, 3, 1)
+    torchvision.io.write_video(save_sample_path_flow, sample_flow, fps=8)
+    del sample_flow
+    original_flow_shape = flow.shape
+    print("Optical flow computed, shape:", flow.shape)
+    if flow.shape[2] < 16:
+        print("Video is too short, padding to 16 frames")
+        video_length = 16
+        n = 16 - flow.shape[2]
+        # create a tensor containing the last frame optical flow repeated n times
+        to_add = flow[:, :, -1].unsqueeze(2).expand(-1, -1, n, -1, -1)
+        flow = torch.cat([flow, to_add], dim=2).to(device)
+    elif flow.shape[2] > 16:
+        print("Video is too long, enabling windowing")
+        print("Enabling model CPU offload")
+        pipeline.enable_model_cpu_offload()
+        print("Enabling VAE slicing")
+        pipeline.enable_vae_slicing()
+        print("Enabling VAE tiling")
+        pipeline.enable_vae_tiling()
+        print("Enabling free noise")
+        pipeline.enable_free_noise(
+            context_length=16,
+            context_stride=context_stride,
+        )
+        import math
+        def find_divisors(n: int):
+            """
+            Return sorted list of all positive divisors of n.
+            Uses a sqrt(n) approach for efficiency.
+            """
+            divs = set()
+            limit = int(math.isqrt(n))
+            for i in range(1, limit + 1):
+                if n % i == 0:
+                    divs.add(i)
+                    divs.add(n // i)
+            return sorted(divs)
+        def multiples_in_range(k: int, min_val: int, max_val: int):
+            """
+            Return all multiples of k within [min_val, max_val].
+            """
+            if k == 0:
+                return []
+            # First multiple of k >= min_val
+            start = ((min_val + k - 1) // k) * k
+            # Last multiple of k <= max_val
+            end = (max_val // k) * k
+            return list(range(start, end + 1, k)) if start <= end else []
+        def adjust_video_length(original_length: int,
+                                context_stride: int,
+                                chunk_size: int,
+                                temporal_split_size: int) -> int:
+            """
+            Find the minimal video_length >= original_length satisfying:
+              1) (video_length - 16) is divisible by context_stride.
+              2) EITHER (2*video_length) is divisible by temporal_split_size
+                 OR (2*video_length) is divisible by chunk_size
+                 (when 2*video_length is not multiple of temporal_split_size).
+            """
+            # We start at least at 16 (though in practice original_length likely > 16)
+            candidate = max(original_length, 16)
+            # We want (candidate - 16) % context_stride == 0
+            # so let n be the multiple to step.
+            # n is how many times we add `context_stride` beyond 16.
+            # This ensures (candidate - 16) is a multiple of context_stride.
+            # Then we check the second condition, else keep stepping.
+            # If candidate < 16, bump it to 16
+            if candidate < 16:
+                candidate = 16
+            # Make sure we jump to the correct "starting multiple" of context_stride
+            offset = (candidate - 16) % context_stride
+            if offset != 0:
+                candidate += (context_stride - offset)  # jump to the next multiple
+            while True:
+                # Condition: (candidate - 16) is multiple of context_stride (already enforced by stepping)
+                # Check second part:
+                # - if (2*candidate) % temporal_split_size == 0, we are good
+                # - else we require (2*candidate) % chunk_size == 0
+                twoL = 2 * candidate
+                if (twoL % temporal_split_size == 0) or (twoL % chunk_size == 0):
+                    return candidate
+                # Go to next valid candidate
+                candidate += context_stride
+        def find_valid_configs(original_video_length: int,
+                               width: int,
+                               height: int,
+                               context_stride: int):
+            """
+            Generate all valid tuples (chunk_size, spatial_split_size, temporal_split_size, video_length)
+            subject to the constraints:
+              1) chunk_size divides temporal_split_size
+              2) chunk_size divides spatial_split_size
+              3) chunk_size divides (2 * (width//64) * (height//64))
+              4) if (2*video_length) % temporal_split_size != 0, then chunk_size divides (2*video_length)
+              5) context_stride divides (video_length - 16)
+              6) 128 <= spatial_split_size <= 512
+              7) 1 <= temporal_split_size <= 32
+              8) 1 <= chunk_size <= 16
+            We allow increasing original_video_length minimally if needed to satisfy constraints #4 and #5.
+            """
+            factor = 2 * (width // 64) * (height // 64)
+            # 1) find all possible chunk_size as divisors of factor, in [1..16]
+            possible_chunks = [d for d in find_divisors(factor) if 1 <= d <= 32]
+            # For storing results
+            valid_tuples = []
+            for chunk_size in possible_chunks:
+                # 2) generate all spatial_split_size in [128..512] that are multiples of chunk_size
+                spatial_splits = multiples_in_range(chunk_size, 480, 512)
+                # 3) generate all temporal_split_size in [1..32] that are multiples of chunk_size
+                temporal_splits = multiples_in_range(chunk_size, 1, 32)
+                for ssp in spatial_splits:
+                    for tsp in temporal_splits:
+                        # 4) & 5) Adjust video_length minimally to satisfy constraints
+                        final_length = adjust_video_length(original_video_length,
+                                                           context_stride,
+                                                           chunk_size,
+                                                           tsp)
+                        # Now we have a valid (chunk_size, ssp, tsp, final_length)
+                        valid_tuples.append((chunk_size, ssp, tsp, final_length))
+            return valid_tuples
+        def find_pareto_optimal(configs):
+            """
+            Given a list of tuples (chunk_size, spatial_split_size, temporal_split_size, video_length),
+            return the Pareto-optimal subset under the criteria:
+              - chunk_size: larger is better
+              - spatial_split_size: larger is better
+              - temporal_split_size: larger is better
+              - video_length: smaller is better
+            """
+            def dominates(A, B):
+                cA, sA, tA, lA = A
+                cB, sB, tB, lB = B
+                # A dominates B if:
+                #   cA >= cB, sA >= sB, tA >= tB, and lA <= lB
+                #   AND at least one of these is a strict inequality.
+                better_or_equal = (cA >= cB) and (tA >= tB) and (lA <= lB)
+                strictly_better = (cA > cB) or (tA > tB) or (lA < lB)
+                return better_or_equal and strictly_better
+            pareto = []
+            for i, cfg_i in enumerate(configs):
+                # Check if cfg_i is dominated by any cfg_j
+                is_dominated = False
+                for j, cfg_j in enumerate(configs):
+                    if i == j:
+                        continue
+                    if dominates(cfg_j, cfg_i):
+                        is_dominated = True
+                        break
+                if not is_dominated:
+                    pareto.append(cfg_i)
+            return pareto
+        print("Finding valid configurations...")
+        valid_configs = find_valid_configs(
+            original_video_length=flow.shape[2],
+            width=width_slider,
+            height=height_slider,
+            context_stride=context_stride
+        )
+        print("Found", len(valid_configs), "valid configurations")
+        print("Finding Pareto-optimal configurations...")
+        pareto_optimal = find_pareto_optimal(valid_configs)
+        print("Found", pareto_optimal)
+        criteria = lambda cs, sss, tss, vl: cs + tss - 3 * int(abs(flow.shape[2] - vl) / 10)
+        pareto_optimal.sort(key=lambda x: criteria(*x), reverse=True)
+        print("Found sorted", pareto_optimal)
+        solution = pareto_optimal[0]
+        chunk_size, spatial_split_size, temporal_split_size, video_length = solution
+        n = video_length - original_flow_shape[2]
+        to_add = flow[:, :, -1].unsqueeze(2).expand(-1, -1, n, -1, -1)
+        flow = torch.cat([flow, to_add], dim=2)
+        pipeline.enable_free_noise_split_inference(
+            temporal_split_size=temporal_split_size,
+            spatial_split_size=spatial_split_size
+        )
+        pipeline.unet.enable_forward_chunking(chunk_size)
+        print("Chunking enabled with chunk size:", chunk_size)
+        print("Temporal split size:", temporal_split_size)
+        print("Spatial split size:", spatial_split_size)
+        print("Context stride:", context_stride)
+        print("Temporal downscale:", temporal_ds)
+        print("Video length:", video_length)
+        print("Flow shape:", flow.shape)
+    else:
+        print("Video is just right, no padding or windowing needed")
+        flow = flow.to(device)
+        video_length = flow.shape[2]
+    sample_vid = pipeline(
+        prompt_textbox,
+        negative_prompt=negative_prompt_textbox,
+        optical_flow=flow,
+        num_inference_steps=diffusion_steps,
+        guidance_scale=cfg,
+        width=width_slider,
+        height=height_slider,
+        num_frames=video_length,
+        val_scale_factor_temporal=flow_scale,
+        generator=generator,
+    ).frames[0]
+    del flow
+    if device == "cuda":
+        torch.cuda.synchronize()
+        torch.cuda.empty_cache()
+    save_sample_path_video = os.path.join(savedir, f"sample.mp4")
+    sample_vid = sample_vid[:original_flow_shape[2]] * 255.
+    sample_vid = sample_vid.cpu().numpy()
+    sample_vid = np.transpose(sample_vid, axes=(0, 2, 3, 1))
+    torchvision.io.write_video(save_sample_path_video, sample_vid, fps=8)
+    return gr.Video(value=save_sample_path_flow), gr.Video(value=save_sample_path_video)
+controller = AnimateController()
+def find_closest_ratio(target_ratio):
+    width_list = list(reversed(range(256, 1025, 64)))
+    height_list = list(reversed(range(256, 1025, 64)))
+    ratio_list = [(h, w, w/h) for h in height_list for w in width_list]
+    ratio_list.sort(key=lambda x: abs(x[2] - target_ratio))
+    ratio_list = list(filter(lambda x: x[2] == ratio_list[0][2], ratio_list))
+    ratio_list.sort(key=lambda x: abs(x[0]*x[1] - 512*512))
+    return ratio_list[0][:2]
+def find_dimension(video):
+    import av
+    container = av.open(open(video, 'rb'))
+    height, width = container.streams.video[0].height, container.streams.video[0].width
+    target_ratio = width / height
+    return find_closest_ratio(target_ratio)
+def ui():
+    with gr.Blocks(css=css) as demo:
+        gr.Markdown(
+            """
+            # <p style="text-align:center;">OnlyFlow: Optical Flow based Motion Conditioning for Video Diffusion Models</p>
+            Mathis Koroglu, Hugo Caselles-Dupré, Guillaume Jeanneret Sanmiguel, Matthieu Cord<br>
+            [Arxiv Report](https://arxiv.org/abs/2411.10501) | [Project Page](https://obvious-research.github.io/onlyflow/) | [Github](https://github.com/obvious-research/onlyflow/)
+            """
+        )
+        gr.Markdown(
+            """
+            ### Quick Start:
+            1. Select desired `Base Model`.
+            2. Select `Motion Module`. We recommend trying guoyww/animatediff-motion-adapter-v1-5-3 for the best results.
+            3. Provide `Positive Prompt` and `Negative Prompt`. You are encouraged to refer to each model's webpage on HuggingFace Hub or CivitAI to learn how to write prompts for them.
+            4. Upload a video to extract optical flow from.
+            5. Select a 'Flow Scale' to modulate the input video optical flow conditioning.
+            6. Select a 'CFG' and 'Diffusion Steps' to control the quality of the generated video and prompt adherence.
+            7. Select a 'Temporal Downsample' to reduce the number of frames in the input video.
+            8. If you want to use a custom dimension, check the `Custom Dimension` box and adjust the `Width` and `Height` sliders.
+            9. If the video is too long, you can adjust the generation window offset with the `Context Stride` slider.
+            10. Click `Generate`, wait for ~1/3 min, and enjoy the result!
+            If you have any error concerning GPU limits, please try again later when your ZeroGPU quota is reset, or try with a shorter video.
+            Otherwise, you can also duplicate this space and select a custom GPU plan.
+            """
+        )
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown("# INPUTS")
+                with gr.Row(equal_height=True, show_progress=True):
+                    base_model = gr.Dropdown(
+                        label="Select or type a base model id",
+                        choices=[
+                        "stable-diffusion-v1-5/stable-diffusion-v1-5",
+                        "digiplay/Photon_v1",
+                    ],
+                        interactive=True,
+                        scale=4,
+                        allow_custom_value=True,
+                        show_label=True
+                    )
+                    base_model_btn = gr.Button(value="Update", scale=1, size='lg')
+                with gr.Row(equal_height=True, show_progress=True):
+                    motion_module  = gr.Dropdown(
+                        label="Select or type a motion module id",
+                        choices=[
+                            "guoyww/animatediff-motion-adapter-v1-5-3",
+                            "guoyww/animatediff-motion-adapter-v1-5-2"
+                        ],
+                        interactive=True,
+                        scale=4
+                    )
+                    motion_module_btn = gr.Button(value="Update", scale=1, size='lg')
+                base_model_btn.click(fn=controller.update_base_model, inputs=[base_model])
+                motion_module_btn.click(fn=controller.update_motion_module, inputs=[motion_module])
+                prompt_textbox_positive = gr.Textbox(label="Positive Prompt", lines=3)
+                prompt_textbox_negative = gr.Textbox(label="Negative Prompt", lines=2, value="worst quality, low quality, nsfw, logo")
+                flow_scale = gr.Slider(label="Flow Scale", value=1.0, minimum=0, maximum=2, step=0.025)
+                diffusion_steps = gr.Slider(label="Diffusion Steps", value=25, minimum=0, maximum=100, step=1)
+                cfg = gr.Slider(label="CFG", value=7.5, minimum=0, maximum=30, step=0.1)
+                temporal_ds = gr.Slider(label="Temporal Downsample", value=1, minimum=1, maximum=30, step=1)
+                input_video = gr.Video(label="Input Video", interactive=True)
+                ctx_stride = gr.State(12)
+                with gr.Accordion("Advanced", open=False):
+                    use_custom_dim = gr.Checkbox(label="Custom Dimension", value=False)
+                    with gr.Row(equal_height=True):
+                        height, width = gr.State(512), gr.State(512)
+                        @gr.render(inputs=[use_custom_dim, input_video])
+                        def render_custom_dim(use_custom_dim, input_video):
+                            if input_video is not None:
+                                loc_height, loc_width = find_dimension(input_video)
+                            else:
+                                loc_height, loc_width = 512, 512
+                            slider_width = gr.Slider(label="Width", value=loc_width, minimum=256, maximum=1024,
+                                                     step=64, visible=use_custom_dim)
+                            slider_height = gr.Slider(label="Height", value=loc_height, minimum=256, maximum=1024,
+                                                      step=64, visible=use_custom_dim)
+                            slider_width.change(lambda x: x, inputs=[slider_width], outputs=[width])
+                            slider_height.change(lambda x: x, inputs=[slider_height], outputs=[height])
+                    with gr.Row():
+                        @gr.render(inputs=input_video)
+                        def render_ctx_stride(input_video):
+                            if input_video is not None:
+                                video  = open(input_video, 'rb')
+                                import av
+                                container = av.open(video)
+                                num_frames = container.streams.video[0].frames
+                                if num_frames > 17:
+                                    stride_slider = gr.Slider(label="Context Stride", value=12, minimum=1, maximum=16, step=1)
+                                    stride_slider.input(lambda x: x, inputs=[stride_slider], outputs=[ctx_stride])
+                                if num_frames > 32:
+                                    gr.Warning(f"Video is long ({num_frames} frames), consider using a shorter video, increasing the context stride, or selecting a custom GPU plan.")
+                                elif num_frames > 64:
+                                    raise gr.Error(f"Video is too long ({num_frames} frames), please use a shorter video, increase the context stride, or select a custom GPU plan. The current parameters won't allow generation on ZeroGPU.")
+                    with gr.Row(equal_height=True):
+                        seed_textbox = gr.Textbox(label="Seed",  value='-1')
+                        seed_button  = gr.Button(value="\U0001F3B2", elem_classes="toolbutton")
+                        seed_button.click(
+                            fn=lambda: random.randint(1, int(1e16)),
+                            inputs=[],
+                            outputs=[seed_textbox]
+                        )
+                with gr.Row():
+                    clear_btn = gr.ClearButton(value="Clear & Reset", size='lg', variant='secondary', scale=1)
+                    generate_button = gr.Button(value="Generate", variant='primary', scale=2, size='lg')
+                    clear_btn.add([base_model, motion_module, input_video, prompt_textbox_positive, prompt_textbox_negative, seed_textbox, use_custom_dim, ctx_stride])
+            with gr.Column():
+                gr.Markdown("# OUTPUTS")
+                result_optical_flow = gr.Video(label="Optical Flow", interactive=False)
+                result_video = gr.Video(label="Generated Animation", interactive=False)
+            inputs  = [base_model, motion_module, prompt_textbox_positive, prompt_textbox_negative, seed_textbox, input_video, height, width, flow_scale, cfg, diffusion_steps, temporal_ds, ctx_stride]
+            outputs = [result_optical_flow, result_video]
+            generate_button.click(fn=controller.animate, inputs=inputs, outputs=outputs)
+    return demo
+if __name__ == "__main__":
+    demo = ui()
+    demo.queue(max_size=20)
+    demo.launch()

onlyflow/data/dataset_idx.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import functools
+from io import BytesIO
+import torch
+import torchvision
+import torchvision.transforms.v2 as transforms
+import wids
+from torch.utils.data import DataLoader
+def _video_shortener(video_tensor, length, generator=None):
+    start = torch.randint(0, video_tensor.shape[0] - length, (1,), generator=generator)
+    return video_tensor[start:start + length]
+def select_video_extract(length=16, generator=None):
+    return functools.partial(_video_shortener, length=length, generator=generator)
+def my_collate_fn(batch):
+    videos = torch.stack([sample[0] for sample in batch])
+    txts = [sample[1] for sample in batch]
+    return videos, txts
+class WebVidDataset(wids.ShardListDataset):
+    def __init__(self, shards, cache_dir, video_length=16, video_size=256, video_length_offset=1, val=False, seed=42,
+                 **kwargs):
+        self.val = val
+        self.generator = torch.Generator()
+        self.generator.manual_seed(seed)
+        self.generator_init_state = self.generator.get_state()
+        super().__init__(shards, cache_dir=cache_dir, keep=True, **kwargs)
+        if isinstance(video_size, int):
+            video_size = (video_size, video_size)
+        self.video_size = video_size
+        for size in video_size:
+            if size % 8 != 0:
+                raise ValueError("video_size must be divisible by 8")
+        self.transform = transforms.Compose(
+            [
+                select_video_extract(length=video_length + video_length_offset, generator=self.generator),
+                transforms.Resize(size=video_size),
+                transforms.RandomCrop(size=video_size) if not self.val else transforms.CenterCrop(size=video_size),
+                transforms.RandomHorizontalFlip() if not self.val else transforms.Identity(),
+            ]
+        )
+        self.add_transform(self._make_sample)
+    def _make_sample(self, sample):
+        if self.val:
+            self.generator.set_state(self.generator_init_state)
+        video = torchvision.io.read_video(BytesIO(sample[".mp4"].read()), output_format="TCHW", pts_unit='sec')[0]
+        label = sample[".txt"]
+        return self.transform(video), label
+if __name__ == "__main__":
+    dataset = WebVidDataset(
+        tar_index=0,
+        root_path='/users/Etu9/3711799/onlyflow/data/webvid/desc.json',
+        video_length=16,
+        video_size=256,
+        video_length_offset=0,
+    )
+    sampler = wids.DistributedChunkedSampler(dataset, chunksize=1000, shuffle=True)
+    dataloader = DataLoader(
+        dataset,
+        collate_fn=my_collate_fn,
+        batch_size=4,
+        sampler=sampler,
+        num_workers=4
+    )
+    for i, (images, labels) in enumerate(dataloader):
+        print(i, images.shape, labels)
+        if i > 10:
+            break

onlyflow/data/dataset_itr.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import functools
+import os
+from io import BytesIO
+import torch
+import torchvision
+import torchvision.transforms.v2 as transforms
+import webdataset as wds
+def _video_shortener(video_tensor, length):
+    start = torch.randint(0, video_tensor.shape[0] - length, (1,))
+    return video_tensor[start:start + length]
+def select_video_extract(length=16):
+    return functools.partial(_video_shortener, length=length)
+def my_collate_fn(batch):
+    output = {}
+    for key in batch[0].keys():
+        if key == 'video':
+            output[key] = torch.stack([sample[key] for sample in batch])
+        else:
+            output[key] = [sample[key] for sample in batch]
+    return output
+def map_mp4(sample):
+    return torchvision.io.read_video(BytesIO(sample), output_format="TCHW", pts_unit='sec')[0]
+def map_txt(sample):
+    return sample.decode("utf-8")
+class WebVidDataset(wds.DataPipeline):
+    def __init__(self, batch_size, tar_index, root_path, video_length=16, video_size=256, video_length_offset=0,
+                 horizontal_flip=True, seed=None):
+        self.dataset_full_path = os.path.join(root_path, f'webvid-uw-{{{tar_index}}}.tar')
+        if isinstance(video_size, int):
+            video_size = (video_size, video_size)
+        for size in video_size:
+            if size % 8 != 0:
+                raise ValueError("video_size must be divisible by 8")
+        self.pipeline = [
+            wds.SimpleShardList('file:' + str(self.dataset_full_path), seed=seed),
+            wds.shuffle(50),
+            wds.split_by_node,
+            wds.tarfile_to_samples(),
+            wds.shuffle(100),
+            wds.split_by_worker,
+            wds.map_dict(
+                mp4=map_mp4,
+                txt=map_txt,
+            ),
+            wds.map_dict(
+                mp4=transforms.Compose(
+                    [
+                        select_video_extract(length=video_length + video_length_offset),
+                        transforms.Resize(size=video_size),
+                        transforms.RandomCrop(size=video_size),
+                        transforms.RandomHorizontalFlip() if horizontal_flip else transforms.Identity,
+                    ]
+                )
+            ),
+            wds.rename_keys(video="mp4", text='txt', keep_unselected=True),
+            wds.batched(batch_size, collation_fn=my_collate_fn, partial=True)
+        ]
+        super().__init__(self.pipeline)
+        self.batch_size = batch_size
+        self.video_length = video_length
+        self.video_size = video_size

onlyflow/models/attention.py ADDED Viewed

	@@ -0,0 +1,359 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Optional
+import torch
+from diffusers.models.attention import GatedSelfAttentionDense, FeedForward, _chunked_feed_forward
+from diffusers.models.embeddings import SinusoidalPositionalEmbedding
+from diffusers.models.normalization import AdaLayerNorm, AdaLayerNormContinuous, AdaLayerNormZero
+from diffusers.utils import logging
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from torch import nn
+from onlyflow.models.attention_processor import Attention
+logger = logging.get_logger(__name__)
+@maybe_allow_in_graph
+class BasicTransformerBlock(nn.Module):
+    r"""
+    A basic Transformer block.
+    Parameters:
+        dim (`int`): The number of channels in the input and output.
+        num_attention_heads (`int`): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        num_embeds_ada_norm (:
+            obj: `int`, *optional*): The number of diffusion steps used during training. See `Transformer2DModel`.
+        attention_bias (:
+            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
+        only_cross_attention (`bool`, *optional*):
+            Whether to use only cross-attention layers. In this case two cross attention layers are used.
+        double_self_attention (`bool`, *optional*):
+            Whether to use two self-attention layers. In this case no cross attention layers are used.
+        upcast_attention (`bool`, *optional*):
+            Whether to upcast the attention computation to float32. This is useful for mixed precision training.
+        norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
+            Whether to use learnable elementwise affine parameters for normalization.
+        norm_type (`str`, *optional*, defaults to `"layer_norm"`):
+            The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
+        final_dropout (`bool` *optional*, defaults to False):
+            Whether to apply a final dropout after the last feed-forward layer.
+        attention_type (`str`, *optional*, defaults to `"default"`):
+            The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
+        positional_embeddings (`str`, *optional*, defaults to `None`):
+            The type of positional embeddings to apply to.
+        num_positional_embeddings (`int`, *optional*, defaults to `None`):
+            The maximum number of positional embeddings to apply.
+    """
+    def __init__(
+            self,
+            dim: int,
+            num_attention_heads: int,
+            attention_head_dim: int,
+            dropout=0.0,
+            cross_attention_dim: Optional[int] = None,
+            activation_fn: str = "geglu",
+            num_embeds_ada_norm: Optional[int] = None,
+            attention_bias: bool = False,
+            only_cross_attention: bool = False,
+            double_self_attention: bool = False,
+            upcast_attention: bool = False,
+            norm_elementwise_affine: bool = True,
+            norm_type: str = "layer_norm",
+            # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single', 'ada_norm_continuous', 'layer_norm_i2vgen'
+            norm_eps: float = 1e-5,
+            final_dropout: bool = False,
+            attention_type: str = "default",
+            positional_embeddings: Optional[str] = None,
+            num_positional_embeddings: Optional[int] = None,
+            ada_norm_continous_conditioning_embedding_dim: Optional[int] = None,
+            ada_norm_bias: Optional[int] = None,
+            ff_inner_dim: Optional[int] = None,
+            ff_bias: bool = True,
+            attention_out_bias: bool = True,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        self.dropout = dropout
+        self.cross_attention_dim = cross_attention_dim
+        self.activation_fn = activation_fn
+        self.attention_bias = attention_bias
+        self.double_self_attention = double_self_attention
+        self.norm_elementwise_affine = norm_elementwise_affine
+        self.positional_embeddings = positional_embeddings
+        self.num_positional_embeddings = num_positional_embeddings
+        self.only_cross_attention = only_cross_attention
+        # We keep these boolean flags for backward-compatibility.
+        self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
+        self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
+        self.use_ada_layer_norm_single = norm_type == "ada_norm_single"
+        self.use_layer_norm = norm_type == "layer_norm"
+        self.use_ada_layer_norm_continuous = norm_type == "ada_norm_continuous"
+        if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
+            raise ValueError(
+                f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
+                f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
+            )
+        self.norm_type = norm_type
+        self.num_embeds_ada_norm = num_embeds_ada_norm
+        if positional_embeddings and (num_positional_embeddings is None):
+            raise ValueError(
+                "If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined."
+            )
+        if positional_embeddings == "sinusoidal":
+            self.pos_embed = SinusoidalPositionalEmbedding(dim, max_seq_length=num_positional_embeddings)
+        else:
+            self.pos_embed = None
+        # Define 3 blocks. Each block has its own normalization layer.
+        # 1. Self-Attn
+        if norm_type == "ada_norm":
+            self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
+        elif norm_type == "ada_norm_zero":
+            self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
+        elif norm_type == "ada_norm_continuous":
+            self.norm1 = AdaLayerNormContinuous(
+                dim,
+                ada_norm_continous_conditioning_embedding_dim,
+                norm_elementwise_affine,
+                norm_eps,
+                ada_norm_bias,
+                "rms_norm",
+            )
+        else:
+            self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+            upcast_attention=upcast_attention,
+            out_bias=attention_out_bias,
+        )
+        # 2. Cross-Attn
+        if cross_attention_dim is not None or double_self_attention:
+            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
+            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
+            # the second cross attention block.
+            if norm_type == "ada_norm":
+                self.norm2 = AdaLayerNorm(dim, num_embeds_ada_norm)
+            elif norm_type == "ada_norm_continuous":
+                self.norm2 = AdaLayerNormContinuous(
+                    dim,
+                    ada_norm_continous_conditioning_embedding_dim,
+                    norm_elementwise_affine,
+                    norm_eps,
+                    ada_norm_bias,
+                    "rms_norm",
+                )
+            else:
+                self.norm2 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+            self.attn2 = Attention(
+                query_dim=dim,
+                cross_attention_dim=cross_attention_dim if not double_self_attention else None,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+                out_bias=attention_out_bias,
+            )  # is self-attn if encoder_hidden_states is none
+        else:
+            if norm_type == "ada_norm_single":  # For Latte
+                self.norm2 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+            else:
+                self.norm2 = None
+            self.attn2 = None
+        # 3. Feed-forward
+        if norm_type == "ada_norm_continuous":
+            self.norm3 = AdaLayerNormContinuous(
+                dim,
+                ada_norm_continous_conditioning_embedding_dim,
+                norm_elementwise_affine,
+                norm_eps,
+                ada_norm_bias,
+                "layer_norm",
+            )
+        elif norm_type in ["ada_norm_zero", "ada_norm", "layer_norm"]:
+            self.norm3 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+        elif norm_type == "layer_norm_i2vgen":
+            self.norm3 = None
+        self.ff = FeedForward(
+            dim,
+            dropout=dropout,
+            activation_fn=activation_fn,
+            final_dropout=final_dropout,
+            inner_dim=ff_inner_dim,
+            bias=ff_bias,
+        )
+        # 4. Fuser
+        if attention_type == "gated" or attention_type == "gated-text-image":
+            self.fuser = GatedSelfAttentionDense(dim, cross_attention_dim, num_attention_heads, attention_head_dim)
+        # 5. Scale-shift for PixArt-Alpha.
+        if norm_type == "ada_norm_single":
+            self.scale_shift_table = nn.Parameter(torch.randn(6, dim) / dim ** 0.5)
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+    def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0):
+        # Sets chunk feed-forward
+        self._chunk_size = chunk_size
+        self._chunk_dim = dim
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            encoder_hidden_states: Optional[torch.Tensor] = None,
+            encoder_attention_mask: Optional[torch.Tensor] = None,
+            timestep: Optional[torch.LongTensor] = None,
+            cross_attention_kwargs: Dict[str, Any] = None,
+            class_labels: Optional[torch.LongTensor] = None,
+            added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+        if cross_attention_kwargs is not None:
+            if cross_attention_kwargs.get("scale", None) is not None:
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
+        # Notice that normalization is always applied before the real computation in the following blocks.
+        # 0. Self-Attention
+        batch_size = hidden_states.shape[0]
+        if self.norm_type == "ada_norm":
+            norm_hidden_states = self.norm1(hidden_states, timestep)
+        elif self.norm_type == "ada_norm_zero":
+            norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+                hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
+            )
+        elif self.norm_type in ["layer_norm", "layer_norm_i2vgen"]:
+            norm_hidden_states = self.norm1(hidden_states)
+        elif self.norm_type == "ada_norm_continuous":
+            norm_hidden_states = self.norm1(hidden_states, added_cond_kwargs["pooled_text_emb"])
+        elif self.norm_type == "ada_norm_single":
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+                    self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1)
+            ).chunk(6, dim=1)
+            norm_hidden_states = self.norm1(hidden_states)
+            norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
+        else:
+            raise ValueError("Incorrect norm used")
+        if self.pos_embed is not None:
+            norm_hidden_states = self.pos_embed(norm_hidden_states)
+        # 1. Prepare GLIGEN inputs
+        cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
+        gligen_kwargs = cross_attention_kwargs.pop("gligen", None)
+        attn_output = self.attn1(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+            attention_mask=attention_mask,
+            **cross_attention_kwargs,
+        )
+        if self.norm_type == "ada_norm_zero":
+            attn_output = gate_msa.unsqueeze(1) * attn_output
+        elif self.norm_type == "ada_norm_single":
+            attn_output = gate_msa * attn_output
+        hidden_states = attn_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
+        # 1.2 GLIGEN Control
+        if gligen_kwargs is not None:
+            hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])
+        # 3. Cross-Attention
+        if self.attn2 is not None:
+            if self.norm_type == "ada_norm":
+                norm_hidden_states = self.norm2(hidden_states, timestep)
+            elif self.norm_type in ["ada_norm_zero", "layer_norm", "layer_norm_i2vgen"]:
+                norm_hidden_states = self.norm2(hidden_states)
+            elif self.norm_type == "ada_norm_single":
+                # For PixArt norm2 isn't applied here:
+                # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
+                norm_hidden_states = hidden_states
+            elif self.norm_type == "ada_norm_continuous":
+                norm_hidden_states = self.norm2(hidden_states, added_cond_kwargs["pooled_text_emb"])
+            else:
+                raise ValueError("Incorrect norm")
+            if self.pos_embed is not None and self.norm_type != "ada_norm_single":
+                norm_hidden_states = self.pos_embed(norm_hidden_states)
+            attn_output = self.attn2(
+                hidden_states=norm_hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                **cross_attention_kwargs,
+            )
+            hidden_states = attn_output + hidden_states
+        # 4. Feed-forward
+        # i2vgen doesn't have this norm 🤷‍♂️
+        if self.norm_type == "ada_norm_continuous":
+            norm_hidden_states = self.norm3(hidden_states, added_cond_kwargs["pooled_text_emb"])
+        elif not self.norm_type == "ada_norm_single":
+            norm_hidden_states = self.norm3(hidden_states)
+        if self.norm_type == "ada_norm_zero":
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        if self.norm_type == "ada_norm_single":
+            norm_hidden_states = self.norm2(hidden_states)
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
+        if self._chunk_size is not None:
+            # "feed_forward_chunk_size" can be used to save memory
+            ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
+        else:
+            ff_output = self.ff(norm_hidden_states)
+        if self.norm_type == "ada_norm_zero":
+            ff_output = gate_mlp.unsqueeze(1) * ff_output
+        elif self.norm_type == "ada_norm_single":
+            ff_output = gate_mlp * ff_output
+        hidden_states = ff_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
+        return hidden_states

onlyflow/models/attention_processor.py ADDED Viewed

	@@ -0,0 +1,456 @@

+import inspect
+import logging
+from typing import Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.nn.init as init
+from diffusers.models.attention_processor import Attention as AttentionBase
+from diffusers.models.attention_processor import AttnProcessor2_0 as AttnProcessor2_0_Base, SpatialNorm, AttnProcessor
+from diffusers.models.attention_processor import IPAdapterAttnProcessor2_0 as IPAdapterAttnProcessor2_0_Base
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+logger = logging.getLogger(__name__)
+@maybe_allow_in_graph
+class Attention(AttentionBase):
+    r"""
+    A cross attention layer.
+    Parameters:
+        query_dim (`int`):
+            The number of channels in the query.
+        cross_attention_dim (`int`, *optional*):
+            The number of channels in the encoder_hidden_states. If not given, defaults to `query_dim`.
+        heads (`int`,  *optional*, defaults to 8):
+            The number of heads to use for multi-head attention.
+        kv_heads (`int`,  *optional*, defaults to `None`):
+            The number of key and value heads to use for multi-head attention. Defaults to `heads`. If
+            `kv_heads=heads`, the model will use Multi Head Attention (MHA), if `kv_heads=1` the model will use Multi
+            Query Attention (MQA) otherwise GQA is used.
+        dim_head (`int`,  *optional*, defaults to 64):
+            The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability to use.
+        bias (`bool`, *optional*, defaults to False):
+            Set to `True` for the query, key, and value linear layers to contain a bias parameter.
+        upcast_attention (`bool`, *optional*, defaults to False):
+            Set to `True` to upcast the attention computation to `float32`.
+        upcast_softmax (`bool`, *optional*, defaults to False):
+            Set to `True` to upcast the softmax computation to `float32`.
+        cross_attention_norm (`str`, *optional*, defaults to `None`):
+            The type of normalization to use for the cross attention. Can be `None`, `layer_norm`, or `group_norm`.
+        cross_attention_norm_num_groups (`int`, *optional*, defaults to 32):
+            The number of groups to use for the group norm in the cross attention.
+        added_kv_proj_dim (`int`, *optional*, defaults to `None`):
+            The number of channels to use for the added key and value projections. If `None`, no projection is used.
+        norm_num_groups (`int`, *optional*, defaults to `None`):
+            The number of groups to use for the group norm in the attention.
+        spatial_norm_dim (`int`, *optional*, defaults to `None`):
+            The number of channels to use for the spatial normalization.
+        out_bias (`bool`, *optional*, defaults to `True`):
+            Set to `True` to use a bias in the output linear layer.
+        scale_qk (`bool`, *optional*, defaults to `True`):
+            Set to `True` to scale the query and key by `1 / sqrt(dim_head)`.
+        only_cross_attention (`bool`, *optional*, defaults to `False`):
+            Set to `True` to only use cross attention and not added_kv_proj_dim. Can only be set to `True` if
+            `added_kv_proj_dim` is not `None`.
+        eps (`float`, *optional*, defaults to 1e-5):
+            An additional value added to the denominator in group normalization that is used for numerical stability.
+        rescale_output_factor (`float`, *optional*, defaults to 1.0):
+            A factor to rescale the output by dividing it with this value.
+        residual_connection (`bool`, *optional*, defaults to `False`):
+            Set to `True` to add the residual connection to the output.
+        _from_deprecated_attn_block (`bool`, *optional*, defaults to `False`):
+            Set to `True` if the attention block is loaded from a deprecated state dict.
+        processor (`AttnProcessor`, *optional*, defaults to `None`):
+            The attention processor to use. If `None`, defaults to `AttnProcessor2_0` if `torch 2.x` is used and
+            `AttnProcessor` otherwise.
+    """
+    def __init__(
+            self,
+            query_dim: int,
+            cross_attention_dim: Optional[int] = None,
+            heads: int = 8,
+            kv_heads: Optional[int] = None,
+            dim_head: int = 64,
+            dropout: float = 0.0,
+            bias: bool = False,
+            upcast_attention: bool = False,
+            upcast_softmax: bool = False,
+            cross_attention_norm: Optional[str] = None,
+            cross_attention_norm_num_groups: int = 32,
+            qk_norm: Optional[str] = None,
+            added_kv_proj_dim: Optional[int] = None,
+            added_proj_bias: Optional[bool] = True,
+            norm_num_groups: Optional[int] = None,
+            spatial_norm_dim: Optional[int] = None,
+            out_bias: bool = True,
+            scale_qk: bool = True,
+            only_cross_attention: bool = False,
+            eps: float = 1e-5,
+            rescale_output_factor: float = 1.0,
+            residual_connection: bool = False,
+            _from_deprecated_attn_block: bool = False,
+            processor: Optional["AttnProcessor"] = None,
+            out_dim: int = None,
+            context_pre_only=None,
+            pre_only=False,
+    ):
+        nn.Module.__init__(self)
+        # To prevent circular import.
+        from diffusers.models.normalization import FP32LayerNorm, RMSNorm
+        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
+        self.inner_kv_dim = self.inner_dim if kv_heads is None else dim_head * kv_heads
+        self.query_dim = query_dim
+        self.use_bias = bias
+        self.is_cross_attention = cross_attention_dim is not None
+        self.cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
+        self.upcast_attention = upcast_attention
+        self.upcast_softmax = upcast_softmax
+        self.rescale_output_factor = rescale_output_factor
+        self.residual_connection = residual_connection
+        self.dropout = dropout
+        self.fused_projections = False
+        self.out_dim = out_dim if out_dim is not None else query_dim
+        self.context_pre_only = context_pre_only
+        self.pre_only = pre_only
+        # we make use of this private variable to know whether this class is loaded
+        # with an deprecated state dict so that we can convert it on the fly
+        self._from_deprecated_attn_block = _from_deprecated_attn_block
+        self.scale_qk = scale_qk
+        self.scale = dim_head ** -0.5 if self.scale_qk else 1.0
+        self.heads = out_dim // dim_head if out_dim is not None else heads
+        # for slice_size > 0 the attention score computation
+        # is split across the batch axis to save memory
+        # You can set slice_size with `set_attention_slice`
+        self.sliceable_head_dim = heads
+        self.added_kv_proj_dim = added_kv_proj_dim
+        self.only_cross_attention = only_cross_attention
+        if self.added_kv_proj_dim is None and self.only_cross_attention:
+            raise ValueError(
+                "`only_cross_attention` can only be set to True if `added_kv_proj_dim` is not None. Make sure to set either `only_cross_attention=False` or define `added_kv_proj_dim`."
+            )
+        if norm_num_groups is not None:
+            self.group_norm = nn.GroupNorm(num_channels=query_dim, num_groups=norm_num_groups, eps=eps, affine=True)
+        else:
+            self.group_norm = None
+        if spatial_norm_dim is not None:
+            self.spatial_norm = SpatialNorm(f_channels=query_dim, zq_channels=spatial_norm_dim)
+        else:
+            self.spatial_norm = None
+        if qk_norm is None:
+            self.norm_q = None
+            self.norm_k = None
+        elif qk_norm == "layer_norm":
+            self.norm_q = nn.LayerNorm(dim_head, eps=eps)
+            self.norm_k = nn.LayerNorm(dim_head, eps=eps)
+        elif qk_norm == "fp32_layer_norm":
+            self.norm_q = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps)
+            self.norm_k = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps)
+        elif qk_norm == "layer_norm_across_heads":
+            # Lumina applys qk norm across all heads
+            self.norm_q = nn.LayerNorm(dim_head * heads, eps=eps)
+            self.norm_k = nn.LayerNorm(dim_head * kv_heads, eps=eps)
+        elif qk_norm == "rms_norm":
+            self.norm_q = RMSNorm(dim_head, eps=eps)
+            self.norm_k = RMSNorm(dim_head, eps=eps)
+        else:
+            raise ValueError(f"unknown qk_norm: {qk_norm}. Should be None or 'layer_norm'")
+        if cross_attention_norm is None:
+            self.norm_cross = None
+        elif cross_attention_norm == "layer_norm":
+            self.norm_cross = nn.LayerNorm(self.cross_attention_dim)
+        elif cross_attention_norm == "group_norm":
+            if self.added_kv_proj_dim is not None:
+                # The given `encoder_hidden_states` are initially of shape
+                # (batch_size, seq_len, added_kv_proj_dim) before being projected
+                # to (batch_size, seq_len, cross_attention_dim). The norm is applied
+                # before the projection, so we need to use `added_kv_proj_dim` as
+                # the number of channels for the group norm.
+                norm_cross_num_channels = added_kv_proj_dim
+            else:
+                norm_cross_num_channels = self.cross_attention_dim
+            self.norm_cross = nn.GroupNorm(
+                num_channels=norm_cross_num_channels, num_groups=cross_attention_norm_num_groups, eps=1e-5, affine=True
+            )
+        else:
+            raise ValueError(
+                f"unknown cross_attention_norm: {cross_attention_norm}. Should be None, 'layer_norm' or 'group_norm'"
+            )
+        self.to_q = nn.Linear(query_dim, self.inner_dim, bias=bias)
+        if not self.only_cross_attention:
+            # only relevant for the `AddedKVProcessor` classes
+            self.to_k = nn.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias)
+            self.to_v = nn.Linear(self.cross_attention_dim, self.inner_kv_dim, bias=bias)
+        else:
+            self.to_k = None
+            self.to_v = None
+        self.added_proj_bias = added_proj_bias
+        if self.added_kv_proj_dim is not None:
+            self.add_k_proj = nn.Linear(added_kv_proj_dim, self.inner_kv_dim, bias=added_proj_bias)
+            self.add_v_proj = nn.Linear(added_kv_proj_dim, self.inner_kv_dim, bias=added_proj_bias)
+            if self.context_pre_only is not None:
+                self.add_q_proj = nn.Linear(added_kv_proj_dim, self.inner_dim, bias=added_proj_bias)
+        if not self.pre_only:
+            self.to_out = nn.ModuleList([])
+            self.to_out.append(nn.Linear(self.inner_dim, self.out_dim, bias=out_bias))
+            self.to_out.append(nn.Dropout(dropout))
+        if self.context_pre_only is not None and not self.context_pre_only:
+            self.to_add_out = nn.Linear(self.inner_dim, self.out_dim, bias=out_bias)
+        if qk_norm is not None and added_kv_proj_dim is not None:
+            if qk_norm == "fp32_layer_norm":
+                self.norm_added_q = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps)
+                self.norm_added_k = FP32LayerNorm(dim_head, elementwise_affine=False, bias=False, eps=eps)
+            elif qk_norm == "rms_norm":
+                self.norm_added_q = RMSNorm(dim_head, eps=eps)
+                self.norm_added_k = RMSNorm(dim_head, eps=eps)
+        else:
+            self.norm_added_q = None
+            self.norm_added_k = None
+        # set attention processor
+        # We use the AttnProcessor2_0 by default when torch 2.x is used which uses
+        # torch.nn.functional.scaled_dot_product_attention for native Flash/memory_efficient_attention
+        # but only if it has the default `scale` argument. TODO remove scale_qk check when we move to torch 2.1
+        if processor is None:
+            processor = (
+                AttnProcessor2_0() if hasattr(F, "scaled_dot_product_attention") and self.scale_qk else AttnProcessor()
+            )
+        self.set_processor(processor)
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            encoder_hidden_states: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            **cross_attention_kwargs,
+    ) -> torch.Tensor:
+        r"""
+        The forward method of the `Attention` class.
+        Args:
+            hidden_states (`torch.Tensor`):
+                The hidden states of the query.
+            encoder_hidden_states (`torch.Tensor`, *optional*):
+                The hidden states of the encoder.
+            attention_mask (`torch.Tensor`, *optional*):
+                The attention mask to use. If `None`, no mask is applied.
+            **cross_attention_kwargs:
+                Additional keyword arguments to pass along to the cross attention.
+        Returns:
+            `torch.Tensor`: The output of the attention layer.
+        """
+        # The `Attention` class can call different attention processors / attention functions
+        # here we simply pass along all tensors to the selected processor class
+        # For standard processors that are defined here, `**cross_attention_kwargs` is empty
+        return self.processor(
+            self,
+            hidden_states=hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=attention_mask,
+            **cross_attention_kwargs,
+        )
+class AttnProcessor2_0(AttnProcessor2_0_Base):
+    def __call__(
+            self,
+            attn: Attention,
+            hidden_states: torch.Tensor,
+            encoder_hidden_states: Optional[torch.Tensor] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            temb: Optional[torch.Tensor] = None,
+            flow_feature: Optional[torch.Tensor] = None,
+            flow_scale: Optional[float] = None,
+            *args,
+            **kwargs,
+    ) -> torch.Tensor:
+        old_attn = attn.scale
+        attn.scale *= kwargs.get("attn_scale", 1.0)
+        output = super().__call__(
+            attn,
+            hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=attention_mask,
+            temb=temb,
+            *args,
+            **kwargs,
+        )
+        attn.scale = old_attn
+        return output
+class IPAdapterAttnProcessor2_0(IPAdapterAttnProcessor2_0_Base):
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        scale: float = 1.0,
+        ip_adapter_masks: Optional[torch.Tensor] = None,
+        flow_feature: Optional[torch.Tensor] = None,
+        flow_scale: Optional[float] = None,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        return super().__call__(
+            attn=attn,
+            hidden_states=hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=attention_mask,
+            temb=temb,
+            scale=scale,
+            ip_adapter_masks=ip_adapter_masks,
+        )
+class FlowAdaptorAttnProcessor(nn.Module):
+    def __init__(self,
+                 type: str,
+                 hidden_size,  # dimension of hidden state
+                 flow_feature_dim=None,  # dimension of the pose feature
+                 cross_attention_dim=None,  # dimension of the text embedding
+                 query_condition=False,
+                 key_value_condition=False,
+                 flow_scale=1.0
+                 ):
+        super().__init__()
+        self.type = type
+        self.hidden_size = hidden_size
+        self.flow_feature_dim = flow_feature_dim
+        self.cross_attention_dim = cross_attention_dim
+        self.flow_scale = flow_scale
+        self.query_condition = query_condition
+        self.key_value_condition = key_value_condition
+        assert hidden_size == flow_feature_dim
+        if self.query_condition and self.key_value_condition:
+            self.qkv_merge = nn.Linear(hidden_size, hidden_size)
+            init.zeros_(self.qkv_merge.weight)
+            init.zeros_(self.qkv_merge.bias)
+        elif self.query_condition:
+            self.q_merge = nn.Linear(hidden_size, hidden_size)
+            init.zeros_(self.q_merge.weight)
+            init.zeros_(self.q_merge.bias)
+        else:
+            self.kv_merge = nn.Linear(hidden_size, hidden_size)
+            init.zeros_(self.kv_merge.weight)
+            init.zeros_(self.kv_merge.bias)
+    def forward(self,
+                attn: Attention,
+                hidden_states,
+                flow_feature,
+                encoder_hidden_states=None,
+                attention_mask=None,
+                temb=None,
+                flow_scale=None,
+                *args,
+                **kwargs,
+                ):
+        assert flow_feature is not None
+        flow_embedding_scale = (flow_scale if flow_scale is not None else self.flow_scale)
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        if self.query_condition and self.key_value_condition:
+            assert encoder_hidden_states is None
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        batch_size, ehs_sequence_length, _ = encoder_hidden_states.shape
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, ehs_sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        if attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        if self.query_condition and self.key_value_condition:  # only self attention
+            query_hidden_state = self.qkv_merge(hidden_states + flow_feature) * flow_embedding_scale + hidden_states
+            key_value_hidden_state = query_hidden_state
+        elif self.query_condition:
+            query_hidden_state = self.q_merge(hidden_states + flow_feature) * flow_embedding_scale + hidden_states
+            key_value_hidden_state = encoder_hidden_states
+        else:
+            key_value_hidden_state = self.kv_merge(
+                encoder_hidden_states + flow_feature) * flow_embedding_scale + encoder_hidden_states
+            query_hidden_state = hidden_states
+        # original attention
+        key = attn.to_k(key_value_hidden_state)
+        value = attn.to_v(key_value_hidden_state)
+        query = attn.to_q(query_hidden_state)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value,
+            attn_mask=attention_mask,
+            dropout_p=0.0,
+            is_causal=False,
+            scale=attn.scale * kwargs.get("attn_scale_flow", 1.0),
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states

onlyflow/models/flow_adaptor.py ADDED Viewed

	@@ -0,0 +1,247 @@

+import math
+import torch
+import torch.nn as nn
+from einops import rearrange
+from torch.utils import checkpoint
+from onlyflow.models.attention import BasicTransformerBlock
+def get_parameter_dtype(parameter: torch.nn.Module):
+    params = tuple(parameter.parameters())
+    if len(params) > 0:
+        return params[0].dtype
+    buffers = tuple(parameter.buffers())
+    if len(buffers) > 0:
+        return buffers[0].dtype
+def conv_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D convolution module.
+    """
+    if dims == 1:
+        return nn.Conv1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.Conv2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.Conv3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+def avg_pool_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D average pooling module.
+    """
+    if dims == 1:
+        return nn.AvgPool1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.AvgPool2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.AvgPool3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+class FlowAdaptor(nn.Module):
+    def __init__(self, unet, flow_encoder, ckpt_act=True):
+        super().__init__()
+        self.unet = unet
+        self.flow_encoder = flow_encoder
+        self.ckpt_act = ckpt_act
+    def forward(self, noisy_latents, timesteps, encoder_hidden_states, flow_embedding):
+        assert flow_embedding.ndim == 5
+        bs = flow_embedding.shape[0]  # b c f h w
+        flow_embedding_features = self.flow_encoder(flow_embedding)  # flow_embedding b f c h w
+        flow_embedding_features = [rearrange(x, '(b f) c h w -> b c f h w', b=bs)
+                                   for x in flow_embedding_features]
+        added_cond_kwargs = {'flow_embedding_features': flow_embedding_features}
+        noise_pred = self.unet(noisy_latents,
+                               timesteps,
+                               encoder_hidden_states,
+                               added_cond_kwargs=added_cond_kwargs,
+                               )
+        return noise_pred.sample
+class Downsample(nn.Module):
+    """
+    A downsampling layer with an optional convolution.
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 downsampling occurs in the inner-two dimensions.
+    """
+    def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        stride = 2 if dims != 3 else (1, 2, 2)
+        if use_conv:
+            self.op = conv_nd(dims, self.channels, self.out_channels, 3, stride=stride, padding=padding)
+        else:
+            assert self.channels == self.out_channels
+            self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        return self.op(x)
+class ResnetBlock(nn.Module):
+    def __init__(self, in_c, out_c, down, ksize=3, sk=False, use_conv=True):
+        super().__init__()
+        ps = ksize // 2
+        if in_c != out_c or sk == False:
+            self.in_conv = nn.Conv2d(in_c, out_c, ksize, 1, ps)
+        else:
+            self.in_conv = None
+        self.block1 = nn.Conv2d(out_c, out_c, 3, 1, 1)
+        self.act = nn.ReLU()
+        self.block2 = nn.Conv2d(out_c, out_c, ksize, 1, ps)
+        if not sk:
+            self.skep = nn.Conv2d(in_c, out_c, ksize, 1, ps)
+        else:
+            self.skep = None
+        self.down = down
+        if self.down:
+            self.down_opt = Downsample(in_c, use_conv=use_conv)
+    def forward(self, x):
+        if self.down:
+            x = self.down_opt(x)
+        if self.in_conv is not None:  # edit
+            x = self.in_conv(x)
+        h = self.block1(x)
+        h = self.act(h)
+        h = self.block2(h)
+        if self.skep is not None:
+            return h + self.skep(x)
+        else:
+            return h + x
+class PositionalEncoding(nn.Module):
+    def __init__(
+            self,
+            d_model,
+            dropout=0.,
+            max_len=32,
+    ):
+        super().__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        position = torch.arange(max_len).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
+        pe = torch.zeros(1, max_len, d_model)
+        pe[0, :, 0::2, ...] = torch.sin(position * div_term)
+        pe[0, :, 1::2, ...] = torch.cos(position * div_term)
+        pe.unsqueeze_(-1).unsqueeze_(-1)
+        self.register_buffer('pe', pe)
+    def forward(self, x):
+        x = x + self.pe[:, :x.size(1), ...]
+        return self.dropout(x)
+class FlowEncoder(nn.Module):
+    def __init__(self,
+                 downscale_factor,
+                 channels=None,
+                 nums_rb=3,
+                 ksize=3,
+                 sk=False,
+                 use_conv=True,
+                 compression_factor=1,
+                 temporal_attention_nhead=8,
+                 positional_embeddings=None,
+                 num_positional_embeddings=16,
+                 rescale_output_factor=1.0,
+                 checkpointing=False):
+        super(FlowEncoder, self).__init__()
+        if channels is None:
+            channels = [320, 640, 1280, 1280]
+        self.checkpointing = checkpointing
+        self.unshuffle = nn.PixelUnshuffle(downscale_factor)
+        self.channels = channels
+        self.nums_rb = nums_rb
+        self.encoder_down_conv_blocks = nn.ModuleList()
+        self.encoder_down_attention_blocks = nn.ModuleList()
+        for i in range(len(channels)):
+            conv_layers = nn.ModuleList()
+            temporal_attention_layers = nn.ModuleList()
+            for j in range(nums_rb):
+                if j == 0 and i != 0:
+                    in_dim = channels[i - 1]
+                    out_dim = int(channels[i] / compression_factor)
+                    conv_layer = ResnetBlock(in_dim, out_dim, down=True, ksize=ksize, sk=sk, use_conv=use_conv)
+                elif j == 0:
+                    in_dim = channels[0]
+                    out_dim = int(channels[i] / compression_factor)
+                    conv_layer = ResnetBlock(in_dim, out_dim, down=False, ksize=ksize, sk=sk, use_conv=use_conv)
+                elif j == nums_rb - 1:
+                    in_dim = channels[i] / compression_factor
+                    out_dim = channels[i]
+                    conv_layer = ResnetBlock(in_dim, out_dim, down=False, ksize=ksize, sk=sk, use_conv=use_conv)
+                else:
+                    in_dim = int(channels[i] / compression_factor)
+                    out_dim = int(channels[i] / compression_factor)
+                    conv_layer = ResnetBlock(in_dim, out_dim, down=False, ksize=ksize, sk=sk, use_conv=use_conv)
+                temporal_attention_layer = BasicTransformerBlock(
+                    dim=out_dim,
+                    num_attention_heads=temporal_attention_nhead,
+                    attention_head_dim=int(out_dim / temporal_attention_nhead),
+                    dropout=0.0,
+                    positional_embeddings=positional_embeddings,
+                    num_positional_embeddings=num_positional_embeddings
+                )
+                conv_layers.append(conv_layer)
+                temporal_attention_layers.append(temporal_attention_layer)
+            self.encoder_down_conv_blocks.append(conv_layers)
+            self.encoder_down_attention_blocks.append(temporal_attention_layers)
+        self.encoder_conv_in = nn.Conv2d(2 * (downscale_factor ** 2), channels[0], 3, 1, 1)
+    @property
+    def dtype(self) -> torch.dtype:
+        """
+        `torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype).
+        """
+        return get_parameter_dtype(self)
+    def forward(self, x):
+        # unshuffle
+        bs = x.shape[0]
+        x = rearrange(x, "b c f h w -> (b f) c h w")
+        x = self.unshuffle(x)
+        # extract features
+        features = []
+        x = self.encoder_conv_in(x)
+        for i, (res_block, attention_block) in enumerate(
+                zip(self.encoder_down_conv_blocks, self.encoder_down_attention_blocks)):
+            for j, (res_layer, attention_layer) in enumerate(zip(res_block, attention_block)):
+                if self.checkpointing:
+                    x = checkpoint.checkpoint(res_layer, x, use_reentrant=False)
+                else:
+                    x = res_layer(x)
+                h, w = x.shape[-2:]
+                x = rearrange(x, '(b f) c h w -> (b h w) f c', b=bs)
+                if self.checkpointing:
+                    x = checkpoint.checkpoint(attention_layer, x, use_reentrant=False)
+                else:
+                    x = attention_layer(x)
+                x = rearrange(x, '(b h w) f c -> (b f) c h w', h=h, w=w)
+            features.append(x)
+        return features

onlyflow/models/transformer_2d.py ADDED Viewed

	@@ -0,0 +1,566 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Optional
+import torch
+import torch.nn.functional as F
+from diffusers.configuration_utils import LegacyConfigMixin, register_to_config
+from diffusers.models.embeddings import ImagePositionalEmbeddings, PatchEmbed, PixArtAlphaTextProjection
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.models.modeling_utils import LegacyModelMixin
+from diffusers.models.normalization import AdaLayerNormSingle
+from diffusers.utils import deprecate, is_torch_version, logging
+from torch import nn
+from onlyflow.models.attention import BasicTransformerBlock
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class Transformer2DModelOutput(Transformer2DModelOutput):
+    def __init__(self, *args, **kwargs):
+        deprecation_message = "Importing `Transformer2DModelOutput` from `diffusers.models.transformer_2d` is deprecated and this will be removed in a future version. Please use `from diffusers.models.modeling_outputs import Transformer2DModelOutput`, instead."
+        deprecate("Transformer2DModelOutput", "1.0.0", deprecation_message)
+        super().__init__(*args, **kwargs)
+class Transformer2DModel(LegacyModelMixin, LegacyConfigMixin):
+    """
+    A 2D Transformer model for image-like data.
+    Parameters:
+        num_attention_heads (`int`, *optional*, defaults to 16): The number of heads to use for multi-head attention.
+        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
+        in_channels (`int`, *optional*):
+            The number of channels in the input and output (specify if the input is **continuous**).
+        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
+        sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
+            This is fixed during training since it is used to learn a number of position embeddings.
+        num_vector_embeds (`int`, *optional*):
+            The number of classes of the vector embeddings of the latent pixels (specify if the input is **discrete**).
+            Includes the class for the masked latent pixel.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to use in feed-forward.
+        num_embeds_ada_norm ( `int`, *optional*):
+            The number of diffusion steps used during training. Pass if at least one of the norm_layers is
+            `AdaLayerNorm`. This is fixed during training since it is used to learn a number of embeddings that are
+            added to the hidden states.
+            During inference, you can denoise for up to but not more steps than `num_embeds_ada_norm`.
+        attention_bias (`bool`, *optional*):
+            Configure if the `TransformerBlocks` attention should contain a bias parameter.
+    """
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["BasicTransformerBlock"]
+    @register_to_config
+    def __init__(
+            self,
+            num_attention_heads: int = 16,
+            attention_head_dim: int = 88,
+            in_channels: Optional[int] = None,
+            out_channels: Optional[int] = None,
+            num_layers: int = 1,
+            dropout: float = 0.0,
+            norm_num_groups: int = 32,
+            cross_attention_dim: Optional[int] = None,
+            attention_bias: bool = False,
+            sample_size: Optional[int] = None,
+            num_vector_embeds: Optional[int] = None,
+            patch_size: Optional[int] = None,
+            activation_fn: str = "geglu",
+            num_embeds_ada_norm: Optional[int] = None,
+            use_linear_projection: bool = False,
+            only_cross_attention: bool = False,
+            double_self_attention: bool = False,
+            upcast_attention: bool = False,
+            norm_type: str = "layer_norm",
+            # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single', 'ada_norm_continuous', 'layer_norm_i2vgen'
+            norm_elementwise_affine: bool = True,
+            norm_eps: float = 1e-5,
+            attention_type: str = "default",
+            caption_channels: int = None,
+            interpolation_scale: float = None,
+            use_additional_conditions: Optional[bool] = None,
+    ):
+        super().__init__()
+        # Validate inputs.
+        if patch_size is not None:
+            if norm_type not in ["ada_norm", "ada_norm_zero", "ada_norm_single"]:
+                raise NotImplementedError(
+                    f"Forward pass is not implemented when `patch_size` is not None and `norm_type` is '{norm_type}'."
+                )
+            elif norm_type in ["ada_norm", "ada_norm_zero"] and num_embeds_ada_norm is None:
+                raise ValueError(
+                    f"When using a `patch_size` and this `norm_type` ({norm_type}), `num_embeds_ada_norm` cannot be None."
+                )
+        # 1. Transformer2DModel can process both standard continuous images of shape `(batch_size, num_channels, width, height)` as well as quantized image embeddings of shape `(batch_size, num_image_vectors)`
+        # Define whether input is continuous or discrete depending on configuration
+        self.is_input_continuous = (in_channels is not None) and (patch_size is None)
+        self.is_input_vectorized = num_vector_embeds is not None
+        self.is_input_patches = in_channels is not None and patch_size is not None
+        if self.is_input_continuous and self.is_input_vectorized:
+            raise ValueError(
+                f"Cannot define both `in_channels`: {in_channels} and `num_vector_embeds`: {num_vector_embeds}. Make"
+                " sure that either `in_channels` or `num_vector_embeds` is None."
+            )
+        elif self.is_input_vectorized and self.is_input_patches:
+            raise ValueError(
+                f"Cannot define both `num_vector_embeds`: {num_vector_embeds} and `patch_size`: {patch_size}. Make"
+                " sure that either `num_vector_embeds` or `num_patches` is None."
+            )
+        elif not self.is_input_continuous and not self.is_input_vectorized and not self.is_input_patches:
+            raise ValueError(
+                f"Has to define `in_channels`: {in_channels}, `num_vector_embeds`: {num_vector_embeds}, or patch_size:"
+                f" {patch_size}. Make sure that `in_channels`, `num_vector_embeds` or `num_patches` is not None."
+            )
+        if norm_type == "layer_norm" and num_embeds_ada_norm is not None:
+            deprecation_message = (
+                f"The configuration file of this model: {self.__class__} is outdated. `norm_type` is either not set or"
+                " incorrectly set to `'layer_norm'`. Make sure to set `norm_type` to `'ada_norm'` in the config."
+                " Please make sure to update the config accordingly as leaving `norm_type` might led to incorrect"
+                " results in future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it"
+                " would be very nice if you could open a Pull request for the `transformer/config.json` file"
+            )
+            deprecate("norm_type!=num_embeds_ada_norm", "1.0.0", deprecation_message, standard_warn=False)
+            norm_type = "ada_norm"
+        # Set some common variables used across the board.
+        self.use_linear_projection = use_linear_projection
+        self.interpolation_scale = interpolation_scale
+        self.caption_channels = caption_channels
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
+        self.in_channels = in_channels
+        self.out_channels = in_channels if out_channels is None else out_channels
+        self.gradient_checkpointing = False
+        if use_additional_conditions is None:
+            if norm_type == "ada_norm_single" and sample_size == 128:
+                use_additional_conditions = True
+            else:
+                use_additional_conditions = False
+        self.use_additional_conditions = use_additional_conditions
+        # 2. Initialize the right blocks.
+        # These functions follow a common structure:
+        # a. Initialize the input blocks. b. Initialize the transformer blocks.
+        # c. Initialize the output blocks and other projection blocks when necessary.
+        if self.is_input_continuous:
+            self._init_continuous_input(norm_type=norm_type)
+        elif self.is_input_vectorized:
+            self._init_vectorized_inputs(norm_type=norm_type)
+        elif self.is_input_patches:
+            self._init_patched_inputs(norm_type=norm_type)
+    def _init_continuous_input(self, norm_type):
+        self.norm = torch.nn.GroupNorm(
+            num_groups=self.config.norm_num_groups, num_channels=self.in_channels, eps=1e-6, affine=True
+        )
+        if self.use_linear_projection:
+            self.proj_in = torch.nn.Linear(self.in_channels, self.inner_dim)
+        else:
+            self.proj_in = torch.nn.Conv2d(self.in_channels, self.inner_dim, kernel_size=1, stride=1, padding=0)
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    self.inner_dim,
+                    self.config.num_attention_heads,
+                    self.config.attention_head_dim,
+                    dropout=self.config.dropout,
+                    cross_attention_dim=self.config.cross_attention_dim,
+                    activation_fn=self.config.activation_fn,
+                    num_embeds_ada_norm=self.config.num_embeds_ada_norm,
+                    attention_bias=self.config.attention_bias,
+                    only_cross_attention=self.config.only_cross_attention,
+                    double_self_attention=self.config.double_self_attention,
+                    upcast_attention=self.config.upcast_attention,
+                    norm_type=norm_type,
+                    norm_elementwise_affine=self.config.norm_elementwise_affine,
+                    norm_eps=self.config.norm_eps,
+                    attention_type=self.config.attention_type,
+                )
+                for _ in range(self.config.num_layers)
+            ]
+        )
+        if self.use_linear_projection:
+            self.proj_out = torch.nn.Linear(self.inner_dim, self.out_channels)
+        else:
+            self.proj_out = torch.nn.Conv2d(self.inner_dim, self.out_channels, kernel_size=1, stride=1, padding=0)
+    def _init_vectorized_inputs(self, norm_type):
+        assert self.config.sample_size is not None, "Transformer2DModel over discrete input must provide sample_size"
+        assert (
+                self.config.num_vector_embeds is not None
+        ), "Transformer2DModel over discrete input must provide num_embed"
+        self.height = self.config.sample_size
+        self.width = self.config.sample_size
+        self.num_latent_pixels = self.height * self.width
+        self.latent_image_embedding = ImagePositionalEmbeddings(
+            num_embed=self.config.num_vector_embeds, embed_dim=self.inner_dim, height=self.height, width=self.width
+        )
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    self.inner_dim,
+                    self.config.num_attention_heads,
+                    self.config.attention_head_dim,
+                    dropout=self.config.dropout,
+                    cross_attention_dim=self.config.cross_attention_dim,
+                    activation_fn=self.config.activation_fn,
+                    num_embeds_ada_norm=self.config.num_embeds_ada_norm,
+                    attention_bias=self.config.attention_bias,
+                    only_cross_attention=self.config.only_cross_attention,
+                    double_self_attention=self.config.double_self_attention,
+                    upcast_attention=self.config.upcast_attention,
+                    norm_type=norm_type,
+                    norm_elementwise_affine=self.config.norm_elementwise_affine,
+                    norm_eps=self.config.norm_eps,
+                    attention_type=self.config.attention_type,
+                )
+                for _ in range(self.config.num_layers)
+            ]
+        )
+        self.norm_out = nn.LayerNorm(self.inner_dim)
+        self.out = nn.Linear(self.inner_dim, self.config.num_vector_embeds - 1)
+    def _init_patched_inputs(self, norm_type):
+        assert self.config.sample_size is not None, "Transformer2DModel over patched input must provide sample_size"
+        self.height = self.config.sample_size
+        self.width = self.config.sample_size
+        self.patch_size = self.config.patch_size
+        interpolation_scale = (
+            self.config.interpolation_scale
+            if self.config.interpolation_scale is not None
+            else max(self.config.sample_size // 64, 1)
+        )
+        self.pos_embed = PatchEmbed(
+            height=self.config.sample_size,
+            width=self.config.sample_size,
+            patch_size=self.config.patch_size,
+            in_channels=self.in_channels,
+            embed_dim=self.inner_dim,
+            interpolation_scale=interpolation_scale,
+        )
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    self.inner_dim,
+                    self.config.num_attention_heads,
+                    self.config.attention_head_dim,
+                    dropout=self.config.dropout,
+                    cross_attention_dim=self.config.cross_attention_dim,
+                    activation_fn=self.config.activation_fn,
+                    num_embeds_ada_norm=self.config.num_embeds_ada_norm,
+                    attention_bias=self.config.attention_bias,
+                    only_cross_attention=self.config.only_cross_attention,
+                    double_self_attention=self.config.double_self_attention,
+                    upcast_attention=self.config.upcast_attention,
+                    norm_type=norm_type,
+                    norm_elementwise_affine=self.config.norm_elementwise_affine,
+                    norm_eps=self.config.norm_eps,
+                    attention_type=self.config.attention_type,
+                )
+                for _ in range(self.config.num_layers)
+            ]
+        )
+        if self.config.norm_type != "ada_norm_single":
+            self.norm_out = nn.LayerNorm(self.inner_dim, elementwise_affine=False, eps=1e-6)
+            self.proj_out_1 = nn.Linear(self.inner_dim, 2 * self.inner_dim)
+            self.proj_out_2 = nn.Linear(
+                self.inner_dim, self.config.patch_size * self.config.patch_size * self.out_channels
+            )
+        elif self.config.norm_type == "ada_norm_single":
+            self.norm_out = nn.LayerNorm(self.inner_dim, elementwise_affine=False, eps=1e-6)
+            self.scale_shift_table = nn.Parameter(torch.randn(2, self.inner_dim) / self.inner_dim ** 0.5)
+            self.proj_out = nn.Linear(
+                self.inner_dim, self.config.patch_size * self.config.patch_size * self.out_channels
+            )
+        # PixArt-Alpha blocks.
+        self.adaln_single = None
+        if self.config.norm_type == "ada_norm_single":
+            # TODO(Sayak, PVP) clean this, for now we use sample size to determine whether to use
+            # additional conditions until we find better name
+            self.adaln_single = AdaLayerNormSingle(
+                self.inner_dim, use_additional_conditions=self.use_additional_conditions
+            )
+        self.caption_projection = None
+        if self.caption_channels is not None:
+            self.caption_projection = PixArtAlphaTextProjection(
+                in_features=self.caption_channels, hidden_size=self.inner_dim
+            )
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            encoder_hidden_states: Optional[torch.Tensor] = None,
+            timestep: Optional[torch.LongTensor] = None,
+            added_cond_kwargs: Dict[str, torch.Tensor] = None,
+            class_labels: Optional[torch.LongTensor] = None,
+            cross_attention_kwargs: Dict[str, Any] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            encoder_attention_mask: Optional[torch.Tensor] = None,
+            return_dict: bool = True,
+    ):
+        """
+        The [`Transformer2DModel`] forward method.
+        Args:
+            hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.Tensor` of shape `(batch size, channel, height, width)` if continuous):
+                Input `hidden_states`.
+            encoder_hidden_states ( `torch.Tensor` of shape `(batch size, sequence len, embed dims)`, *optional*):
+                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
+                self-attention.
+            timestep ( `torch.LongTensor`, *optional*):
+                Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
+            class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
+                Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
+                `AdaLayerZeroNorm`.
+            cross_attention_kwargs ( `Dict[str, Any]`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            attention_mask ( `torch.Tensor`, *optional*):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            encoder_attention_mask ( `torch.Tensor`, *optional*):
+                Cross-attention mask applied to `encoder_hidden_states`. Two formats supported:
+                    * Mask `(batch, sequence_length)` True = keep, False = discard.
+                    * Bias `(batch, 1, sequence_length)` 0 = keep, -10000 = discard.
+                If `ndim == 2`: will be interpreted as a mask, then converted into a bias consistent with the format
+                above. This bias will be added to the cross-attention scores.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
+                tuple.
+        Returns:
+            If `return_dict` is True, an [`~models.transformers.transformer_2d.Transformer2DModelOutput`] is returned,
+            otherwise a `tuple` where the first element is the sample tensor.
+        """
+        if cross_attention_kwargs is not None:
+            if cross_attention_kwargs.get("scale", None) is not None:
+                logger.warning("Passing `scale` to `cross_attention_kwargs` is deprecated. `scale` will be ignored.")
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
+        #   we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
+        #   we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None and attention_mask.ndim == 2:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
+            encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+        # 1. Input
+        if self.is_input_continuous:
+            batch_size, _, height, width = hidden_states.shape
+            residual = hidden_states
+            hidden_states, inner_dim = self._operate_on_continuous_inputs(hidden_states)
+        elif self.is_input_vectorized:
+            hidden_states = self.latent_image_embedding(hidden_states)
+        elif self.is_input_patches:
+            height, width = hidden_states.shape[-2] // self.patch_size, hidden_states.shape[-1] // self.patch_size
+            hidden_states, encoder_hidden_states, timestep, embedded_timestep = self._operate_on_patched_inputs(
+                hidden_states, encoder_hidden_states, timestep, added_cond_kwargs
+            )
+        # 2. Blocks
+        for block in self.transformer_blocks:
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    timestep,
+                    cross_attention_kwargs,
+                    class_labels,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states=hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    timestep=timestep,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    class_labels=class_labels,
+                )
+        # 3. Output
+        if self.is_input_continuous:
+            output = self._get_output_for_continuous_inputs(
+                hidden_states=hidden_states,
+                residual=residual,
+                batch_size=batch_size,
+                height=height,
+                width=width,
+                inner_dim=inner_dim,
+            )
+        elif self.is_input_vectorized:
+            output = self._get_output_for_vectorized_inputs(hidden_states)
+        elif self.is_input_patches:
+            output = self._get_output_for_patched_inputs(
+                hidden_states=hidden_states,
+                timestep=timestep,
+                class_labels=class_labels,
+                embedded_timestep=embedded_timestep,
+                height=height,
+                width=width,
+            )
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)
+    def _operate_on_continuous_inputs(self, hidden_states):
+        batch, _, height, width = hidden_states.shape
+        hidden_states = self.norm(hidden_states)
+        if not self.use_linear_projection:
+            hidden_states = self.proj_in(hidden_states)
+            inner_dim = hidden_states.shape[1]
+            hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
+        else:
+            inner_dim = hidden_states.shape[1]
+            hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * width, inner_dim)
+            hidden_states = self.proj_in(hidden_states)
+        return hidden_states, inner_dim
+    def _operate_on_patched_inputs(self, hidden_states, encoder_hidden_states, timestep, added_cond_kwargs):
+        batch_size = hidden_states.shape[0]
+        hidden_states = self.pos_embed(hidden_states)
+        embedded_timestep = None
+        if self.adaln_single is not None:
+            if self.use_additional_conditions and added_cond_kwargs is None:
+                raise ValueError(
+                    "`added_cond_kwargs` cannot be None when using additional conditions for `adaln_single`."
+                )
+            timestep, embedded_timestep = self.adaln_single(
+                timestep, added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_states.dtype
+            )
+        if self.caption_projection is not None:
+            encoder_hidden_states = self.caption_projection(encoder_hidden_states)
+            encoder_hidden_states = encoder_hidden_states.view(batch_size, -1, hidden_states.shape[-1])
+        return hidden_states, encoder_hidden_states, timestep, embedded_timestep
+    def _get_output_for_continuous_inputs(self, hidden_states, residual, batch_size, height, width, inner_dim):
+        if not self.use_linear_projection:
+            hidden_states = (
+                hidden_states.reshape(batch_size, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
+            )
+            hidden_states = self.proj_out(hidden_states)
+        else:
+            hidden_states = self.proj_out(hidden_states)
+            hidden_states = (
+                hidden_states.reshape(batch_size, height, width, inner_dim).permute(0, 3, 1, 2).contiguous()
+            )
+        output = hidden_states + residual
+        return output
+    def _get_output_for_vectorized_inputs(self, hidden_states):
+        hidden_states = self.norm_out(hidden_states)
+        logits = self.out(hidden_states)
+        # (batch, self.num_vector_embeds - 1, self.num_latent_pixels)
+        logits = logits.permute(0, 2, 1)
+        # log(p(x_0))
+        output = F.log_softmax(logits.double(), dim=1).float()
+        return output
+    def _get_output_for_patched_inputs(
+            self, hidden_states, timestep, class_labels, embedded_timestep, height=None, width=None
+    ):
+        if self.config.norm_type != "ada_norm_single":
+            conditioning = self.transformer_blocks[0].norm1.emb(
+                timestep, class_labels, hidden_dtype=hidden_states.dtype
+            )
+            shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, dim=1)
+            hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
+            hidden_states = self.proj_out_2(hidden_states)
+        elif self.config.norm_type == "ada_norm_single":
+            shift, scale = (self.scale_shift_table[None] + embedded_timestep[:, None]).chunk(2, dim=1)
+            hidden_states = self.norm_out(hidden_states)
+            # Modulation
+            hidden_states = hidden_states * (1 + scale) + shift
+            hidden_states = self.proj_out(hidden_states)
+            hidden_states = hidden_states.squeeze(1)
+        # unpatchify
+        if self.adaln_single is None:
+            height = width = int(hidden_states.shape[1] ** 0.5)
+        hidden_states = hidden_states.reshape(
+            shape=(-1, height, width, self.patch_size, self.patch_size, self.out_channels)
+        )
+        hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
+        output = hidden_states.reshape(
+            shape=(-1, self.out_channels, height * self.patch_size, width * self.patch_size)
+        )
+        return output

onlyflow/models/unet.py ADDED Viewed

The diff for this file is too large to render. See raw diff

onlyflow/pipelines/pipeline_animation.py ADDED Viewed

	@@ -0,0 +1,497 @@

+# Adapted from https://github.com/showlab/Tune-A-Video/blob/main/tuneavideo/pipelines/pipeline_tuneavideo.py
+# TODO: rebase on diffusers/pipelines/animatediff/pipeline_animatediff.py
+import copy
+from dataclasses import dataclass
+from typing import Callable, Optional, Dict, Any
+from typing import List, Union
+import PIL.Image
+import numpy as np
+import torch
+from diffusers import AnimateDiffPipeline
+from diffusers.image_processor import PipelineImageInput
+from diffusers.models import AutoencoderKL
+from diffusers.pipelines.animatediff import AnimateDiffPipelineOutput
+from diffusers.pipelines.animatediff.pipeline_animatediff import EXAMPLE_DOC_STRING
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from diffusers.utils import BaseOutput
+from diffusers.utils import deprecate, logging, replace_example_docstring
+from einops import rearrange
+from transformers import CLIPTextModel, CLIPTokenizer
+from onlyflow.models.flow_adaptor import FlowEncoder
+from onlyflow.models.unet import UNetMotionModel
+logger = logging.get_logger(__name__)
+@dataclass
+class AnimateDiffPipelineOutput(BaseOutput):
+    frames_no_flow: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]]
+    frames_flow: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]]
+class FlowCtrlPipeline(AnimateDiffPipeline, DiffusionPipeline):
+    _optional_components = ["feature_extractor", "image_encoder", "motion_adapter"]
+    def __init__(self,
+                 vae: AutoencoderKL,
+                 text_encoder: CLIPTextModel,
+                 tokenizer: CLIPTokenizer,
+                 unet: UNetMotionModel,
+                 scheduler: Union[
+                     DDIMScheduler,
+                     PNDMScheduler,
+                     LMSDiscreteScheduler,
+                     EulerDiscreteScheduler,
+                     EulerAncestralDiscreteScheduler,
+                     DPMSolverMultistepScheduler],
+                 flow_encoder: FlowEncoder,
+                 feature_extractor=None,
+                 image_encoder=None,
+                 motion_adapter=None,
+                 ):
+        super().__init__(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            motion_adapter=motion_adapter,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        # deepcopy the scheduler
+        self.scheduler_no_flow = copy.deepcopy(scheduler)
+        self.unet = unet
+        self.register_modules(
+            flow_encoder=flow_encoder
+        )
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+            self,
+            prompt: Union[str, List[str]] = None,
+            flow_embedding: torch.FloatTensor = None,
+            num_frames: Optional[int] = 16,
+            height: Optional[int] = None,
+            width: Optional[int] = None,
+            num_inference_steps: int = 50,
+            guidance_scale: float = 7.5,
+            negative_prompt: Optional[Union[str, List[str]]] = None,
+            eta: float = 0.0,
+            generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+            latents: Optional[torch.Tensor] = None,
+            prompt_embeds: Optional[torch.Tensor] = None,
+            negative_prompt_embeds: Optional[torch.Tensor] = None,
+            ip_adapter_image: Optional[PipelineImageInput] = None,
+            ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+            output_type: Optional[str] = "pt",
+            return_dict: bool = True,
+            callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+            callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+            motion_cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+            clip_skip: Optional[int] = None,
+            decode_chunk_size: int = 16,
+            val_scale_factor_spatial: float = 1.,
+            val_scale_factor_temporal: float = 1.,
+            generate_no_flow: bool = False,
+            **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated video.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated video.
+            num_frames (`int`, *optional*, defaults to 16):
+                The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
+                amounts to 2 seconds of video.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
+                `(batch_size, num_channel, num_frames, height, width)`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead
+                of a plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            decode_chunk_size (`int`, defaults to `16`):
+                The number of frames to decode at a time when calling `decode_latents` method.
+        Examples:
+        Returns:
+            [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
+        """
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        num_videos_per_prompt = 1
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self.unet.device
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_videos_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=text_encoder_lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_videos_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+        # 4. Prepare timesteps
+        single_model_length = num_frames
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            num_frames,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        if generate_no_flow:
+            latents_no_flow = latents.clone()
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        if isinstance(flow_embedding, list):
+            assert all([x.ndim == 5 for x in flow_embedding])
+            bs = flow_embedding[0].shape[0]
+            flow_embedding_features = []
+            for pe in flow_embedding:
+                flow_embedding_feature = self.flow_encoder(pe)
+                flow_embedding_feature = [rearrange(x, '(b f) c h w -> b c f h w', b=bs) for x in
+                                          flow_embedding_feature]
+                flow_embedding_features.append(flow_embedding_feature)
+        else:
+            bs = flow_embedding.shape[0]
+            assert flow_embedding.ndim == 5
+            flow_embedding_features = self.flow_encoder(flow_embedding)  # bf, c, h, w
+            flow_embedding_features = [rearrange(x, '(b f) c h w -> b c f h w', b=bs)
+                                       for x in flow_embedding_features]
+        # 7. Add image embeds for IP-Adapter
+        added_cond_kwargs = {
+            "image_embeds": image_embeds} if ip_adapter_image is not None or ip_adapter_image_embeds is not None else None
+        num_free_init_iters = self._free_init_num_iters if self.free_init_enabled else 1
+        for free_init_iter in range(num_free_init_iters):
+            if self.free_init_enabled:
+                latents, timesteps = self._apply_free_init(
+                    latents, free_init_iter, num_inference_steps, device, latents.dtype, generator
+                )
+                if generate_no_flow:
+                    latents_no_flow = latents.clone()
+            self._num_timesteps = len(timesteps)
+            num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+            if isinstance(flow_embedding_features[0], list):
+                flow_embedding_features = [[torch.cat([x, x], dim=0) for x in flow_embedding_feature]
+                                           for flow_embedding_feature in flow_embedding_features] \
+                    if self.do_classifier_free_guidance else flow_embedding_features
+            else:
+                flow_embedding_features = [torch.cat([x, x], dim=0) for x in flow_embedding_features] \
+                    if self.do_classifier_free_guidance else flow_embedding_features  # [2b c f h w]
+            # 8. Denoising loop
+            with self.progress_bar(total=self._num_timesteps) as progress_bar:
+                for i, t in enumerate(timesteps):
+                    # expand the latents if we are doing classifier free guidance
+                    latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                    if added_cond_kwargs is not None:
+                        added_cond_kwargs.update({"flow_embedding_features": flow_embedding_features})
+                    else:
+                        added_cond_kwargs = {"flow_embedding_features": flow_embedding_features}
+                    if cross_attention_kwargs is not None:
+                        cross_attention_kwargs.update({"flow_scale": val_scale_factor_spatial})
+                    else:
+                        cross_attention_kwargs = {"flow_scale": val_scale_factor_spatial}
+                    if motion_cross_attention_kwargs is not None:
+                        motion_cross_attention_kwargs.update({"flow_scale": val_scale_factor_temporal})
+                    else:
+                        motion_cross_attention_kwargs = {"flow_scale": val_scale_factor_temporal}
+                    # predict the noise residual
+                    noise_pred = self.unet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=prompt_embeds,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                        motion_cross_attention_kwargs=motion_cross_attention_kwargs,
+                        added_cond_kwargs=added_cond_kwargs,
+                    ).sample
+                    del latent_model_input
+                    # perform guidance
+                    if self.do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                        del noise_pred_uncond, noise_pred_text
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                    del noise_pred
+                    if callback_on_step_end is not None:
+                        callback_kwargs = {}
+                        for k in callback_on_step_end_tensor_inputs:
+                            callback_kwargs[k] = locals()[k]
+                        callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                        latents = callback_outputs.pop("latents", latents)
+                        prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                        negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    # call the callback, if provided
+                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                        progress_bar.update()
+                        if callback is not None and i % callback_steps == 0:
+                            callback(i, t, latents)
+            # 8. Denoising loop
+            if generate_no_flow:
+                with self.progress_bar(total=self._num_timesteps) as progress_bar:
+                    for i, t in enumerate(timesteps):
+                        # expand the latents if we are doing classifier free guidance
+                        latent_model_input_no_flow = torch.cat(
+                            [latents_no_flow] * 2) if self.do_classifier_free_guidance else latents_no_flow
+                        latent_model_input_no_flow = self.scheduler.scale_model_input(latent_model_input_no_flow, t)
+                        if added_cond_kwargs is not None:
+                            added_cond_kwargs.update({"flow_embedding_features": flow_embedding_features})
+                        else:
+                            added_cond_kwargs = {"flow_embedding_features": flow_embedding_features}
+                        if cross_attention_kwargs is not None:
+                            cross_attention_kwargs.update({"flow_scale": 0.})
+                        else:
+                            cross_attention_kwargs = {"flow_scale": 0.}
+                        if motion_cross_attention_kwargs is not None:
+                            motion_cross_attention_kwargs.update({"flow_scale": 0.})
+                        else:
+                            motion_cross_attention_kwargs = {"flow_scale": 0.}
+                        noise_pred_no_flow = self.unet(
+                            latent_model_input_no_flow,
+                            t,
+                            encoder_hidden_states=prompt_embeds,
+                            cross_attention_kwargs=cross_attention_kwargs,
+                            motion_cross_attention_kwargs=motion_cross_attention_kwargs,
+                            added_cond_kwargs=added_cond_kwargs,
+                        ).sample
+                        del latent_model_input_no_flow
+                        # perform guidance
+                        if self.do_classifier_free_guidance:
+                            noise_pred_no_flow_uncond, noise_pred_no_flow_text = noise_pred_no_flow.chunk(2)
+                            noise_pred_no_flow = noise_pred_no_flow_uncond + guidance_scale * (
+                                    noise_pred_no_flow_text - noise_pred_no_flow_uncond)
+                            del noise_pred_no_flow_uncond, noise_pred_no_flow_text
+                        # compute the previous noisy sample x_t -> x_t-1
+                        latents_no_flow = self.scheduler.step(noise_pred_no_flow, t, latents_no_flow,
+                                                              **extra_step_kwargs).prev_sample
+                        del noise_pred_no_flow
+                        if callback_on_step_end is not None:
+                            callback_kwargs = {}
+                            for k in callback_on_step_end_tensor_inputs:
+                                callback_kwargs[k] = locals()[k]
+                            callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                            latents = callback_outputs.pop("latents", latents)
+                            prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                            negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds",
+                                                                          negative_prompt_embeds)
+                        # call the callback, if provided
+                        if i == len(timesteps) - 1 or (
+                                (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                            progress_bar.update()
+                            if callback is not None and i % callback_steps == 0:
+                                callback(i, t, latents)
+        # 9. Post processing
+        if output_type == "latent":
+            video = latents
+            if generate_no_flow:
+                video_no_flow = latents_no_flow
+        else:
+            video_tensor = self.decode_latents(latents, decode_chunk_size)
+            video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
+            if generate_no_flow:
+                video_tensor_no_flow = self.decode_latents(latents_no_flow, decode_chunk_size)
+                video_no_flow = self.video_processor.postprocess_video(video=video_tensor_no_flow,
+                                                                       output_type=output_type)
+        # 10. Offload all models
+        self.maybe_free_model_hooks()
+        video_no_flow = None if not generate_no_flow else video_no_flow
+        if not return_dict:
+            return (video, video_no_flow)
+        return AnimateDiffPipelineOutput(frames_flow=video, frames_no_flow=video_no_flow)

onlyflow/pipelines/pipeline_animation_long.py ADDED Viewed

	@@ -0,0 +1,555 @@

+# Adapted from https://github.com/showlab/Tune-A-Video/blob/main/tuneavideo/pipelines/pipeline_tuneavideo.py
+# TODO: rebase on diffusers/pipelines/animatediff/pipeline_animatediff.py
+import copy
+import gc
+from dataclasses import dataclass
+from typing import Callable, Optional, Dict, Any, Tuple
+from typing import List, Union
+import PIL.Image
+import numpy as np
+import torch
+from diffusers import AnimateDiffPipeline
+from diffusers.image_processor import PipelineImageInput
+from diffusers.models import AutoencoderKL
+from diffusers.models.attention import FreeNoiseTransformerBlock
+from diffusers.pipelines.animatediff.pipeline_animatediff import EXAMPLE_DOC_STRING
+from diffusers.pipelines.free_noise_utils import AnimateDiffFreeNoiseMixin, SplitInferenceModule
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from diffusers.utils import BaseOutput
+from diffusers.utils import deprecate, logging, replace_example_docstring
+from einops import rearrange
+from transformers import CLIPTextModel, CLIPTokenizer
+from onlyflow.models.flow_adaptor import FlowEncoder
+from onlyflow.models.unet import UNetMotionModel, AnimateDiffTransformer3D, \
+    CrossAttnDownBlockMotion, DownBlockMotion, UpBlockMotion, CrossAttnUpBlockMotion
+from ..models.attention import BasicTransformerBlock
+logger = logging.get_logger(__name__)
+@dataclass
+class FlowCtrlPipelineOutput(BaseOutput):
+    r"""
+     Output class for AnimateDiff pipelines.
+    Args:
+         frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
+             List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
+             denoised
+     PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
+    `(batch_size, num_frames, channels, height, width)`
+    """
+    frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]]]
+class FlowCtrlPipeline(AnimateDiffPipeline):
+    model_cpu_offload_seq = "text_encoder->flow_encoder->image_encoder->unet->vae"
+    _optional_components = ["feature_extractor", "image_encoder", "motion_adapter"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+    def __init__(self,
+                 vae: AutoencoderKL,
+                 text_encoder: CLIPTextModel,
+                 tokenizer: CLIPTokenizer,
+                 unet: UNetMotionModel,
+                 scheduler: Union[
+                     DDIMScheduler,
+                     PNDMScheduler,
+                     LMSDiscreteScheduler,
+                     EulerDiscreteScheduler,
+                     EulerAncestralDiscreteScheduler,
+                     DPMSolverMultistepScheduler],
+                 flow_encoder: FlowEncoder,
+                 feature_extractor=None,
+                 image_encoder=None,
+                 motion_adapter=None,
+                 ):
+        super().__init__(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            motion_adapter=motion_adapter,
+            scheduler=scheduler,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.register_modules(
+            flow_encoder=flow_encoder
+        )
+    def _enable_split_inference_motion_modules_(
+        self, motion_modules: List[AnimateDiffTransformer3D], spatial_split_size: int
+    ) -> None:
+        for motion_module in motion_modules:
+            motion_module.proj_in = SplitInferenceModule(motion_module.proj_in, spatial_split_size, 0, ["input"])
+            for i in range(len(motion_module.transformer_blocks)):
+                motion_module.transformer_blocks[i] = SplitInferenceModule(
+                    motion_module.transformer_blocks[i],
+                    spatial_split_size,
+                    0,
+                    ["hidden_states", "encoder_hidden_states", "cross_attention_kwargs"],
+                )
+            motion_module.proj_out = SplitInferenceModule(motion_module.proj_out, spatial_split_size, 0, ["input"])
+    def _enable_free_noise_in_block(self, block: Union[CrossAttnDownBlockMotion, DownBlockMotion, UpBlockMotion, CrossAttnUpBlockMotion]):
+        r"""Helper function to enable FreeNoise in transformer blocks."""
+        for motion_module in block.motion_modules:
+            num_transformer_blocks = len(motion_module.transformer_blocks)
+            for i in range(num_transformer_blocks):
+                if isinstance(motion_module.transformer_blocks[i], FreeNoiseTransformerBlock):
+                    motion_module.transformer_blocks[i].set_free_noise_properties(
+                        self._free_noise_context_length,
+                        self._free_noise_context_stride,
+                        self._free_noise_weighting_scheme,
+                    )
+                else:
+                    basic_transfomer_block = motion_module.transformer_blocks[i]
+                    motion_module.transformer_blocks[i] = FreeNoiseTransformerBlock(
+                        dim=basic_transfomer_block.dim,
+                        num_attention_heads=basic_transfomer_block.num_attention_heads,
+                        attention_head_dim=basic_transfomer_block.attention_head_dim,
+                        dropout=basic_transfomer_block.dropout,
+                        cross_attention_dim=basic_transfomer_block.cross_attention_dim,
+                        activation_fn=basic_transfomer_block.activation_fn,
+                        attention_bias=basic_transfomer_block.attention_bias,
+                        only_cross_attention=basic_transfomer_block.only_cross_attention,
+                        double_self_attention=basic_transfomer_block.double_self_attention,
+                        positional_embeddings=basic_transfomer_block.positional_embeddings,
+                        num_positional_embeddings=basic_transfomer_block.num_positional_embeddings,
+                        context_length=self._free_noise_context_length,
+                        context_stride=self._free_noise_context_stride,
+                        weighting_scheme=self._free_noise_weighting_scheme,
+                    ).to(device=self._execution_device, dtype=self.dtype)
+                    # here i need to copy the attention processor from the basic transformer block to the free noise transformer block
+                    motion_module.transformer_blocks[i].attn1 = basic_transfomer_block.attn1
+                    motion_module.transformer_blocks[i].attn2 = basic_transfomer_block.attn2
+                    motion_module.transformer_blocks[i].load_state_dict(
+                        basic_transfomer_block.state_dict(), strict=True
+                    )
+                    motion_module.transformer_blocks[i].set_chunk_feed_forward(
+                        basic_transfomer_block._chunk_size, basic_transfomer_block._chunk_dim
+                    )
+    def _disable_free_noise_in_block(self, block: Union[CrossAttnDownBlockMotion, DownBlockMotion, UpBlockMotion, CrossAttnUpBlockMotion]):
+        r"""Helper function to disable FreeNoise in transformer blocks."""
+        for motion_module in block.motion_modules:
+            num_transformer_blocks = len(motion_module.transformer_blocks)
+            for i in range(num_transformer_blocks):
+                if isinstance(motion_module.transformer_blocks[i], FreeNoiseTransformerBlock):
+                    free_noise_transfomer_block = motion_module.transformer_blocks[i]
+                    motion_module.transformer_blocks[i] = BasicTransformerBlock(
+                        dim=free_noise_transfomer_block.dim,
+                        num_attention_heads=free_noise_transfomer_block.num_attention_heads,
+                        attention_head_dim=free_noise_transfomer_block.attention_head_dim,
+                        dropout=free_noise_transfomer_block.dropout,
+                        cross_attention_dim=free_noise_transfomer_block.cross_attention_dim,
+                        activation_fn=free_noise_transfomer_block.activation_fn,
+                        attention_bias=free_noise_transfomer_block.attention_bias,
+                        only_cross_attention=free_noise_transfomer_block.only_cross_attention,
+                        double_self_attention=free_noise_transfomer_block.double_self_attention,
+                        positional_embeddings=free_noise_transfomer_block.positional_embeddings,
+                        num_positional_embeddings=free_noise_transfomer_block.num_positional_embeddings,
+                    ).to(device=self._execution_device, dtype=self.dtype)
+                    motion_module.transformer_blocks[i].load_state_dict(
+                        free_noise_transfomer_block.state_dict(), strict=True
+                    )
+                    motion_module.transformer_blocks[i].set_chunk_feed_forward(
+                        free_noise_transfomer_block._chunk_size, free_noise_transfomer_block._chunk_dim
+                    )
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+            self,
+            prompt: Union[str, List[str]] = None,
+            optical_flow: torch.FloatTensor = None,
+            num_frames: Optional[int] = 16,
+            height: Optional[int] = None,
+            width: Optional[int] = None,
+            num_inference_steps: int = 50,
+            guidance_scale: float = 7.5,
+            negative_prompt: Optional[Union[str, List[str]]] = None,
+            eta: float = 0.0,
+            generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+            latents: Optional[torch.Tensor] = None,
+            prompt_embeds: Optional[torch.Tensor] = None,
+            negative_prompt_embeds: Optional[torch.Tensor] = None,
+            ip_adapter_image: Optional[PipelineImageInput] = None,
+            ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+            output_type: Optional[str] = "pt",
+            return_dict: bool = True,
+            callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+            callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+            motion_cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+            clip_skip: Optional[int] = None,
+            decode_chunk_size: int = 16,
+            val_scale_factor_spatial: float = 0.,
+            val_scale_factor_temporal: float = 0.,
+            **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated video.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated video.
+            num_frames (`int`, *optional*, defaults to 16):
+                The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
+                amounts to 2 seconds of video.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
+                `(batch_size, num_channel, num_frames, height, width)`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*):
+                Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated video. Choose between `torch.Tensor`, `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead
+                of a plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            decode_chunk_size (`int`, defaults to `16`):
+                The number of frames to decode at a time when calling `decode_latents` method.
+        Examples:
+        Returns:
+            [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.animatediff.pipeline_output.AnimateDiffPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
+        """
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        num_videos_per_prompt = 1
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+        self._guidance_scale = guidance_scale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, (str, dict)):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        # 3. Encode input prompt
+        text_encoder_lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+        if self.free_noise_enabled:
+            prompt_embeds, negative_prompt_embeds = self._encode_prompt_free_noise(
+                prompt=prompt,
+                num_frames=num_frames,
+                device=device,
+                num_videos_per_prompt=num_videos_per_prompt,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                negative_prompt=negative_prompt,
+                prompt_embeds=prompt_embeds,
+                negative_prompt_embeds=negative_prompt_embeds,
+                lora_scale=text_encoder_lora_scale,
+                clip_skip=self.clip_skip,
+            )
+        else:
+            prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+                prompt,
+                device,
+                num_videos_per_prompt,
+                self.do_classifier_free_guidance,
+                negative_prompt,
+                prompt_embeds=prompt_embeds,
+                negative_prompt_embeds=negative_prompt_embeds,
+                lora_scale=text_encoder_lora_scale,
+                clip_skip=self.clip_skip,
+            )
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            if self.do_classifier_free_guidance:
+                prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+            prompt_embeds = prompt_embeds.repeat_interleave(repeats=num_frames, dim=0)
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_videos_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+        # 4. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            num_frames,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.reset_peak_memory_stats()
+            torch.cuda.synchronize()
+        assert optical_flow.ndim == 5
+        bs = optical_flow.shape[0]
+        if self.free_noise_enabled:
+            length = optical_flow.shape[2]
+            flow_embedding_features = [
+                torch.zeros((bs, length, *test_size.shape[1:]), device=self._execution_device)
+                for test_size in self.flow_encoder(optical_flow[:,:,:16].to(self._execution_device))
+            ]
+            weight_factor = torch.zeros(length, device=self._execution_device)
+            for star_idx in range(0, length, self._free_noise_context_stride):
+                weight_factor[star_idx:star_idx + self._free_noise_context_length] += 1.0
+                infe = self.flow_encoder(optical_flow[:,:,star_idx:star_idx + self._free_noise_context_length].to(self._execution_device))
+                for flow_emb, infe_sub in zip(flow_embedding_features, infe):
+                    flow_emb[:,star_idx:star_idx + self._free_noise_context_length] += rearrange(infe_sub, '(b f) c h w -> b f c h w', b=bs).to(self._execution_device)
+            flow_embedding_features = [flow_emb / weight_factor[None,:,None,None,None] for flow_emb in flow_embedding_features]
+            flow_embedding_features = [rearrange(x, 'b f c h w -> b c f h w') for x in flow_embedding_features]
+        else:
+            flow_embedding_features = self.flow_encoder(optical_flow.to(self._execution_device))  # input b c f h w into bf, c, h, w
+            flow_embedding_features = [rearrange(x, '(b f) c h w -> b c f h w', b=bs).to(self._execution_device)
+                                       for x in flow_embedding_features]
+        del optical_flow
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+            torch.cuda.reset_peak_memory_stats()
+            torch.cuda.synchronize()
+        # 7. Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
+            else None
+        )
+        num_free_init_iters = self._free_init_num_iters if self.free_init_enabled else 1
+        for free_init_iter in range(num_free_init_iters):
+            if self.free_init_enabled:
+                latents, timesteps = self._apply_free_init(
+                    latents, free_init_iter, num_inference_steps, device, latents.dtype, generator
+                )
+            self._num_timesteps = len(timesteps)
+            num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+            if isinstance(flow_embedding_features[0], list):
+                flow_embedding_features = [[torch.cat([x, x], dim=0) for x in flow_embedding_feature]
+                                           for flow_embedding_feature in flow_embedding_features] \
+                    if self.do_classifier_free_guidance else flow_embedding_features
+            else:
+                flow_embedding_features = [torch.cat([x, x], dim=0) for x in flow_embedding_features] \
+                    if self.do_classifier_free_guidance else flow_embedding_features  # [2b c f h w]
+            # 8. Denoising loop
+            with self.progress_bar(total=self._num_timesteps) as progress_bar:
+                for i, t in enumerate(timesteps):
+                    if self.interrupt:
+                        continue
+                    # expand the latents if we are doing classifier free guidance
+                    latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                    if added_cond_kwargs is not None:
+                        added_cond_kwargs.update({"flow_embedding_features": flow_embedding_features})
+                    else:
+                        added_cond_kwargs = {"flow_embedding_features": flow_embedding_features}
+                    if cross_attention_kwargs is not None:
+                        cross_attention_kwargs.update({"flow_scale": val_scale_factor_spatial})
+                    else:
+                        cross_attention_kwargs = {"flow_scale": val_scale_factor_spatial}
+                    if motion_cross_attention_kwargs is not None:
+                        motion_cross_attention_kwargs.update({"flow_scale": val_scale_factor_temporal})
+                    else:
+                        motion_cross_attention_kwargs = {"flow_scale": val_scale_factor_temporal}
+                    # predict the noise residual
+                    noise_pred = self.unet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=prompt_embeds,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                        motion_cross_attention_kwargs=motion_cross_attention_kwargs,
+                        added_cond_kwargs=added_cond_kwargs,
+                    ).sample
+                    # perform guidance
+                    if self.do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                    if callback_on_step_end is not None:
+                        callback_kwargs = {}
+                        for k in callback_on_step_end_tensor_inputs:
+                            callback_kwargs[k] = locals()[k]
+                        callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                        latents = callback_outputs.pop("latents", latents)
+                        prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                        negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    # call the callback, if provided
+                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                        progress_bar.update()
+                        if callback is not None and i % callback_steps == 0:
+                            callback(i, t, latents)
+        # 9. Post processing
+        if output_type == "latent":
+            video = latents
+        else:
+            video_tensor = self.decode_latents(latents, decode_chunk_size)
+            video = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
+        # 10. Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (video,)
+        return FlowCtrlPipelineOutput(frames=video)

onlyflow/utils/util.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import atexit
+import functools
+import importlib
+import io
+import logging
+import os
+import sys
+import imageio
+import numpy as np
+import torch
+from termcolor import colored
+def instantiate_from_config(config, **additional_kwargs):
+    if not "target" in config:
+        if config == '__is_first_stage__':
+            return None
+        elif config == "__is_unconditional__":
+            return None
+        raise KeyError("Expected key `target` to instantiate.")
+    additional_kwargs.update(config.get("kwargs", dict()))
+    return get_obj_from_str(config["target"])(**additional_kwargs)
+def get_obj_from_str(string, reload=False):
+    module, cls = string.rsplit(".", 1)
+    if reload:
+        module_imp = importlib.import_module(module)
+        importlib.reload(module_imp)
+    return getattr(importlib.import_module(module, package=None), cls)
+def get_video(videos: torch.Tensor, path: str, rescale=False, fps=8):
+    if rescale:
+        videos = (videos + 1.0) / 2.0  # -1,1 -> 0,1
+    videos = (videos * 255).numpy().astype(np.uint8)
+    videos = np.transpose(videos, axes=(1, 2, 3, 0))
+    binary_object = io.BytesIO()
+    imageio.mimsave(binary_object, list(videos), fps=fps, format='gif')
+    return binary_object
+# Logger utils are copied from detectron2
+class _ColorfulFormatter(logging.Formatter):
+    def __init__(self, *args, **kwargs):
+        self._root_name = kwargs.pop("root_name") + "."
+        self._abbrev_name = kwargs.pop("abbrev_name", "")
+        if len(self._abbrev_name):
+            self._abbrev_name = self._abbrev_name + "."
+        super(_ColorfulFormatter, self).__init__(*args, **kwargs)
+    def formatMessage(self, record):
+        record.name = record.name.replace(self._root_name, self._abbrev_name)
+        log = super(_ColorfulFormatter, self).formatMessage(record)
+        if record.levelno == logging.WARNING:
+            prefix = colored("WARNING", "red", attrs=["blink"])
+        elif record.levelno == logging.ERROR or record.levelno == logging.CRITICAL:
+            prefix = colored("ERROR", "red", attrs=["blink", "underline"])
+        else:
+            return log
+        return prefix + " " + log
+# cache the opened file object, so that different calls to `setup_logger`
+# with the same file name can safely write to the same file.
+@functools.lru_cache(maxsize=None)
+def _cached_log_stream(filename):
+    # use 1K buffer if writing to cloud storage
+    io = open(filename, "a", buffering=1024 if "://" in filename else -1)
+    atexit.register(io.close)
+    return io
+@functools.lru_cache()
+def setup_logger(output, distributed_rank, color=True, name='AnimateDiff', abbrev_name=None):
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.DEBUG)
+    logger.propagate = False
+    if abbrev_name is None:
+        abbrev_name = 'AD'
+    plain_formatter = logging.Formatter(
+        "[%(asctime)s] %(name)s:%(lineno)d %(levelname)s: %(message)s", datefmt="%m/%d %H:%M:%S"
+    )
+    # stdout logging: master only
+    if distributed_rank == 0:
+        ch = logging.StreamHandler(stream=sys.stdout)
+        ch.setLevel(logging.DEBUG)
+        if color:
+            formatter = _ColorfulFormatter(
+                colored("[%(asctime)s %(name)s:%(lineno)d]: ", "green") + "%(message)s",
+                datefmt="%m/%d %H:%M:%S",
+                root_name=name,
+                abbrev_name=str(abbrev_name),
+            )
+        else:
+            formatter = plain_formatter
+        ch.setFormatter(formatter)
+        logger.addHandler(ch)
+    # file logging: all workers
+    if output is not None:
+        filename = os.path.join(output, "ranks_logs", f"log.{distributed_rank}.txt")
+        os.makedirs(os.path.dirname(filename), exist_ok=True)
+        fh = logging.StreamHandler(_cached_log_stream(filename))
+        fh.setLevel(logging.DEBUG)
+        fh.setFormatter(plain_formatter)
+        logger.addHandler(fh)
+    return logger
+def format_time(elapsed_time):
+    # Time thresholds
+    minute = 60
+    hour = 60 * minute
+    day = 24 * hour
+    days, remainder = divmod(elapsed_time, day)
+    hours, remainder = divmod(remainder, hour)
+    minutes, seconds = divmod(remainder, minute)
+    formatted_time = ""
+    if days > 0:
+        formatted_time += f"{int(days)} days "
+    if hours > 0:
+        formatted_time += f"{int(hours)} hours "
+    if minutes > 0:
+        formatted_time += f"{int(minutes)} minutes "
+    if seconds > 0:
+        formatted_time += f"{seconds:.2f} seconds"
+    return formatted_time.strip()

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+torch
+torchvision
+diffusers
+transformers
+accelerate
+git+https://github.com/obvious-research/diffusers.git
+numpy
+einops
+imageio
+omegaconf
+av==12.0.0

tools/optical_flow.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import torch
+from einops import rearrange
+@torch.no_grad()
+def get_optical_flow(raft_model, pixel_values, video_length, encode_chunk_size=48, num_flow_updates=14):
+    imgs_1 = pixel_values[:, :-1]
+    imgs_2 = pixel_values[:, 1:]
+    imgs_1 = rearrange(imgs_1, "b f c h w -> (b f) c h w")
+    imgs_2 = rearrange(imgs_2, "b f c h w -> (b f) c h w")
+    flow_embedding = []
+    for i in range(0, imgs_1.shape[0], encode_chunk_size):
+        imgs_1_chunk = imgs_1[i:i + encode_chunk_size]
+        imgs_2_chunk = imgs_2[i:i + encode_chunk_size]
+        flow_embedding_chunk = raft_model(imgs_1_chunk, imgs_2_chunk, num_flow_updates)[-1]
+        flow_embedding.append(flow_embedding_chunk)
+    flow_embedding = torch.cat(flow_embedding).contiguous()
+    flow_embedding = rearrange(flow_embedding, "(b f) c h w -> b c f h w", f=video_length)
+    return flow_embedding