File size: 27,925 Bytes

5c31d1f

from copy import deepcopy
from dataclasses import dataclass
from diffusers import StableDiffusionXLPipeline
from diffusers.image_processor import PipelineImageInput
from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img\
    import rescale_noise_cfg, retrieve_latents, retrieve_timesteps
from diffusers.utils import BaseOutput
from diffusers.utils.torch_utils import randn_tensor
import numpy as np
from PIL import Image
import torch
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
from utils.utils import batch_dict_to_tensor, batch_tensor_to_dict, noise_prev, noise_t2t
from utils.sdxl import register_attr

###
# Code from genforce/ctrl-x/ctrl_x/pipelines/pipeline_sdxl.py

BATCH_ORDER = [
    "structure_uncond", "appearance_uncond", "uncond", "structure_cond", "appearance_cond", "cond",
]

def get_last_control_i(control_schedule, num_inference_steps):
    if control_schedule is None:
        return num_inference_steps, num_inference_steps

    def max_(l):
        if len(l) == 0:
            return 0.0
        return max(l)

    structure_max = 0.0
    appearance_max = 0.0
    for block in control_schedule.values():
        if isinstance(block, list):  # Handling mid_block
            block = {0: block}
        for layer in block.values():
            structure_max = max(structure_max, max_(layer[0] + layer[1]))
            appearance_max = max(appearance_max, max_(layer[2]))

    structure_i = round(num_inference_steps * structure_max)
    appearance_i = round(num_inference_steps * appearance_max)

    return structure_i, appearance_i

@dataclass
class CtrlXStableDiffusionXLPipelineOutput(BaseOutput):
    images: Union[List[Image.Image], np.ndarray]
    structures = Union[List[Image.Image], np.ndarray]
    appearances = Union[List[Image.Image], np.ndarray]

class CtrlXStableDiffusionXLPipeline(StableDiffusionXLPipeline):
    def __call__(
        self,
        prompt: Union[str, List[str]] = None,  # TODO: Support prompt_2 and negative_prompt_2
        structure_prompt: Optional[Union[str, List[str]]] = None,
        appearance_prompt: Optional[Union[str, List[str]]] = None,
        structure_image: Optional[PipelineImageInput] = None,
        appearance_image: Optional[PipelineImageInput] = None,
        num_inference_steps: int = 50,
        timesteps: List[int] = None,
        negative_prompt: Optional[Union[str, List[str]]] = None,
        positive_prompt: Optional[Union[str, List[str]]] = None,
        height: Optional[int] = None,
        width: Optional[int] = None,
        guidance_scale: float = 5.0,
        structure_guidance_scale: Optional[float] = None,
        appearance_guidance_scale: Optional[float] = None,
        num_images_per_prompt: Optional[int] = 1,
        eta: float = 0.0,
        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
        latents: Optional[torch.Tensor] = None,
        structure_latents: Optional[torch.Tensor] = None,
        appearance_latents: Optional[torch.Tensor] = None,
        prompt_embeds: Optional[torch.Tensor] = None,  # Positive prompt is concatenated with prompt, so no embeddings
        structure_prompt_embeds: Optional[torch.Tensor] = None,
        appearance_prompt_embeds: Optional[torch.Tensor] = None,
        negative_prompt_embeds: Optional[torch.Tensor] = None,
        pooled_prompt_embeds: Optional[torch.Tensor] = None,
        structure_pooled_prompt_embeds: Optional[torch.Tensor] = None,
        appearance_pooled_prompt_embeds: Optional[torch.Tensor] = None,
        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
        control_schedule: Optional[Dict] = None,
        self_recurrence_schedule: Optional[List[int]] = [],  # Format: [(start, end, num_repeat)]
        decode_structure: Optional[bool] = True,
        decode_appearance: Optional[bool] = True,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
        guidance_rescale: float = 0.0,
        original_size: Tuple[int, int] = None,
        crops_coords_top_left: Tuple[int, int] = (0, 0),
        target_size: Tuple[int, int] = None,
        clip_skip: Optional[int] = None,
        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
        **kwargs,
    ):
        callback = kwargs.pop("callback", None)
        callback_steps = kwargs.pop("callback_steps", None)
        self._guidance_scale = guidance_scale

        # 0. Default height and width to U-Net
        height = height or self.default_sample_size * self.vae_scale_factor
        width = width or self.default_sample_size * self.vae_scale_factor
        original_size = original_size or (height, width)
        target_size = target_size or (height, width)

        # 2. Set batch_size = 1 as per instruction
        batch_size = 1
        if isinstance(prompt, list):
            assert len(prompt) == batch_size
        if prompt_embeds is not None:
            assert prompt_embeds.shape[0] == batch_size

        device = self._execution_device

        # 3. Encode input prompt
        text_encoder_lora_scale = (
            cross_attention_kwargs.get("scale", None)
            if cross_attention_kwargs is not None else None
        )

        # 3-3.2 Encode input, structure, appearance prompt
        # bc98db93-468b-4511-b30d-3a330eca9968
        # Prepare prompt data
        prompts = [
            (prompt, None, None, None, None, prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds),
            (structure_prompt, structure_prompt_embeds, negative_prompt if structure_image is None else "", None, None, structure_prompt_embeds, None, structure_pooled_prompt_embeds, None),
            (appearance_prompt, appearance_prompt_embeds, negative_prompt if appearance_image is None else "", None, None, appearance_prompt_embeds, None, appearance_pooled_prompt_embeds, None)
        ]
        prompt_embeds_list = []
        add_text_embeds_list = []
        for item in prompts:
            prompt_text, prompt_embeds_temp, negative_prompt_temp, pooled_prompt_embeds_temp = item[:4]  # Unpack relevant items

            if prompt_text is not None and prompt_text != "":
                (
                    prompt_embeds_,
                    negative_prompt_embeds,
                    pooled_prompt_embeds_,
                    negative_pooled_prompt_embeds,
                ) = self.encode_prompt(
                    prompt=prompt_text,
                    prompt_2=None,
                    device=device,
                    num_images_per_prompt=num_images_per_prompt,
                    do_classifier_free_guidance=True,
                    negative_prompt=negative_prompt_temp,
                    negative_prompt_2=None,
                    prompt_embeds=prompt_embeds_temp,
                    negative_prompt_embeds=None,
                    pooled_prompt_embeds=pooled_prompt_embeds_temp,
                    negative_pooled_prompt_embeds=None,
                    lora_scale=text_encoder_lora_scale,
                    clip_skip=clip_skip,
                )
                prompt_embeds_list.append(torch.cat([negative_prompt_embeds, prompt_embeds_], dim=0).to(device))
                add_text_embeds_list.append(torch.cat([negative_pooled_prompt_embeds, pooled_prompt_embeds_], dim=0).to(device))
            else:
                prompt_embeds_list.append(prompt_embeds_list[0])
                add_text_embeds_list.append(add_text_embeds_list[0])
        # prompt_embeds, structure_prompt_embeds, appearance_prompt_embeds = prompt_embeds_list
        # add_text_embeds, structure_add_text_embeds, appearance_add_text_embeds = add_text_embeds_list

        # 3.3. Prepare added time ids & embeddings
        if self.text_encoder_2 is None:
            text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
        else:
            text_encoder_projection_dim = self.text_encoder_2.config.projection_dim

        add_time_ids = self._get_add_time_ids(
            original_size,
            crops_coords_top_left,
            target_size,
            dtype=self.dtype,
            text_encoder_projection_dim=text_encoder_projection_dim,
        )
        negative_add_time_ids = add_time_ids
        add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0).to(device)

        # 4. Prepare timesteps
        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)

        # 5. Prepare latent variables
        num_channels_latents = self.unet.config.in_channels

        # The second variable is _.
        latents, _ = self.prepare_latents(
            None, batch_size, num_images_per_prompt, num_channels_latents, height, width,
            self.dtype, device, generator, latents
        )
        latents_ = [structure_latents, appearance_latents]
        clean_latents_ = []
        for image_index, image_ in enumerate([structure_image, appearance_image]):
            if image_ is not None:
                # The first variable is _.
                _, clean_latent = self.prepare_latents(
                    image_, batch_size, num_images_per_prompt, num_channels_latents, height, width,
                    self.dtype, device, generator, latents_[image_index]
                )
                clean_latents_.append(clean_latent)
            else:
                clean_latents_.append(None)
            if latents_[image_index] is None:
                latents_[image_index] = latents
        latents_ = [latents] + latents_
        # clean_structure_latents, clean_appearance_latents = clean_latents_

        # 6. Prepare extra step kwargs
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

        # 7. Denoising loop
        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)

        # 7.1 Apply denoising_end
        if hasattr(self, 'denoising_end') and self.denoising_end is not None and 0.0 < float(self.denoising_end) < 1.0:
            discrete_timestep_cutoff = int(
                round(
                    self.scheduler.config.num_train_timesteps
                    - (self.denoising_end * self.scheduler.config.num_train_timesteps)
                )
            )
            num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
            timesteps = timesteps[:num_inference_steps]

        # 7.2 Optionally get guidance scale embedding
        timestep_cond = None
        assert self.unet.config.time_cond_proj_dim is None

        # 7.3 Get batch order
        batch_order = deepcopy(BATCH_ORDER)
        if structure_image is not None:  # If image is provided, not generating, so no CFG needed
            batch_order.remove("structure_uncond")
        if appearance_image is not None:
            batch_order.remove("appearance_uncond")

        baked_latents = self.cfg_loop(batch_order,
                                      prompt_embeds_list,
                                      add_text_embeds_list,
                                      add_time_ids,
                                      latents_,
                                      clean_latents_,
                                      num_inference_steps,
                                      num_warmup_steps,
                                      extra_step_kwargs,
                                      timesteps,
                                      timestep_cond=timestep_cond,
                                      control_schedule=control_schedule,
                                      self_recurrence_schedule=self_recurrence_schedule,
                                      guidance_rescale=guidance_rescale,
                                      callback=callback,
                                      callback_steps=callback_steps,
                                      cross_attention_kwargs=cross_attention_kwargs)
        latents, structure_latents, appearance_latents = baked_latents

        # For passing important information onto the refiner
        self.refiner_args = {"latents": latents.detach(), "prompt": prompt, "negative_prompt": negative_prompt}

        if not output_type == "latent":
            # Make sure the VAE is in float32 mode, as it overflows in float16
            if self.vae.config.force_upcast:
                self.upcast_vae()
                vae_dtype = next(iter(self.vae.post_quant_conv.parameters())).dtype
                latents = latents.to(vae_dtype)
                structure_latents = structure_latents.to(vae_dtype)
                appearance_latents = appearance_latents.to(vae_dtype)

            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
            image = self.image_processor.postprocess(image, output_type=output_type)
            if decode_structure:
                structure = self.vae.decode(structure_latents / self.vae.config.scaling_factor, return_dict=False)[0]
                structure = self.image_processor.postprocess(structure, output_type=output_type)
            else:
                structure = structure_latents
            if decode_appearance:
                appearance = self.vae.decode(appearance_latents / self.vae.config.scaling_factor, return_dict=False)[0]
                appearance = self.image_processor.postprocess(appearance, output_type=output_type)
            else:
                appearance = appearance_latents

            # Cast back to fp16 if needed
            if self.vae.config.force_upcast:
                self.vae.to(dtype=torch.float16)
        else:
            return CtrlXStableDiffusionXLPipelineOutput(
                images=latents, structures=structure_latents, appearances=appearance_latents
            )

        # Offload all models
        self.maybe_free_model_hooks()

        if not return_dict:
            return image, structure, appearance

        return CtrlXStableDiffusionXLPipelineOutput(images=image, structures=structure, appearances=appearance)

    def cfg_loop(self,
                 batch_order,
                 prompt_embeds_list,
                 add_text_embeds_list,
                 add_time_ids,
                 latents_,
                 clean_latents_,
                 num_inference_steps,
                 num_warmup_steps,
                 extra_step_kwargs,
                 timesteps,
                 timestep_cond=None,
                 control_schedule=None,
                 self_recurrence_schedule=None,
                 guidance_rescale=0.0,
                 callback=None,
                 callback_steps=None,
                 callback_on_step_end=None,
                 callback_on_step_end_tensor_inputs=None,
                 cross_attention_kwargs=None):
        prompt_embeds, structure_prompt_embeds, appearance_prompt_embeds = prompt_embeds_list
        add_text_embeds, structure_add_text_embeds, appearance_add_text_embeds = add_text_embeds_list
        latents, structure_latents, appearance_latents = latents_
        clean_structure_latents, clean_appearance_latents = clean_latents_
        structure_control_stop_i, appearance_control_stop_i = get_last_control_i(control_schedule, num_inference_steps)

        if self_recurrence_schedule is None:
            self_recurrence_schedule = [0] * num_inference_steps

        self._num_timesteps = len(timesteps)
        with self.progress_bar(total=num_inference_steps) as progress_bar:
            for i, t in enumerate(timesteps):
                if hasattr(self, 'interrupt') and self.interrupt:
                    continue

                if i == structure_control_stop_i:  # If not generating structure/appearance, drop after last control
                    if "structure_uncond" not in batch_order:
                        batch_order.remove("structure_cond")
                if i == appearance_control_stop_i:
                    if "appearance_uncond" not in batch_order:
                        batch_order.remove("appearance_cond")

                register_attr(self, t=t.item(), do_control=True, batch_order=batch_order)

                # With CFG.
                latent_model_input = self.scheduler.scale_model_input(latents, t)
                structure_latent_model_input = self.scheduler.scale_model_input(structure_latents, t)
                appearance_latent_model_input = self.scheduler.scale_model_input(appearance_latents, t)

                pass
                all_latent_model_input = {
                    "structure_uncond": structure_latent_model_input[0:1],
                    "appearance_uncond": appearance_latent_model_input[0:1],
                    "uncond": latent_model_input[0:1],
                    "structure_cond": structure_latent_model_input[0:1],
                    "appearance_cond": appearance_latent_model_input[0:1],
                    "cond": latent_model_input[0:1],
                }
                all_prompt_embeds = {
                    "structure_uncond": structure_prompt_embeds[0:1],
                    "appearance_uncond": appearance_prompt_embeds[0:1],
                    "uncond": prompt_embeds[0:1],
                    "structure_cond": structure_prompt_embeds[1:2],
                    "appearance_cond": appearance_prompt_embeds[1:2],
                    "cond": prompt_embeds[1:2],
                }
                all_add_text_embeds = {
                    "structure_uncond": structure_add_text_embeds[0:1],
                    "appearance_uncond": appearance_add_text_embeds[0:1],
                    "uncond": add_text_embeds[0:1],
                    "structure_cond": structure_add_text_embeds[1:2],
                    "appearance_cond": appearance_add_text_embeds[1:2],
                    "cond": add_text_embeds[1:2],
                }
                all_time_ids = {
                    "structure_uncond": add_time_ids[0:1],
                    "appearance_uncond": add_time_ids[0:1],
                    "uncond": add_time_ids[0:1],
                    "structure_cond": add_time_ids[1:2],
                    "appearance_cond": add_time_ids[1:2],
                    "cond": add_time_ids[1:2],
                }

                concat_latent_model_input = batch_dict_to_tensor(all_latent_model_input, batch_order)
                concat_prompt_embeds = batch_dict_to_tensor(all_prompt_embeds, batch_order)
                concat_add_text_embeds = batch_dict_to_tensor(all_add_text_embeds, batch_order)
                concat_add_time_ids = batch_dict_to_tensor(all_time_ids, batch_order)

                # Predict the noise residual
                added_cond_kwargs = {"text_embeds": concat_add_text_embeds, "time_ids": concat_add_time_ids}

                concat_noise_pred = self.unet(
                    concat_latent_model_input,
                    t,
                    encoder_hidden_states=concat_prompt_embeds,
                    timestep_cond=timestep_cond,
                    cross_attention_kwargs=cross_attention_kwargs,
                    added_cond_kwargs=added_cond_kwargs,
                ).sample
                all_noise_pred = batch_tensor_to_dict(concat_noise_pred, batch_order)

                # Classifier-free guidance
                noise_pred = all_noise_pred["uncond"] +\
                    self.guidance_scale * (all_noise_pred["cond"] - all_noise_pred["uncond"])

                structure_noise_pred = all_noise_pred["structure_cond"]\
                    if "structure_cond" in batch_order else noise_pred
                if "structure_uncond" in all_noise_pred:
                    structure_noise_pred = all_noise_pred["structure_uncond"] +\
                        self.structure_guidance_scale * (structure_noise_pred - all_noise_pred["structure_uncond"])

                appearance_noise_pred = all_noise_pred["appearance_cond"]\
                    if "appearance_cond" in batch_order else noise_pred
                if "appearance_uncond" in all_noise_pred:
                    appearance_noise_pred = all_noise_pred["appearance_uncond"] +\
                        self.appearance_guidance_scale * (appearance_noise_pred - all_noise_pred["appearance_uncond"])

                if guidance_rescale > 0.0:
                    noise_pred = rescale_noise_cfg(
                        noise_pred, all_noise_pred["cond"], guidance_rescale=guidance_rescale
                    )
                    if "structure_uncond" in all_noise_pred:
                        structure_noise_pred = rescale_noise_cfg(
                            structure_noise_pred, all_noise_pred["structure_cond"],
                            guidance_rescale=guidance_rescale
                        )
                    if "appearance_uncond" in all_noise_pred:
                        appearance_noise_pred = rescale_noise_cfg(
                            appearance_noise_pred, all_noise_pred["appearance_cond"],
                            guidance_rescale=guidance_rescale
                        )

                # Compute the previous noisy sample x_t -> x_t-1
                concat_noise_pred = torch.cat(
                    [structure_noise_pred, appearance_noise_pred, noise_pred], dim=0,
                )
                concat_latents = torch.cat(
                    [structure_latents, appearance_latents, latents], dim=0,
                )
                structure_latents, appearance_latents, latents = self.scheduler.step(
                    concat_noise_pred, t, concat_latents, **extra_step_kwargs,
                ).prev_sample.chunk(3)

                if clean_structure_latents is not None:
                    structure_latents = noise_prev(self.scheduler, t, clean_structure_latents)
                if clean_appearance_latents is not None:
                    appearance_latents = noise_prev(self.scheduler, t, clean_appearance_latents)

                # Self-recurrence
                for _ in range(self_recurrence_schedule[i]):
                    if hasattr(self.scheduler, "_step_index"):  # For fancier schedulers
                        self.scheduler._step_index -= 1  # TODO: Does this actually work?

                    t_prev = 0 if i + 1 >= num_inference_steps else timesteps[i + 1]
                    latents = noise_t2t(self.scheduler, t_prev, t, latents)
                    latent_model_input = torch.cat([latents] * 2)

                    register_attr(self, t=t.item(), do_control=False, batch_order=["uncond", "cond"])

                    # Predict the noise residual
                    added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
                    noise_pred_uncond, noise_pred_ = self.unet(
                        latent_model_input,
                        t,
                        encoder_hidden_states=prompt_embeds,
                        timestep_cond=timestep_cond,
                        cross_attention_kwargs=cross_attention_kwargs,
                        added_cond_kwargs=added_cond_kwargs,
                    ).sample.chunk(2)
                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_ - noise_pred_uncond)

                    if guidance_rescale > 0.0:
                        noise_pred = rescale_noise_cfg(noise_pred, noise_pred_, guidance_rescale=guidance_rescale)

                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample

                # Callbacks
                assert callback_on_step_end is None

                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                    progress_bar.update()
                    if callback is not None and i % callback_steps == 0:
                        step_idx = i // getattr(self.scheduler, "order", 1)
                        callback(step_idx, t, latents)

        # "Reconstruction"
        if clean_structure_latents is not None:
            structure_latents = clean_structure_latents
        if clean_appearance_latents is not None:
            appearance_latents = clean_appearance_latents

        return latents, structure_latents, appearance_latents

    @property
    def appearance_guidance_scale(self):
        return self._guidance_scale if self._appearance_guidance_scale is None else self._appearance_guidance_scale

    @property
    def structure_guidance_scale(self):
        return self._guidance_scale if self._structure_guidance_scale is None else self._structure_guidance_scale

    def prepare_latents(self, image, batch_size, num_images_per_prompt, num_channels_latents, height, width,
                        dtype, device, generator=None, noise=None):
        batch_size = batch_size * num_images_per_prompt

        if noise is None:
            shape = (
                batch_size,
                num_channels_latents,
                height // self.vae_scale_factor,
                width // self.vae_scale_factor
            )
            noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
            noise = noise * self.scheduler.init_noise_sigma  # Starting noise, need to scale
        else:
            noise = noise.to(device)

        if image is None:
            return noise, None

        if not isinstance(image, (torch.Tensor, Image.Image, list)):
            raise ValueError(
                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
            )

        # Offload text encoder if `enable_model_cpu_offload` was enabled
        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
            self.text_encoder_2.to("cpu")
            torch.cuda.empty_cache()

        image = image.to(device=device, dtype=dtype)

        if image.shape[1] == 4:  # Image already in latents form
            init_latents = image

        else:
            # Make sure the VAE is in float32 mode, as it overflows in float16
            if self.vae.config.force_upcast:
                image = image.to(torch.float32)
                self.vae.to(torch.float32)

            if isinstance(generator, list) and len(generator) != batch_size:
                raise ValueError(
                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
                )
            elif isinstance(generator, list):
                init_latents = [
                    retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
                    for i in range(batch_size)
                ]
                init_latents = torch.cat(init_latents, dim=0)
            else:
                init_latents = retrieve_latents(self.vae.encode(image), generator=generator)

            if self.vae.config.force_upcast:
                self.vae.to(dtype)

            init_latents = init_latents.to(dtype)
            init_latents = self.vae.config.scaling_factor * init_latents

        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
            # Expand init_latents for batch_size
            additional_image_per_prompt = batch_size // init_latents.shape[0]
            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
            raise ValueError(
                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
            )
        else:
            init_latents = torch.cat([init_latents], dim=0)

        return noise, init_latents