Spaces:

qyoo
/

AID-v2

Running on Zero

App Files Files Community

QY-H00 commited on Oct 21

Commit

0320907

•

1 Parent(s): c824a93

init

Browse files

Files changed (11) hide show

README.md +5 -5
app.py +517 -0
asset/statue.jpg +0 -0
asset/vermeer.jpg +0 -0
interpolation.py +918 -0
pipeline_interpolated_sd.py +1963 -0
pipeline_interpolated_sdxl.py +0 -0
prior.py +506 -0
requirements.txt +66 -0
style.css +95 -0
utils.py +212 -0

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
-title: AID V2
-emoji: 🏃
-colorFrom: green
-colorTo: indigo
 sdk: gradio
-sdk_version: 5.1.0
 app_file: app.py
 pinned: false
 ---

 ---
+title: PAID
+emoji: 🏢
+colorFrom: pink
+colorTo: red
 sdk: gradio
+sdk_version: 4.22.0
 app_file: app.py
 pinned: false
 ---

app.py ADDED Viewed

	@@ -0,0 +1,517 @@

+import os
+from typing import Optional
+import gradio as gr
+import numpy as np
+import pandas as pd
+import torch
+from PIL import Image
+from pipeline_interpolated_sd import InterpolationStableDiffusionPipeline
+from pipeline_interpolated_sdxl import InterpolationStableDiffusionXLPipeline
+from prior import BetaPriorPipeline
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+title = r"""
+<h1 align="center">PAID: (Prompt-guided) Attention Interpolation of Text-to-Image Diffusion</h1>
+"""
+description = r"""
+<b>Official 🤗 Gradio demo</b> for <a href='https://github.com/QY-H00/attention-interpolation-diffusion/tree/public' target='_blank'><b>PAID: (Prompt-guided) Attention Interpolation of Text-to-Image Diffusion</b></a>.<br>
+How to use:<br>
+1. Input prompt 1, prompt 2 and negative prompt.
+2. For <b> Compositional Generation </b> Input the guidance prompt and choose the one you are satisfied!
+3. For <b> Image morphing </b> Input the image prompt 1 and image prompt 2, and choose IP-Adapter.
+4. For <b> Scale Control </b> Input the same text for prompt 1 and prompt 2, leave image prompt 1 blank and upload image prompt 2. Then choose IP-Adapter or IP-Composition-Adapter.
+5. <b> Note that the time required for the SD-series with an exploration size of 10 is around 120 seconds. XL-series with an exploration size 5 is around 5 minutes 30 seconds. </b>
+6. Click the <b>Generate</b> button to begin generating images.
+7. Enjoy! 😊"""
+article = r"""
+---
+✒️ **Citation**
+<br>
+If you found this demo/our paper useful, please consider citing:
+```bibtex
+@article{he2024aid,
+  title={AID: Attention Interpolation of Text-to-Image Diffusion},
+  author={He, Qiyuan and Wang, Jinghao and Liu, Ziwei and Yao, Angela},
+  journal={arXiv preprint arXiv:2403.17924},
+  year={2024}
+}
+```
+📧 **Contact**
+<br>
+If you have any questions, please feel free to open an issue in our <a href='https://github.com/QY-H00/attention-interpolation-diffusion/tree/public' target='_blank'><b>Github Repo</b></a> or directly reach us out at <b>[email protected]</b>.
+"""
+MAX_SEED = np.iinfo(np.int32).max
+CACHE_EXAMPLES = False
+USE_TORCH_COMPILE = False
+ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD") == "1"
+PREVIEW_IMAGES = False
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+pipeline = InterpolationStableDiffusionPipeline.from_pretrained(
+    "SG161222/Realistic_Vision_V4.0_noVAE",
+    torch_dtype=torch.float16
+)
+pipeline.to(device, dtype=torch.float16)
+def change_model_fn(model_name: str) -> None:
+    global device
+    name_mapping = {
+        "AOM3": "hogiahien/aom3",
+        "SD1.5-512": "stable-diffusion-v1-5/stable-diffusion-v1-5",
+        "SD2.1-768": "stabilityai/stable-diffusion-2-1",
+        "RealVis-v4.0": "SG161222/Realistic_Vision_V4.0_noVAE",
+        "SDXL-1024": "stabilityai/stable-diffusion-xl-base-1.0",
+        "Playground-XL-v2": "playgroundai/playground-v2.5-1024px-aesthetic",
+        "Juggernaut-XL-v9": "RunDiffusion/Juggernaut-XL-v9"
+    }
+    if device == torch.device("cpu"):
+            dtype = torch.float16
+    else:
+        dtype = torch.float16
+    if "XL" not in model_name:
+        globals()["pipeline"] = InterpolationStableDiffusionPipeline.from_pretrained(
+            name_mapping[model_name], torch_dtype=dtype
+        )
+        globals()["pipeline"].to(device, dtype=torch.float16)
+    else:
+        globals()["pipeline"] = InterpolationStableDiffusionXLPipeline.from_pretrained(
+            name_mapping[model_name], torch_dtype=dtype
+        )
+        globals()["pipeline"].to(device)
+def change_adapter_fn(adapter_name: str) -> None:
+    global pipeline
+    if adapter_name == "IP-Adapter":
+        if isinstance(pipeline, InterpolationStableDiffusionPipeline):
+            pipeline.load_aid_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
+        else:
+            pipeline.load_aid_ip_adapter("ozzygt/sdxl-ip-adapter", "", weight_name="ip-adapter-plus_sdxl_vit-h.safetensors")
+    elif adapter_name == "IP-Composition-Adapter":
+        if isinstance(pipeline, InterpolationStableDiffusionPipeline):
+            pipeline.load_aid_ip_adapter("ostris/ip-composition-adapter", subfolder="", weight_name="ip_plus_composition_sd15.safetensors")
+        else:
+            pipeline.load_aid_ip_adapter("ozzygt/sdxl-ip-adapter", subfolder="", weight_name="ip_plus_composition_sdxl.safetensors")
+    else:
+        pipeline.load_aid()
+def save_image(img, index):
+    unique_name = f"{index}.png"
+    img = Image.fromarray(img)
+    img.save(unique_name)
+    return unique_name
+def get_example() -> list[list[str | float | int ]]:
+    case = [
+        [
+            "A statue",
+            "A dragon",
+            "nsfw, lowres, (bad), text, error, fewer, extra, missing, worst quality, jpeg artifacts, low quality, watermark, unfinished, displeasing, oldest, early, chromatic aberration, signature, extra digits, artistic error, username, scan, [abstract]",
+            "",
+            None,
+            None,
+            50,
+            10,
+            5,
+            5.0,
+            0.5,
+            "RealVis-v4.0",
+            "None",
+            0,
+            True,
+        ],
+        [
+            "A photo of a statue",
+            "Het meisje met de parel, by Vermeer",
+            "nsfw, lowres, (bad), text, error, fewer, extra, missing, worst quality, jpeg artifacts, low quality, watermark, unfinished, displeasing, oldest, early, chromatic aberration, signature, extra digits, artistic error, username, scan, [abstract]",
+            "",
+            Image.open("asset/statue.jpg"),
+            Image.open("asset/vermeer.jpg"),
+            50,
+            10,
+            5,
+            5.0,
+            0.5,
+            "RealVis-v4.0",
+            "IP-Adapter",
+            0,
+            True,
+        ],
+        [
+            "A boy is smiling",
+            "A boy is smiling",
+            "nsfw, lowres, (bad), text, error, fewer, extra, missing, worst quality, jpeg artifacts, low quality, watermark, unfinished, displeasing, oldest, early, chromatic aberration, signature, extra digits, artistic error, username, scan, [abstract]",
+            "",
+            None,
+            Image.open("asset/vermeer.jpg"),
+            50,
+            10,
+            5,
+            5.0,
+            0.5,
+            "RealVis-v4.0",
+            "IP-Composition-Adapter",
+            0,
+            True,
+        ],
+        [
+            "masterpiece, best quality, very aesthetic, absurdres, A dog",
+            "masterpiece, best quality, very aesthetic, absurdres, A car",
+            "nsfw, lowres, (bad), text, error, fewer, extra, missing, worst quality, jpeg artifacts, low quality, watermark, unfinished, displeasing, oldest, early, chromatic aberration, signature, extra digits, artistic error, username, scan, [abstract]",
+            "masterpiece, best quality, very aesthetic, absurdres, the toy, named 'Dog-Car', is designed as a dog figure with car wheels instead of feet",
+            None,
+            None,
+            50,
+            5,
+            5,
+            5.0,
+            0.5,
+            "RealVis-v4.0",
+            "None",
+            1002,
+            True
+        ],
+        [
+            "masterpiece, best quality, very aesthetic, absurdres, A dog",
+            "masterpiece, best quality, very aesthetic, absurdres, A car",
+            "nsfw, lowres, (bad), text, error, fewer, extra, missing, worst quality, jpeg artifacts, low quality, watermark, unfinished, displeasing, oldest, early, chromatic aberration, signature, extra digits, artistic error, username, scan, [abstract]",
+            "masterpiece, best quality, very aesthetic, absurdres, a dog is driving a car",
+            None,
+            None,
+            28,
+            5,
+            5,
+            5.0,
+            0.5,
+            "Playground-XL-v2",
+            "None",
+            1002,
+            True
+        ]
+        # [
+        #     "masterpiece, best quality, very aesthetic, absurdres, A cat is smiling, face portrait",
+        #     "masterpiece, best quality, very aesthetic, absurdres, A beautiful lady, face portrait",
+        #     "nsfw, lowres, (bad), text, error, fewer, extra, missing, worst quality, jpeg artifacts, low quality, watermark, unfinished, displeasing, oldest, early, chromatic aberration, signature, extra digits, artistic error, username, scan, [abstract]",
+        #     None,
+        #     None,
+        #     None,
+        #     28,
+        #     7,
+        #     5,
+        #     5.0,
+        #     1.0,
+        #     "Playground-XL-v2"
+        # ],
+        # [
+        #     "masterpiece, best quality, very aesthetic, absurdres, A dog",
+        #     "masterpiece, best quality, very aesthetic, absurdres, A car",
+        #     "nsfw, lowres, (bad), text, error, fewer, extra, missing, worst quality, jpeg artifacts, low quality, watermark, unfinished, displeasing, oldest, early, chromatic aberration, signature, extra digits, artistic error, username, scan, [abstract]",
+        #     "masterpiece, best quality, very aesthetic, absurdres, the toy, named 'Dog-Car', is designed as a dog figure with car wheels instead of feet",
+        #     None,
+        #     None,
+        #     28,
+        #     5,
+        #     5,
+        #     5.0,
+        #     0.5,
+        #     "Playground-XL-v2"
+        # ],
+    ]
+    return case
+def change_generate_button_fn(enable: int) -> gr.Button:
+    if enable == 0:
+        return gr.Button(interactive=False, value="Switching Model...")
+    else:
+        return gr.Button(interactive=True, value="Generate")
+def dynamic_gallery_fn(interpolation_size: int):
+    return gr.Gallery(
+        label="Result", show_label=False, rows=1, columns=interpolation_size
+    )
+@torch.no_grad()
+def generate(
+    prompt1,
+    prompt2,
+    negative_prompt,
+    guide_prompt=None,
+    image_prompt1=None,
+    image_prompt2=None,
+    num_inference_steps=28,
+    exploration_size=16,
+    interpolation_size=7,
+    guidance_scale=5.0,
+    warmup_ratio=0.5,
+    seed=0,
+    same_latent=True,
+) -> np.ndarray:
+    global pipeline
+    global adapter_choice
+    beta_pipe = BetaPriorPipeline(pipeline)
+    if guide_prompt == "":
+        guide_prompt = None
+    generator = (
+        torch.cuda.manual_seed(seed)
+        if torch.cuda.is_available()
+        else torch.manual_seed(seed)
+    )
+    size = pipeline.unet.config.sample_size
+    latent1 = torch.randn((1, 4, size, size,), device="cuda", dtype=pipeline.unet.dtype, generator=generator)
+    if same_latent:
+        latent2 = latent1.clone()
+    else:
+        latent2 = torch.randn((1, 4, size, size,), device="cuda", dtype=pipeline.unet.dtype, generator=generator)
+    if image_prompt1 is None and image_prompt2 is None:
+        pipeline.load_aid()
+    elif (image_prompt1 is None and image_prompt2 is not None):
+        if adapter_choice.value == "IP-Adapter":
+            if isinstance(pipeline, InterpolationStableDiffusionPipeline):
+                pipeline.load_aid_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
+            else:
+                pipeline.load_aid_ip_adapter("ozzygt/sdxl-ip-adapter", "", weight_name="ip-adapter-plus_sdxl_vit-h.safetensors")
+        elif adapter_choice.value == "IP-Composition-Adapter":
+            if isinstance(pipeline, InterpolationStableDiffusionPipeline):
+                pipeline.load_aid_ip_adapter("ostris/ip-composition-adapter", subfolder="", weight_name="ip_plus_composition_sd15.safetensors")
+            else:
+                pipeline.load_aid_ip_adapter("ozzygt/sdxl-ip-adapter", subfolder="", weight_name="ip_plus_composition_sdxl.safetensors")
+    elif (image_prompt1 is None and image_prompt2 is not None):
+        if adapter_choice.value == "IP-Adapter":
+            if isinstance(pipeline, InterpolationStableDiffusionPipeline):
+                pipeline.load_aid_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin", early="scale_control")
+            else:
+                pipeline.load_aid_ip_adapter("ozzygt/sdxl-ip-adapter", "", weight_name="ip-adapter-plus_sdxl_vit-h.safetensors", early="scale_control")
+        elif adapter_choice.value == "IP-Composition-Adapter":
+            if isinstance(pipeline, InterpolationStableDiffusionPipeline):
+                pipeline.load_aid_ip_adapter("ostris/ip-composition-adapter", subfolder="", weight_name="ip_plus_composition_sd15.safetensors", early="scale_control")
+            else:
+                pipeline.load_aid_ip_adapter("ozzygt/sdxl-ip-adapter", subfolder="", weight_name="ip_plus_composition_sdxl.safetensors", early="scale_control")
+    else:
+        raise ValueError("To use scale control, please provide only the right image; To use image morphing, please provide images from both side.")
+    images = beta_pipe.generate_interpolation(
+        gr.Progress(),
+        prompt1,
+        prompt2,
+        negative_prompt,
+        latent1,
+        latent2,
+        num_inference_steps,
+        image_start=image_prompt1,
+        image_end=image_prompt2,
+        exploration_size=exploration_size,
+        interpolation_size=interpolation_size,
+        output_type="np",
+        guide_prompt=guide_prompt,
+        guidance_scale=guidance_scale,
+        warmup_ratio=warmup_ratio
+    )
+    return images
+interpolation_size = None
+with gr.Blocks(css="style.css") as demo:
+    gr.Markdown(title)
+    gr.Markdown(description)
+    with gr.Row(elem_classes="grid-container"):
+        with gr.Group():
+            with gr.Column(elem_classes="grid-item"):  # 左侧列
+                prompt1 = gr.Text(
+                    label="Prompt 1",
+                    max_lines=3,
+                    placeholder="Enter the First Prompt",
+                    interactive=True,
+                    value="A photo of a cat",
+                )
+                prompt2 = gr.Text(
+                    label="Prompt 2",
+                    max_lines=3,
+                    placeholder="Enter the Second Prompt",
+                    interactive=True,
+                    value="A photo of a beautiful lady",
+                )
+                negative_prompt = gr.Text(
+                    label="Negative prompt",
+                    max_lines=3,
+                    placeholder="Enter a Negative Prompt",
+                    interactive=True,
+                    value="nsfw, lowres, (bad), text, error, fewer, extra, missing, worst quality, jpeg artifacts, low quality, watermark, unfinished, displeasing, oldest, early, chromatic aberration, signature, extra digits, artistic error, username, scan, [abstract]",
+                )
+                guidance_prompt = gr.Text(
+                    label="Guidance prompt (Optional)",
+                    max_lines=3,
+                    placeholder="Enter a Guidance Prompt",
+                    interactive=True,
+                    value="",
+                )
+        with gr.Group():
+            with gr.Column(elem_classes="grid-item"):  # 右侧列
+                with gr.Row(elem_classes="flex-grow"):
+                    image_prompt1 = gr.Image(label="Image Prompt 1 (Optional)", interactive=True, height=236, width=235)
+                    image_prompt2 = gr.Image(label="Image Prompt 2 (Optional)", interactive=True, height=236, width=235)
+                with gr.Row(elem_classes="flex-grow"):
+                    model_choice = gr.Dropdown(
+                        ["RealVis-v4.0", "SD1.4-512", "SD1.5-512", "SD2.1-768", "AOM3", "SDXL-1024", "Playground-XL-v2", "Juggernaut-XL-v9"],
+                        label="Model",
+                        value="RealVis-v4.0",
+                        interactive=True,
+                        info="All series are running on float16; SD2.1 does not support IP-Adapter; XL-Series takes longer time",
+                    )
+                    adapter_choice = gr.Dropdown(
+                        ["None", "IP-Adapter", "IP-Composition-Adapter"],
+                        label="IP-Adapter",
+                        value="None",
+                        interactive=True,
+                        info="Only set to IP-Adapter or IP-Composition-Adapter when using image prompt",
+                    )
+    with gr.Group():
+        result = gr.Gallery(label="Result", show_label=False, rows=1, columns=3)
+        generate_button = gr.Button(value="Generate", variant="primary")
+    with gr.Accordion("Advanced options", open=True):
+        with gr.Group():
+            with gr.Row():
+                with gr.Column():
+                    interpolation_size = gr.Slider(
+                        label="Interpolation Size",
+                        minimum=3,
+                        maximum=7,
+                        step=1,
+                        value=5,
+                        info="Interpolation size includes the start and end images",
+                    )
+                    exploration_size = gr.Slider(
+                        label="Exploration Size",
+                        minimum=7,
+                        maximum=16,
+                        step=1,
+                        value=10,
+                        info="Exploration size has to be larger than interpolation size",
+                    )
+        with gr.Row():
+            with gr.Column():
+                warmup_ratio = gr.Slider(
+                    label="Warmup Ratio",
+                    minimum=0.02,
+                    maximum=1,
+                    step=0.01,
+                    value=0.5,
+                    interactive=True,
+                )
+                guidance_scale = gr.Slider(
+                    label="Guidance Scale",
+                    minimum=0,
+                    maximum=20,
+                    step=0.1,
+                    value=5.0,
+                    interactive=True,
+                )
+        num_inference_steps = gr.Slider(
+            label="Inference Steps",
+            minimum=25,
+            maximum=50,
+            step=1,
+            value=50,
+            interactive=True,
+        )
+        with gr.Column():
+            seed = gr.Slider(
+                label="Seed",
+                minimum=0,
+                maximum=MAX_SEED,
+                step=1,
+                value=0,
+            )
+            same_latent = gr.Checkbox(
+                label="Same latent",
+                value=False,
+                info="Use the same latent for start and end images",
+                show_label=True,
+            )
+    gr.Examples(
+        examples=get_example(),
+        inputs=[
+            prompt1,
+            prompt2,
+            negative_prompt,
+            guidance_prompt,
+            image_prompt1,
+            image_prompt2,
+            num_inference_steps,
+            exploration_size,
+            interpolation_size,
+            guidance_scale,
+            warmup_ratio,
+            model_choice,
+            adapter_choice,
+            seed,
+            same_latent,
+        ],
+        cache_examples=CACHE_EXAMPLES,
+    )
+    model_choice.change(
+        fn=change_generate_button_fn,
+        inputs=gr.Number(0, visible=False),
+        outputs=generate_button,
+    ).then(fn=change_model_fn, inputs=model_choice).then(
+        fn=change_generate_button_fn,
+        inputs=gr.Number(1, visible=False),
+        outputs=generate_button,
+    )
+    adapter_choice.change(
+        fn=change_generate_button_fn,
+        inputs=gr.Number(0, visible=False),
+        outputs=generate_button,
+    ).then(fn=change_adapter_fn, inputs=[adapter_choice]).then(
+        fn=change_generate_button_fn,
+        inputs=gr.Number(1, visible=False),
+        outputs=generate_button,
+    )
+    inputs = [
+        prompt1,
+        prompt2,
+        negative_prompt,
+        guidance_prompt,
+        image_prompt1,
+        image_prompt2,
+        num_inference_steps,
+        exploration_size,
+        interpolation_size,
+        guidance_scale,
+        warmup_ratio,
+        seed,
+        same_latent,
+    ]
+    generate_button.click(
+        fn=dynamic_gallery_fn,
+        inputs=interpolation_size,
+        outputs=result,
+    ).then(
+        fn=generate,
+        inputs=inputs,
+        outputs=result,
+    )
+    gr.Markdown(article)
+demo.launch()

asset/statue.jpg ADDED Viewed

asset/vermeer.jpg ADDED Viewed

interpolation.py ADDED Viewed

	@@ -0,0 +1,918 @@

+from typing import Optional
+import torch
+from torch import FloatTensor, LongTensor, Size, Tensor
+from torch import nn as nn
+from prior import generate_beta_tensor
+class InterpolatedAttnProcessor(nn.Module):
+    def __init__(
+        self,
+        t: Optional[float] = None,
+        size: int = 7,
+        is_fused: bool = False,
+        alpha: float = 1,
+        beta: float = 1,
+    ):
+        super().__init__()
+        if t is None:
+            ts = generate_beta_tensor(size, alpha=alpha, beta=beta)
+            ts[0], ts[-1] = 0, 1
+        else:
+            assert t > 0 and t < 1, "t must be between 0 and 1"
+            ts = [0, t, 1]
+            ts = torch.tensor(ts)
+            size = 3
+        self.size = size
+        self.coef = ts
+        self.is_fused = is_fused
+        self.activated = True
+    def deactivate(self):
+        self.activated = False
+    def activate(self, t):
+        self.activated = True
+        assert t > 0 and t < 1, "t must be between 0 and 1"
+        ts = [0, t, 1]
+        ts = torch.tensor(ts)
+        self.coef = ts
+    def load_end_point(self, key_begin, value_begin, key_end, value_end):
+        self.key_begin = key_begin
+        self.value_begin = value_begin
+        self.key_end = key_end
+        self.value_end = value_end
+class ScaleControlIPAttnProcessor(InterpolatedAttnProcessor):
+    r"""
+    Personalized processor for control the impact of image prompt via attention interpolation.
+    """
+    def __init__(
+        self,
+        t: Optional[float] = None,
+        size: int = 7,
+        is_fused: bool = False,
+        alpha: float = 1,
+        beta: float = 1,
+        ip_attn: Optional[nn.Module] = None,
+    ):
+        """
+        t: float, interpolation point between 0 and 1, if specified, size is set to 3
+        """
+        super().__init__(t=t, size=size, is_fused=is_fused, alpha=alpha, beta=beta)
+        self.num_tokens = (
+            ip_attn.num_tokens if hasattr(ip_attn, "num_tokens") else (16,)
+        )
+        self.scale = ip_attn.scale if hasattr(ip_attn, "scale") else None
+        self.ip_attn = ip_attn
+    def __call__(
+        self,
+        attn,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+            ip_hidden_states = None
+        else:
+            if isinstance(encoder_hidden_states, tuple):
+                encoder_hidden_states, ip_hidden_states = encoder_hidden_states
+            else:
+                end_pos = encoder_hidden_states.shape[1] - self.num_tokens[0]
+                encoder_hidden_states, ip_hidden_states = (
+                    encoder_hidden_states[:, :end_pos, :],
+                    [encoder_hidden_states[:, end_pos:, :]],
+                )
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(
+                batch_size, channel, height * width
+            ).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape
+            if encoder_hidden_states is None
+            else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(
+            attention_mask, sequence_length, batch_size
+        )
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(
+                1, 2
+            )
+        query = attn.to_q(hidden_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        if not self.activated:
+            key = attn.head_to_batch_dim(key)
+            value = attn.head_to_batch_dim(value)
+            attention_probs = attn.get_attention_scores(query, key, attention_mask)
+            hidden_states = torch.bmm(attention_probs, value)
+            hidden_states = attn.batch_to_head_dim(hidden_states)
+            if ip_hidden_states is not None:
+                key = self.ip_attn.to_k_ip[0](ip_hidden_states[0][6:9])
+                value = self.ip_attn.to_v_ip[0](ip_hidden_states[0][6:9])
+                key = attn.head_to_batch_dim(key)
+                value = attn.head_to_batch_dim(value)
+                ip_attention_probs = attn.get_attention_scores(
+                    query, key, attention_mask
+                )
+                ip_hidden_states = torch.bmm(ip_attention_probs, value)
+                ip_hidden_states = attn.batch_to_head_dim(ip_hidden_states)
+                hidden_states = (
+                    hidden_states
+                    + self.coef.reshape(-1, 1, 1).to(key.device, key.dtype)
+                    * ip_hidden_states
+                )
+        else:
+            key_begin = key[0:1].expand(3, *key.shape[1:])
+            key_end = key[-1:].expand(3, *key.shape[1:])
+            value_begin = value[0:1].expand(3, *value.shape[1:])
+            value_end = value[-1:].expand(3, *value.shape[1:])
+            key_begin = attn.head_to_batch_dim(key_begin)
+            value_begin = attn.head_to_batch_dim(value_begin)
+            key_end = attn.head_to_batch_dim(key_end)
+            value_end = attn.head_to_batch_dim(value_end)
+            if self.is_fused:
+                key = attn.head_to_batch_dim(key)
+                value = attn.head_to_batch_dim(value)
+                key_end = torch.cat([key, key_end], dim=-2)
+                value_end = torch.cat([value, value_end], dim=-2)
+                key_begin = torch.cat([key, key_begin], dim=-2)
+                value_begin = torch.cat([value, value_begin], dim=-2)
+            attention_probs_end = attn.get_attention_scores(
+                query, key_end, attention_mask
+            )
+            hidden_states_end = torch.bmm(attention_probs_end, value_end)
+            hidden_states_end = attn.batch_to_head_dim(hidden_states_end)
+            attention_probs_begin = attn.get_attention_scores(
+                query, key_begin, attention_mask
+            )
+            hidden_states_begin = torch.bmm(attention_probs_begin, value_begin)
+            hidden_states_begin = attn.batch_to_head_dim(hidden_states_begin)
+            # Apply outer interpolation on attention
+            coef = self.coef.reshape(-1, 1, 1)
+            coef = coef.to(key.device, key.dtype)
+            hidden_states = (1 - coef) * hidden_states_begin + coef * hidden_states_end
+            # for ip-adapter
+            if ip_hidden_states is not None:
+                key = self.ip_attn.to_k_ip[0](ip_hidden_states[0][6:9])
+                value = self.ip_attn.to_v_ip[0](ip_hidden_states[0][6:9])
+                key = attn.head_to_batch_dim(key)
+                value = attn.head_to_batch_dim(value)
+                ip_attention_probs = attn.get_attention_scores(
+                    query, key, attention_mask
+                )
+                ip_hidden_states = torch.bmm(ip_attention_probs, value)
+                ip_hidden_states = attn.batch_to_head_dim(ip_hidden_states)
+                hidden_states = hidden_states + coef * ip_hidden_states
+        hidden_states = attn.to_out[0](hidden_states)
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(
+                batch_size, channel, height, width
+            )
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class OuterInterpolatedIPAttnProcessor(InterpolatedAttnProcessor):
+    r"""
+    Personalized processor for performing outer attention interpolation.
+    Combined with IP-Adapter attention processor.
+    """
+    def __init__(
+        self,
+        t: Optional[float] = None,
+        size: int = 7,
+        is_fused: bool = False,
+        alpha: float = 1,
+        beta: float = 1,
+        ip_attn: Optional[nn.Module] = None,
+    ):
+        """
+        t: float, interpolation point between 0 and 1, if specified, size is set to 3
+        """
+        super().__init__(t=t, size=size, is_fused=is_fused, alpha=alpha, beta=beta)
+        self.num_tokens = (
+            ip_attn.num_tokens if hasattr(ip_attn, "num_tokens") else (16,)
+        )
+        self.scale = ip_attn.scale if hasattr(ip_attn, "scale") else None
+        self.ip_attn = ip_attn
+    def __call__(
+        self,
+        attn,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        if not self.activated:
+            return self.ip_attn(
+                attn, hidden_states, encoder_hidden_states, attention_mask, temb
+            )
+        residual = hidden_states
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+            ip_hidden_states = None
+        else:
+            if isinstance(encoder_hidden_states, tuple):
+                encoder_hidden_states, ip_hidden_states = encoder_hidden_states
+            else:
+                end_pos = encoder_hidden_states.shape[1] - self.num_tokens[0]
+                encoder_hidden_states, ip_hidden_states = (
+                    encoder_hidden_states[:, :end_pos, :],
+                    [encoder_hidden_states[:, end_pos:, :]],
+                )
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(
+                batch_size, channel, height * width
+            ).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape
+            if encoder_hidden_states is None
+            else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(
+            attention_mask, sequence_length, batch_size
+        )
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(
+                1, 2
+            )
+        query = attn.to_q(hidden_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        # Specify the first and last key and value
+        key_begin = key[0:1].expand(3, *key.shape[1:])
+        key_end = key[-1:].expand(3, *key.shape[1:])
+        value_begin = value[0:1].expand(3, *value.shape[1:])
+        value_end = value[-1:].expand(3, *value.shape[1:])
+        key_begin = attn.head_to_batch_dim(key_begin)
+        value_begin = attn.head_to_batch_dim(value_begin)
+        key_end = attn.head_to_batch_dim(key_end)
+        value_end = attn.head_to_batch_dim(value_end)
+        # Fused with self-attention
+        if self.is_fused:
+            key = attn.head_to_batch_dim(key)
+            value = attn.head_to_batch_dim(value)
+            key_end = torch.cat([key, key_end], dim=-2)
+            value_end = torch.cat([value, value_end], dim=-2)
+            key_begin = torch.cat([key, key_begin], dim=-2)
+            value_begin = torch.cat([value, value_begin], dim=-2)
+        attention_probs_end = attn.get_attention_scores(query, key_end, attention_mask)
+        hidden_states_end = torch.bmm(attention_probs_end, value_end)
+        hidden_states_end = attn.batch_to_head_dim(hidden_states_end)
+        attention_probs_begin = attn.get_attention_scores(
+            query, key_begin, attention_mask
+        )
+        hidden_states_begin = torch.bmm(attention_probs_begin, value_begin)
+        hidden_states_begin = attn.batch_to_head_dim(hidden_states_begin)
+        # for ip-adapter
+        if ip_hidden_states is not None:
+            key = self.ip_attn.to_k_ip[0](ip_hidden_states[0][::3])
+            value = self.ip_attn.to_v_ip[0](ip_hidden_states[0][::3])
+            # Specify the first and last key and value
+            key_begin = key[0:1].expand(3, *key.shape[1:])
+            key_end = key[-1:].expand(3, *key.shape[1:])
+            value_begin = value[0:1].expand(3, *value.shape[1:])
+            value_end = value[-1:].expand(3, *value.shape[1:])
+            key_begin = attn.head_to_batch_dim(key_begin)
+            value_begin = attn.head_to_batch_dim(value_begin)
+            key_end = attn.head_to_batch_dim(key_end)
+            value_end = attn.head_to_batch_dim(value_end)
+            # Fused with self-attention
+            if self.is_fused:
+                key = attn.head_to_batch_dim(key)
+                value = attn.head_to_batch_dim(value)
+                key_end = torch.cat([key, key_end], dim=-2)
+                value_end = torch.cat([value, value_end], dim=-2)
+                key_begin = torch.cat([key, key_begin], dim=-2)
+                value_begin = torch.cat([value, value_begin], dim=-2)
+            ip_attention_probs_end = attn.get_attention_scores(
+                query, key_end, attention_mask
+            )
+            ip_hidden_states_end = torch.bmm(ip_attention_probs_end, value_end)
+            ip_hidden_states_end = attn.batch_to_head_dim(ip_hidden_states_end)
+            ip_attention_probs_begin = attn.get_attention_scores(
+                query, key_begin, attention_mask
+            )
+            ip_hidden_states_begin = torch.bmm(ip_attention_probs_begin, value_begin)
+            ip_hidden_states_begin = attn.batch_to_head_dim(ip_hidden_states_begin)
+            hidden_states_begin = (
+                hidden_states_begin + self.scale[0] * ip_hidden_states_begin
+            )
+            hidden_states_end = hidden_states_end + self.scale[0] * ip_hidden_states_end
+        # Apply outer interpolation on attention
+        coef = self.coef.reshape(-1, 1, 1)
+        coef = coef.to(key.device, key.dtype)
+        hidden_states = (1 - coef) * hidden_states_begin + coef * hidden_states_end
+        hidden_states = attn.to_out[0](hidden_states)
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(
+                batch_size, channel, height, width
+            )
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class InnerInterpolatedIPAttnProcessor(InterpolatedAttnProcessor):
+    r"""
+    Personalized processor for performing inner attention interpolation.
+    With IP-Adapter.
+    """
+    def __init__(
+        self,
+        t: Optional[float] = None,
+        size: int = 7,
+        is_fused: bool = False,
+        alpha: float = 1,
+        beta: float = 1,
+        ip_attn: Optional[nn.Module] = None,
+    ):
+        """
+        t: float, interpolation point between 0 and 1, if specified, size is set to 3
+        """
+        super().__init__(t=t, size=size, is_fused=is_fused, alpha=alpha, beta=beta)
+        self.num_tokens = (
+            ip_attn.num_tokens if hasattr(ip_attn, "num_tokens") else (16,)
+        )
+        self.scale = ip_attn.scale if hasattr(ip_attn, "scale") else None
+        self.ip_attn = ip_attn
+    def __call__(
+        self,
+        attn,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        if not self.activated:
+            return self.ip_attn(
+                attn, hidden_states, encoder_hidden_states, attention_mask, temb
+            )
+        residual = hidden_states
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+            ip_hidden_states = None
+        else:
+            if isinstance(encoder_hidden_states, tuple):
+                encoder_hidden_states, ip_hidden_states = encoder_hidden_states
+            else:
+                end_pos = encoder_hidden_states.shape[1] - self.num_tokens[0]
+                encoder_hidden_states, ip_hidden_states = (
+                    encoder_hidden_states[:, :end_pos, :],
+                    [encoder_hidden_states[:, end_pos:, :]],
+                )
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(
+                batch_size, channel, height * width
+            ).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape
+            if encoder_hidden_states is None
+            else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(
+            attention_mask, sequence_length, batch_size
+        )
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(
+                1, 2
+            )
+        query = attn.to_q(hidden_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        # Specify the first and last key and value
+        key_begin = key[0:1].expand(3, *key.shape[1:])
+        key_end = key[-1:].expand(3, *key.shape[1:])
+        value_begin = value[0:1].expand(3, *value.shape[1:])
+        value_end = value[-1:].expand(3, *value.shape[1:])
+        coef = self.coef.reshape(-1, 1, 1)
+        coef = coef.to(key.device, key.dtype)
+        key_cross = (1 - coef) * key_begin + coef * key_end
+        value_cross = (1 - coef) * value_begin + coef * value_end
+        key_cross = attn.head_to_batch_dim(key_cross)
+        value_cross = attn.head_to_batch_dim(value_cross)
+        # Fused with self-attention
+        if self.is_fused:
+            key = attn.head_to_batch_dim(key)
+            value = attn.head_to_batch_dim(value)
+            key_cross = torch.cat([key, key_cross], dim=-2)
+            value_cross = torch.cat([value, value_cross], dim=-2)
+        attention_probs = attn.get_attention_scores(query, key_cross, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value_cross)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # for ip-adapter
+        if ip_hidden_states is not None:
+            key = self.ip_attn.to_k_ip[0](ip_hidden_states[0][::3])
+            value = self.ip_attn.to_v_ip[0](ip_hidden_states[0][::3])
+            key = key.squeeze()
+            value = value.squeeze()
+            # Specify the first and last key and value
+            key_begin = key[0:1].expand(3, *key.shape[1:])
+            key_end = key[-1:].expand(3, *key.shape[1:])
+            value_begin = value[0:1].expand(3, *value.shape[1:])
+            value_end = value[-1:].expand(3, *value.shape[1:])
+            key_cross = (1 - coef) * key_begin + coef * key_end
+            value_cross = (1 - coef) * value_begin + coef * value_end
+            key_cross = attn.head_to_batch_dim(key_cross)
+            value_cross = attn.head_to_batch_dim(value_cross)
+            # Fused with self-attention
+            if self.is_fused:
+                key = attn.head_to_batch_dim(key)
+                value = attn.head_to_batch_dim(value)
+                key_cross = torch.cat([key, key_cross], dim=-2)
+                value_cross = torch.cat([value, value_cross], dim=-2)
+            attention_probs = attn.get_attention_scores(query, key, attention_mask)
+            ip_hidden_states = torch.bmm(attention_probs, value)
+            ip_hidden_states = attn.batch_to_head_dim(ip_hidden_states)
+            hidden_states = hidden_states + self.scale[0] * ip_hidden_states
+        hidden_states = attn.to_out[0](hidden_states)
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(
+                batch_size, channel, height, width
+            )
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class OuterInterpolatedAttnProcessor(InterpolatedAttnProcessor):
+    r"""
+    Personalized processor for performing outer attention interpolation.
+    The attention output of interpolated image is obtained by:
+    (1 - t) * Q_t * K_1 * V_1 + t * Q_t * K_m * V_m;
+    If fused with self-attention:
+    (1 - t) * Q_t * [K_1, K_t] * [V_1, V_t] + t * Q_t * [K_m, K_t] * [V_m, V_t];
+    """
+    def __init__(
+        self,
+        t: Optional[float] = None,
+        size: int = 7,
+        is_fused: bool = False,
+        alpha: float = 1,
+        beta: float = 1,
+        original_attn: Optional[nn.Module] = None,
+    ):
+        """
+        t: float, interpolation point between 0 and 1, if specified, size is set to 3
+        """
+        super().__init__(t=t, size=size, is_fused=is_fused, alpha=alpha, beta=beta)
+        self.original_attn = original_attn
+    def __call__(
+        self,
+        attn,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        if not self.activated:
+            return self.original_attn(
+                attn, hidden_states, encoder_hidden_states, attention_mask, temb
+            )
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(
+                batch_size, channel, height * width
+            ).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape
+            if encoder_hidden_states is None
+            else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(
+            attention_mask, sequence_length, batch_size
+        )
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(
+                1, 2
+            )
+        query = attn.to_q(hidden_states)
+        query = attn.head_to_batch_dim(query)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(
+                encoder_hidden_states
+            )
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        # Specify the first and last key and value
+        key_begin = key[0:1]
+        key_end = key[-1:]
+        value_begin = value[0:1]
+        value_end = value[-1:]
+        key_begin = torch.cat([key_begin] * (self.size))
+        key_end = torch.cat([key_end] * (self.size))
+        value_begin = torch.cat([value_begin] * (self.size))
+        value_end = torch.cat([value_end] * (self.size))
+        key_begin = attn.head_to_batch_dim(key_begin)
+        value_begin = attn.head_to_batch_dim(value_begin)
+        key_end = attn.head_to_batch_dim(key_end)
+        value_end = attn.head_to_batch_dim(value_end)
+        # Fused with self-attention
+        if self.is_fused:
+            key = attn.head_to_batch_dim(key)
+            value = attn.head_to_batch_dim(value)
+            key_end = torch.cat([key, key_end], dim=-2)
+            value_end = torch.cat([value, value_end], dim=-2)
+            key_begin = torch.cat([key, key_begin], dim=-2)
+            value_begin = torch.cat([value, value_begin], dim=-2)
+        attention_probs_end = attn.get_attention_scores(query, key_end, attention_mask)
+        hidden_states_end = torch.bmm(attention_probs_end, value_end)
+        hidden_states_end = attn.batch_to_head_dim(hidden_states_end)
+        attention_probs_begin = attn.get_attention_scores(
+            query, key_begin, attention_mask
+        )
+        hidden_states_begin = torch.bmm(attention_probs_begin, value_begin)
+        hidden_states_begin = attn.batch_to_head_dim(hidden_states_begin)
+        # Apply outer interpolation on attention
+        coef = self.coef.reshape(-1, 1, 1)
+        coef = coef.to(key.device, key.dtype)
+        hidden_states = (1 - coef) * hidden_states_begin + coef * hidden_states_end
+        hidden_states = attn.to_out[0](hidden_states)
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(
+                batch_size, channel, height, width
+            )
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class InnerInterpolatedAttnProcessor(InterpolatedAttnProcessor):
+    r"""
+    Personalized processor for performing inner attention interpolation.
+    The attention output of interpolated image is obtained by:
+    (1 - t) * Q_t * K_1 * V_1 + t * Q_t * K_m * V_m;
+    If fused with self-attention:
+    (1 - t) * Q_t * [K_1, K_t] * [V_1, V_t] + t * Q_t * [K_m, K_t] * [V_m, V_t];
+    """
+    def __init__(
+        self,
+        t: Optional[float] = None,
+        size: int = 7,
+        is_fused: bool = False,
+        alpha: float = 1,
+        beta: float = 1,
+        original_attn: Optional[nn.Module] = None,
+    ):
+        """
+        t: float, interpolation point between 0 and 1, if specified, size is set to 3
+        """
+        super().__init__(t=t, size=size, is_fused=is_fused, alpha=alpha, beta=beta)
+        self.original_attn = original_attn
+    def __call__(
+        self,
+        attn,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        if not self.activated:
+            return self.original_attn(
+                attn, hidden_states, encoder_hidden_states, attention_mask, temb
+            )
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(
+                batch_size, channel, height * width
+            ).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape
+            if encoder_hidden_states is None
+            else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(
+            attention_mask, sequence_length, batch_size
+        )
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(
+                1, 2
+            )
+        query = attn.to_q(hidden_states)
+        query = attn.head_to_batch_dim(query)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(
+                encoder_hidden_states
+            )
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        # Specify the first and last key and value
+        key_start = key[0:1]
+        key_end = key[-1:]
+        value_start = value[0:1]
+        value_end = value[-1:]
+        key_start = torch.cat([key_start] * (self.size))
+        key_end = torch.cat([key_end] * (self.size))
+        value_start = torch.cat([value_start] * (self.size))
+        value_end = torch.cat([value_end] * (self.size))
+        # Apply inner interpolation on attention
+        coef = self.coef.reshape(-1, 1, 1)
+        coef = coef.to(key.device, key.dtype)
+        key_cross = (1 - coef) * key_start + coef * key_end
+        value_cross = (1 - coef) * value_start + coef * value_end
+        key_cross = attn.head_to_batch_dim(key_cross)
+        value_cross = attn.head_to_batch_dim(value_cross)
+        # Fused with self-attention
+        if self.is_fused:
+            key = attn.head_to_batch_dim(key)
+            value = attn.head_to_batch_dim(value)
+            key_cross = torch.cat([key, key_cross], dim=-2)
+            value_cross = torch.cat([value, value_cross], dim=-2)
+        attention_probs = attn.get_attention_scores(query, key_cross, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value_cross)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        hidden_states = attn.to_out[0](hidden_states)
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(
+                batch_size, channel, height, width
+            )
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+def linear_interpolation(
+    l1: FloatTensor, l2: FloatTensor, ts: Optional[FloatTensor] = None, size: int = 5
+) -> FloatTensor:
+    """
+    Linear interpolation
+    Args:
+        l1: Starting vector: (1, *)
+        l2: Final vector: (1, *)
+        ts: FloatTensor, interpolation points between 0 and 1
+        size: int, number of interpolation points including l1 and l2
+    Returns:
+    Interpolated vectors: (size, *)
+    """
+    assert l1.shape == l2.shape, "shapes of l1 and l2 must match"
+    res = []
+    if ts is not None:
+        for t in ts:
+            li = torch.lerp(l1, l2, t)
+            res.append(li)
+    else:
+        for i in range(size):
+            t = i / (size - 1)
+            li = torch.lerp(l1, l2, t)
+            res.append(li)
+    res = torch.cat(res, dim=0)
+    return res
+def spherical_interpolation(l1: FloatTensor, l2: FloatTensor, size=5) -> FloatTensor:
+    """
+    Spherical interpolation
+    Args:
+        l1: Starting vector: (1, *)
+        l2: Final vector: (1, *)
+        size: int, number of interpolation points including l1 and l2
+    Returns:
+        Interpolated vectors: (size, *)
+    """
+    assert l1.shape == l2.shape, "shapes of l1 and l2 must match"
+    res = []
+    for i in range(size):
+        t = i / (size - 1)
+        li = slerp(l1, l2, t)
+        res.append(li)
+    res = torch.cat(res, dim=0)
+    return res
+def slerp(v0: FloatTensor, v1: FloatTensor, t, threshold=0.9995):
+    """
+    Spherical linear interpolation
+    Args:
+        v0: Starting vector
+        v1: Final vector
+        t: Float value between 0.0 and 1.0
+        threshold: Threshold for considering the two vectors as
+                                colinear. Not recommended to alter this.
+    Returns:
+        Interpolation vector between v0 and v1
+    """
+    assert v0.shape == v1.shape, "shapes of v0 and v1 must match"
+    # Normalize the vectors to get the directions and angles
+    v0_norm: FloatTensor = torch.norm(v0, dim=-1)
+    v1_norm: FloatTensor = torch.norm(v1, dim=-1)
+    v0_normed: FloatTensor = v0 / v0_norm.unsqueeze(-1)
+    v1_normed: FloatTensor = v1 / v1_norm.unsqueeze(-1)
+    # Dot product with the normalized vectors
+    dot: FloatTensor = (v0_normed * v1_normed).sum(-1)
+    dot_mag: FloatTensor = dot.abs()
+    # if dp is NaN, it's because the v0 or v1 row was filled with 0s
+    # If absolute value of dot product is almost 1, vectors are ~colinear, so use torch.lerp
+    gotta_lerp: LongTensor = dot_mag.isnan() | (dot_mag > threshold)
+    can_slerp: LongTensor = ~gotta_lerp
+    t_batch_dim_count: int = max(0, t.dim() - v0.dim()) if isinstance(t, Tensor) else 0
+    t_batch_dims: Size = (
+        t.shape[:t_batch_dim_count] if isinstance(t, Tensor) else Size([])
+    )
+    out: FloatTensor = torch.zeros_like(v0.expand(*t_batch_dims, *[-1] * v0.dim()))
+    # if no elements are lerpable, our vectors become 0-dimensional, preventing broadcasting
+    if gotta_lerp.any():
+        lerped: FloatTensor = torch.lerp(v0, v1, t)
+        out: FloatTensor = lerped.where(gotta_lerp.unsqueeze(-1), out)
+    # if no elements are slerpable, our vectors become 0-dimensional, preventing broadcasting
+    if can_slerp.any():
+        # Calculate initial angle between v0 and v1
+        theta_0: FloatTensor = dot.arccos().unsqueeze(-1)
+        sin_theta_0: FloatTensor = theta_0.sin()
+        # Angle at timestep t
+        theta_t: FloatTensor = theta_0 * t
+        sin_theta_t: FloatTensor = theta_t.sin()
+        # Finish the slerp algorithm
+        s0: FloatTensor = (theta_0 - theta_t).sin() / sin_theta_0
+        s1: FloatTensor = sin_theta_t / sin_theta_0
+        slerped: FloatTensor = s0 * v0 + s1 * v1
+        out: FloatTensor = slerped.where(can_slerp.unsqueeze(-1), out)
+    return out

pipeline_interpolated_sd.py ADDED Viewed

	@@ -0,0 +1,1963 @@

+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+import torch
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.configuration_utils import FrozenDict
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import (
+    FromSingleFileMixin,
+    IPAdapterMixin,
+    TextualInversionLoaderMixin,
+)
+from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from diffusers.models.attention_processor import (
+    FusedAttnProcessor2_0,
+)
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion.pipeline_output import (
+    StableDiffusionPipelineOutput,
+)
+from diffusers.pipelines.stable_diffusion.safety_checker import (
+    StableDiffusionSafetyChecker,
+)
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import (
+    deprecate,
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from packaging import version
+from interpolation import (
+    InnerInterpolatedAttnProcessor,
+    InnerInterpolatedIPAttnProcessor,
+    OuterInterpolatedAttnProcessor,
+    OuterInterpolatedIPAttnProcessor,
+    ScaleControlIPAttnProcessor,
+    slerp,
+)
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+)
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm  # type: ignore
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import StableDiffusionXLPipeline
+        >>> pipe = StableDiffusionXLPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ... )
+        >>> pipe = pipe.to("cuda")
+        >>> prompt = "a photo of an astronaut riding a horse on mars"
+        >>> image = pipe(prompt).images[0]
+        ```
+"""
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.rescale_noise_cfg
+def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
+    """
+    Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+    Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+    """
+    std_text = noise_pred_text.std(
+        dim=list(range(1, noise_pred_text.ndim)), keepdim=True
+    )
+    std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
+    # rescale the results from guidance (fixes overexposure)
+    noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
+    # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
+    noise_cfg = (
+        guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
+    )
+    return noise_cfg
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(
+            inspect.signature(scheduler.set_timesteps).parameters.keys()
+        )
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+class StableDiffusionMixin:
+    r"""
+    Helper for DiffusionPipeline with vae and unet.(mainly for LDM such as stable diffusion)
+    """
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.vae.enable_tiling()
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
+    def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
+        r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
+        The suffixes after the scaling factors represent the stages where they are being applied.
+        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
+        that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
+        Args:
+            s1 (`float`):
+                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
+                mitigate "oversmoothing effect" in the enhanced denoising process.
+            s2 (`float`):
+                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
+                mitigate "oversmoothing effect" in the enhanced denoising process.
+            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
+        """
+        if not hasattr(self, "unet"):
+            raise ValueError("The pipeline must have `unet` for using FreeU.")
+        self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
+    def disable_freeu(self):
+        """Disables the FreeU mechanism if enabled."""
+        self.unet.disable_freeu()
+    def fuse_qkv_projections(self, unet: bool = True, vae: bool = True):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query,
+        key, value) are fused. For cross-attention modules, key and value projection matrices are fused.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        Args:
+            unet (`bool`, defaults to `True`): To apply fusion on the UNet.
+            vae (`bool`, defaults to `True`): To apply fusion on the VAE.
+        """
+        self.fusing_unet = False
+        self.fusing_vae = False
+        if unet:
+            self.fusing_unet = True
+            self.unet.fuse_qkv_projections()
+            self.unet.set_attn_processor(FusedAttnProcessor2_0())
+        if vae:
+            if not isinstance(self.vae, AutoencoderKL):
+                raise ValueError(
+                    "`fuse_qkv_projections()` is only supported for the VAE of type `AutoencoderKL`."
+                )
+            self.fusing_vae = True
+            self.vae.fuse_qkv_projections()
+            self.vae.set_attn_processor(FusedAttnProcessor2_0())
+    def unfuse_qkv_projections(self, unet: bool = True, vae: bool = True):
+        """Disable QKV projection fusion if enabled.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        Args:
+            unet (`bool`, defaults to `True`): To apply fusion on the UNet.
+            vae (`bool`, defaults to `True`): To apply fusion on the VAE.
+        """
+        if unet:
+            if not self.fusing_unet:
+                logger.warning(
+                    "The UNet was not initially fused for QKV projections. Doing nothing."
+                )
+            else:
+                self.unet.unfuse_qkv_projections()
+                self.fusing_unet = False
+        if vae:
+            if not self.fusing_vae:
+                logger.warning(
+                    "The VAE was not initially fused for QKV projections. Doing nothing."
+                )
+            else:
+                self.vae.unfuse_qkv_projections()
+                self.fusing_vae = False
+class InterpolationStableDiffusionPipeline(
+    DiffusionPipeline,
+    StableDiffusionMixin,
+    TextualInversionLoaderMixin,
+    IPAdapterMixin,
+    FromSingleFileMixin,
+):
+    r"""
+    Pipeline for text-to-image generation using Stable Diffusion.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
+    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
+    The pipeline also inherits the following loading methods:
+        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
+        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
+        text_encoder ([`~transformers.CLIPTextModel`]):
+            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
+        tokenizer ([`~transformers.CLIPTokenizer`]):
+            A `CLIPTokenizer` to tokenize text.
+        unet ([`UNet2DConditionModel`]):
+            A `UNet2DConditionModel` to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            about a model's potential harms.
+        feature_extractor ([`~transformers.CLIPImageProcessor`]):
+            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
+    """
+    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
+    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
+    _exclude_from_cpu_offload = ["safety_checker"]
+    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+        if (
+            hasattr(scheduler.config, "steps_offset")
+            and scheduler.config.steps_offset != 1
+        ):
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate(
+                "steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False
+            )
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+        if (
+            hasattr(scheduler.config, "clip_sample")
+            and scheduler.config.clip_sample is True
+        ):
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate(
+                "clip_sample not set", "1.0.0", deprecation_message, standard_warn=False
+            )
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+        is_unet_version_less_0_9_0 = hasattr(
+            unet.config, "_diffusers_version"
+        ) and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse(
+            "0.9.0.dev0"
+        )
+        is_unet_sample_size_less_64 = (
+            hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        )
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate(
+                "sample_size<64", "1.0.0", deprecation_message, standard_warn=False
+            )
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+        self.load_aid()
+    def _encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        **kwargs,
+    ):
+        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
+        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
+        prompt_embeds_tuple = self.encode_prompt(
+            prompt=prompt,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            **kwargs,
+        )
+        # concatenate for backwards comp
+        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
+        return prompt_embeds
+    def encode_prompt(
+        self,
+        prompt,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+        negative_prompt=None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        lora_scale: Optional[float] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            do_classifier_free_guidance (`bool`):
+                whether to use classifier free guidance or not
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            lora_scale (`float`, *optional*):
+                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        """
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        if prompt_embeds is None:
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            untruncated_ids = self.tokenizer(
+                prompt, padding="longest", return_tensors="pt"
+            ).input_ids
+            if untruncated_ids.shape[-1] >= text_input_ids.shape[
+                -1
+            ] and not torch.equal(text_input_ids, untruncated_ids):
+                removed_text = self.tokenizer.batch_decode(
+                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
+                )
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+            if (
+                hasattr(self.text_encoder.config, "use_attention_mask")
+                and self.text_encoder.config.use_attention_mask
+            ):
+                attention_mask = text_inputs.attention_mask.to(device)
+            else:
+                attention_mask = None
+            if clip_skip is None:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device), attention_mask=attention_mask
+                )
+                prompt_embeds = prompt_embeds[0]
+            else:
+                prompt_embeds = self.text_encoder(
+                    text_input_ids.to(device),
+                    attention_mask=attention_mask,
+                    output_hidden_states=True,
+                )
+                # Access the `hidden_states` first, that contains a tuple of
+                # all the hidden states from the encoder layers. Then index into
+                # the tuple to access the hidden states from the desired layer.
+                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
+                # We also need to apply the final LayerNorm here to not mess with the
+                # representations. The `last_hidden_states` that we typically use for
+                # obtaining the final prompt representations passes through the LayerNorm
+                # layer.
+                prompt_embeds = self.text_encoder.text_model.final_layer_norm(
+                    prompt_embeds
+                )
+        if self.text_encoder is not None:
+            prompt_embeds_dtype = self.text_encoder.dtype
+        elif self.unet is not None:
+            prompt_embeds_dtype = self.unet.dtype
+        else:
+            prompt_embeds_dtype = prompt_embeds.dtype
+        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+        bs_embed, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(
+            bs_embed * num_images_per_prompt, seq_len, -1
+        )
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            # textual inversion: process multi-vector tokens if necessary
+            if isinstance(self, TextualInversionLoaderMixin):
+                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
+            max_length = prompt_embeds.shape[1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            if (
+                hasattr(self.text_encoder.config, "use_attention_mask")
+                and self.text_encoder.config.use_attention_mask
+            ):
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+            negative_prompt_embeds = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            negative_prompt_embeds = negative_prompt_embeds[0]
+        if do_classifier_free_guidance:
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = negative_prompt_embeds.shape[1]
+            negative_prompt_embeds = negative_prompt_embeds.to(
+                dtype=prompt_embeds_dtype, device=device
+            )
+            negative_prompt_embeds = negative_prompt_embeds.repeat(
+                1, num_images_per_prompt, 1
+            )
+            negative_prompt_embeds = negative_prompt_embeds.view(
+                batch_size * num_images_per_prompt, seq_len, -1
+            )
+        return prompt_embeds, negative_prompt_embeds
+    def encode_image(
+        self, image, device, num_images_per_prompt, output_hidden_states=None
+    ):
+        dtype = next(self.image_encoder.parameters()).dtype
+        if not isinstance(image, torch.Tensor):
+            image = self.feature_extractor(image, return_tensors="pt").pixel_values
+        image = image.to(device=device, dtype=dtype)
+        if output_hidden_states:
+            image_enc_hidden_states = self.image_encoder(
+                image, output_hidden_states=True
+            ).hidden_states[-2]
+            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(
+                num_images_per_prompt, dim=0
+            )
+            uncond_image_enc_hidden_states = self.image_encoder(
+                torch.zeros_like(image), output_hidden_states=True
+            ).hidden_states[-2]
+            uncond_image_enc_hidden_states = (
+                uncond_image_enc_hidden_states.repeat_interleave(
+                    num_images_per_prompt, dim=0
+                )
+            )
+            return image_enc_hidden_states, uncond_image_enc_hidden_states
+        else:
+            image_embeds = self.image_encoder(image).image_embeds
+            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
+            uncond_image_embeds = torch.zeros_like(image_embeds)
+            return image_embeds, uncond_image_embeds
+    def prepare_ip_adapter_image_embeds(
+        self,
+        ip_adapter_image,
+        ip_adapter_image_embeds,
+        device,
+        num_images_per_prompt,
+        do_classifier_free_guidance,
+    ):
+        image_embeds = []
+        if do_classifier_free_guidance:
+            negative_image_embeds = []
+        if ip_adapter_image_embeds is None:
+            if not isinstance(ip_adapter_image, list):
+                ip_adapter_image = [ip_adapter_image]
+            if len(ip_adapter_image) != len(
+                self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                raise ValueError(
+                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
+                )
+            for single_ip_adapter_image, image_proj_layer in zip(
+                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
+            ):
+                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
+                single_image_embeds, single_negative_image_embeds = self.encode_image(
+                    single_ip_adapter_image, device, 1, output_hidden_state
+                )
+                image_embeds.append(single_image_embeds[None, :])
+                if do_classifier_free_guidance:
+                    negative_image_embeds.append(single_negative_image_embeds[None, :])
+        else:
+            for single_image_embeds in ip_adapter_image_embeds:
+                if do_classifier_free_guidance:
+                    single_negative_image_embeds, single_image_embeds = (
+                        single_image_embeds.chunk(2)
+                    )
+                    negative_image_embeds.append(single_negative_image_embeds)
+                image_embeds.append(single_image_embeds)
+        ip_adapter_image_embeds = []
+        for i, single_image_embeds in enumerate(image_embeds):
+            single_image_embeds = torch.cat(
+                [single_image_embeds] * num_images_per_prompt, dim=0
+            )
+            if do_classifier_free_guidance:
+                single_negative_image_embeds = torch.cat(
+                    [negative_image_embeds[i]] * num_images_per_prompt, dim=0
+                )
+                single_image_embeds = torch.cat(
+                    [single_negative_image_embeds, single_image_embeds], dim=0
+                )
+            single_image_embeds = single_image_embeds.to(device=device)
+            ip_adapter_image_embeds.append(single_image_embeds)
+        return ip_adapter_image_embeds
+    def run_safety_checker(self, image, device, dtype):
+        if self.safety_checker is None:
+            has_nsfw_concept = None
+        else:
+            if torch.is_tensor(image):
+                feature_extractor_input = self.image_processor.postprocess(
+                    image, output_type="pil"
+                )
+            else:
+                feature_extractor_input = self.image_processor.numpy_to_pil(image)
+            safety_checker_input = self.feature_extractor(
+                feature_extractor_input, return_tensors="pt"
+            ).to(device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
+            )
+        return image, has_nsfw_concept
+    def decode_latents(self, latents):
+        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
+        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
+        latents = 1 / self.vae.config.scaling_factor * latents
+        image = self.vae.decode(latents, return_dict=False)[0]
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        return image
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(
+            inspect.signature(self.scheduler.step).parameters.keys()
+        )
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        callback_steps,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        ip_adapter_image=None,
+        ip_adapter_image_embeds=None,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(
+                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
+            )
+        if callback_steps is not None and (
+            not isinstance(callback_steps, int) or callback_steps <= 0
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs
+            for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (
+            not isinstance(prompt, str) and not isinstance(prompt, list)
+        ):
+            raise ValueError(
+                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
+            )
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
+            raise ValueError(
+                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
+            )
+        if ip_adapter_image_embeds is not None:
+            if not isinstance(ip_adapter_image_embeds, list):
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
+                )
+            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
+                raise ValueError(
+                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
+                )
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            latents = randn_tensor(
+                shape, generator=generator, device=device, dtype=dtype
+            )
+        else:
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
+    def get_guidance_scale_embedding(
+        self,
+        w: torch.Tensor,
+        embedding_dim: int = 512,
+        dtype: torch.dtype = torch.float32,
+    ) -> torch.Tensor:
+        """
+        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+        Args:
+            w (`torch.Tensor`):
+                Generate embedding vectors with a specified guidance scale to subsequently enrich timestep embeddings.
+            embedding_dim (`int`, *optional*, defaults to 512):
+                Dimension of the embeddings to generate.
+            dtype (`torch.dtype`, *optional*, defaults to `torch.float32`):
+                Data type of the generated embeddings.
+        Returns:
+            `torch.Tensor`: Embedding vectors with shape `(len(w), embedding_dim)`.
+        """
+        assert len(w.shape) == 1
+        w = w * 1000.0
+        half_dim = embedding_dim // 2
+        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
+        emb = w.to(dtype)[:, None] * emb[None, :]
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+        if embedding_dim % 2 == 1:  # zero pad
+            emb = torch.nn.functional.pad(emb, (0, 1))
+        assert emb.shape == (w.shape[0], embedding_dim)
+        return emb
+    # load interpolated attention processor
+    def load_aid(
+        self, t: Optional[float] = 0.5, is_fused: bool = True, atype="fused_outer"
+    ):
+        attn_procs = {}
+        for name in self.unet.attn_processors.keys():
+            if not name.startswith("encoder"):
+                if atype == "fused_outer":
+                    attn_procs[name] = OuterInterpolatedAttnProcessor(
+                        t=t,
+                        is_fused=is_fused,
+                        original_attn=self.unet.attn_processors[name],
+                    )
+                elif atype == "fused_inner":
+                    attn_procs[name] = InnerInterpolatedAttnProcessor(
+                        t=t,
+                        is_fused=is_fused,
+                        original_attn=self.unet.attn_processors[name],
+                    )
+            else:
+                attn_procs[name] = self.unet.attn_processors[name]
+        self.unet.set_attn_processor(attn_procs)
+    # load customized ip_adapter
+    def load_aid_ip_adapter(
+        self,
+        pretrained_model_name_or_path_or_dict: Union[
+            str, List[str], Dict[str, torch.Tensor]
+        ],
+        subfolder: Union[str, List[str]],
+        weight_name: Union[str, List[str]],
+        t: Optional[float] = 0.5,
+        is_fused: bool = True,
+        image_encoder_folder: Optional[str] = "image_encoder",
+        early="fused_outer",
+        **kwargs,
+    ):
+        self.load_ip_adapter(
+            pretrained_model_name_or_path_or_dict=pretrained_model_name_or_path_or_dict,
+            subfolder=subfolder,
+            weight_name=weight_name,
+            image_encoder_folder=image_encoder_folder,
+            **kwargs,
+        )
+        attn_procs = {}
+        for name in self.unet.attn_processors.keys():
+            if not name.startswith("encoder"):
+                if early == "fused_outer":
+                    attn_procs[name] = OuterInterpolatedIPAttnProcessor(
+                        t=t, is_fused=is_fused, ip_attn=self.unet.attn_processors[name]
+                    )
+                elif early == "fused_inner":
+                    attn_procs[name] = InnerInterpolatedIPAttnProcessor(
+                        t=t, is_fused=is_fused, ip_attn=self.unet.attn_processors[name]
+                    )
+                elif early == "scale_control":
+                    attn_procs[name] = ScaleControlIPAttnProcessor(
+                        t=t, is_fused=is_fused, ip_attn=self.unet.attn_processors[name]
+                    )
+            else:
+                attn_procs[name] = self.unet.attn_processors[name]
+        self.unet.set_attn_processor(attn_procs)
+    def activate_aid(self, it: float):
+        for name in self.unet.attn_processors.keys():
+            if not name.startswith("encoder"):
+                self.unet.attn_processors[name].activate(it)
+    def deactivate_aid(self):
+        for name in self.unet.attn_processors.keys():
+            if not name.startswith("encoder"):
+                self.unet.attn_processors[name].deactivate()
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def guidance_rescale(self):
+        return self._guidance_rescale
+    @property
+    def clip_skip(self):
+        return self._clip_skip
+    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+    # corresponds to doing no classifier free guidance.
+    @property
+    def do_classifier_free_guidance(self):
+        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
+    @property
+    def cross_attention_kwargs(self):
+        return self._cross_attention_kwargs
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def interrupt(self):
+        return self._interrupt
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[
+            Union[
+                Callable[[int, int, Dict], None],
+                PipelineCallback,
+                MultiPipelineCallbacks,
+            ]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
+                using zero terminal SNR.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+        Examples:
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # to deal with lora scaling and other possible forward hooks
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None)
+            if self.cross_attention_kwargs is not None
+            else None
+        )
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # 6.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if (ip_adapter_image is not None or ip_adapter_image_embeds is not None)
+            else None
+        )
+        # 6.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(
+                batch_size * num_images_per_prompt
+            )
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = (
+                    torch.cat([latents] * 2)
+                    if self.do_classifier_free_guidance
+                    else latents
+                )
+                latent_model_input = self.scheduler.scale_model_input(
+                    latent_model_input, t
+                )
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (
+                        noise_pred_text - noise_pred_uncond
+                    )
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(
+                        noise_pred,
+                        noise_pred_text,
+                        guidance_rescale=self.guidance_rescale,
+                    )
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, **extra_step_kwargs, return_dict=False
+                )[0]
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop(
+                        "negative_prompt_embeds", negative_prompt_embeds
+                    )
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+                ):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+        if not output_type == "latent":
+            image = self.vae.decode(
+                latents / self.vae.config.scaling_factor,
+                return_dict=False,
+                generator=generator,
+            )[0]
+            image, has_nsfw_concept = self.run_safety_checker(
+                image, device, prompt_embeds.dtype
+            )
+        else:
+            image = latents
+            has_nsfw_concept = None
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+        image = self.image_processor.postprocess(
+            image, output_type=output_type, do_denormalize=do_denormalize
+        )
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(
+            images=image, nsfw_content_detected=has_nsfw_concept
+        )
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def interpolate_single(
+        self,
+        it: int = 0.5,
+        prompt_start: Optional[str] = None,
+        prompt_end: Optional[str] = None,
+        latent_start: Optional[torch.FloatTensor] = None,
+        latent_end: Optional[torch.FloatTensor] = None,
+        image_start: Optional[PipelineImageInput] = None,
+        image_end: Optional[PipelineImageInput] = None,
+        guide_prompt: Optional[str] = None,
+        warmup_ratio: float = 0.5,
+        is_fused: bool = True,
+        atype: str = "outer",
+        init: str = "linear",
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[
+            Union[
+                Callable[[int, int, Dict], None],
+                PipelineCallback,
+                MultiPipelineCallbacks,
+            ]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in both text-encoders
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+                Anything below 512 pixels won't work well for
+                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
+                and checkpoints that are not specifically fine-tuned on low resolutions.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            denoising_end (`float`, *optional*):
+                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
+                completed before it is intentionally prematurely terminated. As a result, the returned sample will
+                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
+                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
+                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
+                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
+            guidance_scale (`float`, *optional*, defaults to 5.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
+                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
+                if `do_classifier_free_guidance` is set to `True`.
+                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
+                of a plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
+                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
+                Guidance rescale factor should fix overexposure when using zero terminal SNR.
+            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
+                explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
+                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
+                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                For most cases, `target_size` should be set to the desired height and width of the generated image. If
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
+                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
+            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
+                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
+                micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
+                To negatively condition the generation process based on a target image resolution. It should be as same
+                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
+                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
+                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+        Examples:
+        Returns:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
+            `tuple`. When returning a tuple, the first element is a list with the generated images.
+        """
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if image_start is not None and image_end is None:
+            # throw error
+            raise ValueError(
+                "Please provide both `image_start` and `image_end` to interpolate, or only `image_end` to control the scale."
+            )
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt_start,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+        # 2. Define call parameters
+        batch_size = 3  # [Source A, Interpolated, Source B]
+        device = self._execution_device
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None)
+            if self.cross_attention_kwargs is not None
+            else None
+        )
+        (prompt_embeds_start, negative_prompt_embeds_start) = self.encode_prompt(
+            prompt=prompt_start,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        (prompt_embeds_end, negative_prompt_embeds_end) = self.encode_prompt(
+            prompt=prompt_end,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            do_classifier_free_guidance=self.do_classifier_free_guidance,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+        if guide_prompt is not None:
+            (prompt_embeds_target, negative_prompt_embeds_target) = self.encode_prompt(
+                prompt=guide_prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                do_classifier_free_guidance=self.do_classifier_free_guidance,
+                negative_prompt=negative_prompt,
+                prompt_embeds=prompt_embeds,
+                negative_prompt_embeds=negative_prompt_embeds,
+                lora_scale=lora_scale,
+                clip_skip=self.clip_skip,
+            )
+        else:
+            if init == "linear":
+                prompt_embeds_target = torch.lerp(
+                    prompt_embeds_start, prompt_embeds_end, it
+                )
+                negative_prompt_embeds_target = torch.lerp(
+                    negative_prompt_embeds_start, negative_prompt_embeds_end, it
+                )
+            else:
+                prompt_embeds_target = slerp(prompt_embeds_start, prompt_embeds_end, it)
+                negative_prompt_embeds_target = slerp(
+                    negative_prompt_embeds_start, negative_prompt_embeds_end, it
+                )
+        prompt_embeds = torch.cat(
+            [prompt_embeds_start, prompt_embeds_target, prompt_embeds_end], dim=0
+        ).to(device=device)
+        negative_prompt_embeds = torch.cat(
+            [
+                negative_prompt_embeds_start,
+                negative_prompt_embeds_target,
+                negative_prompt_embeds_end,
+            ],
+            dim=0,
+        ).to(device=device)
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps
+        )
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latent_start = self.prepare_latents(
+            1,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latent_start,
+        )
+        latent_end = self.prepare_latents(
+            1,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latent_end,
+        )
+        latent_target = slerp(latent_start, latent_end, it)
+        latents = torch.cat([latent_start, latent_target, latent_end], dim=0).to(
+            device=device
+        )
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                3,
+                self.do_classifier_free_guidance,
+            )
+        # 6.1 Prepare image embeddings for interpolation
+        if image_end is not None:
+            image_embeds_end = self.prepare_ip_adapter_image_embeds(
+                image_end,
+                None,
+                device,
+                3,
+                self.do_classifier_free_guidance,
+            )
+            negative_image_embeds_end, image_embeds_end = image_embeds_end[0].chunk(2)
+            if image_start is None:
+                image_embeds_start = negative_image_embeds_end
+                negative_image_embeds_start = negative_image_embeds_end
+            else:
+                image_embeds_start = self.prepare_ip_adapter_image_embeds(
+                    image_start,
+                    None,
+                    device,
+                    3,
+                    self.do_classifier_free_guidance,
+                )
+                negative_image_embeds_start, image_embeds_start = image_embeds_start[
+                    0
+                ].chunk(2)
+            if init == "linear":
+                image_embeds_target = torch.lerp(
+                    image_embeds_start, image_embeds_end, it
+                )
+                negative_image_embeds_target = torch.lerp(
+                    negative_image_embeds_start, negative_image_embeds_end, it
+                )
+            else:
+                image_embeds_target = slerp(image_embeds_start, image_embeds_end, it)
+                negative_image_embeds_target = slerp(
+                    negative_image_embeds_start, negative_image_embeds_end, it
+                )
+            image_embeds = torch.cat(
+                [image_embeds_start, image_embeds_target, image_embeds_end], dim=0
+            ).to(device=device)
+            negative_image_embeds = torch.cat(
+                [
+                    negative_image_embeds_start,
+                    negative_image_embeds_target,
+                    negative_image_embeds_end,
+                ],
+                dim=0,
+            ).to(device=device)
+            image_embeds = [image_embeds]
+            negative_image_embeds = [negative_image_embeds]
+        # 6.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(
+                batch_size * num_images_per_prompt
+            )
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+        # 7. Denoising loop
+        num_warmup_steps = max(
+            len(timesteps) - num_inference_steps * self.scheduler.order, 0
+        )
+        warmup_steps = int(num_inference_steps * warmup_ratio)
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = latents
+                latent_model_input = self.scheduler.scale_model_input(
+                    latent_model_input, t
+                )
+                # Set the interpolated attention processor
+                if i < warmup_steps:
+                    self.activate_aid(it)
+                else:
+                    self.deactivate_aid()
+                # predict the noise residual for conditional noise
+                if (
+                    (image_start is not None or image_end is not None)
+                    or ip_adapter_image is not None
+                    or ip_adapter_image_embeds is not None
+                ):
+                    added_cond_kwargs = {"image_embeds": image_embeds}
+                else:
+                    added_cond_kwargs = None
+                noise_pred_text = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+                # Set back to usual attention processor, if using image_embed, dont do this
+                self.deactivate_aid()
+                # predict the noise residual for negative noise
+                if (
+                    (image_start is not None or image_end is not None)
+                    or ip_adapter_image is not None
+                    or ip_adapter_image_embeds is not None
+                ):
+                    added_cond_kwargs = {"image_embeds": negative_image_embeds}
+                else:
+                    None
+                noise_pred_uncond = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=negative_prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False,
+                )[0]
+                # perform guidance
+                noise_pred = noise_pred_uncond + self.guidance_scale * (
+                    noise_pred_text - noise_pred_uncond
+                )
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(
+                        noise_pred,
+                        noise_pred_text,
+                        guidance_rescale=self.guidance_rescale,
+                    )
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, **extra_step_kwargs, return_dict=False
+                )[0]
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop(
+                        "negative_prompt_embeds", negative_prompt_embeds
+                    )
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+                ):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+        if not output_type == "latent":
+            image = self.vae.decode(
+                latents / self.vae.config.scaling_factor,
+                return_dict=False,
+                generator=generator,
+            )[0]
+            image, has_nsfw_concept = self.run_safety_checker(
+                image, device, prompt_embeds.dtype
+            )
+        else:
+            image = latents
+            has_nsfw_concept = None
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+        image = self.image_processor.postprocess(
+            image, output_type=output_type, do_denormalize=do_denormalize
+        )
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(
+            images=image, nsfw_content_detected=has_nsfw_concept
+        )

pipeline_interpolated_sdxl.py ADDED Viewed

The diff for this file is too large to render. See raw diff

prior.py ADDED Viewed

	@@ -0,0 +1,506 @@

+import numpy as np
+import torch
+from bayes_opt import BayesianOptimization, SequentialDomainReductionTransformer
+from lpips import LPIPS
+from scipy.optimize import curve_fit
+from scipy.stats import beta as beta_distribution
+from transformers import CLIPImageProcessor, CLIPModel
+from utils import compute_lpips, compute_smoothness_and_consistency
+class BetaPriorPipeline:
+    def __init__(self, pipe, model_ID="openai/clip-vit-base-patch32"):
+        self.model = CLIPModel.from_pretrained(model_ID)
+        self.preprocess = CLIPImageProcessor.from_pretrained(model_ID)
+        self.pipe = pipe
+    def _compute_clip(self, embedding_a, embedding_b):
+        similarity_score = torch.nn.functional.cosine_similarity(
+            embedding_a, embedding_b
+        )
+        return 1 - similarity_score[0]
+    def _get_feature(self, image):
+        with torch.no_grad():
+            if isinstance(image, np.ndarray):
+                image = self.preprocess(
+                    image, return_tensors="pt", do_rescale=False
+                ).pixel_values
+            else:
+                image = self.preprocess(image, return_tensors="pt").pixel_values
+            embedding = self.model.get_image_features(image)
+        return embedding
+    def _update_alpha_beta(self, xs, ds):
+        uniform_point = []
+        ds_sum = sum(ds)
+        for i in range(len(ds)):
+            uniform_point.append(ds[i] / ds_sum)
+        uniform_point = [0] + uniform_point
+        uniform_points = np.cumsum(uniform_point)
+        xs = np.asarray(xs)
+        uniform_points = np.asarray(uniform_points)
+        def beta_cdf(x, alpha, beta_param):
+            return beta_distribution.cdf(x, alpha, beta_param)
+        initial_guess = [1.0, 1.0]
+        bounds = ([1e-6, 1e-6], [np.inf, np.inf])
+        params, covariance = curve_fit(
+            beta_cdf, xs, uniform_points, p0=initial_guess, bounds=bounds
+        )
+        fitted_alpha, fitted_beta = params
+        return fitted_alpha, fitted_beta
+    def _add_next_point(
+        self,
+        ds,
+        xs,
+        images,
+        features,
+        alpha,
+        beta_param,
+        prompt_start,
+        prompt_end,
+        negative_prompt,
+        latent_start,
+        latent_end,
+        num_inference_steps,
+        uniform=False,
+        **kwargs,
+    ):
+        idx = np.argmax(ds)
+        A = xs[idx]
+        B = xs[idx + 1]
+        F_A = beta_distribution.cdf(A, alpha, beta_param)
+        F_B = beta_distribution.cdf(B, alpha, beta_param)
+        # Compute the target CDF for t
+        F_t = (F_A + F_B) / 2
+        # Compute the value of t using the inverse CDF (percent point function)
+        t = beta_distribution.ppf(F_t, alpha, beta_param)
+        if uniform:
+            idx = np.argmax(np.array(xs) - np.array([0] + xs[:-1])) - 1
+            t = (xs[idx] + xs[idx + 1]) / 2
+        if t < 0 or t > 1:
+            return xs, False
+        ims = self.pipe.interpolate_single(
+            t,
+            prompt_start=prompt_start,
+            prompt_end=prompt_end,
+            negative_prompt=negative_prompt,
+            latent_start=latent_start,
+            latent_end=latent_end,
+            early="fused_outer",
+            num_inference_steps=num_inference_steps,
+            **kwargs,
+        )
+        added_image = ims.images[1]
+        added_feature = self._get_feature(added_image)
+        d1 = self._compute_clip(features[idx], added_feature)
+        d2 = self._compute_clip(features[idx + 1], added_feature)
+        images.insert(idx + 1, ims.images[1])
+        features.insert(idx + 1, added_feature)
+        xs.insert(idx + 1, t)
+        del ds[idx]
+        ds.insert(idx, d1)
+        ds.insert(idx + 1, d2)
+        return xs, True
+    def explore_with_beta(
+        self,
+        progress,
+        prompt_start,
+        prompt_end,
+        negative_prompt,
+        latent_start,
+        latent_end,
+        num_inference_steps=28,
+        exploration_size=16,
+        init_alpha=3,
+        init_beta=3,
+        uniform=False,
+        **kwargs,
+    ):
+        xs = [0.0, 0.5, 1.0]
+        images = self.pipe.interpolate_single(
+            0.5,
+            prompt_start=prompt_start,
+            prompt_end=prompt_end,
+            negative_prompt=negative_prompt,
+            latent_start=latent_start,
+            latent_end=latent_end,
+            early="fused_outer",
+            num_inference_steps=num_inference_steps,
+            **kwargs,
+        )
+        images = images.images
+        images = [images[0], images[1], images[2]]
+        features = [self._get_feature(image) for image in images]
+        ds = [
+            self._compute_clip(features[0], features[1]),
+            self._compute_clip(features[1], features[2]),
+        ]
+        alpha = init_alpha
+        beta_param = init_beta
+        print(
+            "Alpha:",
+            alpha,
+            "| Beta:",
+            beta_param,
+            "| Current Coefs:",
+            xs,
+            "| Current Distances:",
+            ds,
+        )
+        progress(3, desc="Exploration")
+        for i in progress.tqdm(range(3, exploration_size)):
+            xs, flag = self._add_next_point(
+                ds,
+                xs,
+                images,
+                features,
+                alpha,
+                beta_param,
+                prompt_start,
+                prompt_end,
+                negative_prompt,
+                latent_start,
+                latent_end,
+                num_inference_steps,
+                uniform=uniform,
+                **kwargs,
+            )
+            if not flag:
+                break
+            alpha, beta_param = self._update_alpha_beta(xs, ds)
+            if uniform:
+                alpha = 1
+                beta_param = 1
+            print(f"--------Exploration: {len(xs)} / {exploration_size}--------")
+            print(
+                "Alpha:",
+                alpha,
+                "| Beta:",
+                beta_param,
+                "| Current Coefs:",
+                xs,
+                "| Current Distances:",
+                ds,
+            )
+        return images, features, ds, xs, alpha, beta_param
+    def extract_uniform_points(self, ds, interpolation_size):
+        expected_dis = sum(ds) / (interpolation_size - 1)
+        current_sum = 0
+        output_idxs = [0]
+        for idx, d in enumerate(ds):
+            current_sum += d
+            if current_sum >= expected_dis:
+                output_idxs.append(idx)
+                current_sum = 0
+        return output_idxs
+    def extract_uniform_points_plus(self, features, interpolation_size):
+        weights = -1 * np.ones((len(features), len(features)))
+        for i in range(len(features)):
+            for j in range(i + 1, len(features)):
+                weights[i][j] = self._compute_clip(features[i], features[j])
+        m = len(features)
+        n = interpolation_size
+        _, best_path = self.find_minimal_spread_and_path(n, m, weights)
+        print("Optimal smooth path:", best_path)
+        return best_path
+    def find_minimal_spread_and_path(self, n, m, weights):
+        # Collect all unique edge weights, excluding non-existent edges (-1)
+        W = sorted(
+            {
+                weights[i][j]
+                for i in range(m - 1)
+                for j in range(i + 1, m)
+                if weights[i][j] != -1
+            }
+        )
+        min_weight = W[0]
+        max_weight = W[-1]
+        low = 0.0
+        high = max_weight - min_weight
+        epsilon = 1e-6  # Desired precision
+        best_D = None
+        best_path = None
+        while high - low > epsilon:
+            D = (low + high) / 2
+            result = self.is_path_possible(D, n, m, weights, W)
+            if result is not None:
+                # A valid path is found
+                high = D
+                best_D = D
+                best_path = result
+            else:
+                low = D
+        return best_D, best_path
+    def is_path_possible(self, D, n, m, weights, W):
+        for w_min in W:
+            w_max = w_min + D
+            if w_max > W[-1]:
+                break
+            # Dynamic Programming to check for a valid path
+            dp = [[None] * (n + 1) for _ in range(m)]
+            dp[0][1] = (
+                float("-inf"),
+                float("inf"),
+                [0],
+            )  # Start from x1 with path length 1
+            for l in range(1, n):
+                for i in range(m):
+                    if dp[i][l] is not None:
+                        max_w, min_w, path = dp[i][l]
+                        for j in range(i + 1, m):
+                            w = weights[i][j]
+                            if w != -1 and w_min <= w <= w_max:
+                                # Update max and min weights along the path
+                                new_max_w = max(max_w, w)
+                                new_min_w = min(min_w, w)
+                                new_diff = new_max_w - new_min_w
+                                if new_diff <= D:
+                                    dp_j_l_plus_1 = dp[j][l + 1]
+                                    if dp_j_l_plus_1 is None or new_diff < (
+                                        dp_j_l_plus_1[0] - dp_j_l_plus_1[1]
+                                    ):
+                                        dp[j][l + 1] = (
+                                            new_max_w,
+                                            new_min_w,
+                                            path + [j],
+                                        )
+            if dp[m - 1][n] is not None:
+                # Reconstruct the path
+                _, _, path = dp[m - 1][n]
+                return path  # Return the path if found
+        return None  # Return None if no valid path is found
+    def generate_interpolation(
+        self,
+        progress,
+        prompt_start,
+        prompt_end,
+        negative_prompt,
+        latent_start,
+        latent_end,
+        num_inference_steps=28,
+        exploration_size=16,
+        init_alpha=3,
+        init_beta=3,
+        interpolation_size=7,
+        uniform=False,
+        **kwargs,
+    ):
+        images, features, ds, xs, alpha, beta_param = self.explore_with_beta(
+            progress,
+            prompt_start,
+            prompt_end,
+            negative_prompt,
+            latent_start,
+            latent_end,
+            num_inference_steps,
+            exploration_size,
+            init_alpha,
+            init_beta,
+            uniform=uniform,
+            **kwargs,
+        )
+        # output_idx = self.extract_uniform_points(ds, interpolation_size)
+        output_idx = self.extract_uniform_points_plus(features, interpolation_size)
+        output_images = []
+        for idx in output_idx:
+            output_images.append(images[idx])
+        # for call_back
+        self.images = images
+        self.ds = ds
+        self.xs = xs
+        self.alpha = alpha
+        self.beta_param = beta_param
+        return output_images
+def bayesian_prior_selection(
+    interpolation_pipe,
+    latent1: torch.FloatTensor,
+    latent2: torch.FloatTensor,
+    prompt1: str,
+    prompt2: str,
+    lpips_model: LPIPS,
+    guide_prompt: str | None = None,
+    negative_prompt: str = "",
+    size: int = 3,
+    num_inference_steps: int = 25,
+    warmup_ratio: float = 1,
+    early: str = "vfused",
+    late: str = "self",
+    target_score: float = 0.9,
+    n_iter: int = 15,
+    p_min: float | None = None,
+    p_max: float | None = None,
+) -> tuple:
+    """
+    Select the alpha and beta parameters for the interpolation using Bayesian optimization.
+    Args:
+        interpolation_pipe (any): The interpolation pipeline.
+        latent1 (torch.FloatTensor): The first source latent vector.
+        latent2 (torch.FloatTensor): The second source latent vector.
+        prompt1 (str): The first source prompt.
+        prompt2 (str): The second source prompt.
+        lpips_model (any): The LPIPS model used to compute perceptual distances.
+        guide_prompt (str | None, optional): The guide prompt for the interpolation, if any. Defaults to None.
+        negative_prompt (str, optional): The negative prompt for the interpolation, default to empty string. Defaults to "".
+        size (int, optional): The size of the interpolation sequence. Defaults to 3.
+        num_inference_steps (int, optional): The number of inference steps. Defaults to 25.
+        warmup_ratio (float, optional): The warmup ratio. Defaults to 1.
+        early (str, optional): The early fusion method. Defaults to "vfused".
+        late (str, optional): The late fusion method. Defaults to "self".
+        target_score (float, optional): The target score. Defaults to 0.9.
+        n_iter (int, optional): The maximum number of iterations. Defaults to 15.
+        p_min (float, optional): The minimum value of alpha and beta. Defaults to None.
+        p_max (float, optional): The maximum value of alpha and beta. Defaults to None.
+    Returns:
+        tuple: A tuple containing the selected alpha and beta parameters.
+    """
+    def get_smoothness(alpha, beta):
+        """
+        Black-box objective function of Bayesian Optimization.
+        Get the smoothness of the interpolated sequence with the given alpha and beta.
+        """
+        if alpha < beta and large_alpha_prior:
+            return 0
+        if alpha > beta and not large_alpha_prior:
+            return 0
+        if alpha == beta:
+            return init_smoothness
+        interpolation_sequence = interpolation_pipe.interpolate_save_gpu(
+            latent1,
+            latent2,
+            prompt1,
+            prompt2,
+            guide_prompt=guide_prompt,
+            negative_prompt=negative_prompt,
+            size=size,
+            num_inference_steps=num_inference_steps,
+            warmup_ratio=warmup_ratio,
+            early=early,
+            late=late,
+            alpha=alpha,
+            beta=beta,
+        )
+        smoothness, _, _ = compute_smoothness_and_consistency(
+            interpolation_sequence, lpips_model
+        )
+        return smoothness
+    # Add prior into selection of alpha and beta
+    # We firstly compute the interpolated images with t=0.5
+    images = interpolation_pipe.interpolate_single(
+        0.5,
+        latent1,
+        latent2,
+        prompt1,
+        prompt2,
+        guide_prompt=guide_prompt,
+        negative_prompt=negative_prompt,
+        num_inference_steps=num_inference_steps,
+        warmup_ratio=warmup_ratio,
+        early=early,
+        late=late,
+    )
+    # We compute the perceptual distances of the interpolated images (t=0.5) to the source image
+    distances = compute_lpips(images, lpips_model)
+    # We compute the init_smoothness as the smoothness when alpha=beta to avoid recomputation
+    init_smoothness, _, _ = compute_smoothness_and_consistency(images, lpips_model)
+    # If perceptual distance to the first source image is smaller, alpha should be larger than beta
+    large_alpha_prior = distances[0] < distances[1]
+    # Bayesian optimization configuration
+    num_warmup_steps = warmup_ratio * num_inference_steps
+    if p_min is None:
+        p_min = 1
+    if p_max is None:
+        p_max = num_warmup_steps
+    pbounds = {"alpha": (p_min, p_max), "beta": (p_min, p_max)}
+    bounds_transformer = SequentialDomainReductionTransformer(minimum_window=0.1)
+    optimizer = BayesianOptimization(
+        f=get_smoothness,
+        pbounds=pbounds,
+        random_state=1,
+        bounds_transformer=bounds_transformer,
+        allow_duplicate_points=True,
+    )
+    alpha_init = [p_min, (p_min + p_max) / 2, p_max]
+    beta_init = [p_min, (p_min + p_max) / 2, p_max]
+    # Initial probing
+    for alpha in alpha_init:
+        for beta in beta_init:
+            optimizer.probe(params={"alpha": alpha, "beta": beta}, lazy=False)
+            latest_result = optimizer.res[-1]  # Get the last result
+            latest_score = latest_result["target"]
+            if latest_score >= target_score:
+                return alpha, beta
+    # Start optimization
+    for _ in range(n_iter):  # Max iterations
+        optimizer.maximize(init_points=0, n_iter=1)  # One iteration at a time
+        max_score = optimizer.max["target"]  # Get the highest score so far
+        if max_score >= target_score:
+            print(f"Stopping early, target of {target_score} reached.")
+            break  # Exit the loop if target is reached or exceeded
+    results = optimizer.max
+    alpha = results["params"]["alpha"]
+    beta = results["params"]["beta"]
+    return alpha, beta
+def generate_beta_tensor(
+    size: int, alpha: float = 3, beta: float = 3
+) -> torch.FloatTensor:
+    """
+    Assume size as n
+    Generates a PyTorch tensor of values [x0, x1, ..., xn-1] for the Beta distribution
+    where each xi satisfies F(xi) = i/(n-1) for the CDF F of the Beta distribution.
+    Args:
+        size (int): The number of values to generate.
+        alpha (float): The alpha parameter of the Beta distribution.
+        beta (float): The beta parameter of the Beta distribution.
+    Returns:
+        torch.Tensor: A tensor of the inverse CDF values of the Beta distribution.
+    """
+    # Generating the inverse CDF values
+    prob_values = [i / (size - 1) for i in range(size)]
+    inverse_cdf_values = beta_distribution.ppf(prob_values, alpha, beta)
+    # Converting to a PyTorch tensor
+    return torch.tensor(inverse_cdf_values, dtype=torch.float32)

requirements.txt ADDED Viewed

	@@ -0,0 +1,66 @@

+absl-py==2.1.0
+accelerate==0.27.2
+addict==2.4.0
+antlr4-python3-runtime==4.9.3
+bayesian-optimization==1.4.3
+clean-fid==0.1.35
+clip @ git+https://github.com/openai/CLIP.git@a1d071733d7111c9c014f024669f959182114e33
+colorama==0.4.6
+contourpy==1.2.0
+cycler==0.12.1
+diffusers==0.27.1
+einops==0.7.0
+facexlib==0.3.0
+filterpy==1.4.5
+fonttools==4.49.0
+fsspec==2024.2.0
+ftfy==6.1.3
+future==1.0.0
+grpcio==1.62.0
+huggingface-hub==0.20.3
+imageio==2.34.0
+imgaug==0.4.0
+joblib==1.3.2
+kiwisolver==1.4.5
+lazy_loader==0.3
+llvmlite==0.42.0
+lmdb==1.4.1
+lpips==0.1.4
+Markdown==3.5.2
+matplotlib==3.8.3
+mkl-service==2.4.0
+numba==0.59.0
+numpy==1.24.4
+omegaconf==2.3.0
+openai-clip==1.0.1
+opencv-python==4.9.0.80
+pandas==2.2.0
+protobuf==4.25.3
+pyiqa==0.1.10
+pyparsing==3.1.1
+python-dateutil==2.8.2
+pytorch-fid==0.3.0
+pytz==2024.1
+regex==2023.12.25
+safetensors==0.4.2
+scikit-image==0.22.0
+scikit-learn==1.4.1.post1
+scipy==1.9.1
+shapely==2.0.3
+tensorboard==2.16.2
+tensorboard-data-server==0.7.2
+threadpoolctl==3.3.0
+tifffile==2024.2.12
+timm==0.9.16
+tokenizers==0.15.2
+tomli==2.0.1
+torch==2.1.0
+torchmetrics
+torchaudio==2.1.0
+torchvision==0.16.0
+tqdm==4.66.2
+transformers==4.38.2
+triton==2.1.0
+tzdata==2024.1
+Werkzeug==3.0.1
+yapf==0.40.2

style.css ADDED Viewed

	@@ -0,0 +1,95 @@

+h1 {
+    text-align: center;
+    justify-content: center;
+}
+[role="tabpanel"] {
+    border: 0
+}
+#duplicate-button {
+    margin: auto;
+    color: #fff;
+    background: #1565c0;
+    border-radius: 100vh;
+}
+.gradio-container {
+    max-width: 690px ! important;
+}
+.equal-height {
+    display: flex;
+    flex: 1;
+}
+.grid-container {
+    display: grid;
+    grid-template-columns: 1fr 1fr;  /* 两列宽度相等 */
+    gap: 20px;
+    height: 100%;  /* 确保容器高度为100% */
+}
+.grid-item {
+    display: flex;
+    flex-direction: column;
+    height: 100%;
+}
+.flex-grow {
+    flex-grow: 1;  /* 使该元素占据剩余的高度 */
+    display: flex;
+    flex-direction: column;
+}
+#share-btn-container {
+    padding-left: 0.5rem !important;
+    padding-right: 0.5rem !important;
+    background-color: #000000;
+    justify-content: center;
+    align-items: center;
+    border-radius: 9999px !important;
+    max-width: 13rem;
+    margin-left: auto;
+    margin-top: 0.35em;
+}
+div#share-btn-container>div {
+    flex-direction: row;
+    background: black;
+    align-items: center
+}
+#share-btn-container:hover {
+    background-color: #060606
+}
+#share-btn {
+    all: initial;
+    color: #ffffff;
+    font-weight: 600;
+    cursor: pointer;
+    font-family: 'IBM Plex Sans', sans-serif;
+    margin-left: 0.5rem !important;
+    padding-top: 0.5rem !important;
+    padding-bottom: 0.5rem !important;
+    right: 0;
+    font-size: 15px;
+}
+#share-btn * {
+    all: unset
+}
+#share-btn-container div:nth-child(-n+2) {
+    width: auto !important;
+    min-height: 0px !important;
+}
+#share-btn-container .wrap {
+    display: none !important
+}
+#share-btn-container.hidden {
+    display: none !important
+}

utils.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import os
+from typing import Optional
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+from lpips import LPIPS
+from PIL import Image
+from torchvision.transforms import Normalize
+def show_images_horizontally(
+    list_of_files: np.array, output_file: Optional[str] = None, interact: bool = False
+) -> None:
+    """
+    Visualize the list of images horizontally and save the figure as PNG.
+    Args:
+        list_of_files: The list of images as numpy array with shape (N, H, W, C).
+        output_file: The output file path to save the figure as PNG.
+        interact: Whether to show the figure interactively in Jupyter Notebook or not in Python.
+    """
+    number_of_files = len(list_of_files)
+    heights = [a[0].shape[0] for a in list_of_files]
+    widths = [a.shape[1] for a in list_of_files[0]]
+    fig_width = 8.0  # inches
+    fig_height = fig_width * sum(heights) / sum(widths)
+    # Create a figure with subplots
+    _, axs = plt.subplots(
+        1, number_of_files, figsize=(fig_width * number_of_files, fig_height)
+    )
+    plt.tight_layout()
+    for i in range(number_of_files):
+        _image = list_of_files[i]
+        axs[i].imshow(_image)
+        axs[i].axis("off")
+    # Save the figure as PNG
+    if interact:
+        plt.show()
+    else:
+        plt.savefig(output_file, bbox_inches="tight", pad_inches=0.25)
+def image_grids(images, rows=None, cols=None):
+    if not images:
+        raise ValueError("The image list is empty.")
+    n_images = len(images)
+    if cols is None:
+        cols = int(n_images**0.5)
+    if rows is None:
+        rows = (n_images + cols - 1) // cols
+    width, height = images[0].size
+    grid_width = cols * width
+    grid_height = rows * height
+    grid_image = Image.new("RGB", (grid_width, grid_height))
+    for i, image in enumerate(images):
+        row, col = divmod(i, cols)
+        grid_image.paste(image, (col * width, row * height))
+    return grid_image
+def save_image(image: np.array, file_name: str) -> None:
+    """
+    Save the image as JPG.
+    Args:
+        image: The input image as numpy array with shape (H, W, C).
+        file_name: The file name to save the image.
+    """
+    image = Image.fromarray(image)
+    image.save(file_name)
+def load_and_process_images(load_dir: str) -> np.array:
+    """
+    Load and process the images into numpy array from the directory.
+    Args:
+        load_dir: The directory to load the images.
+    Returns:
+        images: The images as numpy array with shape (N, H, W, C).
+    """
+    images = []
+    print(load_dir)
+    filenames = sorted(
+        os.listdir(load_dir), key=lambda x: int(x.split(".")[0])
+    )  # Ensure the files are sorted numerically
+    for filename in filenames:
+        if filename.endswith(".jpg"):
+            img = Image.open(os.path.join(load_dir, filename))
+            img_array = (
+                np.asarray(img) / 255.0
+            )  # Convert to numpy array and scale pixel values to [0, 1]
+            images.append(img_array)
+    return images
+def compute_lpips(images: np.array, lpips_model: LPIPS) -> np.array:
+    """
+    Compute the LPIPS of the input images.
+    Args:
+        images: The input images as numpy array with shape (N, H, W, C).
+        lpips_model: The LPIPS model used to compute perceptual distances.
+    Returns:
+        distances: The LPIPS of the input images.
+    """
+    # Get device of lpips_model
+    device = next(lpips_model.parameters()).device
+    device = str(device)
+    # Change the input images into tensor
+    images = torch.tensor(images).to(device).float()
+    images = torch.permute(images, (0, 3, 1, 2))
+    normalize = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+    images = normalize(images)
+    # Compute the LPIPS between each adjacent input images
+    distances = []
+    for i in range(images.shape[0]):
+        if i == images.shape[0] - 1:
+            break
+        img1 = images[i].unsqueeze(0)
+        img2 = images[i + 1].unsqueeze(0)
+        loss = lpips_model(img1, img2)
+        distances.append(loss.item())
+    distances = np.array(distances)
+    return distances
+def compute_gini(distances: np.array) -> float:
+    """
+    Compute the Gini index of the input distances.
+    Args:
+        distances: The input distances as numpy array.
+    Returns:
+        gini: The Gini index of the input distances.
+    """
+    if len(distances) < 2:
+        return 0.0  # Gini index is 0 for less than two elements
+    # Sort the list of distances
+    sorted_distances = sorted(distances)
+    n = len(sorted_distances)
+    mean_distance = sum(sorted_distances) / n
+    # Compute the sum of absolute differences
+    sum_of_differences = 0
+    for di in sorted_distances:
+        for dj in sorted_distances:
+            sum_of_differences += abs(di - dj)
+    # Normalize the sum of differences by the mean and the number of elements
+    gini = sum_of_differences / (2 * n * n * mean_distance)
+    return gini
+def compute_smoothness_and_consistency(images: np.array, lpips_model: LPIPS) -> tuple:
+    """
+    Compute the smoothness and efficiency of the input images.
+    Args:
+        images: The input images as numpy array with shape (N, H, W, C).
+        lpips_model: The LPIPS model used to compute perceptual distances.
+    Returns:
+        smoothness: One minus gini index of LPIPS of consecutive images.
+        consistency: The mean LPIPS of consecutive images.
+        max_inception_distance: The maximum LPIPS of consecutive images.
+    """
+    distances = compute_lpips(images, lpips_model)
+    smoothness = 1 - compute_gini(distances)
+    consistency = np.mean(distances)
+    max_inception_distance = np.max(distances)
+    return smoothness, consistency, max_inception_distance
+def separate_source_and_interpolated_images(images: np.array) -> tuple:
+    """
+    Separate the input images into source and interpolated images.
+    The input source is the start and end of the images, while the interpolated images are the rest.
+    Args:
+        images: The input images as numpy array with shape (N, H, W, C).
+    Returns:
+        source: The source images as numpy array with shape (2, H, W, C).
+        interpolation: The interpolated images as numpy array with shape (N-2, H, W, C).
+    """
+    # Check if the array has at least two elements
+    if len(images) < 2:
+        raise ValueError("The input array should have at least two elements.")
+    # Separate the array into two parts
+    # First part takes the first and last element
+    source = np.array([images[0], images[-1]])
+    # Second part takes the rest of the elements
+    interpolation = images[1:-1]
+    return source, interpolation