Spaces:

ChenWu98
/

Stable-CycleDiffusion

Runtime error

App Files Files Community

ChenWu98 commited on Nov 9, 2022

Commit

9fc2574

•

1 Parent(s): 334ea23

Update app.py

Browse files

Files changed (1) hide show

app.py +254 -17

app.py CHANGED Viewed

@@ -4,10 +4,19 @@ import torch
 from PIL import Image
 import utils
 import streamlit as st
 is_colab = utils.is_google_colab()
-if True:
     model_id_or_path = "CompVis/stable-diffusion-v1-4"
     scheduler = DDIMScheduler.from_config(model_id_or_path,
                                           use_auth_token=st.secrets["USER_TOKEN"],
@@ -15,21 +24,233 @@ if True:
     pipe = CycleDiffusionPipeline.from_pretrained(model_id_or_path,
                                                   use_auth_token=st.secrets["USER_TOKEN"],
                                                   scheduler=scheduler)
     if torch.cuda.is_available():
         pipe = pipe.to("cuda")
-device = "GPU 🔥" if torch.cuda.is_available() else "CPU 🥶"
 def inference(source_prompt, target_prompt, source_guidance_scale=1, guidance_scale=5, num_inference_steps=100,
-              width=512, height=512, seed=0, img=None, strength=0.7):
     torch.manual_seed(seed)
     ratio = min(height / img.height, width / img.width)
     img = img.resize((int(img.width * ratio), int(img.height * ratio)))
     results = pipe(prompt=target_prompt,
                    source_prompt=source_prompt,
                    init_image=img,
@@ -64,7 +285,7 @@ with gr.Blocks(css=css) as demo:
                 <a href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/cycle_diffusion">🧨 Pipeline doc</a> | <a href="https://arxiv.org/abs/2210.05559">📄 Paper link</a>
               </p>
               <p>You can skip the queue in the colab: <a href="https://colab.research.google.com/gist/ChenWu98/0aa4fe7be80f6b45d3d055df9f14353a/copy-of-fine-tuned-diffusion-gradio.ipynb"><img data-canonical-src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg"></a></p>
-               Running on <b>{device}</b>{(" in a <b>Google Colab</b>." if is_colab else "")}
               </p>
             </div>
         """
@@ -82,42 +303,58 @@ with gr.Blocks(css=css) as demo:
                 # ).style(grid=[1], height="auto")
         with gr.Column(scale=45):
-            with gr.Tab("Options"):
                 with gr.Group():
                     with gr.Row():
                         source_prompt = gr.Textbox(label="Source prompt", placeholder="Source prompt describes the input image")
                     with gr.Row():
                         target_prompt = gr.Textbox(label="Target prompt", placeholder="Target prompt describes the output image")
-                    with gr.Row():
-                        source_guidance_scale = gr.Slider(label="Source guidance scale", value=1, minimum=1, maximum=10)
                         guidance_scale = gr.Slider(label="Target guidance scale", value=5, minimum=1, maximum=10)
                     with gr.Row():
-                        num_inference_steps = gr.Slider(label="Number of inference steps", value=100, minimum=25, maximum=500, step=1)
                         strength = gr.Slider(label="Strength", value=0.7, minimum=0.5, maximum=1, step=0.01)
                     with gr.Row():
                         width = gr.Slider(label="Width", value=512, minimum=64, maximum=1024, step=8)
                         height = gr.Slider(label="Height", value=512, minimum=64, maximum=1024, step=8)
                     with gr.Row():
                         seed = gr.Slider(0, 2147483647, label='Seed', value=0, step=1)
                     with gr.Row():
-                        generate = gr.Button(value="Edit")
     inputs = [source_prompt, target_prompt, source_guidance_scale, guidance_scale, num_inference_steps,
-              width, height, seed, img, strength]
     generate.click(inference, inputs=inputs, outputs=image_out)
     ex = gr.Examples(
         [
-            ["An astronaut riding a horse", "An astronaut riding an elephant", 1, 2, 100, 0, "images/astronaut_horse.png", 0.8],
-            ["A black colored car.", "A blue colored car.", 1, 2, 100, 0, "images/black_car.png", 0.85],
-            ["An aerial view of autumn scene.", "An aerial view of winter scene.", 1, 5, 100, 0, "images/mausoleum.png", 0.9],
-            ["A green apple and a black backpack on the floor.", "A red apple and a black backpack on the floor.", 1, 7, 100, 0, "images/apple_bag.png", 0.9],
         ],
-        [source_prompt, target_prompt, source_guidance_scale, guidance_scale, num_inference_steps, seed, img, strength],
         image_out, inference, cache_examples=False)
     gr.Markdown('''

 from PIL import Image
 import utils
 import streamlit as st
+import ptp_utils
+import seq_aligner
+import torch.nn.functional as nnf
+from typing import Optional, Union, Tuple, List, Callable, Dict
+import abc
+LOW_RESOURCE = False
+MAX_NUM_WORDS = 77
 is_colab = utils.is_google_colab()
+if False:
     model_id_or_path = "CompVis/stable-diffusion-v1-4"
     scheduler = DDIMScheduler.from_config(model_id_or_path,
                                           use_auth_token=st.secrets["USER_TOKEN"],
     pipe = CycleDiffusionPipeline.from_pretrained(model_id_or_path,
                                                   use_auth_token=st.secrets["USER_TOKEN"],
                                                   scheduler=scheduler)
+    tokenizer = pipe.tokenizer
     if torch.cuda.is_available():
         pipe = pipe.to("cuda")
+device_print = "GPU 🔥" if torch.cuda.is_available() else "CPU 🥶"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+class LocalBlend:
+    def __call__(self, x_t, attention_store):
+        k = 1
+        maps = attention_store["down_cross"][2:4] + attention_store["up_cross"][:3]
+        maps = [item.reshape(self.alpha_layers.shape[0], -1, 1, 16, 16, MAX_NUM_WORDS) for item in maps]
+        maps = torch.cat(maps, dim=1)
+        maps = (maps * self.alpha_layers).sum(-1).mean(1)
+        mask = nnf.max_pool2d(maps, (k * 2 + 1, k * 2 + 1), (1, 1), padding=(k, k))
+        mask = nnf.interpolate(mask, size=(x_t.shape[2:]))
+        mask = mask / mask.max(2, keepdims=True)[0].max(3, keepdims=True)[0]
+        mask = mask.gt(self.threshold)
+        mask = (mask[:1] + mask[1:]).float()
+        x_t = x_t[:1] + mask * (x_t - x_t[:1])
+        return x_t
+    def __init__(self, prompts: List[str], words: [List[List[str]]], threshold=.3):
+        alpha_layers = torch.zeros(len(prompts),  1, 1, 1, 1, MAX_NUM_WORDS)
+        for i, (prompt, words_) in enumerate(zip(prompts, words)):
+            if type(words_) is str:
+                words_ = [words_]
+            for word in words_:
+                ind = ptp_utils.get_word_inds(prompt, word, tokenizer)
+                alpha_layers[i, :, :, :, :, ind] = 1
+        self.alpha_layers = alpha_layers.to(device)
+        self.threshold = threshold
+class AttentionControl(abc.ABC):
+    def step_callback(self, x_t):
+        return x_t
+    def between_steps(self):
+        return
+    @property
+    def num_uncond_att_layers(self):
+        return self.num_att_layers if LOW_RESOURCE else 0
+    @abc.abstractmethod
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        raise NotImplementedError
+    def __call__(self, attn, is_cross: bool, place_in_unet: str):
+        if self.cur_att_layer >= self.num_uncond_att_layers:
+            if LOW_RESOURCE:
+                attn = self.forward(attn, is_cross, place_in_unet)
+            else:
+                h = attn.shape[0]
+                attn[h // 2:] = self.forward(attn[h // 2:], is_cross, place_in_unet)
+        self.cur_att_layer += 1
+        if self.cur_att_layer == self.num_att_layers + self.num_uncond_att_layers:
+            self.cur_att_layer = 0
+            self.cur_step += 1
+            self.between_steps()
+        return attn
+    def reset(self):
+        self.cur_step = 0
+        self.cur_att_layer = 0
+    def __init__(self):
+        self.cur_step = 0
+        self.num_att_layers = -1
+        self.cur_att_layer = 0
+class EmptyControl(AttentionControl):
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        return attn
+class AttentionStore(AttentionControl):
+    @staticmethod
+    def get_empty_store():
+        return {"down_cross": [], "mid_cross": [], "up_cross": [],
+                "down_self": [],  "mid_self": [],  "up_self": []}
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        key = f"{place_in_unet}_{'cross' if is_cross else 'self'}"
+        if attn.shape[1] <= 32 ** 2:  # avoid memory overhead
+            self.step_store[key].append(attn)
+        return attn
+    def between_steps(self):
+        if len(self.attention_store) == 0:
+            self.attention_store = self.step_store
+        else:
+            for key in self.attention_store:
+                for i in range(len(self.attention_store[key])):
+                    self.attention_store[key][i] += self.step_store[key][i]
+        self.step_store = self.get_empty_store()
+    def get_average_attention(self):
+        average_attention = {key: [item / self.cur_step for item in self.attention_store[key]] for key in self.attention_store}
+        return average_attention
+    def reset(self):
+        super(AttentionStore, self).reset()
+        self.step_store = self.get_empty_store()
+        self.attention_store = {}
+    def __init__(self):
+        super(AttentionStore, self).__init__()
+        self.step_store = self.get_empty_store()
+        self.attention_store = {}
+class AttentionControlEdit(AttentionStore, abc.ABC):
+    def step_callback(self, x_t):
+        if self.local_blend is not None:
+            x_t = self.local_blend(x_t, self.attention_store)
+        return x_t
+    def replace_self_attention(self, attn_base, att_replace):
+        if att_replace.shape[2] <= 16 ** 2:
+            return attn_base.unsqueeze(0).expand(att_replace.shape[0], *attn_base.shape)
+        else:
+            return att_replace
+    @abc.abstractmethod
+    def replace_cross_attention(self, attn_base, att_replace):
+        raise NotImplementedError
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        super(AttentionControlEdit, self).forward(attn, is_cross, place_in_unet)
+        if is_cross or (self.num_self_replace[0] <= self.cur_step < self.num_self_replace[1]):
+            h = attn.shape[0] // self.batch_size
+            attn = attn.reshape(self.batch_size, h, *attn.shape[1:])
+            attn_base, attn_repalce = attn[0], attn[1:]
+            if is_cross:
+                alpha_words = self.cross_replace_alpha[self.cur_step]
+                attn_replace_new = self.replace_cross_attention(attn_base, attn_repalce) * alpha_words + (1 - alpha_words) * attn_repalce
+                attn[1:] = attn_replace_new
+            else:
+                attn[1:] = self.replace_self_attention(attn_base, attn_repalce)
+            attn = attn.reshape(self.batch_size * h, *attn.shape[2:])
+        return attn
+    def __init__(self, prompts, num_steps: int,
+                 cross_replace_steps: Union[float, Tuple[float, float], Dict[str, Tuple[float, float]]],
+                 self_replace_steps: Union[float, Tuple[float, float]],
+                 local_blend: Optional[LocalBlend]):
+        super(AttentionControlEdit, self).__init__()
+        self.batch_size = len(prompts)
+        self.cross_replace_alpha = ptp_utils.get_time_words_attention_alpha(prompts, num_steps, cross_replace_steps, tokenizer).to(device)
+        if type(self_replace_steps) is float:
+            self_replace_steps = 0, self_replace_steps
+        self.num_self_replace = int(num_steps * self_replace_steps[0]), int(num_steps * self_replace_steps[1])
+        self.local_blend = local_blend
+class AttentionReplace(AttentionControlEdit):
+    def replace_cross_attention(self, attn_base, att_replace):
+        return torch.einsum('hpw,bwn->bhpn', attn_base, self.mapper)
+    def __init__(self, prompts, num_steps: int, cross_replace_steps: float, self_replace_steps: float,
+                 local_blend: Optional[LocalBlend] = None):
+        super(AttentionReplace, self).__init__(prompts, num_steps, cross_replace_steps, self_replace_steps, local_blend)
+        self.mapper = seq_aligner.get_replacement_mapper(prompts, tokenizer).to(device)
+class AttentionRefine(AttentionControlEdit):
+    def replace_cross_attention(self, attn_base, att_replace):
+        attn_base_replace = attn_base[:, :, self.mapper].permute(2, 0, 1, 3)
+        attn_replace = attn_base_replace * self.alphas + att_replace * (1 - self.alphas)
+        return attn_replace
+    def __init__(self, prompts, num_steps: int, cross_replace_steps: float, self_replace_steps: float,
+                 local_blend: Optional[LocalBlend] = None):
+        super(AttentionRefine, self).__init__(prompts, num_steps, cross_replace_steps, self_replace_steps, local_blend)
+        self.mapper, alphas = seq_aligner.get_refinement_mapper(prompts, tokenizer)
+        self.mapper, alphas = self.mapper.to(device), alphas.to(device)
+        self.alphas = alphas.reshape(alphas.shape[0], 1, 1, alphas.shape[1])
+def get_equalizer(text: str, word_select: Union[int, Tuple[int, ...]], values: Union[List[float], Tuple[float, ...]]):
+    if type(word_select) is int or type(word_select) is str:
+        word_select = (word_select,)
+    equalizer = torch.ones(len(values), 77)
+    values = torch.tensor(values, dtype=torch.float32)
+    for word in word_select:
+        inds = ptp_utils.get_word_inds(text, word, tokenizer)
+        equalizer[:, inds] = values
+    return equalizer
 def inference(source_prompt, target_prompt, source_guidance_scale=1, guidance_scale=5, num_inference_steps=100,
+              width=512, height=512, seed=0, img=None, strength=0.7,
+              cross_attention_control=None, cross_replace_steps=0.8, self_replace_steps=0.4):
     torch.manual_seed(seed)
     ratio = min(height / img.height, width / img.width)
     img = img.resize((int(img.width * ratio), int(img.height * ratio)))
+    # create the CAC controller.
+    if cross_attention_control == "replace":
+        controller = AttentionReplace([source_prompt, target_prompt],
+                                      num_inference_steps,
+                                      cross_replace_steps=cross_replace_steps,
+                                      self_replace_steps=self_replace_steps,
+                                      )
+        ptp_utils.register_attention_control(pipe, controller)
+    elif cross_attention_control == "refine":
+        controller = AttentionRefine([source_prompt, target_prompt],
+                                     num_inference_steps,
+                                     cross_replace_steps=cross_replace_steps,
+                                     self_replace_steps=self_replace_steps,
+                                     )
+        ptp_utils.register_attention_control(pipe, controller)
     results = pipe(prompt=target_prompt,
                    source_prompt=source_prompt,
                    init_image=img,
                 <a href="https://huggingface.co/docs/diffusers/main/en/api/pipelines/cycle_diffusion">🧨 Pipeline doc</a> | <a href="https://arxiv.org/abs/2210.05559">📄 Paper link</a>
               </p>
               <p>You can skip the queue in the colab: <a href="https://colab.research.google.com/gist/ChenWu98/0aa4fe7be80f6b45d3d055df9f14353a/copy-of-fine-tuned-diffusion-gradio.ipynb"><img data-canonical-src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg"></a></p>
+               Running on <b>{device_print}</b>{(" in a <b>Google Colab</b>." if is_colab else "")}
               </p>
             </div>
         """
                 # ).style(grid=[1], height="auto")
         with gr.Column(scale=45):
+            with gr.Tab("Edit options"):
                 with gr.Group():
                     with gr.Row():
                         source_prompt = gr.Textbox(label="Source prompt", placeholder="Source prompt describes the input image")
+                        source_guidance_scale = gr.Slider(label="Source guidance scale", value=1, minimum=1, maximum=10)
                     with gr.Row():
                         target_prompt = gr.Textbox(label="Target prompt", placeholder="Target prompt describes the output image")
                         guidance_scale = gr.Slider(label="Target guidance scale", value=5, minimum=1, maximum=10)
                     with gr.Row():
                         strength = gr.Slider(label="Strength", value=0.7, minimum=0.5, maximum=1, step=0.01)
                     with gr.Row():
+                        generate = gr.Button(value="Edit")
+            with gr.Tab("Basic options"):
+                with gr.Group():
+                    with gr.Row():
+                        num_inference_steps = gr.Slider(label="Number of inference steps", value=100, minimum=25, maximum=500, step=1)
                         width = gr.Slider(label="Width", value=512, minimum=64, maximum=1024, step=8)
                         height = gr.Slider(label="Height", value=512, minimum=64, maximum=1024, step=8)
                     with gr.Row():
                         seed = gr.Slider(0, 2147483647, label='Seed', value=0, step=1)
+            with gr.Tab("CAC options"):
+                with gr.Group():
                     with gr.Row():
+                        cross_attention_control = gr.Radio(label="CAC type", choices=["None", "Replace", "Refine"], value="None")
+                    with gr.Row():
+                        # If not "None", the following two parameters will be used.
+                        cross_replace_steps = gr.Slider(label="Cross replace steps", value=0.8, minimum=0.0, maximum=1, step=0.01)
+                        self_replace_steps = gr.Slider(label="Self replace steps", value=0.4, minimum=0.0, maximum=1, step=0.01)
     inputs = [source_prompt, target_prompt, source_guidance_scale, guidance_scale, num_inference_steps,
+              width, height, seed, img, strength,
+              cross_attention_control, cross_replace_steps, self_replace_steps]
     generate.click(inference, inputs=inputs, outputs=image_out)
     ex = gr.Examples(
         [
+            ["An astronaut riding a horse", "An astronaut riding an elephant", 1, 2, 100, "images/astronaut_horse.png", 0.8, "None", 0, 0],
+            ["An astronaut riding a horse", "An astronaut riding a elephant", 1, 2, 100, "images/astronaut_horse.png", 0.9, "Replace", 0.15, 0.10],
+            ["A black colored car.", "A blue colored car.", 1, 2, 100, "images/black_car.png", 0.85, "None", 0, 0],
+            ["A black colored car.", "A blue colored car.", 1, 5, 100, "images/black_car.png", 0.95, "Replace", 0.8, 0.4],
+            ["A black colored car.", "A red colored car.", 1, 5, 100, "images/black_car.png", 1, "Replace", 0.8, 0.4],
+            ["An aerial view of autumn scene.", "An aerial view of winter scene.", 1, 5, 100, "images/mausoleum.png", 0.9, "None", 0.0, 0.0],
+            ["An aerial view of autumn scene.", "An aerial view of winter scene.", 1, 5, 100, "images/mausoleum.png", 1, "Replace", 0.8, 0.4],
+            ["A green apple and a black backpack on the floor.", "A red apple and a black backpack on the floor.", 1, 7, 100, "images/apple_bag.png", 0.9, "None", 0.0, 0.0],
+            ["A green apple and a black backpack on the floor.", "A red apple and a black backpack on the floor.", 1, 7, 100, "images/apple_bag.png", 0.9, "Replace", 0.8, 0.4],
         ],
+        [source_prompt, target_prompt, source_guidance_scale, guidance_scale, num_inference_steps,
+         img, strength,
+         cross_attention_control, cross_replace_steps, self_replace_steps],
         image_out, inference, cache_examples=False)
     gr.Markdown('''