File size: 8,716 Bytes
a9288bc
 
 
6120a85
a9288bc
6120a85
a9288bc
d1d12e1
e4b358e
cfc4462
e4b358e
cfc4462
b91f87b
a9288bc
 
3188b31
a9288bc
 
86e890a
a9288bc
1bae6ea
a9288bc
 
6120a85
a9288bc
 
6120a85
a9288bc
6120a85
a9288bc
 
6120a85
a9288bc
 
1bae6ea
 
 
 
a9288bc
6120a85
 
a9288bc
 
 
 
 
 
 
 
 
1bae6ea
 
a9288bc
f40b820
 
 
 
 
 
 
 
75de42e
f40b820
 
 
 
 
 
1bae6ea
f40b820
1bae6ea
 
 
f40b820
a9288bc
1bae6ea
a9288bc
 
 
 
 
 
1bae6ea
 
a9288bc
4ca9ea8
 
 
 
 
 
1bae6ea
4ca9ea8
 
 
 
 
 
 
 
 
 
 
 
 
e73c979
4ca9ea8
1bae6ea
4ca9ea8
 
 
 
 
 
 
 
 
 
 
1bae6ea
4ca9ea8
 
1bae6ea
4ca9ea8
1bae6ea
 
 
4ca9ea8
 
 
1bae6ea
4ca9ea8
1bae6ea
 
 
 
 
 
 
4ca9ea8
 
1bae6ea
a9288bc
 
 
 
1bae6ea
 
75de42e
 
 
 
a9288bc
 
1bae6ea
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import spaces
import torch
from diffusers import StableDiffusion3InstructPix2PixPipeline, SD3Transformer2DModel
import gradio as gr
import PIL.Image
import numpy as np
from PIL import Image, ImageOps
import os
import transformers
from transformers.utils.hub import move_cache
transformers.utils.move_cache()
move_cache()
pipe = StableDiffusion3InstructPix2PixPipeline.from_pretrained("BleachNick/SD3_UltraEdit_w_mask", torch_dtype=torch.float16).to("cuda")


@spaces.GPU(duration=120)
def generate(image_mask, prompt, num_inference_steps=50, image_guidance_scale=1.6, guidance_scale=7.5, seed=255):
    def is_blank_mask(mask_img):
        mask_array = np.array(mask_img.convert('L'))
        return np.all(mask_array == 0)

    seed = int(seed)
    generator = torch.manual_seed(seed)

    img = image_mask["background"].convert("RGB")
    mask_img = image_mask["layers"][0].getchannel('A').convert("RGB")

    desired_size = (512, 512)

    img = ImageOps.fit(img, desired_size, method=Image.LANCZOS, centering=(0.5, 0.5))
    mask_img = ImageOps.fit(mask_img, desired_size, method=Image.LANCZOS, centering=(0.5, 0.5))

    if is_blank_mask(mask_img):
        mask_img = PIL.Image.new('RGB', img.size, color=(255, 255, 255))
        editing_mode = "Free-form"
    else:
        editing_mode = "Region-based"

    mask_img = mask_img.convert('RGB')

    image = pipe(
        prompt,
        image=img,
        mask_img=mask_img,
        num_inference_steps=num_inference_steps,
        image_guidance_scale=image_guidance_scale,
        guidance_scale=guidance_scale,
        generator=generator
    ).images[0]

    return image, f"Editing Mode: {editing_mode}"


example_lists=[
    
    [['UltraEdit/images/example_images/1-input.png','UltraEdit/images/example_images/1-mask.png','UltraEdit/images/example_images/1-merged.png'], "Add a moon in the sky", 20, 1.5, 12.5,255],
    
    [['UltraEdit/images/example_images/1-input.png','UltraEdit/images/example_images/1-input.png','UltraEdit/images/example_images/1-input.png'], "Add a moon in the sky", 20, 1.5, 6.5,255],
    
    [['UltraEdit/images/example_images/2-input.png','UltraEdit/images/example_images/2-mask.png','UltraEdit/images/example_images/2-merged.png'], "add cherry blossoms", 20, 1.5, 12.5,255],
    
    [['UltraEdit/images/example_images/3-input.png','UltraEdit/images/example_images/3-mask.png','UltraEdit/images/example_images/3-merged.png'], "Please dress her in a short purple wedding dress adorned with white floral embroidery.", 20, 1.5, 7.5,255],

    [['UltraEdit/images/example_images/4-input.png','UltraEdit/images/example_images/4-mask.png','UltraEdit/images/example_images/4-merged.png'], "give her a chief's headdress.", 20, 1.5, 7.5, 24555]

]
mask_ex_list = []
for exp in example_lists:
    ex_dict = {}
    ex_dict['background'] = exp[0][0]
    ex_dict['layers'] = [exp[0][1], exp[0][2]]
    ex_dict['composite'] = exp[0][2]
    re_list = [ex_dict, exp[1], exp[2], exp[3], exp[4], exp[5]]
    mask_ex_list.append(re_list)

image_mask_input = gr.ImageMask(sources='upload', type="pil", label="Input Image: Mask with pen or leave unmasked", transforms=(), layers=False)
prompt_input = gr.Textbox(label="Prompt")
num_inference_steps_input = gr.Slider(minimum=0, maximum=100, value=50, label="Number of Inference Steps")
image_guidance_scale_input = gr.Slider(minimum=0.0, maximum=2.5, value=1.5, label="Image Guidance Scale")
guidance_scale_input = gr.Slider(minimum=0.0, maximum=17.5, value=12.5, label="Guidance Scale")
seed_input = gr.Textbox(value="255", label="Random Seed")

inputs = [image_mask_input, prompt_input, num_inference_steps_input, image_guidance_scale_input, guidance_scale_input, seed_input]
outputs = [gr.Image(label="Generated Image"), gr.Text(label="Editing Mode")]

article_html = """
<div style="text-align: center; max-width: 1000px; margin: 20px auto; font-family: Arial, sans-serif;">
  <h2 style="font-weight: 900; font-size: 2.5rem; margin-bottom: 0.5rem;">
    🖼️ UltraEdit for Fine-Grained Image Editing
  </h2>
  <div style="margin-bottom: 1rem;">
    <h3 style="font-weight: 500; font-size: 1.25rem; margin: 0;"></h3>
    <p style="font-weight: 400; font-size: 1rem; margin: 0.5rem 0;">
      Haozhe Zhao<sup>1*</sup>, Xiaojian Ma<sup>2*</sup>, Liang Chen<sup>1</sup>, Shuzheng Si<sup>1</sup>, Rujie Wu<sup>1</sup>,
      Kaikai An<sup>1</sup>, Peiyu Yu<sup>3</sup>, Minjia Zhang<sup>4</sup>, Qing Li<sup>2</sup>, Baobao Chang<sup>2</sup>
    </p>
    <p style="font-weight: 400; font-size: 1rem; margin: 0;">
      <sup>1</sup>Peking University, <sup>2</sup>BIGAI, <sup>3</sup>UCLA, <sup>4</sup>UIUC
    </p>
  </div>
  <div style="margin: 1rem 0; display: flex; justify-content: center; gap: 1.5rem; flex-wrap: wrap;">
    <a href="https://huggingface.co/datasets/BleachNick/UltraEdit" style="display: flex; align-items: center; text-decoration: none; color: blue; font-weight: bold; gap: 0.5rem;">
      <img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg" alt="Dataset_4M" style="height: 20px; vertical-align: middle;"> Dataset
    </a>
    <a href="https://huggingface.co/datasets/BleachNick/UltraEdit_500k" style="display: flex; align-items: center; text-decoration: none; color: blue; font-weight: bold; gap: 0.5rem;">
      <img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg" alt="Dataset_500k" style="height: 20px; vertical-align: middle;"> Dataset_500k
    </a>
    <a href="https://ultra-editing.github.io/" style="display: flex; align-items: center; text-decoration: none; color: blue; font-weight: bold; gap: 0.5rem;">
      <span style="font-size: 20px; vertical-align: middle;">🔗</span> Page
    </a>
    <a href="https://huggingface.co/BleachNick/SD3_UltraEdit_w_mask" style="display: flex; align-items: center; text-decoration: none; color: blue; font-weight: bold; gap: 0.5rem;">
      <img src="https://huggingface.co/front/assets/huggingface_logo-noborder.svg" alt="Checkpoint" style="height: 20px; vertical-align: middle;"> Checkpoint
    </a>
    <a href="https://github.com/HaozheZhao/UltraEdit" style="display: flex; align-items: center; text-decoration: none; color: blue; font-weight: bold; gap: 0.5rem;">
      <img src="https://upload.wikimedia.org/wikipedia/commons/9/91/Octicons-mark-github.svg" alt="GitHub" style="height: 20px; vertical-align: middle;"> GitHub
    </a>
  </div>
  <div style="text-align: left; margin: 0 auto; font-size: 1rem; line-height: 1.5;">
    <p>
      <b>UltraEdit</b> is a dataset designed for fine-grained, instruction-based image editing. It contains over 4 million free-form image editing samples and more than 100,000 region-based image editing samples, automatically generated with real images as anchors.
    </p>
    <p>
      This demo allows you to perform image editing using the <a href="https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers" style="color: blue; text-decoration: none;">Stable Diffusion 3</a> model trained with this extensive dataset. It supports both free-form (without mask) and region-based (with mask) image editing. Use the sliders to adjust the inference steps and guidance scales, and provide a seed for reproducibility. The image guidance scale of 1.5 and text guidance scale of 7.5 / 12.5 is a good start for free-form/region-based image editing.
    </p>
    <p>
     <b>Usage Instructions:</b> You need to upload the images and prompts for editing. Use the pen tool to mark the areas you want to edit. If no region is marked, it will resort to free-form editing.
     </p>
  </div>
</div>
"""
html = '''
  <div style="text-align: left; margin-top: 2rem; font-size: 0.85rem; color: gray;">
    <b>Limitations:</b>
    <ul>
      <li>We have not conducted any NSFW checks;</li>
      <li>Due to the bias of the generated models, the model performance is still weak when dealing with high-frequency information such as <b>human facial expressions or text in the images</b>;</li>
      <li>We unified the free-form and region-based image editing by adding an extra channel of the mask image to the dataset. When doing free-form image editing, the network receives a blank mask.</li>
      <li>The generation result is sensitive to the guidance scale. For text guidance, based on experience, free-form image editing will perform better with a relatively low guidance score (7.5 or lower), while region-based image editing will perform better with a higher guidance score.</li>
    </ul>
  </div>
'''

demo = gr.Interface(
    fn=generate,
    inputs=inputs,
    outputs=outputs,
    description=article_html,
    article=html,
    examples=mask_ex_list,
    cache_examples = True,
    live = False
    
)

demo.queue().launch()