Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
@@ -1,165 +1,183 @@
|
|
|
|
1 |
import gradio as gr
|
2 |
import torch
|
3 |
import spaces
|
4 |
-
from src.models.transformer_sd3_SiamLayout import SiamLayoutSD3Transformer2DModel
|
5 |
-
from src.pipeline.pipeline_CreatiLayout import CreatiLayoutSD3Pipeline
|
6 |
-
from utils.bbox_visualization import bbox_visualization,scale_boxes
|
7 |
from PIL import Image
|
8 |
-
import os
|
9 |
-
import pandas as pd
|
10 |
from huggingface_hub import login
|
|
|
|
|
11 |
|
12 |
hf_token = os.getenv("HF_TOKEN")
|
13 |
-
|
14 |
if hf_token is None:
|
15 |
raise ValueError("Hugging Face token not found. Please set the HF_TOKEN secret.")
|
16 |
|
17 |
login(token=hf_token)
|
18 |
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
transformer_additional_kwargs = dict(attention_type="layout",strict=True)
|
23 |
|
24 |
-
|
25 |
-
|
|
|
|
|
26 |
|
27 |
-
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
|
30 |
-
print("pipeline is loaded.")
|
31 |
|
32 |
@spaces.GPU
|
33 |
-
def process_image_and_text(
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
if randomize_seed:
|
36 |
-
seed = torch.randint(0,
|
|
|
37 |
|
38 |
height = 1024
|
39 |
width = 1024
|
40 |
|
41 |
-
box_detail_phrases_list_tmp = box_detail_phrases_list.values.tolist()
|
42 |
-
box_detail_phrases_list_tmp = [c[0] for c in box_detail_phrases_list_tmp]
|
43 |
-
boxes = boxes.astype(float).values.tolist()
|
44 |
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
prompt=global_caption,
|
51 |
-
generator=torch.Generator(device=
|
52 |
-
guidance_scale=guidance_scale,
|
53 |
-
num_inference_steps=num_inference_steps,
|
54 |
-
bbox_phrases=
|
55 |
-
bbox_raw=
|
56 |
height=height,
|
57 |
-
width=width
|
58 |
-
)
|
59 |
-
|
60 |
return bbox_visualization_img, result_img
|
61 |
|
|
|
|
|
62 |
def get_samples():
|
63 |
sample_list = [
|
64 |
-
{
|
65 |
-
"global_caption": "
|
66 |
"region_caption_list": [
|
67 |
-
"Iron Man standing
|
68 |
"A rugged rock by the sea.",
|
69 |
-
"A drawing board with the words
|
70 |
-
"The serene sea
|
71 |
-
"The sky
|
72 |
],
|
73 |
"region_bboxes_list": [
|
74 |
-
[0.40, 0.35, 0.55, 0.80],
|
75 |
-
[0.35, 0.75, 0.60, 0.95],
|
76 |
-
[0.40, 0.45, 0.55, 0.65],
|
77 |
-
[0.00, 0.30, 1.00, 0.90],
|
78 |
[0.00, 0.00, 1.00, 0.30]
|
79 |
]
|
80 |
},
|
81 |
{
|
82 |
-
"global_caption": "
|
83 |
"region_caption_list": [
|
84 |
-
"A
|
85 |
-
"
|
86 |
-
"A
|
87 |
-
"Thick
|
88 |
],
|
89 |
"region_bboxes_list": [
|
90 |
-
[0.30, 0.44, 0.62, 0.78],
|
91 |
-
[0.54, 0.41, 0.75, 0.65],
|
92 |
-
[0.00, 0.39, 1.00, 1.00],
|
93 |
-
[0.00, 0.00, 1.00, 0.43]
|
94 |
-
]
|
95 |
-
},
|
96 |
-
{
|
97 |
-
"global_caption": "This is a wedding photo taken in a photography studio, showing a newlywed couple sitting on a brown leather sofa in a modern indoor setting. The groom is dressed in a pink suit, paired with a pink tie and white shirt, while the bride is wearing a white wedding dress with a long veil. They are sitting on a brown leather sofa, with a wooden table in front of them, on which a bouquet of flowers is placed. The background is a bar with a staircase and a wall decorated with lights, creating a warm and romantic atmosphere.",
|
98 |
-
"region_caption_list": [
|
99 |
-
"A floral arrangement consisting of roses, carnations, and eucalyptus leaves on a wooden surface.",
|
100 |
-
"A white wedding dress with off-the-shoulder ruffles and a long, sheer veil.",
|
101 |
-
"A polished wooden table with visible grain and knots.",
|
102 |
-
"A close-up of a dark brown leather sofa with tufted upholstery and button details.",
|
103 |
-
"A man in a pink suit with a white shirt and red tie, sitting on a leather armchair.",
|
104 |
-
"A person in a suit seated on a leather armchair near a wooden staircase with books and bottles.",
|
105 |
-
"Bride in white gown with veil, groom in maroon suit and pink tie, seated on leather armchairs."
|
106 |
-
],
|
107 |
-
"region_bboxes_list": [
|
108 |
-
[0.09, 0.65, 0.31, 0.93],
|
109 |
-
[0.62, 0.25, 0.89, 0.90],
|
110 |
-
[0.01, 0.70, 0.78, 0.99],
|
111 |
-
[0.76, 0.65, 1.00, 0.99],
|
112 |
-
[0.27, 0.32, 0.72, 0.75],
|
113 |
-
[0.00, 0.01, 0.52, 0.72],
|
114 |
-
[0.27, 0.09, 0.94, 0.89]
|
115 |
]
|
116 |
}
|
117 |
-
|
118 |
]
|
119 |
-
return [[
|
120 |
|
121 |
|
122 |
|
123 |
with gr.Blocks() as demo:
|
124 |
gr.Markdown("# CreatiLayout: Layout-to-Image generation")
|
125 |
gr.Markdown("""CreatiLayout is a layout-to-image framework for Diffusion Transformer models, offering high-quality and fine-grained controllable generation based on the global description and entity annotations. Users need to provide a global description and the position and description of each entity, as shown in the examples. Please feel free to modify the position and attributes of the entities in the examples (such as size, color, shape, text, portrait, etc.). Here are some inspirations: Iron Man -> Spider Man/Harry Potter/Buzz Lightyear; CreatiLayout -> Hello Friends/Let's Control; drawing board -> round drawing board; Modify the position of the drawing board to (0.4, 0.15, 0.55, 0.35)""")
|
|
|
126 |
with gr.Row():
|
127 |
-
|
128 |
with gr.Column():
|
129 |
global_caption = gr.Textbox(lines=2, label="Global Caption")
|
130 |
-
|
131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
132 |
with gr.Accordion("Advanced Settings", open=False):
|
133 |
-
seed = gr.Slider(0,
|
134 |
randomize_seed = gr.Checkbox(label="Randomize seed", value=False)
|
135 |
guidance_scale = gr.Slider(1, 30, step=0.5, label="Guidance Scale", value=7.5)
|
136 |
num_inference_steps = gr.Slider(1, 50, step=1, label="Number of inference steps", value=28)
|
|
|
137 |
with gr.Column():
|
138 |
bbox_visualization_img = gr.Image(type="pil", label="Bounding Box Visualization")
|
139 |
-
|
140 |
with gr.Column():
|
141 |
output_image = gr.Image(type="pil", label="Generated Image")
|
142 |
|
143 |
-
|
144 |
-
|
145 |
-
gr.Button("Generate").click(
|
146 |
fn=process_image_and_text,
|
147 |
inputs=[global_caption, box_detail_phrases_list, boxes, seed, randomize_seed, guidance_scale, num_inference_steps],
|
148 |
-
outputs=[bbox_visualization_img, output_image]
|
|
|
149 |
)
|
150 |
-
|
151 |
-
|
152 |
gr.Examples(
|
153 |
examples=get_samples(),
|
154 |
inputs=[global_caption, box_detail_phrases_list, boxes],
|
155 |
-
outputs=[bbox_visualization_img, output_image],
|
156 |
-
fn=process_image_and_text,
|
157 |
-
cache_examples=True
|
158 |
)
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
|
164 |
if __name__ == "__main__":
|
165 |
demo.launch()
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
import gradio as gr
|
3 |
import torch
|
4 |
import spaces
|
|
|
|
|
|
|
5 |
from PIL import Image
|
|
|
|
|
6 |
from huggingface_hub import login
|
7 |
+
from PIL import ImageDraw
|
8 |
+
from utils.bbox_visualization import bbox_visualization,scale_boxes
|
9 |
|
10 |
hf_token = os.getenv("HF_TOKEN")
|
|
|
11 |
if hf_token is None:
|
12 |
raise ValueError("Hugging Face token not found. Please set the HF_TOKEN secret.")
|
13 |
|
14 |
login(token=hf_token)
|
15 |
|
16 |
+
from src.models.transformer_sd3_SiamLayout import SiamLayoutSD3Transformer2DModel
|
17 |
+
from src.pipeline.pipeline_CreatiLayout import CreatiLayoutSD3Pipeline
|
|
|
|
|
18 |
|
19 |
+
pipe = None
|
20 |
+
try:
|
21 |
+
model_path = "stabilityai/stable-diffusion-3-medium-diffusers"
|
22 |
+
ckpt_path = "HuiZhang0812/CreatiLayout"
|
23 |
|
24 |
+
transformer_additional_kwargs = dict(attention_type="layout", strict=True)
|
25 |
+
transformer = SiamLayoutSD3Transformer2DModel.from_pretrained(
|
26 |
+
ckpt_path, subfolder="SiamLayout_SD3", torch_dtype=torch.float16, **transformer_additional_kwargs
|
27 |
+
)
|
28 |
+
pipe = CreatiLayoutSD3Pipeline.from_pretrained(
|
29 |
+
model_path, transformer=transformer, torch_dtype=torch.float16
|
30 |
+
)
|
31 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
32 |
+
pipe = pipe.to(device)
|
33 |
+
print("pipeline is loaded.")
|
34 |
+
except Exception as e:
|
35 |
+
raise RuntimeError(f"Failed to load real pipeline: {e}") from e
|
36 |
|
|
|
37 |
|
38 |
@spaces.GPU
|
39 |
+
def process_image_and_text(
|
40 |
+
global_caption: str,
|
41 |
+
box_detail_phrases_list,
|
42 |
+
boxes,
|
43 |
+
seed: int = 42,
|
44 |
+
randomize_seed: bool = False,
|
45 |
+
guidance_scale: float = 7.5,
|
46 |
+
num_inference_steps: int = 50,
|
47 |
+
):
|
48 |
if randomize_seed:
|
49 |
+
seed = int(torch.randint(0, 10000, (1,)).item())
|
50 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
51 |
|
52 |
height = 1024
|
53 |
width = 1024
|
54 |
|
|
|
|
|
|
|
55 |
|
56 |
+
phrases = []
|
57 |
+
if isinstance(box_detail_phrases_list, list):
|
58 |
+
for row in box_detail_phrases_list:
|
59 |
+
if row and len(row) >= 1 and row[0] is not None:
|
60 |
+
phrases.append(str(row[0]))
|
61 |
+
|
62 |
+
bboxes = []
|
63 |
+
if isinstance(boxes, list):
|
64 |
+
for row in boxes:
|
65 |
+
if row and len(row) == 4 and all(v is not None for v in row):
|
66 |
+
try:
|
67 |
+
bboxes.append([float(v) for v in row])
|
68 |
+
except Exception:
|
69 |
+
pass
|
70 |
+
|
71 |
+
|
72 |
+
white_image = Image.new("RGB", (width, height), color=(255, 255, 255))
|
73 |
+
vis_input = {"boxes": scale_boxes(bboxes, width, height), "labels": phrases}
|
74 |
+
bbox_visualization_img = bbox_visualization(white_image, vis_input)
|
75 |
+
|
76 |
+
|
77 |
+
result = pipe(
|
78 |
prompt=global_caption,
|
79 |
+
generator= torch.Generator(device=device).manual_seed(int(seed)),
|
80 |
+
guidance_scale=float(guidance_scale),
|
81 |
+
num_inference_steps=int(num_inference_steps),
|
82 |
+
bbox_phrases=phrases,
|
83 |
+
bbox_raw=bboxes,
|
84 |
height=height,
|
85 |
+
width=width,
|
86 |
+
)
|
87 |
+
result_img = result.images[0]
|
88 |
return bbox_visualization_img, result_img
|
89 |
|
90 |
+
|
91 |
+
|
92 |
def get_samples():
|
93 |
sample_list = [
|
94 |
+
{
|
95 |
+
"global_caption": "Iron Man holding a drawing board near the sea, sunset sky.",
|
96 |
"region_caption_list": [
|
97 |
+
"Iron Man standing on a rock.",
|
98 |
"A rugged rock by the sea.",
|
99 |
+
"A drawing board with the words 'Creative Layout'.",
|
100 |
+
"The serene sea under the setting sun.",
|
101 |
+
"The sky from orange to purple."
|
102 |
],
|
103 |
"region_bboxes_list": [
|
104 |
+
[0.40, 0.35, 0.55, 0.80],
|
105 |
+
[0.35, 0.75, 0.60, 0.95],
|
106 |
+
[0.40, 0.45, 0.55, 0.65],
|
107 |
+
[0.00, 0.30, 1.00, 0.90],
|
108 |
[0.00, 0.00, 1.00, 0.30]
|
109 |
]
|
110 |
},
|
111 |
{
|
112 |
+
"global_caption": "Two wooden benches in a park with dappled sunlight.",
|
113 |
"region_caption_list": [
|
114 |
+
"A blue wooden bench.",
|
115 |
+
"A green wooden bench.",
|
116 |
+
"A stone path with grass.",
|
117 |
+
"Thick foliage and trees."
|
118 |
],
|
119 |
"region_bboxes_list": [
|
120 |
+
[0.30, 0.44, 0.62, 0.78],
|
121 |
+
[0.54, 0.41, 0.75, 0.65],
|
122 |
+
[0.00, 0.39, 1.00, 1.00],
|
123 |
+
[0.00, 0.00, 1.00, 0.43]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
]
|
125 |
}
|
|
|
126 |
]
|
127 |
+
return [[s["global_caption"], [[c] for c in s["region_caption_list"]], s["region_bboxes_list"]] for s in sample_list]
|
128 |
|
129 |
|
130 |
|
131 |
with gr.Blocks() as demo:
|
132 |
gr.Markdown("# CreatiLayout: Layout-to-Image generation")
|
133 |
gr.Markdown("""CreatiLayout is a layout-to-image framework for Diffusion Transformer models, offering high-quality and fine-grained controllable generation based on the global description and entity annotations. Users need to provide a global description and the position and description of each entity, as shown in the examples. Please feel free to modify the position and attributes of the entities in the examples (such as size, color, shape, text, portrait, etc.). Here are some inspirations: Iron Man -> Spider Man/Harry Potter/Buzz Lightyear; CreatiLayout -> Hello Friends/Let's Control; drawing board -> round drawing board; Modify the position of the drawing board to (0.4, 0.15, 0.55, 0.35)""")
|
134 |
+
|
135 |
with gr.Row():
|
|
|
136 |
with gr.Column():
|
137 |
global_caption = gr.Textbox(lines=2, label="Global Caption")
|
138 |
+
|
139 |
+
box_detail_phrases_list = gr.Dataframe(
|
140 |
+
headers=["Region Captions"],
|
141 |
+
datatype="str",
|
142 |
+
col_count=(1, "fixed"),
|
143 |
+
row_count="dynamic",
|
144 |
+
label="Region Captions"
|
145 |
+
)
|
146 |
+
boxes = gr.Dataframe(
|
147 |
+
headers=["x1", "y1", "x2", "y2"],
|
148 |
+
datatype="number",
|
149 |
+
col_count=(4, "fixed"),
|
150 |
+
row_count="dynamic",
|
151 |
+
label="Region Bounding Boxes (x_min,y_min,x_max,y_max)"
|
152 |
+
)
|
153 |
+
|
154 |
with gr.Accordion("Advanced Settings", open=False):
|
155 |
+
seed = gr.Slider(0, 10000, step=1, label="Seed", value=42)
|
156 |
randomize_seed = gr.Checkbox(label="Randomize seed", value=False)
|
157 |
guidance_scale = gr.Slider(1, 30, step=0.5, label="Guidance Scale", value=7.5)
|
158 |
num_inference_steps = gr.Slider(1, 50, step=1, label="Number of inference steps", value=28)
|
159 |
+
|
160 |
with gr.Column():
|
161 |
bbox_visualization_img = gr.Image(type="pil", label="Bounding Box Visualization")
|
162 |
+
|
163 |
with gr.Column():
|
164 |
output_image = gr.Image(type="pil", label="Generated Image")
|
165 |
|
166 |
+
generate_btn = gr.Button("Generate")
|
167 |
+
generate_btn.click(
|
|
|
168 |
fn=process_image_and_text,
|
169 |
inputs=[global_caption, box_detail_phrases_list, boxes, seed, randomize_seed, guidance_scale, num_inference_steps],
|
170 |
+
outputs=[bbox_visualization_img, output_image],
|
171 |
+
api_name="generate"
|
172 |
)
|
173 |
+
|
|
|
174 |
gr.Examples(
|
175 |
examples=get_samples(),
|
176 |
inputs=[global_caption, box_detail_phrases_list, boxes],
|
|
|
|
|
|
|
177 |
)
|
|
|
|
|
|
|
|
|
178 |
|
179 |
if __name__ == "__main__":
|
180 |
demo.launch()
|
181 |
+
|
182 |
+
|
183 |
+
|