HuiZhang0812 commited on
Commit
f057a7e
·
verified ·
1 Parent(s): 51106c1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +112 -94
app.py CHANGED
@@ -1,165 +1,183 @@
 
1
  import gradio as gr
2
  import torch
3
  import spaces
4
- from src.models.transformer_sd3_SiamLayout import SiamLayoutSD3Transformer2DModel
5
- from src.pipeline.pipeline_CreatiLayout import CreatiLayoutSD3Pipeline
6
- from utils.bbox_visualization import bbox_visualization,scale_boxes
7
  from PIL import Image
8
- import os
9
- import pandas as pd
10
  from huggingface_hub import login
 
 
11
 
12
  hf_token = os.getenv("HF_TOKEN")
13
-
14
  if hf_token is None:
15
  raise ValueError("Hugging Face token not found. Please set the HF_TOKEN secret.")
16
 
17
  login(token=hf_token)
18
 
19
- model_path = "stabilityai/stable-diffusion-3-medium-diffusers"
20
- ckpt_path = "HuiZhang0812/CreatiLayout"
21
-
22
- transformer_additional_kwargs = dict(attention_type="layout",strict=True)
23
 
24
- transformer = SiamLayoutSD3Transformer2DModel.from_pretrained(
25
- ckpt_path, subfolder="SiamLayout_SD3", torch_dtype=torch.float16,**transformer_additional_kwargs)
 
 
26
 
27
- pipe = CreatiLayoutSD3Pipeline.from_pretrained(model_path, transformer=transformer, torch_dtype=torch.float16)
28
- pipe = pipe.to("cuda")
 
 
 
 
 
 
 
 
 
 
29
 
30
- print("pipeline is loaded.")
31
 
32
  @spaces.GPU
33
- def process_image_and_text(global_caption, box_detail_phrases_list:pd.DataFrame, boxes:pd.DataFrame,seed: int=42, randomize_seed: bool=False, guidance_scale: float=7.5, num_inference_steps: int=50):
34
-
 
 
 
 
 
 
 
35
  if randomize_seed:
36
- seed = torch.randint(0, 100, (1,)).item()
 
37
 
38
  height = 1024
39
  width = 1024
40
 
41
- box_detail_phrases_list_tmp = box_detail_phrases_list.values.tolist()
42
- box_detail_phrases_list_tmp = [c[0] for c in box_detail_phrases_list_tmp]
43
- boxes = boxes.astype(float).values.tolist()
44
 
45
- white_image = Image.new('RGB', (width, height), color='rgb(256,256,256)')
46
- show_input = {"boxes":scale_boxes(boxes,width,height),"labels":box_detail_phrases_list_tmp}
47
- bbox_visualization_img = bbox_visualization(white_image,show_input)
48
-
49
- result_img = pipe(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  prompt=global_caption,
51
- generator=torch.Generator(device="cuda").manual_seed(seed),
52
- guidance_scale=guidance_scale,
53
- num_inference_steps=num_inference_steps,
54
- bbox_phrases=box_detail_phrases_list_tmp,
55
- bbox_raw=boxes,
56
  height=height,
57
- width=width
58
- ).images[0]
59
-
60
  return bbox_visualization_img, result_img
61
 
 
 
62
  def get_samples():
63
  sample_list = [
64
- {
65
- "global_caption": "A picturesque scene features Iron Man standing confidently on a rugged rock by the sea, holding a drawing board with his hands. The board displays the words 'Creative Layout' in a playful, hand-drawn font. The serene sea shimmers under the setting sun. The sky is painted with a gradient of warm colors, from deep oranges to soft purples.",
66
  "region_caption_list": [
67
- "Iron Man standing confidently on a rugged rock.",
68
  "A rugged rock by the sea.",
69
- "A drawing board with the words \"Creative Layout\" in a playful, hand-drawn font.",
70
- "The serene sea shimmers under the setting sun.",
71
- "The sky is a shade of deep orange to soft purple."
72
  ],
73
  "region_bboxes_list": [
74
- [0.40, 0.35, 0.55, 0.80],
75
- [0.35, 0.75, 0.60, 0.95],
76
- [0.40, 0.45, 0.55, 0.65],
77
- [0.00, 0.30, 1.00, 0.90],
78
  [0.00, 0.00, 1.00, 0.30]
79
  ]
80
  },
81
  {
82
- "global_caption": "This is a photo showcasing two wooden benches in a park. The bench on the left is painted in a vibrant blue, while the one on the right is painted in a green. Both are placed on a path paved with stones, surrounded by lush trees and shrubs. The sunlight filters through the leaves, casting dappled shadows on the ground, creating a tranquil and comfortable atmosphere.",
83
  "region_caption_list": [
84
- "A weathered, blue wooden bench with green elements in a natural setting.",
85
- "Old, weathered wooden benches with green paint.",
86
- "A dirt path in a park with green grass on the sides and two colorful wooden benches.",
87
- "Thick, verdant foliage of mature trees in a dense forest."
88
  ],
89
  "region_bboxes_list": [
90
- [0.30, 0.44, 0.62, 0.78],
91
- [0.54, 0.41, 0.75, 0.65],
92
- [0.00, 0.39, 1.00, 1.00],
93
- [0.00, 0.00, 1.00, 0.43]
94
- ]
95
- },
96
- {
97
- "global_caption": "This is a wedding photo taken in a photography studio, showing a newlywed couple sitting on a brown leather sofa in a modern indoor setting. The groom is dressed in a pink suit, paired with a pink tie and white shirt, while the bride is wearing a white wedding dress with a long veil. They are sitting on a brown leather sofa, with a wooden table in front of them, on which a bouquet of flowers is placed. The background is a bar with a staircase and a wall decorated with lights, creating a warm and romantic atmosphere.",
98
- "region_caption_list": [
99
- "A floral arrangement consisting of roses, carnations, and eucalyptus leaves on a wooden surface.",
100
- "A white wedding dress with off-the-shoulder ruffles and a long, sheer veil.",
101
- "A polished wooden table with visible grain and knots.",
102
- "A close-up of a dark brown leather sofa with tufted upholstery and button details.",
103
- "A man in a pink suit with a white shirt and red tie, sitting on a leather armchair.",
104
- "A person in a suit seated on a leather armchair near a wooden staircase with books and bottles.",
105
- "Bride in white gown with veil, groom in maroon suit and pink tie, seated on leather armchairs."
106
- ],
107
- "region_bboxes_list": [
108
- [0.09, 0.65, 0.31, 0.93],
109
- [0.62, 0.25, 0.89, 0.90],
110
- [0.01, 0.70, 0.78, 0.99],
111
- [0.76, 0.65, 1.00, 0.99],
112
- [0.27, 0.32, 0.72, 0.75],
113
- [0.00, 0.01, 0.52, 0.72],
114
- [0.27, 0.09, 0.94, 0.89]
115
  ]
116
  }
117
-
118
  ]
119
- return [[sample["global_caption"], [[caption] for caption in sample["region_caption_list"]], sample["region_bboxes_list"]] for sample in sample_list]
120
 
121
 
122
 
123
  with gr.Blocks() as demo:
124
  gr.Markdown("# CreatiLayout: Layout-to-Image generation")
125
  gr.Markdown("""CreatiLayout is a layout-to-image framework for Diffusion Transformer models, offering high-quality and fine-grained controllable generation based on the global description and entity annotations. Users need to provide a global description and the position and description of each entity, as shown in the examples. Please feel free to modify the position and attributes of the entities in the examples (such as size, color, shape, text, portrait, etc.). Here are some inspirations: Iron Man -> Spider Man/Harry Potter/Buzz Lightyear; CreatiLayout -> Hello Friends/Let's Control; drawing board -> round drawing board; Modify the position of the drawing board to (0.4, 0.15, 0.55, 0.35)""")
 
126
  with gr.Row():
127
-
128
  with gr.Column():
129
  global_caption = gr.Textbox(lines=2, label="Global Caption")
130
- box_detail_phrases_list = gr.Dataframe(headers=["Region Captions"], label="Region Captions")
131
- boxes = gr.Dataframe(headers=["x1", "y1", "x2", "y2"], label="Region Bounding Boxes (x_min,y_min,x_max,y_max)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  with gr.Accordion("Advanced Settings", open=False):
133
- seed = gr.Slider(0, 100, step=1, label="Seed", value=42)
134
  randomize_seed = gr.Checkbox(label="Randomize seed", value=False)
135
  guidance_scale = gr.Slider(1, 30, step=0.5, label="Guidance Scale", value=7.5)
136
  num_inference_steps = gr.Slider(1, 50, step=1, label="Number of inference steps", value=28)
 
137
  with gr.Column():
138
  bbox_visualization_img = gr.Image(type="pil", label="Bounding Box Visualization")
139
-
140
  with gr.Column():
141
  output_image = gr.Image(type="pil", label="Generated Image")
142
 
143
-
144
-
145
- gr.Button("Generate").click(
146
  fn=process_image_and_text,
147
  inputs=[global_caption, box_detail_phrases_list, boxes, seed, randomize_seed, guidance_scale, num_inference_steps],
148
- outputs=[bbox_visualization_img, output_image]
 
149
  )
150
-
151
-
152
  gr.Examples(
153
  examples=get_samples(),
154
  inputs=[global_caption, box_detail_phrases_list, boxes],
155
- outputs=[bbox_visualization_img, output_image],
156
- fn=process_image_and_text,
157
- cache_examples=True
158
  )
159
-
160
-
161
-
162
-
163
 
164
  if __name__ == "__main__":
165
  demo.launch()
 
 
 
 
1
+ import os
2
  import gradio as gr
3
  import torch
4
  import spaces
 
 
 
5
  from PIL import Image
 
 
6
  from huggingface_hub import login
7
+ from PIL import ImageDraw
8
+ from utils.bbox_visualization import bbox_visualization,scale_boxes
9
 
10
  hf_token = os.getenv("HF_TOKEN")
 
11
  if hf_token is None:
12
  raise ValueError("Hugging Face token not found. Please set the HF_TOKEN secret.")
13
 
14
  login(token=hf_token)
15
 
16
+ from src.models.transformer_sd3_SiamLayout import SiamLayoutSD3Transformer2DModel
17
+ from src.pipeline.pipeline_CreatiLayout import CreatiLayoutSD3Pipeline
 
 
18
 
19
+ pipe = None
20
+ try:
21
+ model_path = "stabilityai/stable-diffusion-3-medium-diffusers"
22
+ ckpt_path = "HuiZhang0812/CreatiLayout"
23
 
24
+ transformer_additional_kwargs = dict(attention_type="layout", strict=True)
25
+ transformer = SiamLayoutSD3Transformer2DModel.from_pretrained(
26
+ ckpt_path, subfolder="SiamLayout_SD3", torch_dtype=torch.float16, **transformer_additional_kwargs
27
+ )
28
+ pipe = CreatiLayoutSD3Pipeline.from_pretrained(
29
+ model_path, transformer=transformer, torch_dtype=torch.float16
30
+ )
31
+ device = "cuda" if torch.cuda.is_available() else "cpu"
32
+ pipe = pipe.to(device)
33
+ print("pipeline is loaded.")
34
+ except Exception as e:
35
+ raise RuntimeError(f"Failed to load real pipeline: {e}") from e
36
 
 
37
 
38
  @spaces.GPU
39
+ def process_image_and_text(
40
+ global_caption: str,
41
+ box_detail_phrases_list,
42
+ boxes,
43
+ seed: int = 42,
44
+ randomize_seed: bool = False,
45
+ guidance_scale: float = 7.5,
46
+ num_inference_steps: int = 50,
47
+ ):
48
  if randomize_seed:
49
+ seed = int(torch.randint(0, 10000, (1,)).item())
50
+ device = "cuda" if torch.cuda.is_available() else "cpu"
51
 
52
  height = 1024
53
  width = 1024
54
 
 
 
 
55
 
56
+ phrases = []
57
+ if isinstance(box_detail_phrases_list, list):
58
+ for row in box_detail_phrases_list:
59
+ if row and len(row) >= 1 and row[0] is not None:
60
+ phrases.append(str(row[0]))
61
+
62
+ bboxes = []
63
+ if isinstance(boxes, list):
64
+ for row in boxes:
65
+ if row and len(row) == 4 and all(v is not None for v in row):
66
+ try:
67
+ bboxes.append([float(v) for v in row])
68
+ except Exception:
69
+ pass
70
+
71
+
72
+ white_image = Image.new("RGB", (width, height), color=(255, 255, 255))
73
+ vis_input = {"boxes": scale_boxes(bboxes, width, height), "labels": phrases}
74
+ bbox_visualization_img = bbox_visualization(white_image, vis_input)
75
+
76
+
77
+ result = pipe(
78
  prompt=global_caption,
79
+ generator= torch.Generator(device=device).manual_seed(int(seed)),
80
+ guidance_scale=float(guidance_scale),
81
+ num_inference_steps=int(num_inference_steps),
82
+ bbox_phrases=phrases,
83
+ bbox_raw=bboxes,
84
  height=height,
85
+ width=width,
86
+ )
87
+ result_img = result.images[0]
88
  return bbox_visualization_img, result_img
89
 
90
+
91
+
92
  def get_samples():
93
  sample_list = [
94
+ {
95
+ "global_caption": "Iron Man holding a drawing board near the sea, sunset sky.",
96
  "region_caption_list": [
97
+ "Iron Man standing on a rock.",
98
  "A rugged rock by the sea.",
99
+ "A drawing board with the words 'Creative Layout'.",
100
+ "The serene sea under the setting sun.",
101
+ "The sky from orange to purple."
102
  ],
103
  "region_bboxes_list": [
104
+ [0.40, 0.35, 0.55, 0.80],
105
+ [0.35, 0.75, 0.60, 0.95],
106
+ [0.40, 0.45, 0.55, 0.65],
107
+ [0.00, 0.30, 1.00, 0.90],
108
  [0.00, 0.00, 1.00, 0.30]
109
  ]
110
  },
111
  {
112
+ "global_caption": "Two wooden benches in a park with dappled sunlight.",
113
  "region_caption_list": [
114
+ "A blue wooden bench.",
115
+ "A green wooden bench.",
116
+ "A stone path with grass.",
117
+ "Thick foliage and trees."
118
  ],
119
  "region_bboxes_list": [
120
+ [0.30, 0.44, 0.62, 0.78],
121
+ [0.54, 0.41, 0.75, 0.65],
122
+ [0.00, 0.39, 1.00, 1.00],
123
+ [0.00, 0.00, 1.00, 0.43]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  ]
125
  }
 
126
  ]
127
+ return [[s["global_caption"], [[c] for c in s["region_caption_list"]], s["region_bboxes_list"]] for s in sample_list]
128
 
129
 
130
 
131
  with gr.Blocks() as demo:
132
  gr.Markdown("# CreatiLayout: Layout-to-Image generation")
133
  gr.Markdown("""CreatiLayout is a layout-to-image framework for Diffusion Transformer models, offering high-quality and fine-grained controllable generation based on the global description and entity annotations. Users need to provide a global description and the position and description of each entity, as shown in the examples. Please feel free to modify the position and attributes of the entities in the examples (such as size, color, shape, text, portrait, etc.). Here are some inspirations: Iron Man -> Spider Man/Harry Potter/Buzz Lightyear; CreatiLayout -> Hello Friends/Let's Control; drawing board -> round drawing board; Modify the position of the drawing board to (0.4, 0.15, 0.55, 0.35)""")
134
+
135
  with gr.Row():
 
136
  with gr.Column():
137
  global_caption = gr.Textbox(lines=2, label="Global Caption")
138
+
139
+ box_detail_phrases_list = gr.Dataframe(
140
+ headers=["Region Captions"],
141
+ datatype="str",
142
+ col_count=(1, "fixed"),
143
+ row_count="dynamic",
144
+ label="Region Captions"
145
+ )
146
+ boxes = gr.Dataframe(
147
+ headers=["x1", "y1", "x2", "y2"],
148
+ datatype="number",
149
+ col_count=(4, "fixed"),
150
+ row_count="dynamic",
151
+ label="Region Bounding Boxes (x_min,y_min,x_max,y_max)"
152
+ )
153
+
154
  with gr.Accordion("Advanced Settings", open=False):
155
+ seed = gr.Slider(0, 10000, step=1, label="Seed", value=42)
156
  randomize_seed = gr.Checkbox(label="Randomize seed", value=False)
157
  guidance_scale = gr.Slider(1, 30, step=0.5, label="Guidance Scale", value=7.5)
158
  num_inference_steps = gr.Slider(1, 50, step=1, label="Number of inference steps", value=28)
159
+
160
  with gr.Column():
161
  bbox_visualization_img = gr.Image(type="pil", label="Bounding Box Visualization")
162
+
163
  with gr.Column():
164
  output_image = gr.Image(type="pil", label="Generated Image")
165
 
166
+ generate_btn = gr.Button("Generate")
167
+ generate_btn.click(
 
168
  fn=process_image_and_text,
169
  inputs=[global_caption, box_detail_phrases_list, boxes, seed, randomize_seed, guidance_scale, num_inference_steps],
170
+ outputs=[bbox_visualization_img, output_image],
171
+ api_name="generate"
172
  )
173
+
 
174
  gr.Examples(
175
  examples=get_samples(),
176
  inputs=[global_caption, box_detail_phrases_list, boxes],
 
 
 
177
  )
 
 
 
 
178
 
179
  if __name__ == "__main__":
180
  demo.launch()
181
+
182
+
183
+