lzy-tony commited on
Commit
d3653d5
·
1 Parent(s): cca14a1

feat: add english ver

Browse files
Files changed (41) hide show
  1. .gitignore +130 -0
  2. README.md +5 -7
  3. app.py +616 -0
  4. assets/Arial.ttf +0 -0
  5. assets/chinese_char.txt +1000 -0
  6. assets/color_idx.json +1 -0
  7. assets/font_idx_512.json +1 -0
  8. assets/multilingual_cn-en_font_idx.json +1 -0
  9. checkpoints/glyph-sdxl/byt5_mapper.pt +3 -0
  10. checkpoints/glyph-sdxl/byt5_model.pt +3 -0
  11. checkpoints/glyph-sdxl/optimizer.bin +3 -0
  12. checkpoints/glyph-sdxl/scaler.pt +3 -0
  13. checkpoints/glyph-sdxl/scheduler.bin +3 -0
  14. checkpoints/glyph-sdxl/unet_inserted_attn.pt +3 -0
  15. checkpoints/glyph-sdxl/unet_lora.pt +3 -0
  16. configs/glyph_multilingual_sdxl_albedo.py +96 -0
  17. configs/glyph_sdxl.py +96 -0
  18. configs/glyph_sdxl_albedo.py +96 -0
  19. demo/constants.py +2 -0
  20. examples/easter.json +43 -0
  21. examples/easter.png +0 -0
  22. examples/new_year.json +54 -0
  23. examples/new_year.png +0 -0
  24. examples/pancake.json +67 -0
  25. examples/pancake.png +0 -0
  26. examples/shower.json +76 -0
  27. examples/shower.png +0 -0
  28. glyph_sdxl/custom_diffusers/__init__.py +2 -0
  29. glyph_sdxl/custom_diffusers/models/__init__.py +3 -0
  30. glyph_sdxl/custom_diffusers/models/cross_attn_insert_transformer_blocks.py +377 -0
  31. glyph_sdxl/custom_diffusers/pipelines/__init__.py +5 -0
  32. glyph_sdxl/custom_diffusers/pipelines/pipeline_stable_diffusion_glyph_xl.py +922 -0
  33. glyph_sdxl/modules/__init__.py +7 -0
  34. glyph_sdxl/modules/byt5_block_byt5_mapper.py +151 -0
  35. glyph_sdxl/modules/simple_byt5_mapper.py +16 -0
  36. glyph_sdxl/utils/__init__.py +23 -0
  37. glyph_sdxl/utils/constants.py +5 -0
  38. glyph_sdxl/utils/format_prompt.py +113 -0
  39. glyph_sdxl/utils/load_pretrained_byt5.py +60 -0
  40. glyph_sdxl/utils/parse_config.py +17 -0
  41. requirements.txt +10 -0
.gitignore ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ *.egg-info/
24
+ .installed.cfg
25
+ *.egg
26
+ MANIFEST
27
+
28
+ # PyInstaller
29
+ # Usually these files are written by a python script from a template
30
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
31
+ *.manifest
32
+ *.spec
33
+
34
+ # Installer logs
35
+ pip-log.txt
36
+ pip-delete-this-directory.txt
37
+
38
+ # Unit test / coverage reports
39
+ htmlcov/
40
+ .tox/
41
+ .coverage
42
+ .coverage.*
43
+ .cache
44
+ nosetests.xml
45
+ coverage.xml
46
+ *.cover
47
+ .hypothesis/
48
+ .pytest_cache/
49
+
50
+ # Translations
51
+ *.mo
52
+ *.pot
53
+
54
+ # Django stuff:
55
+ *.log
56
+ local_settings.py
57
+ db.sqlite3
58
+
59
+ # Flask stuff:
60
+ instance/
61
+ .webassets-cache
62
+
63
+ # Scrapy stuff:
64
+ .scrapy
65
+
66
+ # Sphinx documentation
67
+ docs/en/_build/
68
+ docs/zh_cn/_build/
69
+
70
+ # PyBuilder
71
+ target/
72
+
73
+ # Jupyter Notebook
74
+ .ipynb_checkpoints
75
+
76
+ # pyenv
77
+ .python-version
78
+
79
+ # celery beat schedule file
80
+ celerybeat-schedule
81
+
82
+ # SageMath parsed files
83
+ *.sage.py
84
+
85
+ # Environments
86
+ .env
87
+ .venv
88
+ env/
89
+ venv/
90
+ ENV/
91
+ env.bak/
92
+ venv.bak/
93
+ .DS_Store
94
+
95
+ # Spyder project settings
96
+ .spyderproject
97
+ .spyproject
98
+
99
+ # Rope project settings
100
+ .ropeproject
101
+
102
+ # mkdocs documentation
103
+ /site
104
+
105
+ # mypy
106
+ .mypy_cache/
107
+
108
+ data
109
+ .vscode
110
+ .vscode/settings.json
111
+ .idea
112
+
113
+ # custom
114
+ *.pkl
115
+ *.pkl.json
116
+ *.log.json
117
+ work_dirs/
118
+
119
+ # Pytorch
120
+
121
+ weights
122
+ wandb
123
+ temp
124
+ test.py
125
+ debug
126
+ *.html
127
+ htmls
128
+ debug.png
129
+
130
+ canva.fonts.json
README.md CHANGED
@@ -1,12 +1,10 @@
1
  ---
2
- title: Glyph SDXL
3
- emoji: 🚀
4
- colorFrom: indigo
5
- colorTo: green
6
  sdk: gradio
7
- sdk_version: 4.31.0
8
  app_file: app.py
9
  pinned: false
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Glyph-SDXL-debug
3
+ emoji: 🖼️🖌️
4
+ colorFrom: yellow
5
+ colorTo: yellow
6
  sdk: gradio
7
+ sdk_version: 4.27.0
8
  app_file: app.py
9
  pinned: false
10
  ---
 
 
app.py ADDED
@@ -0,0 +1,616 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import json
3
+ import webcolors
4
+ import spaces
5
+ import gradio as gr
6
+ import os.path as osp
7
+ from PIL import Image, ImageDraw, ImageFont
8
+
9
+ import torch
10
+ from diffusers import UNet2DConditionModel, AutoencoderKL
11
+ from diffusers.models.attention import BasicTransformerBlock
12
+ from peft import LoraConfig
13
+ from peft.utils import set_peft_model_state_dict
14
+ from transformers import PretrainedConfig
15
+
16
+ from diffusers import DPMSolverMultistepScheduler
17
+
18
+ from glyph_sdxl.utils import (
19
+ parse_config,
20
+ UNET_CKPT_NAME,
21
+ huggingface_cache_dir,
22
+ load_byt5_and_byt5_tokenizer,
23
+ BYT5_MAPPER_CKPT_NAME,
24
+ INSERTED_ATTN_CKPT_NAME,
25
+ BYT5_CKPT_NAME,
26
+ PromptFormat,
27
+ )
28
+ from glyph_sdxl.custom_diffusers import (
29
+ StableDiffusionGlyphXLPipeline,
30
+ CrossAttnInsertBasicTransformerBlock,
31
+ )
32
+ from glyph_sdxl.modules import T5EncoderBlockByT5Mapper
33
+
34
+ byt5_mapper_dict = [T5EncoderBlockByT5Mapper]
35
+ byt5_mapper_dict = {mapper.__name__: mapper for mapper in byt5_mapper_dict}
36
+
37
+ from demo.constants import MAX_TEXT_BOX
38
+
39
+
40
+ html = f"""<h1>Glyph-ByT5: A Customized Text Encoder for Accurate Visual Text Rendering</h1>
41
+ <h2><a href='https://glyph-byt5.github.io/'>Project Page</a> | <a href='https://arxiv.org/abs/2403.09622'>arXiv Paper</a> | <a href=''>Github</a> | <a href=''>Cite our work</a> if our ideas inspire you.</h2>
42
+ <p><b>Try some examples at the bottom of the page to get started!</b></p>
43
+ <p><b>Usage:</b></p>
44
+ <p>1. <b>Select bounding boxes</b> on the canvas on the left <b>by clicking twice</b>. </p>
45
+ <p>2. Click "Redo" if you want to cancel last point, "Undo" for clearing the canvas. </p>
46
+ <p>3. <b>Click "I've finished my layout!"</b> to start choosing specific prompts, colors and font-types. </p>
47
+ <p>4. Enter a <b>design prompt</b> for the background image. Optionally, you can choose to specify the design categories and tags (separated by a comma). </p>
48
+ <p>5. For each text box, <b>enter the text prompts in the text box</b> on the left, and <b>select colors and font-types from the drop boxes</b> on the right. </p>
49
+ <p>6. <b>Click on "I've finished my texts, colors and styles, generate!"</b> to start generating!. </p>
50
+ <style>.btn {{flex-grow: unset !important;}} </p>
51
+ """
52
+
53
+
54
+ css = '''
55
+ #color-bg{display:flex;justify-content: center;align-items: center;}
56
+ .color-bg-item{width: 100%; height: 32px}
57
+ #main_button{width:100%}
58
+ <style>
59
+ '''
60
+
61
+ state = 0
62
+ stack = []
63
+ font = ImageFont.truetype("assets/Arial.ttf", 20)
64
+
65
+ device = "cuda"
66
+
67
+ def import_model_class_from_model_name_or_path(
68
+ pretrained_model_name_or_path: str, revision: str, subfolder: str = "text_encoder",
69
+ ):
70
+ text_encoder_config = PretrainedConfig.from_pretrained(
71
+ pretrained_model_name_or_path,
72
+ subfolder=subfolder,
73
+ revision=revision,
74
+ )
75
+ model_class = text_encoder_config.architectures[0]
76
+
77
+ if model_class == "CLIPTextModel":
78
+ from transformers import CLIPTextModel
79
+
80
+ return CLIPTextModel
81
+ elif model_class == "CLIPTextModelWithProjection":
82
+ from transformers import CLIPTextModelWithProjection
83
+
84
+ return CLIPTextModelWithProjection
85
+ else:
86
+ raise ValueError(f"{model_class} is not supported.")
87
+
88
+ config = parse_config('configs/glyph_sdxl_albedo.py')
89
+ ckpt_dir = 'checkpoints/glyph-sdxl'
90
+
91
+ text_encoder_cls_one = import_model_class_from_model_name_or_path(
92
+ config.pretrained_model_name_or_path, config.revision,
93
+ )
94
+ text_encoder_cls_two = import_model_class_from_model_name_or_path(
95
+ config.pretrained_model_name_or_path, config.revision, subfolder="text_encoder_2",
96
+ )
97
+ text_encoder_one = text_encoder_cls_one.from_pretrained(
98
+ config.pretrained_model_name_or_path, subfolder="text_encoder", revision=config.revision,
99
+ cache_dir=huggingface_cache_dir,
100
+ )
101
+ text_encoder_two = text_encoder_cls_two.from_pretrained(
102
+ config.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=config.revision,
103
+ cache_dir=huggingface_cache_dir,
104
+ )
105
+
106
+ unet = UNet2DConditionModel.from_pretrained(
107
+ config.pretrained_model_name_or_path,
108
+ subfolder="unet",
109
+ revision=config.revision,
110
+ cache_dir=huggingface_cache_dir,
111
+ )
112
+
113
+ vae_path = (
114
+ config.pretrained_model_name_or_path
115
+ if config.pretrained_vae_model_name_or_path is None
116
+ else config.pretrained_vae_model_name_or_path
117
+ )
118
+ vae = AutoencoderKL.from_pretrained(
119
+ vae_path, subfolder="vae" if config.pretrained_vae_model_name_or_path is None else None,
120
+ revision=config.revision,
121
+ cache_dir=huggingface_cache_dir,
122
+ )
123
+
124
+ byt5_model, byt5_tokenizer = load_byt5_and_byt5_tokenizer(
125
+ **config.byt5_config,
126
+ huggingface_cache_dir=huggingface_cache_dir,
127
+ )
128
+
129
+ inference_dtype = torch.float32
130
+ if config.inference_dtype == "fp16":
131
+ inference_dtype = torch.float16
132
+ elif config.inference_dtype == "bf16":
133
+ inference_dtype = torch.bfloat16
134
+
135
+ inserted_new_modules_para_set = set()
136
+ for name, module in unet.named_modules():
137
+ if isinstance(module, BasicTransformerBlock) and name in config.attn_block_to_modify:
138
+ parent_module = unet
139
+ for n in name.split(".")[:-1]:
140
+ parent_module = getattr(parent_module, n)
141
+ new_block = CrossAttnInsertBasicTransformerBlock.from_transformer_block(
142
+ module,
143
+ byt5_model.config.d_model if config.byt5_mapper_config.sdxl_channels is None else config.byt5_mapper_config.sdxl_channels,
144
+ )
145
+ new_block.requires_grad_(False)
146
+ for inserted_module_name, inserted_module in zip(
147
+ new_block.get_inserted_modules_names(),
148
+ new_block.get_inserted_modules()
149
+ ):
150
+ inserted_module.requires_grad_(True)
151
+ for para_name, para in inserted_module.named_parameters():
152
+ para_key = name + '.' + inserted_module_name + '.' + para_name
153
+ assert para_key not in inserted_new_modules_para_set
154
+ inserted_new_modules_para_set.add(para_key)
155
+ for origin_module in new_block.get_origin_modules():
156
+ origin_module.to(dtype=inference_dtype)
157
+ parent_module.register_module(name.split(".")[-1], new_block)
158
+ print(f"inserted cross attn block to {name}")
159
+
160
+ byt5_mapper = byt5_mapper_dict[config.byt5_mapper_type](
161
+ byt5_model.config,
162
+ **config.byt5_mapper_config,
163
+ )
164
+
165
+ unet_lora_target_modules = [
166
+ "attn1.to_k", "attn1.to_q", "attn1.to_v", "attn1.to_out.0",
167
+ "attn2.to_k", "attn2.to_q", "attn2.to_v", "attn2.to_out.0",
168
+ ]
169
+ unet_lora_config = LoraConfig(
170
+ r=config.unet_lora_rank,
171
+ lora_alpha=config.unet_lora_rank,
172
+ init_lora_weights="gaussian",
173
+ target_modules=unet_lora_target_modules,
174
+ )
175
+ unet.add_adapter(unet_lora_config)
176
+
177
+ unet_lora_layers_para = torch.load(osp.join(ckpt_dir, UNET_CKPT_NAME), map_location='cpu')
178
+ incompatible_keys = set_peft_model_state_dict(unet, unet_lora_layers_para, adapter_name="default")
179
+ if getattr(incompatible_keys, 'unexpected_keys', []) == []:
180
+ print(f"loaded unet_lora_layers_para")
181
+ else:
182
+ print(f"unet_lora_layers has unexpected_keys: {getattr(incompatible_keys, 'unexpected_keys', None)}")
183
+
184
+ inserted_attn_module_paras = torch.load(osp.join(ckpt_dir, INSERTED_ATTN_CKPT_NAME), map_location='cpu')
185
+ missing_keys, unexpected_keys = unet.load_state_dict(inserted_attn_module_paras, strict=False)
186
+ assert len(unexpected_keys) == 0, unexpected_keys
187
+
188
+ byt5_mapper_para = torch.load(osp.join(ckpt_dir, BYT5_MAPPER_CKPT_NAME), map_location='cpu')
189
+ byt5_mapper.load_state_dict(byt5_mapper_para)
190
+
191
+ byt5_model_para = torch.load(osp.join(ckpt_dir, BYT5_CKPT_NAME), map_location='cpu')
192
+ byt5_model.load_state_dict(byt5_model_para)
193
+
194
+ pipeline = StableDiffusionGlyphXLPipeline.from_pretrained(
195
+ config.pretrained_model_name_or_path,
196
+ vae=vae,
197
+ text_encoder=text_encoder_one,
198
+ text_encoder_2=text_encoder_two,
199
+ byt5_text_encoder=byt5_model,
200
+ byt5_tokenizer=byt5_tokenizer,
201
+ byt5_mapper=byt5_mapper,
202
+ unet=unet,
203
+ byt5_max_length=config.byt5_max_length,
204
+ revision=config.revision,
205
+ torch_dtype=inference_dtype,
206
+ safety_checker=None,
207
+ cache_dir=huggingface_cache_dir,
208
+ )
209
+
210
+ pipeline.scheduler = DPMSolverMultistepScheduler.from_pretrained(
211
+ config.pretrained_model_name_or_path,
212
+ subfolder="scheduler",
213
+ use_karras_sigmas=True,
214
+ )
215
+
216
+ prompt_format = PromptFormat()
217
+
218
+ def get_pixels(
219
+ box_sketch_template,
220
+ evt: gr.SelectData
221
+ ):
222
+ global state
223
+ global stack
224
+
225
+ text_position = evt.index
226
+
227
+ if state == 0:
228
+ stack.append(text_position)
229
+ state = 1
230
+ else:
231
+ x, y = stack.pop()
232
+ stack.append([x, y, text_position[0], text_position[1]])
233
+ state = 0
234
+
235
+ print(stack)
236
+
237
+ box_sketch_template = Image.new('RGB', (1024, 1024), (255, 255, 255))
238
+ draw = ImageDraw.Draw(box_sketch_template)
239
+
240
+ for i, text_position in enumerate(stack):
241
+ if len(text_position) == 2:
242
+ x, y = text_position
243
+ r = 4
244
+ leftUpPoint = (x-r, y-r)
245
+ rightDownPoint = (x+r, y+r)
246
+
247
+ text_color = (255, 0, 0)
248
+ draw.text((x+2, y), str(i + 1), font=font, fill=text_color)
249
+
250
+ draw.ellipse((leftUpPoint,rightDownPoint), fill='red')
251
+ elif len(text_position) == 4:
252
+ x0, y0, x1, y1 = text_position
253
+ x0, x1 = min(x0, x1), max(x0, x1)
254
+ y0, y1 = min(y0, y1), max(y0, y1)
255
+ r = 4
256
+ leftUpPoint = (x0-r, y0-r)
257
+ rightDownPoint = (x0+r, y0+r)
258
+
259
+ text_color = (255, 0, 0)
260
+ draw.text((x0+2, y0), str(i + 1), font=font, fill=text_color)
261
+
262
+ draw.rectangle((x0, y0, x1, y1), outline=(255, 0, 0))
263
+
264
+ return box_sketch_template
265
+
266
+ def exe_redo(
267
+ box_sketch_template
268
+ ):
269
+ global state
270
+ global stack
271
+
272
+ state = 1 - state
273
+ if len(stack[-1]) == 2:
274
+ stack = stack[:-1]
275
+ else:
276
+ x, y, _, _ = stack[-1]
277
+ stack = stack[:-1] + [[x, y]]
278
+
279
+ box_sketch_template = Image.new('RGB', (1024, 1024), (255, 255, 255))
280
+ draw = ImageDraw.Draw(box_sketch_template)
281
+
282
+ for i, text_position in enumerate(stack):
283
+ if len(text_position) == 2:
284
+ x, y = text_position
285
+ r = 4
286
+ leftUpPoint = (x-r, y-r)
287
+ rightDownPoint = (x+r, y+r)
288
+
289
+ text_color = (255, 0, 0)
290
+ draw.text((x+2, y), str(i+1), font=font, fill=text_color)
291
+
292
+ draw.ellipse((leftUpPoint, rightDownPoint), fill='red')
293
+ elif len(text_position) == 4:
294
+ x0, y0, x1, y1 = text_position
295
+ x0, x1 = min(x0, x1), max(x0, x1)
296
+ y0, y1 = min(y0, y1), max(y0, y1)
297
+ r = 4
298
+ leftUpPoint = (x0-r, y0-r)
299
+ rightDownPoint = (x0+r, y0+r)
300
+
301
+ text_color = (255, 0, 0)
302
+ draw.text((x0+2, y0), str(i+1), font=font, fill=text_color)
303
+
304
+ draw.rectangle((x0,y0,x1,y1), outline=(255, 0, 0))
305
+
306
+ return box_sketch_template
307
+
308
+ def exe_undo(
309
+ box_sketch_template
310
+ ):
311
+ global state
312
+ global stack
313
+
314
+ state = 0
315
+ stack = []
316
+ box_sketch_template = Image.new('RGB', (1024, 1024), (255, 255, 255))
317
+
318
+ return box_sketch_template
319
+
320
+ def process_box():
321
+ global stack
322
+ global state
323
+
324
+ visibilities = []
325
+ for _ in range(MAX_TEXT_BOX + 1):
326
+ visibilities.append(gr.update(visible=False))
327
+ for n in range(len(stack) + 1):
328
+ visibilities[n] = gr.update(visible=True)
329
+
330
+ # return [gr.update(visible=True), binary_matrixes, *visibilities, *colors]
331
+ return [gr.update(visible=True), *visibilities]
332
+
333
+ @spaces.GPU
334
+ def generate_image(bg_prompt, bg_class, bg_tags, seed, *conditions):
335
+ print(conditions)
336
+
337
+ # 0 load model to cuda
338
+ global pipeline
339
+ if config.pretrained_vae_model_name_or_path is None:
340
+ vae.to(device, dtype=torch.float32)
341
+ else:
342
+ vae.to(device, dtype=inference_dtype)
343
+ text_encoder_one.to(device, dtype=inference_dtype)
344
+ text_encoder_two.to(device, dtype=inference_dtype)
345
+ byt5_model.to(device)
346
+ unet.to(device, dtype=inference_dtype)
347
+ pipeline = pipeline.to(device)
348
+
349
+ # 1. parse input
350
+ global state
351
+ global stack
352
+
353
+ prompts = []
354
+ colors = []
355
+ font_type = []
356
+ bboxes = []
357
+ num_boxes = len(stack) if len(stack[-1]) == 4 else len(stack) - 1
358
+ for i in range(num_boxes):
359
+ prompts.append(conditions[i])
360
+ colors.append(conditions[i + MAX_TEXT_BOX])
361
+ font_type.append(conditions[i + MAX_TEXT_BOX * 2])
362
+
363
+ # 2. input check
364
+ styles = []
365
+ if bg_prompt == "" or bg_prompt is None:
366
+ raise gr.Error("Empty background prompt!")
367
+ for i, (prompt, color, style) in enumerate(zip(prompts, colors, font_type)):
368
+ if prompt == "" or prompt is None:
369
+ raise gr.Error(f"Invalid prompt for text box {i + 1} !")
370
+ if color is None:
371
+ raise gr.Error(f"Invalid color for text box {i + 1} !")
372
+ if style is None:
373
+ raise gr.Error(f"Invalid style for text box {i + 1} !")
374
+ bboxes.append(
375
+ [
376
+ stack[i][0] / 1024,
377
+ stack[i][1] / 1024,
378
+ (stack[i][2] - stack[i][0]) / 1024,
379
+ (stack[i][3] - stack[i][1]) / 1024,
380
+ ]
381
+ )
382
+ styles.append(
383
+ {
384
+ 'color': webcolors.name_to_hex(color),
385
+ 'font-family': style,
386
+ }
387
+ )
388
+
389
+ # 3. format input
390
+ if bg_class != "" and bg_class is not None:
391
+ bg_prompt = bg_class + ". " + bg_prompt
392
+ if bg_tags != "" and bg_tags is not None:
393
+ bg_prompt += " Tags: " + bg_tags
394
+ text_prompt = prompt_format.format_prompt(prompts, styles)
395
+
396
+ print(bg_prompt)
397
+ print(text_prompt)
398
+
399
+ # 4. inference
400
+ if seed == -1:
401
+ generator = torch.Generator(device=device)
402
+ else:
403
+ generator = torch.Generator(device=device).manual_seed(seed)
404
+ with torch.cuda.amp.autocast():
405
+ image = pipeline(
406
+ prompt=bg_prompt,
407
+ text_prompt=text_prompt,
408
+ texts=prompts,
409
+ bboxes=bboxes,
410
+ num_inference_steps=50,
411
+ generator=generator,
412
+ text_attn_mask=None,
413
+ ).images[0]
414
+ return image
415
+
416
+ def process_example(bg_prompt, bg_class, bg_tags, color_str, style_str, text_str, box_str, seed):
417
+ global stack
418
+ global state
419
+
420
+ colors = color_str.split(",")
421
+ styles = style_str.split(",")
422
+ boxes = box_str.split(";")
423
+ prompts = text_str.split("**********")
424
+ colors = [color.strip() for color in colors]
425
+ styles = [style.strip() for style in styles]
426
+ colors += [None] * (MAX_TEXT_BOX - len(colors))
427
+ styles += [None] * (MAX_TEXT_BOX - len(styles))
428
+ prompts += [""] * (MAX_TEXT_BOX - len(prompts))
429
+
430
+ state = 0
431
+ stack = []
432
+ print(boxes)
433
+ for box in boxes:
434
+ print(box)
435
+ box = box.strip()[1:-1]
436
+ print(box)
437
+ box = box.split(",")
438
+ print(box)
439
+ x = eval(box[0].strip()) * 1024
440
+ y = eval(box[1].strip()) * 1024
441
+ w = eval(box[2].strip()) * 1024
442
+ h = eval(box[3].strip()) * 1024
443
+ stack.append([int(x), int(y), int(x + w + 0.5), int(y + h + 0.5)])
444
+
445
+ visibilities = []
446
+ for _ in range(MAX_TEXT_BOX + 1):
447
+ visibilities.append(gr.update(visible=False))
448
+ for n in range(len(stack) + 1):
449
+ visibilities[n] = gr.update(visible=True)
450
+
451
+ box_sketch_template = Image.new('RGB', (1024, 1024), (255, 255, 255))
452
+ draw = ImageDraw.Draw(box_sketch_template)
453
+
454
+ for i, text_position in enumerate(stack):
455
+ if len(text_position) == 2:
456
+ x, y = text_position
457
+ r = 4
458
+ leftUpPoint = (x-r, y-r)
459
+ rightDownPoint = (x+r, y+r)
460
+
461
+ text_color = (255, 0, 0)
462
+ draw.text((x+2, y), str(i + 1), font=font, fill=text_color)
463
+
464
+ draw.ellipse((leftUpPoint,rightDownPoint), fill='red')
465
+ elif len(text_position) == 4:
466
+ x0, y0, x1, y1 = text_position
467
+ x0, x1 = min(x0, x1), max(x0, x1)
468
+ y0, y1 = min(y0, y1), max(y0, y1)
469
+ r = 4
470
+ leftUpPoint = (x0-r, y0-r)
471
+ rightDownPoint = (x0+r, y0+r)
472
+
473
+ text_color = (255, 0, 0)
474
+ draw.text((x0+2, y0), str(i + 1), font=font, fill=text_color)
475
+
476
+ draw.rectangle((x0, y0, x1, y1), outline=(255, 0, 0))
477
+
478
+ return [
479
+ gr.update(visible=True), box_sketch_template, seed, *visibilities, *colors, *styles, *prompts,
480
+ ]
481
+
482
+ def main():
483
+ # load configs
484
+ with open('assets/color_idx.json', 'r') as f:
485
+ color_idx_dict = json.load(f)
486
+ color_idx_list = list(color_idx_dict)
487
+ with open('assets/font_idx_512.json', 'r') as f:
488
+ font_idx_dict = json.load(f)
489
+ font_idx_list = list(font_idx_dict)
490
+
491
+ with gr.Blocks(
492
+ title="Glyph-ByT5: A Customized Text Encoder for Accurate Visual Text Rendering",
493
+ css=css,
494
+ ) as demo:
495
+ gr.HTML(html)
496
+ with gr.Row():
497
+ with gr.Column(elem_id="main-image"):
498
+ box_sketch_template = gr.Image(
499
+ value=Image.new('RGB', (1024, 1024), (255, 255, 255)),
500
+ sources=[],
501
+ interactive=False,
502
+ )
503
+
504
+ box_sketch_template.select(get_pixels, [box_sketch_template], [box_sketch_template])
505
+
506
+ with gr.Row():
507
+ redo = gr.Button(value='Redo - Cancel last point')
508
+ undo = gr.Button(value='Undo - Clear the canvas')
509
+ redo.click(exe_redo, [box_sketch_template], [box_sketch_template])
510
+ undo.click(exe_undo, [box_sketch_template], [box_sketch_template])
511
+
512
+ button_layout = gr.Button("(1) I've finished my layout!", elem_id="main_button", interactive=True)
513
+
514
+ prompts = []
515
+ colors = []
516
+ styles = []
517
+ color_row = [None] * (MAX_TEXT_BOX + 1)
518
+ with gr.Column(visible=False) as post_box:
519
+ for n in range(MAX_TEXT_BOX + 1):
520
+ if n == 0 :
521
+ with gr.Row(visible=True) as color_row[n]:
522
+ bg_prompt = gr.Textbox(label="Design prompt for the background image", value="")
523
+ bg_class = gr.Textbox(label="Design type for the background image (optional)", value="")
524
+ bg_tags = gr.Textbox(label="Design type for the background image (optional)", value="")
525
+ else:
526
+ with gr.Row(visible=False) as color_row[n]:
527
+ prompts.append(gr.Textbox(label="Prompt for box "+str(n)))
528
+ colors.append(gr.Dropdown(
529
+ label="Color for box "+str(n),
530
+ choices=color_idx_list,
531
+ ))
532
+ styles.append(gr.Dropdown(
533
+ label="Font type for box "+str(n),
534
+ choices=font_idx_list,
535
+ ))
536
+
537
+ seed_ = gr.Slider(label="Seed", minimum=-1, maximum=999999999, value=-1, step=1)
538
+ button_generate = gr.Button("(2) I've finished my texts, colors and styles, generate!", elem_id="main_button", interactive=True)
539
+
540
+ button_layout.click(process_box, inputs=[], outputs=[post_box, *color_row], queue=False)
541
+
542
+ with gr.Column():
543
+ output_image = gr.Image(label="Output Image", interactive=False)
544
+
545
+ button_generate.click(generate_image, inputs=[bg_prompt, bg_class, bg_tags, seed_, *(prompts + colors + styles)], outputs=[output_image], queue=True)
546
+
547
+ # examples
548
+ color_str = gr.Textbox(label="Color list", value="", visible=False)
549
+ style_str = gr.Textbox(label="Font type list", value="", visible=False)
550
+ box_str = gr.Textbox(label="Bbox list", value="", visible=False)
551
+ text_str = gr.Textbox(label="Text list", value="", visible=False)
552
+
553
+ gr.Examples(
554
+ examples=[
555
+ [
556
+ 'The image features a small bunny rabbit sitting in a basket filled with various flowers. The basket is placed on a yellow background, creating a vibrant and cheerful scene. The flowers surrounding the rabbit come in different sizes and colors, adding to the overall visual appeal of the image. The rabbit appears to be the main focus of the scene, and its presence among the flowers creates a sense of harmony and balance.',
557
+ 'Facebook Post',
558
+ 'green, yellow, minimalist, easter day, happy easter day, easter, happy easter, decoration, happy, egg, spring, selebration, poster, illustration, greeting, season, design, colorful, cute, template',
559
+ 'darkolivegreen, darkolivegreen, darkolivegreen',
560
+ 'Gagalin-Regular, Gagalin-Regular, Brusher-Regular',
561
+ 'MAY ALLYOUR PRAYERS BE ANSWERED**********HAVE A HAPPY**********Easter Day',
562
+ '[0.08267477203647416, 0.5355623100303951, 0.42857142857142855, 0.07477203647416414]; [0.08389057750759879, 0.1951367781155015, 0.38054711246200607, 0.03768996960486322]; [0.07537993920972644, 0.2601823708206687, 0.49544072948328266, 0.14650455927051673]',
563
+ 1,
564
+ ],
565
+ [
566
+ 'The image features a large gray elephant sitting in a field of flowers, holding a smaller elephant in its arms. The scene is quite serene and picturesque, with the two elephants being the main focus of the image. The field is filled with various flowers, creating a beautiful and vibrant backdrop for the elephants.',
567
+ 'Cards and invitations',
568
+ 'Light green, orange, Illustration, watercolor, playful, Baby shower invitation, baby boy shower invitation, baby boy, welcoming baby boy, koala baby shower invitation, baby shower invitation for baby shower, baby boy invitation, background, playful baby shower card, baby shower, card, newborn, born, Baby Shirt Baby Shower Invitation',
569
+ 'peru, olive, olivedrab, peru, peru, peru',
570
+ 'LilitaOne, Sensei-Medium, Sensei-Medium, LilitaOne, LilitaOne, LilitaOne',
571
+ "RSVP to +123-456-7890**********Olivia Wilson**********Baby Shower**********Please Join Us For a**********In Honoring**********23 November, 2021 | 03:00 PM Fauget Hotels",
572
+ '[0.07112462006079028, 0.6462006079027356, 0.3373860182370821, 0.026747720364741642]; [0.07051671732522796, 0.38662613981762917, 0.37264437689969604, 0.059574468085106386]; [0.07234042553191489, 0.15623100303951368, 0.6547112462006079, 0.12401215805471125]; [0.0662613981762918, 0.06747720364741641, 0.3981762917933131, 0.035866261398176294]; [0.07051671732522796, 0.31550151975683893, 0.22006079027355624, 0.03951367781155015]; [0.06990881458966565, 0.48328267477203646, 0.39878419452887537, 0.1094224924012158]',
573
+ 0,
574
+ ],
575
+ [
576
+ 'The image features a white background with a variety of colorful flowers and decorations. There are several pink flowers scattered throughout the scene, with some positioned closer to the top and others near the bottom. A blue flower can also be seen in the middle of the image. The overall composition creates a visually appealing and vibrant display.',
577
+ 'Instagram Posts',
578
+ 'grey, navy, purple, pink, teal, colorful, illustration, happy, celebration, post, party, year, new, event, celebrate, happy new year, new year, countdown, sparkle, firework',
579
+ 'purple, midnightblue, black, black',
580
+ 'Caveat-Regular, Gagalin-Regular, Quicksand-Light, Quicksand-Light',
581
+ 'Happy New Year**********2024**********All THE BEST**********A fresh start to start a change for the better.',
582
+ '[0.2936170212765957, 0.2887537993920973, 0.40303951367781155, 0.07173252279635259]; [0.24984802431610942, 0.3951367781155015, 0.46200607902735563, 0.17203647416413373]; [0.3951367781155015, 0.1094224924012158, 0.2109422492401216, 0.02796352583586626]; [0.20911854103343466, 0.6127659574468085, 0.5586626139817629, 0.08085106382978724]',
583
+ 1,
584
+ ],
585
+ [
586
+ 'The image features a stack of pancakes with syrup and strawberries on top. The pancakes are arranged in a visually appealing manner, with some pancakes placed on top of each other. The syrup is drizzled generously over the pancakes, and the strawberries are scattered around, adding a touch of color and freshness to the scene. The overall presentation of the pancakes is appetizing and inviting.',
587
+ 'Instagram Posts',
588
+ 'brown, peach, grey, modern, minimalist, simple, colorful, illustration, Instagram post, instagram, post, national pancake day, international pancake day, happy pancake day, pancake day, pancake, sweet, cake, discount, sale',
589
+ 'dimgray, white, darkolivegreen',
590
+ 'MoreSugarRegular, Chewy-Regular, Chewy-Regular',
591
+ 'Get 75% Discount for your first order**********Order Now**********National Pancake Day',
592
+ '[0.043161094224924014, 0.5963525835866261, 0.2936170212765957, 0.08389057750759879]; [0.12279635258358662, 0.79209726443769, 0.26382978723404255, 0.05167173252279635]; [0.044984802431610946, 0.09787234042553192, 0.4413373860182371, 0.4158054711246201]',
593
+ 1,
594
+ ]
595
+ ],
596
+ inputs=[
597
+ bg_prompt,
598
+ bg_class,
599
+ bg_tags,
600
+ color_str,
601
+ style_str,
602
+ text_str,
603
+ box_str,
604
+ seed_,
605
+ ],
606
+ outputs=[post_box, box_sketch_template, seed_, *color_row, *colors, *styles, *prompts],
607
+ fn=process_example,
608
+ run_on_click=True,
609
+ label='Examples',
610
+ )
611
+
612
+ demo.queue()
613
+ demo.launch()
614
+
615
+ if __name__ == "__main__":
616
+ main()
assets/Arial.ttf ADDED
Binary file (276 kB). View file
 
assets/chinese_char.txt ADDED
@@ -0,0 +1,1000 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+
4
+
5
+
6
+
7
+
8
+
9
+
10
+
11
+
12
+
13
+
14
+
15
+
16
+
17
+
18
+
19
+
20
+
21
+
22
+
23
+
24
+
25
+
26
+
27
+
28
+
29
+
30
+
31
+
32
+
33
+
34
+
35
+
36
+
37
+
38
+
39
+
40
+
41
+
42
+
43
+
44
+
45
+
46
+
47
+
48
+
49
+
50
+
51
+
52
+
53
+
54
+
55
+
56
+
57
+
58
+
59
+
60
+
61
+
62
+
63
+
64
+
65
+
66
+
67
+
68
+
69
+
70
+
71
+
72
+
73
+
74
+
75
+
76
+
77
+
78
+
79
+
80
+
81
+
82
+
83
+
84
+
85
+
86
+
87
+
88
+
89
+
90
+
91
+
92
+
93
+
94
+
95
+
96
+
97
+
98
+
99
+
100
+
101
+
102
+
103
+
104
+
105
+
106
+
107
+
108
+
109
+
110
+
111
+
112
+
113
+
114
+
115
+
116
+
117
+
118
+
119
+ 使
120
+
121
+
122
+
123
+
124
+
125
+
126
+
127
+
128
+
129
+
130
+
131
+
132
+
133
+
134
+
135
+
136
+
137
+
138
+
139
+
140
+
141
+
142
+
143
+
144
+
145
+
146
+
147
+
148
+
149
+
150
+
151
+
152
+
153
+
154
+
155
+
156
+
157
+
158
+
159
+
160
+
161
+
162
+
163
+
164
+
165
+
166
+
167
+ 西
168
+
169
+
170
+
171
+
172
+
173
+
174
+
175
+
176
+
177
+
178
+
179
+
180
+
181
+
182
+
183
+
184
+
185
+
186
+
187
+
188
+
189
+
190
+
191
+
192
+
193
+
194
+
195
+
196
+
197
+
198
+
199
+
200
+
201
+
202
+
203
+
204
+
205
+
206
+
207
+
208
+
209
+
210
+
211
+
212
+
213
+
214
+
215
+
216
+
217
+
218
+
219
+
220
+
221
+
222
+
223
+
224
+
225
+
226
+
227
+
228
+
229
+
230
+
231
+
232
+
233
+
234
+
235
+
236
+
237
+
238
+
239
+
240
+
241
+
242
+
243
+
244
+
245
+
246
+
247
+
248
+
249
+
250
+
251
+
252
+
253
+
254
+
255
+
256
+
257
+
258
+
259
+
260
+
261
+
262
+
263
+
264
+
265
+
266
+
267
+
268
+
269
+
270
+
271
+ 便
272
+
273
+
274
+
275
+
276
+
277
+
278
+
279
+
280
+
281
+
282
+
283
+
284
+
285
+
286
+
287
+
288
+
289
+
290
+
291
+
292
+
293
+
294
+
295
+
296
+
297
+
298
+
299
+
300
+
301
+
302
+
303
+
304
+
305
+
306
+
307
+
308
+
309
+
310
+
311
+
312
+
313
+
314
+
315
+
316
+
317
+
318
+
319
+
320
+
321
+
322
+
323
+
324
+
325
+
326
+
327
+
328
+
329
+
330
+
331
+
332
+
333
+
334
+
335
+
336
+
337
+
338
+
339
+
340
+
341
+
342
+
343
+
344
+
345
+
346
+
347
+
348
+
349
+
350
+
351
+
352
+
353
+
354
+
355
+
356
+
357
+
358
+
359
+
360
+
361
+
362
+
363
+
364
+
365
+
366
+
367
+
368
+
369
+
370
+
371
+
372
+
373
+
374
+
375
+
376
+
377
+
378
+
379
+
380
+
381
+
382
+
383
+
384
+
385
+
386
+
387
+
388
+
389
+
390
+
391
+
392
+
393
+
394
+
395
+
396
+
397
+
398
+
399
+
400
+
401
+
402
+
403
+
404
+
405
+
406
+
407
+
408
+
409
+
410
+
411
+
412
+
413
+
414
+
415
+
416
+
417
+
418
+
419
+
420
+
421
+
422
+
423
+
424
+
425
+
426
+
427
+
428
+
429
+
430
+ 线
431
+
432
+
433
+
434
+
435
+
436
+
437
+
438
+
439
+
440
+
441
+
442
+
443
+
444
+
445
+
446
+
447
+
448
+
449
+
450
+
451
+
452
+
453
+
454
+
455
+
456
+
457
+
458
+
459
+
460
+
461
+
462
+
463
+
464
+
465
+
466
+
467
+
468
+ 广
469
+
470
+
471
+
472
+
473
+
474
+
475
+
476
+
477
+
478
+
479
+
480
+
481
+
482
+
483
+
484
+
485
+
486
+
487
+
488
+
489
+
490
+
491
+
492
+
493
+
494
+
495
+
496
+
497
+
498
+
499
+
500
+
501
+
502
+
503
+
504
+
505
+
506
+
507
+
508
+
509
+
510
+
511
+
512
+
513
+
514
+
515
+
516
+
517
+
518
+
519
+
520
+
521
+
522
+
523
+
524
+
525
+
526
+
527
+
528
+
529
+
530
+
531
+
532
+
533
+
534
+
535
+
536
+
537
+
538
+
539
+
540
+
541
+
542
+
543
+
544
+
545
+
546
+
547
+
548
+
549
+
550
+
551
+
552
+
553
+
554
+
555
+
556
+
557
+
558
+
559
+
560
+
561
+
562
+
563
+
564
+
565
+
566
+
567
+
568
+
569
+
570
+
571
+
572
+
573
+
574
+
575
+
576
+
577
+
578
+
579
+
580
+
581
+
582
+
583
+
584
+
585
+
586
+
587
+
588
+
589
+
590
+
591
+
592
+
593
+
594
+
595
+
596
+
597
+
598
+
599
+
600
+
601
+
602
+
603
+
604
+
605
+
606
+
607
+
608
+
609
+
610
+
611
+
612
+
613
+
614
+
615
+
616
+
617
+
618
+
619
+
620
+
621
+
622
+
623
+
624
+
625
+
626
+
627
+
628
+
629
+
630
+
631
+
632
+
633
+
634
+
635
+
636
+
637
+
638
+
639
+
640
+
641
+
642
+
643
+
644
+
645
+
646
+
647
+
648
+
649
+
650
+
651
+
652
+
653
+
654
+
655
+
656
+
657
+
658
+
659
+
660
+
661
+
662
+
663
+
664
+
665
+
666
+
667
+
668
+
669
+
670
+
671
+
672
+
673
+
674
+
675
+
676
+
677
+
678
+
679
+
680
+
681
+
682
+
683
+
684
+
685
+
686
+
687
+
688
+
689
+
690
+
691
+
692
+
693
+
694
+
695
+
696
+
697
+
698
+
699
+
700
+
701
+
702
+
703
+
704
+
705
+
706
+
707
+
708
+
709
+
710
+
711
+
712
+
713
+
714
+
715
+
716
+
717
+
718
+
719
+
720
+
721
+
722
+
723
+ 退
724
+
725
+
726
+
727
+
728
+
729
+
730
+
731
+
732
+
733
+
734
+
735
+
736
+
737
+
738
+
739
+
740
+
741
+
742
+
743
+
744
+
745
+
746
+
747
+
748
+
749
+
750
+
751
+
752
+
753
+
754
+
755
+
756
+
757
+
758
+
759
+
760
+
761
+
762
+ 怀
763
+
764
+
765
+
766
+
767
+
768
+
769
+
770
+
771
+
772
+
773
+
774
+
775
+
776
+
777
+
778
+
779
+
780
+
781
+
782
+
783
+
784
+
785
+ 穿
786
+
787
+
788
+
789
+
790
+
791
+
792
+
793
+
794
+
795
+
796
+
797
+
798
+
799
+
800
+
801
+
802
+
803
+
804
+
805
+
806
+
807
+
808
+
809
+
810
+
811
+
812
+
813
+
814
+
815
+
816
+
817
+
818
+
819
+
820
+
821
+
822
+
823
+
824
+
825
+
826
+
827
+
828
+
829
+
830
+
831
+
832
+
833
+
834
+
835
+
836
+
837
+
838
+
839
+
840
+
841
+
842
+
843
+
844
+
845
+
846
+
847
+
848
+
849
+
850
+
851
+
852
+
853
+
854
+
855
+
856
+
857
+
858
+
859
+
860
+
861
+
862
+
863
+
864
+
865
+
866
+
867
+
868
+
869
+
870
+
871
+
872
+
873
+
874
+
875
+
876
+
877
+
878
+
879
+
880
+
881
+
882
+
883
+
884
+
885
+
886
+
887
+
888
+
889
+
890
+
891
+
892
+
893
+
894
+
895
+
896
+
897
+
898
+
899
+
900
+
901
+
902
+
903
+
904
+
905
+
906
+
907
+
908
+
909
+
910
+
911
+
912
+
913
+
914
+
915
+
916
+
917
+
918
+
919
+
920
+
921
+
922
+
923
+
924
+
925
+
926
+
927
+
928
+
929
+
930
+
931
+
932
+
933
+
934
+
935
+
936
+
937
+
938
+
939
+
940
+
941
+
942
+
943
+
944
+
945
+
946
+
947
+
948
+
949
+
950
+
951
+
952
+
953
+
954
+
955
+
956
+
957
+
958
+
959
+
960
+
961
+
962
+
963
+
964
+
965
+
966
+
967
+
968
+
969
+
970
+
971
+
972
+
973
+
974
+
975
+
976
+
977
+
978
+
979
+
980
+
981
+
982
+
983
+
984
+
985
+
986
+
987
+
988
+
989
+
990
+
991
+
992
+
993
+
994
+
995
+
996
+
997
+
998
+
999
+
1000
+
assets/color_idx.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"white": 0, "black": 1, "darkslategray": 2, "dimgray": 3, "darkolivegreen": 4, "midnightblue": 5, "saddlebrown": 6, "sienna": 7, "whitesmoke": 8, "darkslateblue": 9, "indianred": 10, "linen": 11, "maroon": 12, "khaki": 13, "sandybrown": 14, "gray": 15, "gainsboro": 16, "teal": 17, "peru": 18, "gold": 19, "snow": 20, "firebrick": 21, "crimson": 22, "chocolate": 23, "tomato": 24, "brown": 25, "goldenrod": 26, "antiquewhite": 27, "rosybrown": 28, "steelblue": 29, "floralwhite": 30, "seashell": 31, "darkgreen": 32, "oldlace": 33, "darkkhaki": 34, "burlywood": 35, "red": 36, "darkgray": 37, "orange": 38, "royalblue": 39, "seagreen": 40, "lightgray": 41, "tan": 42, "coral": 43, "beige": 44, "palevioletred": 45, "wheat": 46, "lavender": 47, "darkcyan": 48, "slateblue": 49, "slategray": 50, "orangered": 51, "silver": 52, "olivedrab": 53, "forestgreen": 54, "darkgoldenrod": 55, "ivory": 56, "darkorange": 57, "yellow": 58, "hotpink": 59, "ghostwhite": 60, "lightcoral": 61, "indigo": 62, "bisque": 63, "darkred": 64, "darksalmon": 65, "lightslategray": 66, "dodgerblue": 67, "lightpink": 68, "mistyrose": 69, "mediumvioletred": 70, "cadetblue": 71, "deeppink": 72, "salmon": 73, "palegoldenrod": 74, "blanchedalmond": 75, "lightseagreen": 76, "cornflowerblue": 77, "yellowgreen": 78, "greenyellow": 79, "navajowhite": 80, "papayawhip": 81, "mediumslateblue": 82, "purple": 83, "blueviolet": 84, "pink": 85, "cornsilk": 86, "lightsalmon": 87, "mediumpurple": 88, "moccasin": 89, "turquoise": 90, "mediumseagreen": 91, "lavenderblush": 92, "mediumblue": 93, "darkseagreen": 94, "mediumturquoise": 95, "paleturquoise": 96, "skyblue": 97, "lemonchiffon": 98, "olive": 99, "peachpuff": 100, "lightyellow": 101, "lightsteelblue": 102, "mediumorchid": 103, "plum": 104, "darkturquoise": 105, "aliceblue": 106, "mediumaquamarine": 107, "orchid": 108, "powderblue": 109, "blue": 110, "darkorchid": 111, "violet": 112, "lightskyblue": 113, "lightcyan": 114, "lightgoldenrodyellow": 115, "navy": 116, "thistle": 117, "honeydew": 118, "mintcream": 119, "lightblue": 120, "darkblue": 121, "darkmagenta": 122, "deepskyblue": 123, "magenta": 124, "limegreen": 125, "darkviolet": 126, "cyan": 127, "palegreen": 128, "aquamarine": 129, "lawngreen": 130, "lightgreen": 131, "azure": 132, "chartreuse": 133, "green": 134, "mediumspringgreen": 135, "lime": 136, "springgreen": 137}
assets/font_idx_512.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"Montserrat-Regular": 0, "Poppins-Italic": 1, "GlacialIndifference-Regular": 2, "OpenSans-ExtraBoldItalic": 3, "Montserrat-Bold": 4, "Now-Regular": 5, "Garet-Regular": 6, "LeagueSpartan-Bold": 7, "DMSans-Regular": 8, "OpenSauceOne-Regular": 9, "OpenSans-ExtraBold": 10, "KGPrimaryPenmanship": 11, "Anton-Regular": 12, "Aileron-BlackItalic": 13, "Quicksand-Light": 14, "Roboto-BoldItalic": 15, "TheSeasons-It": 16, "Kollektif": 17, "Inter-BoldItalic": 18, "Poppins-Medium": 19, "Poppins-Light": 20, "RoxboroughCF-RegularItalic": 21, "PlayfairDisplay-SemiBold": 22, "Agrandir-Italic": 23, "Lato-Regular": 24, "MoreSugarRegular": 25, "CanvaSans-RegularItalic": 26, "PublicSans-Italic": 27, "CodePro-NormalLC": 28, "Belleza-Regular": 29, "JosefinSans-Bold": 30, "HKGrotesk-Bold": 31, "Telegraf-Medium": 32, "BrittanySignatureRegular": 33, "Raleway-ExtraBoldItalic": 34, "Mont-RegularItalic": 35, "Arimo-BoldItalic": 36, "Lora-Italic": 37, "ArchivoBlack-Regular": 38, "Poppins": 39, "Barlow-Black": 40, "CormorantGaramond-Bold": 41, "LibreBaskerville-Regular": 42, "CanvaSchoolFontRegular": 43, "BebasNeueBold": 44, "LazydogRegular": 45, "FredokaOne-Regular": 46, "Horizon-Bold": 47, "Nourd-Regular": 48, "Hatton-Regular": 49, "Nunito-ExtraBoldItalic": 50, "CerebriSans-Regular": 51, "Montserrat-Light": 52, "TenorSans": 53, "Norwester-Regular": 54, "ClearSans-Bold": 55, "Cardo-Regular": 56, "Alice-Regular": 57, "Oswald-Regular": 58, "Gaegu-Bold": 59, "Muli-Black": 60, "TAN-PEARL-Regular": 61, "CooperHewitt-Book": 62, "Agrandir-Grand": 63, "BlackMango-Thin": 64, "DMSerifDisplay-Regular": 65, "Antonio-Bold": 66, "Sniglet-Regular": 67, "BeVietnam-Regular": 68, "NunitoSans10pt-BlackItalic": 69, "AbhayaLibre-ExtraBold": 70, "Rubik-Regular": 71, "PPNeueMachina-Regular": 72, "TAN - MON CHERI-Regular": 73, "Jua-Regular": 74, "Playlist-Script": 75, "SourceSansPro-BoldItalic": 76, "MoonTime-Regular": 77, "Eczar-ExtraBold": 78, "Gatwick-Regular": 79, "MonumentExtended-Regular": 80, "BarlowSemiCondensed-Regular": 81, "BarlowCondensed-Regular": 82, "Alegreya-Regular": 83, "DreamAvenue": 84, "RobotoCondensed-Italic": 85, "BobbyJones-Regular": 86, "Garet-ExtraBold": 87, "YesevaOne-Regular": 88, "Dosis-ExtraBold": 89, "LeagueGothic-Regular": 90, "OpenSans-Italic": 91, "TANAEGEAN-Regular": 92, "Maharlika-Regular": 93, "MarykateRegular": 94, "Cinzel-Regular": 95, "Agrandir-Wide": 96, "Chewy-Regular": 97, "BodoniFLF-BoldItalic": 98, "Nunito-BlackItalic": 99, "LilitaOne": 100, "HandyCasualCondensed-Regular": 101, "Ovo": 102, "Livvic-Regular": 103, "Agrandir-Narrow": 104, "CrimsonPro-Italic": 105, "AnonymousPro-Bold": 106, "NF-OneLittleFont-Bold": 107, "RedHatDisplay-BoldItalic": 108, "CodecPro-Regular": 109, "HalimunRegular": 110, "LibreFranklin-Black": 111, "TeXGyreTermes-BoldItalic": 112, "Shrikhand-Regular": 113, "TTNormsPro-Italic": 114, "Gagalin-Regular": 115, "OpenSans-Bold": 116, "GreatVibes-Regular": 117, "Breathing": 118, "HeroLight-Regular": 119, "KGPrimaryDots": 120, "Quicksand-Bold": 121, "Brice-ExtraLightSemiExpanded": 122, "Lato-BoldItalic": 123, "Fraunces9pt-Italic": 124, "AbrilFatface-Regular": 125, "BerkshireSwash-Regular": 126, "Atma-Bold": 127, "HolidayRegular": 128, "BebasNeueCyrillic": 129, "IntroRust-Base": 130, "Gistesy": 131, "BDScript-Regular": 132, "ApricotsRegular": 133, "Prompt-Black": 134, "TAN MERINGUE": 135, "Sukar Regular": 136, "GentySans-Regular": 137, "NeueEinstellung-Normal": 138, "Garet-Bold": 139, "FiraSans-Black": 140, "BantayogLight": 141, "NotoSerifDisplay-Black": 142, "TTChocolates-Regular": 143, "Ubuntu-Regular": 144, "Assistant-Bold": 145, "ABeeZee-Regular": 146, "LexendDeca-Regular": 147, "KingredSerif": 148, "Radley-Regular": 149, "BrownSugar": 150, "MigraItalic-ExtraboldItalic": 151, "ChildosArabic-Regular": 152, "PeaceSans": 153, "LondrinaSolid-Black": 154, "SpaceMono-BoldItalic": 155, "RobotoMono-Light": 156, "CourierPrime-Regular": 157, "Alata-Regular": 158, "Amsterdam-One": 159, "IreneFlorentina-Regular": 160, "CatchyMager": 161, "Alta_regular": 162, "ArticulatCF-Regular": 163, "Raleway-Regular": 164, "BrasikaDisplay": 165, "TANAngleton-Italic": 166, "NotoSerifDisplay-ExtraCondensedItalic": 167, "Bryndan Write": 168, "TTCommonsPro-It": 169, "AlexBrush-Regular": 170, "Antic-Regular": 171, "TTHoves-Bold": 172, "DroidSerif": 173, "AblationRegular": 174, "Marcellus-Regular": 175, "Sanchez-Italic": 176, "JosefinSans": 177, "Afrah-Regular": 178, "PinyonScript": 179, "TTInterphases-BoldItalic": 180, "Yellowtail-Regular": 181, "Gliker-Regular": 182, "BobbyJonesSoft-Regular": 183, "IBMPlexSans": 184, "Amsterdam-Three": 185, "Amsterdam-FourSlant": 186, "TTFors-Regular": 187, "Quattrocento": 188, "Sifonn-Basic": 189, "AlegreyaSans-Black": 190, "Daydream": 191, "AristotelicaProTx-Rg": 192, "NotoSerif": 193, "EBGaramond-Italic": 194, "HammersmithOne-Regular": 195, "RobotoSlab-Regular": 196, "DO-Sans-Regular": 197, "KGPrimaryDotsLined": 198, "Blinker-Regular": 199, "TAN NIMBUS": 200, "Blueberry-Regular": 201, "Rosario-Regular": 202, "Forum": 203, "MistrullyRegular": 204, "SourceSerifPro-Regular": 205, "Bugaki-Regular": 206, "CMUSerif-Roman": 207, "GulfsDisplay-NormalItalic": 208, "PTSans-Bold": 209, "Sensei-Medium": 210, "SquadaOne-Regular": 211, "Arapey-Italic": 212, "Parisienne-Regular": 213, "Aleo-Italic": 214, "QuicheDisplay-Italic": 215, "RocaOne-It": 216, "Funtastic-Regular": 217, "PTSerif-BoldItalic": 218, "Muller-RegularItalic": 219, "ArgentCF-Regular": 220, "Brightwall-Italic": 221, "Knewave-Regular": 222, "TYSerif-D": 223, "Agrandir-Tight": 224, "AlfaSlabOne-Regular": 225, "TANTangkiwood-Display": 226, "Kief-Montaser-Regular": 227, "Gotham-Book": 228, "JuliusSansOne-Regular": 229, "CocoGothic-Italic": 230, "SairaCondensed-Regular": 231, "DellaRespira-Regular": 232, "Questrial-Regular": 233, "BukhariScript-Regular": 234, "HelveticaWorld-Bold": 235, "TANKINDRED-Display": 236, "CinzelDecorative-Regular": 237, "Vidaloka-Regular": 238, "AlegreyaSansSC-Black": 239, "FeelingPassionate-Regular": 240, "QuincyCF-Regular": 241, "FiraCode-Regular": 242, "Genty-Regular": 243, "Nickainley-Normal": 244, "RubikOne-Regular": 245, "Gidole-Regular": 246, "Borsok": 247, "Gordita-RegularItalic": 248, "Scripter-Regular": 249, "Buffalo-Regular": 250, "KleinText-Regular": 251, "Creepster-Regular": 252, "Arvo-Bold": 253, "GabrielSans-NormalItalic": 254, "Heebo-Black": 255, "LexendExa-Regular": 256, "BrixtonSansTC-Regular": 257, "GildaDisplay-Regular": 258, "ChunkFive-Roman": 259, "Amaranth-BoldItalic": 260, "BubbleboddyNeue-Regular": 261, "MavenPro-Bold": 262, "TTDrugs-Italic": 263, "CyGrotesk-KeyRegular": 264, "VarelaRound-Regular": 265, "Ruda-Black": 266, "SafiraMarch": 267, "BloggerSans": 268, "TANHEADLINE-Regular": 269, "SloopScriptPro-Regular": 270, "NeueMontreal-Regular": 271, "Schoolbell-Regular": 272, "SigherRegular": 273, "InriaSerif-Regular": 274, "JetBrainsMono-Regular": 275, "MADEEvolveSans": 276, "Dekko": 277, "Handyman-Regular": 278, "Aileron-BoldItalic": 279, "Bright-Italic": 280, "Solway-Regular": 281, "Higuen-Regular": 282, "WedgesItalic": 283, "TANASHFORD-BOLD": 284, "IBMPlexMono": 285, "RacingSansOne-Regular": 286, "RegularBrush": 287, "OpenSans-LightItalic": 288, "SpecialElite-Regular": 289, "FuturaLTPro-Medium": 290, "MaragsaDisplay": 291, "BigShouldersDisplay-Regular": 292, "BDSans-Regular": 293, "RasputinRegular": 294, "Yvesyvesdrawing-BoldItalic": 295, "Bitter-Regular": 296, "LuckiestGuy-Regular": 297, "CanvaSchoolFontDotted": 298, "TTFirsNeue-Italic": 299, "Sunday-Regular": 300, "HKGothic-MediumItalic": 301, "CaveatBrush-Regular": 302, "HeliosExt": 303, "ArchitectsDaughter-Regular": 304, "Angelina": 305, "Calistoga-Regular": 306, "ArchivoNarrow-Regular": 307, "ObjectSans-MediumSlanted": 308, "AyrLucidityCondensed-Regular": 309, "Nexa-RegularItalic": 310, "Lustria-Regular": 311, "Amsterdam-TwoSlant": 312, "Virtual-Regular": 313, "Brusher-Regular": 314, "NF-Lepetitcochon-Regular": 315, "TANTWINKLE": 316, "LeJour-Serif": 317, "Prata-Regular": 318, "PPWoodland-Regular": 319, "PlayfairDisplay-BoldItalic": 320, "AmaticSC-Regular": 321, "Cabin-Regular": 322, "Manjari-Bold": 323, "MrDafoe-Regular": 324, "TTRamillas-Italic": 325, "Luckybones-Bold": 326, "DarkerGrotesque-Light": 327, "BellabooRegular": 328, "CormorantSC-Bold": 329, "GochiHand-Regular": 330, "Atteron": 331, "RocaTwo-Lt": 332, "ZCOOLXiaoWei-Regular": 333, "TANSONGBIRD": 334, "HeadingNow-74Regular": 335, "Luthier-BoldItalic": 336, "Oregano-Regular": 337, "AyrTropikaIsland-Int": 338, "Mali-Regular": 339, "DidactGothic-Regular": 340, "Lovelace-Regular": 341, "BakerieSmooth-Regular": 342, "CarterOne": 343, "HussarBd": 344, "OldStandard-Italic": 345, "TAN-ASTORIA-Display": 346, "rugratssans-Regular": 347, "BMHANNA": 348, "BetterSaturday": 349, "AdigianaToybox": 350, "Sailors": 351, "PlayfairDisplaySC-Italic": 352, "Etna-Regular": 353, "Revive80Signature": 354, "CAGenerated": 355, "Poppins-Regular": 356, "Jonathan-Regular": 357, "Pacifico-Regular": 358, "Saira-Black": 359, "Loubag-Regular": 360, "Decalotype-Black": 361, "Mansalva-Regular": 362, "Allura-Regular": 363, "ProximaNova-Bold": 364, "TANMIGNON-DISPLAY": 365, "ArsenicaAntiqua-Regular": 366, "BreulGroteskA-RegularItalic": 367, "HKModular-Bold": 368, "TANNightingale-Regular": 369, "AristotelicaProCndTxt-Rg": 370, "Aprila-Regular": 371, "Tomorrow-Regular": 372, "AngellaWhite": 373, "KaushanScript-Regular": 374, "NotoSans": 375, "LeJour-Script": 376, "BrixtonTC-Regular": 377, "OleoScript-Regular": 378, "Cakerolli-Regular": 379, "Lobster-Regular": 380, "FrunchySerif-Regular": 381, "PorcelainRegular": 382, "AlojaExtended": 383, "SergioTrendy-Italic": 384, "LovelaceText-Bold": 385, "Anaktoria": 386, "JimmyScript-Light": 387, "IBMPlexSerif": 388, "Marta": 389, "Mango-Regular": 390, "Overpass-Italic": 391, "Hagrid-Regular": 392, "ElikaGorica": 393, "Amiko-Regular": 394, "EFCOBrookshire-Regular": 395, "Caladea-Regular": 396, "MoonlightBold": 397, "Staatliches-Regular": 398, "Helios-Bold": 399, "Satisfy-Regular": 400, "NexaScript-Regular": 401, "Trocchi-Regular": 402, "March": 403, "IbarraRealNova-Regular": 404, "Nectarine-Regular": 405, "Overpass-Light": 406, "TruetypewriterPolyglOTT": 407, "Bangers-Regular": 408, "Lazord-BoldExpandedItalic": 409, "Chloe-Regular": 410, "BaskervilleDisplayPT-Regular": 411, "Bright-Regular": 412, "Vollkorn-Regular": 413, "Harmattan": 414, "SortsMillGoudy-Regular": 415, "Biryani-Bold": 416, "SugoProDisplay-Italic": 417, "Lazord-BoldItalic": 418, "Alike-Regular": 419, "PermanentMarker-Regular": 420, "Sacramento-Regular": 421, "HKGroteskPro-Italic": 422, "Aleo-BoldItalic": 423, "Noot": 424, "TANGARLAND-Regular": 425, "Twister": 426, "Arsenal-Italic": 427, "Bogart-Italic": 428, "BethEllen-Regular": 429, "Caveat-Regular": 430, "BalsamiqSans-Bold": 431, "BreeSerif-Regular": 432, "CodecPro-ExtraBold": 433, "Pierson-Light": 434, "CyGrotesk-WideRegular": 435, "Lumios-Marker": 436, "Comfortaa-Bold": 437, "TraceFontRegular": 438, "RTL-AdamScript-Regular": 439, "EastmanGrotesque-Italic": 440, "Kalam-Bold": 441, "ChauPhilomeneOne-Regular": 442, "Coiny-Regular": 443, "Lovera": 444, "Gellatio": 445, "TitilliumWeb-Bold": 446, "OilvareBase-Italic": 447, "Catamaran-Black": 448, "Anteb-Italic": 449, "SueEllenFrancisco": 450, "SweetApricot": 451, "BrightSunshine": 452, "IM_FELL_Double_Pica_Italic": 453, "Granaina-limpia": 454, "TANPARFAIT": 455, "AcherusGrotesque-Regular": 456, "AwesomeLathusca-Italic": 457, "Signika-Bold": 458, "Andasia": 459, "DO-AllCaps-Slanted": 460, "Zenaida-Regular": 461, "Fahkwang-Regular": 462, "Play-Regular": 463, "BERNIERRegular-Regular": 464, "PlumaThin-Regular": 465, "SportsWorld": 466, "Garet-Black": 467, "CarolloPlayscript-BlackItalic": 468, "Cheque-Regular": 469, "SEGO": 470, "BobbyJones-Condensed": 471, "NexaSlab-RegularItalic": 472, "DancingScript-Regular": 473, "PaalalabasDisplayWideBETA": 474, "Magnolia-Script": 475, "OpunMai-400It": 476, "MadelynFill-Regular": 477, "ZingRust-Base": 478, "FingerPaint-Regular": 479, "BostonAngel-Light": 480, "Gliker-RegularExpanded": 481, "Ahsing": 482, "Engagement-Regular": 483, "EyesomeScript": 484, "LibraSerifModern-Regular": 485, "London-Regular": 486, "AtkinsonHyperlegible-Regular": 487, "StadioNow-TextItalic": 488, "Aniyah": 489, "ITCAvantGardePro-Bold": 490, "Comica-Regular": 491, "Coustard-Regular": 492, "Brice-BoldCondensed": 493, "TANNEWYORK-Bold": 494, "TANBUSTER-Bold": 495, "Alatsi-Regular": 496, "TYSerif-Book": 497, "Jingleberry": 498, "Rajdhani-Bold": 499, "LobsterTwo-BoldItalic": 500, "BestLight-Medium": 501, "Hitchcut-Regular": 502, "GermaniaOne-Regular": 503, "Emitha-Script": 504, "LemonTuesday": 505, "Cubao_Free_Regular": 506, "MonterchiSerif-Regular": 507, "AllertaStencil-Regular": 508, "RTL-Sondos-Regular": 509, "HomemadeApple-Regular": 510, "CosmicOcto-Medium": 511}
assets/multilingual_cn-en_font_idx.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"en-Montserrat-Regular": 0, "en-Poppins-Italic": 1, "en-GlacialIndifference-Regular": 2, "en-OpenSans-ExtraBoldItalic": 3, "en-Montserrat-Bold": 4, "en-Now-Regular": 5, "en-Garet-Regular": 6, "en-LeagueSpartan-Bold": 7, "en-DMSans-Regular": 8, "en-OpenSauceOne-Regular": 9, "en-OpenSans-ExtraBold": 10, "en-KGPrimaryPenmanship": 11, "en-Anton-Regular": 12, "en-Aileron-BlackItalic": 13, "en-Quicksand-Light": 14, "en-Roboto-BoldItalic": 15, "en-TheSeasons-It": 16, "en-Kollektif": 17, "en-Inter-BoldItalic": 18, "en-Poppins-Medium": 19, "en-Poppins-Light": 20, "en-RoxboroughCF-RegularItalic": 21, "en-PlayfairDisplay-SemiBold": 22, "en-Agrandir-Italic": 23, "en-Lato-Regular": 24, "en-MoreSugarRegular": 25, "en-CanvaSans-RegularItalic": 26, "en-PublicSans-Italic": 27, "en-CodePro-NormalLC": 28, "en-Belleza-Regular": 29, "en-JosefinSans-Bold": 30, "en-HKGrotesk-Bold": 31, "en-Telegraf-Medium": 32, "en-BrittanySignatureRegular": 33, "en-Raleway-ExtraBoldItalic": 34, "en-Mont-RegularItalic": 35, "en-Arimo-BoldItalic": 36, "en-Lora-Italic": 37, "en-ArchivoBlack-Regular": 38, "en-Poppins": 39, "en-Barlow-Black": 40, "en-CormorantGaramond-Bold": 41, "en-LibreBaskerville-Regular": 42, "en-CanvaSchoolFontRegular": 43, "en-BebasNeueBold": 44, "en-LazydogRegular": 45, "en-FredokaOne-Regular": 46, "en-Horizon-Bold": 47, "en-Nourd-Regular": 48, "en-Hatton-Regular": 49, "en-Nunito-ExtraBoldItalic": 50, "en-CerebriSans-Regular": 51, "en-Montserrat-Light": 52, "en-TenorSans": 53, "en-Norwester-Regular": 54, "en-ClearSans-Bold": 55, "en-Cardo-Regular": 56, "en-Alice-Regular": 57, "en-Oswald-Regular": 58, "en-Gaegu-Bold": 59, "en-Muli-Black": 60, "en-TAN-PEARL-Regular": 61, "en-CooperHewitt-Book": 62, "en-Agrandir-Grand": 63, "en-BlackMango-Thin": 64, "en-DMSerifDisplay-Regular": 65, "en-Antonio-Bold": 66, "en-Sniglet-Regular": 67, "en-BeVietnam-Regular": 68, "en-NunitoSans10pt-BlackItalic": 69, "en-AbhayaLibre-ExtraBold": 70, "en-Rubik-Regular": 71, "en-PPNeueMachina-Regular": 72, "en-TAN - MON CHERI-Regular": 73, "en-Jua-Regular": 74, "en-Playlist-Script": 75, "en-SourceSansPro-BoldItalic": 76, "en-MoonTime-Regular": 77, "en-Eczar-ExtraBold": 78, "en-Gatwick-Regular": 79, "en-MonumentExtended-Regular": 80, "en-BarlowSemiCondensed-Regular": 81, "en-BarlowCondensed-Regular": 82, "en-Alegreya-Regular": 83, "en-DreamAvenue": 84, "en-RobotoCondensed-Italic": 85, "en-BobbyJones-Regular": 86, "en-Garet-ExtraBold": 87, "en-YesevaOne-Regular": 88, "en-Dosis-ExtraBold": 89, "en-LeagueGothic-Regular": 90, "en-OpenSans-Italic": 91, "en-TANAEGEAN-Regular": 92, "en-Maharlika-Regular": 93, "en-MarykateRegular": 94, "en-Cinzel-Regular": 95, "en-Agrandir-Wide": 96, "en-Chewy-Regular": 97, "en-BodoniFLF-BoldItalic": 98, "en-Nunito-BlackItalic": 99, "en-LilitaOne": 100, "en-HandyCasualCondensed-Regular": 101, "en-Ovo": 102, "en-Livvic-Regular": 103, "en-Agrandir-Narrow": 104, "en-CrimsonPro-Italic": 105, "en-AnonymousPro-Bold": 106, "en-NF-OneLittleFont-Bold": 107, "en-RedHatDisplay-BoldItalic": 108, "en-CodecPro-Regular": 109, "en-HalimunRegular": 110, "en-LibreFranklin-Black": 111, "en-TeXGyreTermes-BoldItalic": 112, "en-Shrikhand-Regular": 113, "en-TTNormsPro-Italic": 114, "en-Gagalin-Regular": 115, "en-OpenSans-Bold": 116, "en-GreatVibes-Regular": 117, "en-Breathing": 118, "en-HeroLight-Regular": 119, "en-KGPrimaryDots": 120, "en-Quicksand-Bold": 121, "en-Brice-ExtraLightSemiExpanded": 122, "en-Lato-BoldItalic": 123, "en-Fraunces9pt-Italic": 124, "en-AbrilFatface-Regular": 125, "en-BerkshireSwash-Regular": 126, "en-Atma-Bold": 127, "en-HolidayRegular": 128, "en-BebasNeueCyrillic": 129, "en-IntroRust-Base": 130, "en-Gistesy": 131, "en-BDScript-Regular": 132, "en-ApricotsRegular": 133, "en-Prompt-Black": 134, "en-TAN MERINGUE": 135, "en-Sukar Regular": 136, "en-GentySans-Regular": 137, "en-NeueEinstellung-Normal": 138, "en-Garet-Bold": 139, "en-FiraSans-Black": 140, "en-BantayogLight": 141, "en-NotoSerifDisplay-Black": 142, "en-TTChocolates-Regular": 143, "en-Ubuntu-Regular": 144, "en-Assistant-Bold": 145, "en-ABeeZee-Regular": 146, "en-LexendDeca-Regular": 147, "en-KingredSerif": 148, "en-Radley-Regular": 149, "en-BrownSugar": 150, "en-MigraItalic-ExtraboldItalic": 151, "en-ChildosArabic-Regular": 152, "en-PeaceSans": 153, "en-LondrinaSolid-Black": 154, "en-SpaceMono-BoldItalic": 155, "en-RobotoMono-Light": 156, "en-CourierPrime-Regular": 157, "en-Alata-Regular": 158, "en-Amsterdam-One": 159, "en-IreneFlorentina-Regular": 160, "en-CatchyMager": 161, "en-Alta_regular": 162, "en-ArticulatCF-Regular": 163, "en-Raleway-Regular": 164, "en-BrasikaDisplay": 165, "en-TANAngleton-Italic": 166, "en-NotoSerifDisplay-ExtraCondensedItalic": 167, "en-Bryndan Write": 168, "en-TTCommonsPro-It": 169, "en-AlexBrush-Regular": 170, "en-Antic-Regular": 171, "en-TTHoves-Bold": 172, "en-DroidSerif": 173, "en-AblationRegular": 174, "en-Marcellus-Regular": 175, "en-Sanchez-Italic": 176, "en-JosefinSans": 177, "en-Afrah-Regular": 178, "en-PinyonScript": 179, "en-TTInterphases-BoldItalic": 180, "en-Yellowtail-Regular": 181, "en-Gliker-Regular": 182, "en-BobbyJonesSoft-Regular": 183, "en-IBMPlexSans": 184, "en-Amsterdam-Three": 185, "en-Amsterdam-FourSlant": 186, "en-TTFors-Regular": 187, "en-Quattrocento": 188, "en-Sifonn-Basic": 189, "en-AlegreyaSans-Black": 190, "en-Daydream": 191, "en-AristotelicaProTx-Rg": 192, "en-NotoSerif": 193, "en-EBGaramond-Italic": 194, "en-HammersmithOne-Regular": 195, "en-RobotoSlab-Regular": 196, "en-DO-Sans-Regular": 197, "en-KGPrimaryDotsLined": 198, "en-Blinker-Regular": 199, "en-TAN NIMBUS": 200, "en-Blueberry-Regular": 201, "en-Rosario-Regular": 202, "en-Forum": 203, "en-MistrullyRegular": 204, "en-SourceSerifPro-Regular": 205, "en-Bugaki-Regular": 206, "en-CMUSerif-Roman": 207, "en-GulfsDisplay-NormalItalic": 208, "en-PTSans-Bold": 209, "en-Sensei-Medium": 210, "en-SquadaOne-Regular": 211, "en-Arapey-Italic": 212, "en-Parisienne-Regular": 213, "en-Aleo-Italic": 214, "en-QuicheDisplay-Italic": 215, "en-RocaOne-It": 216, "en-Funtastic-Regular": 217, "en-PTSerif-BoldItalic": 218, "en-Muller-RegularItalic": 219, "en-ArgentCF-Regular": 220, "en-Brightwall-Italic": 221, "en-Knewave-Regular": 222, "en-TYSerif-D": 223, "en-Agrandir-Tight": 224, "en-AlfaSlabOne-Regular": 225, "en-TANTangkiwood-Display": 226, "en-Kief-Montaser-Regular": 227, "en-Gotham-Book": 228, "en-JuliusSansOne-Regular": 229, "en-CocoGothic-Italic": 230, "en-SairaCondensed-Regular": 231, "en-DellaRespira-Regular": 232, "en-Questrial-Regular": 233, "en-BukhariScript-Regular": 234, "en-HelveticaWorld-Bold": 235, "en-TANKINDRED-Display": 236, "en-CinzelDecorative-Regular": 237, "en-Vidaloka-Regular": 238, "en-AlegreyaSansSC-Black": 239, "en-FeelingPassionate-Regular": 240, "en-QuincyCF-Regular": 241, "en-FiraCode-Regular": 242, "en-Genty-Regular": 243, "en-Nickainley-Normal": 244, "en-RubikOne-Regular": 245, "en-Gidole-Regular": 246, "en-Borsok": 247, "en-Gordita-RegularItalic": 248, "en-Scripter-Regular": 249, "en-Buffalo-Regular": 250, "en-KleinText-Regular": 251, "en-Creepster-Regular": 252, "en-Arvo-Bold": 253, "en-GabrielSans-NormalItalic": 254, "en-Heebo-Black": 255, "en-LexendExa-Regular": 256, "en-BrixtonSansTC-Regular": 257, "en-GildaDisplay-Regular": 258, "en-ChunkFive-Roman": 259, "en-Amaranth-BoldItalic": 260, "en-BubbleboddyNeue-Regular": 261, "en-MavenPro-Bold": 262, "en-TTDrugs-Italic": 263, "en-CyGrotesk-KeyRegular": 264, "en-VarelaRound-Regular": 265, "en-Ruda-Black": 266, "en-SafiraMarch": 267, "en-BloggerSans": 268, "en-TANHEADLINE-Regular": 269, "en-SloopScriptPro-Regular": 270, "en-NeueMontreal-Regular": 271, "en-Schoolbell-Regular": 272, "en-SigherRegular": 273, "en-InriaSerif-Regular": 274, "en-JetBrainsMono-Regular": 275, "en-MADEEvolveSans": 276, "en-Dekko": 277, "en-Handyman-Regular": 278, "en-Aileron-BoldItalic": 279, "en-Bright-Italic": 280, "en-Solway-Regular": 281, "en-Higuen-Regular": 282, "en-WedgesItalic": 283, "en-TANASHFORD-BOLD": 284, "en-IBMPlexMono": 285, "en-RacingSansOne-Regular": 286, "en-RegularBrush": 287, "en-OpenSans-LightItalic": 288, "en-SpecialElite-Regular": 289, "en-FuturaLTPro-Medium": 290, "en-MaragsaDisplay": 291, "en-BigShouldersDisplay-Regular": 292, "en-BDSans-Regular": 293, "en-RasputinRegular": 294, "en-Yvesyvesdrawing-BoldItalic": 295, "en-Bitter-Regular": 296, "en-LuckiestGuy-Regular": 297, "en-CanvaSchoolFontDotted": 298, "en-TTFirsNeue-Italic": 299, "en-Sunday-Regular": 300, "en-HKGothic-MediumItalic": 301, "en-CaveatBrush-Regular": 302, "en-HeliosExt": 303, "en-ArchitectsDaughter-Regular": 304, "en-Angelina": 305, "en-Calistoga-Regular": 306, "en-ArchivoNarrow-Regular": 307, "en-ObjectSans-MediumSlanted": 308, "en-AyrLucidityCondensed-Regular": 309, "en-Nexa-RegularItalic": 310, "en-Lustria-Regular": 311, "en-Amsterdam-TwoSlant": 312, "en-Virtual-Regular": 313, "en-Brusher-Regular": 314, "en-NF-Lepetitcochon-Regular": 315, "en-TANTWINKLE": 316, "en-LeJour-Serif": 317, "en-Prata-Regular": 318, "en-PPWoodland-Regular": 319, "en-PlayfairDisplay-BoldItalic": 320, "en-AmaticSC-Regular": 321, "en-Cabin-Regular": 322, "en-Manjari-Bold": 323, "en-MrDafoe-Regular": 324, "en-TTRamillas-Italic": 325, "en-Luckybones-Bold": 326, "en-DarkerGrotesque-Light": 327, "en-BellabooRegular": 328, "en-CormorantSC-Bold": 329, "en-GochiHand-Regular": 330, "en-Atteron": 331, "en-RocaTwo-Lt": 332, "en-ZCOOLXiaoWei-Regular": 333, "en-TANSONGBIRD": 334, "en-HeadingNow-74Regular": 335, "en-Luthier-BoldItalic": 336, "en-Oregano-Regular": 337, "en-AyrTropikaIsland-Int": 338, "en-Mali-Regular": 339, "en-DidactGothic-Regular": 340, "en-Lovelace-Regular": 341, "en-BakerieSmooth-Regular": 342, "en-CarterOne": 343, "en-HussarBd": 344, "en-OldStandard-Italic": 345, "en-TAN-ASTORIA-Display": 346, "en-rugratssans-Regular": 347, "en-BMHANNA": 348, "en-BetterSaturday": 349, "en-AdigianaToybox": 350, "en-Sailors": 351, "en-PlayfairDisplaySC-Italic": 352, "en-Etna-Regular": 353, "en-Revive80Signature": 354, "en-CAGenerated": 355, "en-Poppins-Regular": 356, "en-Jonathan-Regular": 357, "en-Pacifico-Regular": 358, "en-Saira-Black": 359, "en-Loubag-Regular": 360, "en-Decalotype-Black": 361, "en-Mansalva-Regular": 362, "en-Allura-Regular": 363, "en-ProximaNova-Bold": 364, "en-TANMIGNON-DISPLAY": 365, "en-ArsenicaAntiqua-Regular": 366, "en-BreulGroteskA-RegularItalic": 367, "en-HKModular-Bold": 368, "en-TANNightingale-Regular": 369, "en-AristotelicaProCndTxt-Rg": 370, "en-Aprila-Regular": 371, "en-Tomorrow-Regular": 372, "en-AngellaWhite": 373, "en-KaushanScript-Regular": 374, "en-NotoSans": 375, "en-LeJour-Script": 376, "en-BrixtonTC-Regular": 377, "en-OleoScript-Regular": 378, "en-Cakerolli-Regular": 379, "en-Lobster-Regular": 380, "en-FrunchySerif-Regular": 381, "en-PorcelainRegular": 382, "en-AlojaExtended": 383, "en-SergioTrendy-Italic": 384, "en-LovelaceText-Bold": 385, "en-Anaktoria": 386, "en-JimmyScript-Light": 387, "en-IBMPlexSerif": 388, "en-Marta": 389, "en-Mango-Regular": 390, "en-Overpass-Italic": 391, "en-Hagrid-Regular": 392, "en-ElikaGorica": 393, "en-Amiko-Regular": 394, "en-EFCOBrookshire-Regular": 395, "en-Caladea-Regular": 396, "en-MoonlightBold": 397, "en-Staatliches-Regular": 398, "en-Helios-Bold": 399, "en-Satisfy-Regular": 400, "en-NexaScript-Regular": 401, "en-Trocchi-Regular": 402, "en-March": 403, "en-IbarraRealNova-Regular": 404, "en-Nectarine-Regular": 405, "en-Overpass-Light": 406, "en-TruetypewriterPolyglOTT": 407, "en-Bangers-Regular": 408, "en-Lazord-BoldExpandedItalic": 409, "en-Chloe-Regular": 410, "en-BaskervilleDisplayPT-Regular": 411, "en-Bright-Regular": 412, "en-Vollkorn-Regular": 413, "en-Harmattan": 414, "en-SortsMillGoudy-Regular": 415, "en-Biryani-Bold": 416, "en-SugoProDisplay-Italic": 417, "en-Lazord-BoldItalic": 418, "en-Alike-Regular": 419, "en-PermanentMarker-Regular": 420, "en-Sacramento-Regular": 421, "en-HKGroteskPro-Italic": 422, "en-Aleo-BoldItalic": 423, "en-Noot": 424, "en-TANGARLAND-Regular": 425, "en-Twister": 426, "en-Arsenal-Italic": 427, "en-Bogart-Italic": 428, "en-BethEllen-Regular": 429, "en-Caveat-Regular": 430, "en-BalsamiqSans-Bold": 431, "en-BreeSerif-Regular": 432, "en-CodecPro-ExtraBold": 433, "en-Pierson-Light": 434, "en-CyGrotesk-WideRegular": 435, "en-Lumios-Marker": 436, "en-Comfortaa-Bold": 437, "en-TraceFontRegular": 438, "en-RTL-AdamScript-Regular": 439, "en-EastmanGrotesque-Italic": 440, "en-Kalam-Bold": 441, "en-ChauPhilomeneOne-Regular": 442, "en-Coiny-Regular": 443, "en-Lovera": 444, "en-Gellatio": 445, "en-TitilliumWeb-Bold": 446, "en-OilvareBase-Italic": 447, "en-Catamaran-Black": 448, "en-Anteb-Italic": 449, "en-SueEllenFrancisco": 450, "en-SweetApricot": 451, "en-BrightSunshine": 452, "en-IM_FELL_Double_Pica_Italic": 453, "en-Granaina-limpia": 454, "en-TANPARFAIT": 455, "en-AcherusGrotesque-Regular": 456, "en-AwesomeLathusca-Italic": 457, "en-Signika-Bold": 458, "en-Andasia": 459, "en-DO-AllCaps-Slanted": 460, "en-Zenaida-Regular": 461, "en-Fahkwang-Regular": 462, "en-Play-Regular": 463, "en-BERNIERRegular-Regular": 464, "en-PlumaThin-Regular": 465, "en-SportsWorld": 466, "en-Garet-Black": 467, "en-CarolloPlayscript-BlackItalic": 468, "en-Cheque-Regular": 469, "en-SEGO": 470, "en-BobbyJones-Condensed": 471, "en-NexaSlab-RegularItalic": 472, "en-DancingScript-Regular": 473, "en-PaalalabasDisplayWideBETA": 474, "en-Magnolia-Script": 475, "en-OpunMai-400It": 476, "en-MadelynFill-Regular": 477, "en-ZingRust-Base": 478, "en-FingerPaint-Regular": 479, "en-BostonAngel-Light": 480, "en-Gliker-RegularExpanded": 481, "en-Ahsing": 482, "en-Engagement-Regular": 483, "en-EyesomeScript": 484, "en-LibraSerifModern-Regular": 485, "en-London-Regular": 486, "en-AtkinsonHyperlegible-Regular": 487, "en-StadioNow-TextItalic": 488, "en-Aniyah": 489, "en-ITCAvantGardePro-Bold": 490, "en-Comica-Regular": 491, "en-Coustard-Regular": 492, "en-Brice-BoldCondensed": 493, "en-TANNEWYORK-Bold": 494, "en-TANBUSTER-Bold": 495, "en-Alatsi-Regular": 496, "en-TYSerif-Book": 497, "en-Jingleberry": 498, "en-Rajdhani-Bold": 499, "en-LobsterTwo-BoldItalic": 500, "en-BestLight-Medium": 501, "en-Hitchcut-Regular": 502, "en-GermaniaOne-Regular": 503, "en-Emitha-Script": 504, "en-LemonTuesday": 505, "en-Cubao_Free_Regular": 506, "en-MonterchiSerif-Regular": 507, "en-AllertaStencil-Regular": 508, "en-RTL-Sondos-Regular": 509, "en-HomemadeApple-Regular": 510, "en-CosmicOcto-Medium": 511, "cn-HelloFont-FangHuaTi": 0, "cn-HelloFont-ID-DianFangSong-Bold": 1, "cn-HelloFont-ID-DianFangSong": 2, "cn-HelloFont-ID-DianHei-CEJ": 3, "cn-HelloFont-ID-DianHei-DEJ": 4, "cn-HelloFont-ID-DianHei-EEJ": 5, "cn-HelloFont-ID-DianHei-FEJ": 6, "cn-HelloFont-ID-DianHei-GEJ": 7, "cn-HelloFont-ID-DianKai-Bold": 8, "cn-HelloFont-ID-DianKai": 9, "cn-HelloFont-WenYiHei": 10, "cn-Hellofont-ID-ChenYanXingKai": 11, "cn-Hellofont-ID-DaZiBao": 12, "cn-Hellofont-ID-DaoCaoRen": 13, "cn-Hellofont-ID-JianSong": 14, "cn-Hellofont-ID-JiangHuZhaoPaiHei": 15, "cn-Hellofont-ID-KeSong": 16, "cn-Hellofont-ID-LeYuanTi": 17, "cn-Hellofont-ID-Pinocchio": 18, "cn-Hellofont-ID-QiMiaoTi": 19, "cn-Hellofont-ID-QingHuaKai": 20, "cn-Hellofont-ID-QingHuaXingKai": 21, "cn-Hellofont-ID-ShanShuiXingKai": 22, "cn-Hellofont-ID-ShouXieQiShu": 23, "cn-Hellofont-ID-ShouXieTongZhenTi": 24, "cn-Hellofont-ID-TengLingTi": 25, "cn-Hellofont-ID-XiaoLiShu": 26, "cn-Hellofont-ID-XuanZhenSong": 27, "cn-Hellofont-ID-ZhongLingXingKai": 28, "cn-HellofontIDJiaoTangTi": 29, "cn-HellofontIDJiuZhuTi": 30, "cn-HuXiaoBao-SaoBao": 31, "cn-HuXiaoBo-NanShen": 32, "cn-HuXiaoBo-ZhenShuai": 33, "cn-SourceHanSansSC-Bold": 34, "cn-SourceHanSansSC-ExtraLight": 35, "cn-SourceHanSansSC-Heavy": 36, "cn-SourceHanSansSC-Light": 37, "cn-SourceHanSansSC-Medium": 38, "cn-SourceHanSansSC-Normal": 39, "cn-SourceHanSansSC-Regular": 40, "cn-SourceHanSerifSC-Bold": 41, "cn-SourceHanSerifSC-ExtraLight": 42, "cn-SourceHanSerifSC-Heavy": 43, "cn-SourceHanSerifSC-Light": 44, "cn-SourceHanSerifSC-Medium": 45, "cn-SourceHanSerifSC-Regular": 46, "cn-SourceHanSerifSC-SemiBold": 47, "cn-xiaowei": 48, "cn-AaJianHaoTi": 49, "cn-AlibabaPuHuiTi-Bold": 50, "cn-AlibabaPuHuiTi-Heavy": 51, "cn-AlibabaPuHuiTi-Light": 52, "cn-AlibabaPuHuiTi-Medium": 53, "cn-AlibabaPuHuiTi-Regular": 54, "cn-CanvaAcidBoldSC": 55, "cn-CanvaBreezeCN": 56, "cn-CanvaBumperCropSC": 57, "cn-CanvaCakeShopCN": 58, "cn-CanvaEndeavorBlackSC": 59, "cn-CanvaJoyHeiCN": 60, "cn-CanvaLiCN": 61, "cn-CanvaOrientalBrushCN": 62, "cn-CanvaPoster": 63, "cn-CanvaQinfuCalligraphyCN": 64, "cn-CanvaSweetHeartCN": 65, "cn-CanvaSwordLikeDreamCN": 66, "cn-CanvaTangyuanHandwritingCN": 67, "cn-CanvaWanderWorldCN": 68, "cn-CanvaWenCN": 69, "cn-DianZiChunYi": 70, "cn-GenSekiGothicTW-H": 71, "cn-GenWanMinTW-L": 72, "cn-GenYoMinTW-B": 73, "cn-GenYoMinTW-EL": 74, "cn-GenYoMinTW-H": 75, "cn-GenYoMinTW-M": 76, "cn-GenYoMinTW-R": 77, "cn-GenYoMinTW-SB": 78, "cn-HYQiHei-AZEJ": 79, "cn-HYQiHei-EES": 80, "cn-HanaMinA": 81, "cn-HappyZcool-2016": 82, "cn-HelloFont ZJ KeKouKeAiTi": 83, "cn-HelloFont-ID-BoBoTi": 84, "cn-HelloFont-ID-FuGuHei-25": 85, "cn-HelloFont-ID-FuGuHei-35": 86, "cn-HelloFont-ID-FuGuHei-45": 87, "cn-HelloFont-ID-FuGuHei-55": 88, "cn-HelloFont-ID-FuGuHei-65": 89, "cn-HelloFont-ID-FuGuHei-75": 90, "cn-HelloFont-ID-FuGuHei-85": 91, "cn-HelloFont-ID-HeiKa": 92, "cn-HelloFont-ID-HeiTang": 93, "cn-HelloFont-ID-JianSong-95": 94, "cn-HelloFont-ID-JueJiangHei-50": 95, "cn-HelloFont-ID-JueJiangHei-55": 96, "cn-HelloFont-ID-JueJiangHei-60": 97, "cn-HelloFont-ID-JueJiangHei-65": 98, "cn-HelloFont-ID-JueJiangHei-70": 99, "cn-HelloFont-ID-JueJiangHei-75": 100, "cn-HelloFont-ID-JueJiangHei-80": 101, "cn-HelloFont-ID-KuHeiTi": 102, "cn-HelloFont-ID-LingDongTi": 103, "cn-HelloFont-ID-LingLiTi": 104, "cn-HelloFont-ID-MuFengTi": 105, "cn-HelloFont-ID-NaiNaiJiangTi": 106, "cn-HelloFont-ID-PangDu": 107, "cn-HelloFont-ID-ReLieTi": 108, "cn-HelloFont-ID-RouRun": 109, "cn-HelloFont-ID-SaShuangShouXieTi": 110, "cn-HelloFont-ID-WangZheFengFan": 111, "cn-HelloFont-ID-YouQiTi": 112, "cn-Hellofont-ID-XiaLeTi": 113, "cn-Hellofont-ID-XianXiaTi": 114, "cn-HuXiaoBoKuHei": 115, "cn-IDDanMoXingKai": 116, "cn-IDJueJiangHei": 117, "cn-IDMeiLingTi": 118, "cn-IDQQSugar": 119, "cn-LiuJianMaoCao-Regular": 120, "cn-LongCang-Regular": 121, "cn-MaShanZheng-Regular": 122, "cn-PangMenZhengDao-3": 123, "cn-PangMenZhengDao-Cu": 124, "cn-PangMenZhengDao": 125, "cn-SentyCaramel": 126, "cn-SourceHanSerifSC": 127, "cn-WenCang-Regular": 128, "cn-WenQuanYiMicroHei": 129, "cn-XianErTi": 130, "cn-YRDZSTJF": 131, "cn-YS-HelloFont-BangBangTi": 132, "cn-ZCOOLKuaiLe-Regular": 133, "cn-ZCOOLQingKeHuangYou-Regular": 134, "cn-ZCOOLXiaoWei-Regular": 135, "cn-ZCOOL_KuHei": 136, "cn-ZhiMangXing-Regular": 137, "cn-baotuxiaobaiti": 138, "cn-jiangxizhuokai-Regular": 139, "cn-zcool-gdh": 140, "cn-zcoolqingkehuangyouti-Regular": 141, "cn-zcoolwenyiti": 142}
checkpoints/glyph-sdxl/byt5_mapper.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d8e8c5ac933bc21e80287d2c96aa64f6e03a1936094a4dc8906ab78ecb61063
3
+ size 301553807
checkpoints/glyph-sdxl/byt5_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:edea29b75df65cf6ed3a8c79341292b962eed5c1dc0c111b7dc10e0817d5341c
3
+ size 874506157
checkpoints/glyph-sdxl/optimizer.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:066c9516b1b436ce2ce2aa052fdc272a6daa09e857d876ddf956229df62dbd1e
3
+ size 3839437754
checkpoints/glyph-sdxl/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b76dcf1db79cb067cf7fa4cbecbf2df9c18cc4780e14c75a5081dffd64221c95
3
+ size 988
checkpoints/glyph-sdxl/scheduler.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4aac7effc1e494aeaf74df69283e6c01de6153aa5857e62e82aeb7c9d24c23df
3
+ size 1064
checkpoints/glyph-sdxl/unet_inserted_attn.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b6af4376281be262f3b52ca0b16b0244099161693f65a7db352f53878481767
3
+ size 908
checkpoints/glyph-sdxl/unet_lora.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ac9ec31fbe654b5822caa6a83b1c83c4a09e536a0cd0b23fa5985824260662c
3
+ size 743590514
configs/glyph_multilingual_sdxl_albedo.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #### Model Setting
2
+ pretrained_model_name_or_path = 'stablediffusionapi/albedobase-xl-20'
3
+ pretrained_vae_model_name_or_path = 'madebyollin/sdxl-vae-fp16-fix'
4
+ revision = None
5
+
6
+ byt5_max_length = 512
7
+ byt5_mapper_type = 'T5EncoderBlockByT5Mapper'
8
+ byt5_mapper_config = dict(
9
+ num_layers=4,
10
+ sdxl_channels=2048,
11
+ )
12
+ byt5_config = dict(
13
+ byt5_name='google/byt5-small',
14
+ special_token=True,
15
+ color_special_token=True,
16
+ font_special_token=True,
17
+ color_ann_path='assets/color_idx.json',
18
+ font_ann_path='assets/multilingual_cn-en_font_idx.json',
19
+ multilingual=True,
20
+ )
21
+
22
+ attn_block_to_modify = [
23
+ "down_blocks.1.attentions.0.transformer_blocks.0",
24
+ "down_blocks.1.attentions.0.transformer_blocks.1",
25
+ "down_blocks.1.attentions.1.transformer_blocks.0",
26
+ "down_blocks.1.attentions.1.transformer_blocks.1",
27
+ "down_blocks.2.attentions.0.transformer_blocks.0",
28
+ "down_blocks.2.attentions.0.transformer_blocks.1",
29
+ "down_blocks.2.attentions.0.transformer_blocks.2",
30
+ "down_blocks.2.attentions.0.transformer_blocks.3",
31
+ "down_blocks.2.attentions.0.transformer_blocks.4",
32
+ "down_blocks.2.attentions.0.transformer_blocks.5",
33
+ "down_blocks.2.attentions.0.transformer_blocks.6",
34
+ "down_blocks.2.attentions.0.transformer_blocks.7",
35
+ "down_blocks.2.attentions.0.transformer_blocks.8",
36
+ "down_blocks.2.attentions.0.transformer_blocks.9",
37
+ "down_blocks.2.attentions.1.transformer_blocks.0",
38
+ "down_blocks.2.attentions.1.transformer_blocks.1",
39
+ "down_blocks.2.attentions.1.transformer_blocks.2",
40
+ "down_blocks.2.attentions.1.transformer_blocks.3",
41
+ "down_blocks.2.attentions.1.transformer_blocks.4",
42
+ "down_blocks.2.attentions.1.transformer_blocks.5",
43
+ "down_blocks.2.attentions.1.transformer_blocks.6",
44
+ "down_blocks.2.attentions.1.transformer_blocks.7",
45
+ "down_blocks.2.attentions.1.transformer_blocks.8",
46
+ "down_blocks.2.attentions.1.transformer_blocks.9",
47
+ "up_blocks.0.attentions.0.transformer_blocks.0",
48
+ "up_blocks.0.attentions.0.transformer_blocks.1",
49
+ "up_blocks.0.attentions.0.transformer_blocks.2",
50
+ "up_blocks.0.attentions.0.transformer_blocks.3",
51
+ "up_blocks.0.attentions.0.transformer_blocks.4",
52
+ "up_blocks.0.attentions.0.transformer_blocks.5",
53
+ "up_blocks.0.attentions.0.transformer_blocks.6",
54
+ "up_blocks.0.attentions.0.transformer_blocks.7",
55
+ "up_blocks.0.attentions.0.transformer_blocks.8",
56
+ "up_blocks.0.attentions.0.transformer_blocks.9",
57
+ "up_blocks.0.attentions.1.transformer_blocks.0",
58
+ "up_blocks.0.attentions.1.transformer_blocks.1",
59
+ "up_blocks.0.attentions.1.transformer_blocks.2",
60
+ "up_blocks.0.attentions.1.transformer_blocks.3",
61
+ "up_blocks.0.attentions.1.transformer_blocks.4",
62
+ "up_blocks.0.attentions.1.transformer_blocks.5",
63
+ "up_blocks.0.attentions.1.transformer_blocks.6",
64
+ "up_blocks.0.attentions.1.transformer_blocks.7",
65
+ "up_blocks.0.attentions.1.transformer_blocks.8",
66
+ "up_blocks.0.attentions.1.transformer_blocks.9",
67
+ "up_blocks.0.attentions.2.transformer_blocks.0",
68
+ "up_blocks.0.attentions.2.transformer_blocks.1",
69
+ "up_blocks.0.attentions.2.transformer_blocks.2",
70
+ "up_blocks.0.attentions.2.transformer_blocks.3",
71
+ "up_blocks.0.attentions.2.transformer_blocks.4",
72
+ "up_blocks.0.attentions.2.transformer_blocks.5",
73
+ "up_blocks.0.attentions.2.transformer_blocks.6",
74
+ "up_blocks.0.attentions.2.transformer_blocks.7",
75
+ "up_blocks.0.attentions.2.transformer_blocks.8",
76
+ "up_blocks.0.attentions.2.transformer_blocks.9",
77
+ "up_blocks.1.attentions.0.transformer_blocks.0",
78
+ "up_blocks.1.attentions.0.transformer_blocks.1",
79
+ "up_blocks.1.attentions.1.transformer_blocks.0",
80
+ "up_blocks.1.attentions.1.transformer_blocks.1",
81
+ "up_blocks.1.attentions.2.transformer_blocks.0",
82
+ "up_blocks.1.attentions.2.transformer_blocks.1",
83
+ "mid_block.attentions.0.transformer_blocks.0",
84
+ "mid_block.attentions.0.transformer_blocks.1",
85
+ "mid_block.attentions.0.transformer_blocks.2",
86
+ "mid_block.attentions.0.transformer_blocks.3",
87
+ "mid_block.attentions.0.transformer_blocks.4",
88
+ "mid_block.attentions.0.transformer_blocks.5",
89
+ "mid_block.attentions.0.transformer_blocks.6",
90
+ "mid_block.attentions.0.transformer_blocks.7",
91
+ "mid_block.attentions.0.transformer_blocks.8",
92
+ "mid_block.attentions.0.transformer_blocks.9",
93
+ ]
94
+
95
+ unet_lora_rank = 128
96
+ inference_dtype = 'fp16'
configs/glyph_sdxl.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #### Model Setting
2
+ pretrained_model_name_or_path = 'stabilityai/stable-diffusion-xl-base-1.0'
3
+ pretrained_vae_model_name_or_path = 'madebyollin/sdxl-vae-fp16-fix'
4
+ revision = None
5
+
6
+ byt5_max_length = 512
7
+ byt5_mapper_type = 'T5EncoderBlockByT5Mapper'
8
+ byt5_mapper_config = dict(
9
+ num_layers=4,
10
+ sdxl_channels=2048,
11
+ )
12
+ byt5_config = dict(
13
+ byt5_name='google/byt5-small',
14
+ special_token=True,
15
+ color_special_token=True,
16
+ font_special_token=True,
17
+ color_ann_path='assets/color_idx.json',
18
+ font_ann_path='assets/font_idx_512.json',
19
+ multilingual=False,
20
+ )
21
+
22
+ attn_block_to_modify = [
23
+ "down_blocks.1.attentions.0.transformer_blocks.0",
24
+ "down_blocks.1.attentions.0.transformer_blocks.1",
25
+ "down_blocks.1.attentions.1.transformer_blocks.0",
26
+ "down_blocks.1.attentions.1.transformer_blocks.1",
27
+ "down_blocks.2.attentions.0.transformer_blocks.0",
28
+ "down_blocks.2.attentions.0.transformer_blocks.1",
29
+ "down_blocks.2.attentions.0.transformer_blocks.2",
30
+ "down_blocks.2.attentions.0.transformer_blocks.3",
31
+ "down_blocks.2.attentions.0.transformer_blocks.4",
32
+ "down_blocks.2.attentions.0.transformer_blocks.5",
33
+ "down_blocks.2.attentions.0.transformer_blocks.6",
34
+ "down_blocks.2.attentions.0.transformer_blocks.7",
35
+ "down_blocks.2.attentions.0.transformer_blocks.8",
36
+ "down_blocks.2.attentions.0.transformer_blocks.9",
37
+ "down_blocks.2.attentions.1.transformer_blocks.0",
38
+ "down_blocks.2.attentions.1.transformer_blocks.1",
39
+ "down_blocks.2.attentions.1.transformer_blocks.2",
40
+ "down_blocks.2.attentions.1.transformer_blocks.3",
41
+ "down_blocks.2.attentions.1.transformer_blocks.4",
42
+ "down_blocks.2.attentions.1.transformer_blocks.5",
43
+ "down_blocks.2.attentions.1.transformer_blocks.6",
44
+ "down_blocks.2.attentions.1.transformer_blocks.7",
45
+ "down_blocks.2.attentions.1.transformer_blocks.8",
46
+ "down_blocks.2.attentions.1.transformer_blocks.9",
47
+ "up_blocks.0.attentions.0.transformer_blocks.0",
48
+ "up_blocks.0.attentions.0.transformer_blocks.1",
49
+ "up_blocks.0.attentions.0.transformer_blocks.2",
50
+ "up_blocks.0.attentions.0.transformer_blocks.3",
51
+ "up_blocks.0.attentions.0.transformer_blocks.4",
52
+ "up_blocks.0.attentions.0.transformer_blocks.5",
53
+ "up_blocks.0.attentions.0.transformer_blocks.6",
54
+ "up_blocks.0.attentions.0.transformer_blocks.7",
55
+ "up_blocks.0.attentions.0.transformer_blocks.8",
56
+ "up_blocks.0.attentions.0.transformer_blocks.9",
57
+ "up_blocks.0.attentions.1.transformer_blocks.0",
58
+ "up_blocks.0.attentions.1.transformer_blocks.1",
59
+ "up_blocks.0.attentions.1.transformer_blocks.2",
60
+ "up_blocks.0.attentions.1.transformer_blocks.3",
61
+ "up_blocks.0.attentions.1.transformer_blocks.4",
62
+ "up_blocks.0.attentions.1.transformer_blocks.5",
63
+ "up_blocks.0.attentions.1.transformer_blocks.6",
64
+ "up_blocks.0.attentions.1.transformer_blocks.7",
65
+ "up_blocks.0.attentions.1.transformer_blocks.8",
66
+ "up_blocks.0.attentions.1.transformer_blocks.9",
67
+ "up_blocks.0.attentions.2.transformer_blocks.0",
68
+ "up_blocks.0.attentions.2.transformer_blocks.1",
69
+ "up_blocks.0.attentions.2.transformer_blocks.2",
70
+ "up_blocks.0.attentions.2.transformer_blocks.3",
71
+ "up_blocks.0.attentions.2.transformer_blocks.4",
72
+ "up_blocks.0.attentions.2.transformer_blocks.5",
73
+ "up_blocks.0.attentions.2.transformer_blocks.6",
74
+ "up_blocks.0.attentions.2.transformer_blocks.7",
75
+ "up_blocks.0.attentions.2.transformer_blocks.8",
76
+ "up_blocks.0.attentions.2.transformer_blocks.9",
77
+ "up_blocks.1.attentions.0.transformer_blocks.0",
78
+ "up_blocks.1.attentions.0.transformer_blocks.1",
79
+ "up_blocks.1.attentions.1.transformer_blocks.0",
80
+ "up_blocks.1.attentions.1.transformer_blocks.1",
81
+ "up_blocks.1.attentions.2.transformer_blocks.0",
82
+ "up_blocks.1.attentions.2.transformer_blocks.1",
83
+ "mid_block.attentions.0.transformer_blocks.0",
84
+ "mid_block.attentions.0.transformer_blocks.1",
85
+ "mid_block.attentions.0.transformer_blocks.2",
86
+ "mid_block.attentions.0.transformer_blocks.3",
87
+ "mid_block.attentions.0.transformer_blocks.4",
88
+ "mid_block.attentions.0.transformer_blocks.5",
89
+ "mid_block.attentions.0.transformer_blocks.6",
90
+ "mid_block.attentions.0.transformer_blocks.7",
91
+ "mid_block.attentions.0.transformer_blocks.8",
92
+ "mid_block.attentions.0.transformer_blocks.9",
93
+ ]
94
+
95
+ unet_lora_rank = 128
96
+ inference_dtype = 'fp16'
configs/glyph_sdxl_albedo.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #### Model Setting
2
+ pretrained_model_name_or_path = 'stablediffusionapi/albedobase-xl-20'
3
+ pretrained_vae_model_name_or_path = 'madebyollin/sdxl-vae-fp16-fix'
4
+ revision = None
5
+
6
+ byt5_max_length = 512
7
+ byt5_mapper_type = 'T5EncoderBlockByT5Mapper'
8
+ byt5_mapper_config = dict(
9
+ num_layers=4,
10
+ sdxl_channels=2048,
11
+ )
12
+ byt5_config = dict(
13
+ byt5_name='google/byt5-small',
14
+ special_token=True,
15
+ color_special_token=True,
16
+ font_special_token=True,
17
+ color_ann_path='assets/color_idx.json',
18
+ font_ann_path='assets/font_idx_512.json',
19
+ multilingual=False,
20
+ )
21
+
22
+ attn_block_to_modify = [
23
+ "down_blocks.1.attentions.0.transformer_blocks.0",
24
+ "down_blocks.1.attentions.0.transformer_blocks.1",
25
+ "down_blocks.1.attentions.1.transformer_blocks.0",
26
+ "down_blocks.1.attentions.1.transformer_blocks.1",
27
+ "down_blocks.2.attentions.0.transformer_blocks.0",
28
+ "down_blocks.2.attentions.0.transformer_blocks.1",
29
+ "down_blocks.2.attentions.0.transformer_blocks.2",
30
+ "down_blocks.2.attentions.0.transformer_blocks.3",
31
+ "down_blocks.2.attentions.0.transformer_blocks.4",
32
+ "down_blocks.2.attentions.0.transformer_blocks.5",
33
+ "down_blocks.2.attentions.0.transformer_blocks.6",
34
+ "down_blocks.2.attentions.0.transformer_blocks.7",
35
+ "down_blocks.2.attentions.0.transformer_blocks.8",
36
+ "down_blocks.2.attentions.0.transformer_blocks.9",
37
+ "down_blocks.2.attentions.1.transformer_blocks.0",
38
+ "down_blocks.2.attentions.1.transformer_blocks.1",
39
+ "down_blocks.2.attentions.1.transformer_blocks.2",
40
+ "down_blocks.2.attentions.1.transformer_blocks.3",
41
+ "down_blocks.2.attentions.1.transformer_blocks.4",
42
+ "down_blocks.2.attentions.1.transformer_blocks.5",
43
+ "down_blocks.2.attentions.1.transformer_blocks.6",
44
+ "down_blocks.2.attentions.1.transformer_blocks.7",
45
+ "down_blocks.2.attentions.1.transformer_blocks.8",
46
+ "down_blocks.2.attentions.1.transformer_blocks.9",
47
+ "up_blocks.0.attentions.0.transformer_blocks.0",
48
+ "up_blocks.0.attentions.0.transformer_blocks.1",
49
+ "up_blocks.0.attentions.0.transformer_blocks.2",
50
+ "up_blocks.0.attentions.0.transformer_blocks.3",
51
+ "up_blocks.0.attentions.0.transformer_blocks.4",
52
+ "up_blocks.0.attentions.0.transformer_blocks.5",
53
+ "up_blocks.0.attentions.0.transformer_blocks.6",
54
+ "up_blocks.0.attentions.0.transformer_blocks.7",
55
+ "up_blocks.0.attentions.0.transformer_blocks.8",
56
+ "up_blocks.0.attentions.0.transformer_blocks.9",
57
+ "up_blocks.0.attentions.1.transformer_blocks.0",
58
+ "up_blocks.0.attentions.1.transformer_blocks.1",
59
+ "up_blocks.0.attentions.1.transformer_blocks.2",
60
+ "up_blocks.0.attentions.1.transformer_blocks.3",
61
+ "up_blocks.0.attentions.1.transformer_blocks.4",
62
+ "up_blocks.0.attentions.1.transformer_blocks.5",
63
+ "up_blocks.0.attentions.1.transformer_blocks.6",
64
+ "up_blocks.0.attentions.1.transformer_blocks.7",
65
+ "up_blocks.0.attentions.1.transformer_blocks.8",
66
+ "up_blocks.0.attentions.1.transformer_blocks.9",
67
+ "up_blocks.0.attentions.2.transformer_blocks.0",
68
+ "up_blocks.0.attentions.2.transformer_blocks.1",
69
+ "up_blocks.0.attentions.2.transformer_blocks.2",
70
+ "up_blocks.0.attentions.2.transformer_blocks.3",
71
+ "up_blocks.0.attentions.2.transformer_blocks.4",
72
+ "up_blocks.0.attentions.2.transformer_blocks.5",
73
+ "up_blocks.0.attentions.2.transformer_blocks.6",
74
+ "up_blocks.0.attentions.2.transformer_blocks.7",
75
+ "up_blocks.0.attentions.2.transformer_blocks.8",
76
+ "up_blocks.0.attentions.2.transformer_blocks.9",
77
+ "up_blocks.1.attentions.0.transformer_blocks.0",
78
+ "up_blocks.1.attentions.0.transformer_blocks.1",
79
+ "up_blocks.1.attentions.1.transformer_blocks.0",
80
+ "up_blocks.1.attentions.1.transformer_blocks.1",
81
+ "up_blocks.1.attentions.2.transformer_blocks.0",
82
+ "up_blocks.1.attentions.2.transformer_blocks.1",
83
+ "mid_block.attentions.0.transformer_blocks.0",
84
+ "mid_block.attentions.0.transformer_blocks.1",
85
+ "mid_block.attentions.0.transformer_blocks.2",
86
+ "mid_block.attentions.0.transformer_blocks.3",
87
+ "mid_block.attentions.0.transformer_blocks.4",
88
+ "mid_block.attentions.0.transformer_blocks.5",
89
+ "mid_block.attentions.0.transformer_blocks.6",
90
+ "mid_block.attentions.0.transformer_blocks.7",
91
+ "mid_block.attentions.0.transformer_blocks.8",
92
+ "mid_block.attentions.0.transformer_blocks.9",
93
+ ]
94
+
95
+ unet_lora_rank = 128
96
+ inference_dtype = 'fp16'
demo/constants.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ MAX_TEXT_BOX = 20
2
+ MAX_PROMPT_LENGTH = 512
examples/easter.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "texts": [
3
+ "MAY ALLYOUR PRAYERS BE ANSWERED",
4
+ "HAVE A HAPPY",
5
+ "Easter Day"
6
+ ],
7
+ "styles": [
8
+ {
9
+ "color": "#5a741c",
10
+ "font-family": "Gagalin-Regular"
11
+ },
12
+ {
13
+ "color": "#5a741c",
14
+ "font-family": "Gagalin-Regular"
15
+ },
16
+ {
17
+ "color": "#5a741c",
18
+ "font-family": "Brusher-Regular"
19
+ }
20
+ ],
21
+ "bbox": [
22
+ [
23
+ 0.08267477203647416,
24
+ 0.5355623100303951,
25
+ 0.42857142857142855,
26
+ 0.07477203647416414
27
+ ],
28
+ [
29
+ 0.08389057750759879,
30
+ 0.1951367781155015,
31
+ 0.38054711246200607,
32
+ 0.03768996960486322
33
+ ],
34
+ [
35
+ 0.07537993920972644,
36
+ 0.2601823708206687,
37
+ 0.49544072948328266,
38
+ 0.14650455927051673
39
+ ]
40
+ ],
41
+ "bg_prompt": "Facebook Post. The image features a small bunny rabbit sitting in a basket filled with various flowers. The basket is placed on a yellow background, creating a vibrant and cheerful scene. The flowers surrounding the rabbit come in different sizes and colors, adding to the overall visual appeal of the image. The rabbit appears to be the main focus of the scene, and its presence among the flowers creates a sense of harmony and balance. Tags: green, yellow, minimalist, easter day, happy easter day, easter, happy easter, decoration, happy, egg, spring, selebration, poster, illustration, greeting, season, design, colorful, cute, template",
42
+ "seed": 1
43
+ }
examples/easter.png ADDED
examples/new_year.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "texts": [
3
+ "Happy New Year",
4
+ "2024",
5
+ "All THE BEST",
6
+ "A fresh start to start a change for the better."
7
+ ],
8
+ "styles": [
9
+ {
10
+ "color": "#7b1f7b",
11
+ "font-family": "Caveat-Regular"
12
+ },
13
+ {
14
+ "color": "#1d1d67",
15
+ "font-family": "Gagalin-Regular"
16
+ },
17
+ {
18
+ "color": "#060606",
19
+ "font-family": "Quicksand-Light"
20
+ },
21
+ {
22
+ "color": "#060606",
23
+ "font-family": "Quicksand-Light"
24
+ }
25
+ ],
26
+ "bbox": [
27
+ [
28
+ 0.2936170212765957,
29
+ 0.2887537993920973,
30
+ 0.40303951367781155,
31
+ 0.07173252279635259
32
+ ],
33
+ [
34
+ 0.24984802431610942,
35
+ 0.3951367781155015,
36
+ 0.46200607902735563,
37
+ 0.17203647416413373
38
+ ],
39
+ [
40
+ 0.3951367781155015,
41
+ 0.1094224924012158,
42
+ 0.2109422492401216,
43
+ 0.02796352583586626
44
+ ],
45
+ [
46
+ 0.20911854103343466,
47
+ 0.6127659574468085,
48
+ 0.5586626139817629,
49
+ 0.08085106382978724
50
+ ]
51
+ ],
52
+ "bg_prompt": "Instagram Posts. The image features a white background with a variety of colorful flowers and decorations. There are several pink flowers scattered throughout the scene, with some positioned closer to the top and others near the bottom. A blue flower can also be seen in the middle of the image. The overall composition creates a visually appealing and vibrant display. Tags: grey, navy, purple, pink, teal, colorful, illustration, happy, celebration, post, party, year, new, event, celebrate, happy new year, new year, countdown, sparkle, firework",
53
+ "seed": 1
54
+ }
examples/new_year.png ADDED
examples/pancake.json ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "texts": [
3
+ "Get 75% Discount for your first order",
4
+ "Order Now",
5
+ "National Pancake Day"
6
+ ],
7
+ "styles": [
8
+ {
9
+ "color": "#545454",
10
+ "font-family": "MoreSugarRegular"
11
+ },
12
+ {
13
+ "color": "#ffffff",
14
+ "font-family": "Chewy-Regular"
15
+ },
16
+ {
17
+ "color": "#593535",
18
+ "font-family": "Chewy-Regular"
19
+ }
20
+ ],
21
+ "bbox": [
22
+ [
23
+ 0.043161094224924014,
24
+ 0.5963525835866261,
25
+ 0.2936170212765957,
26
+ 0.08389057750759879
27
+ ],
28
+ [
29
+ 0.12279635258358662,
30
+ 0.79209726443769,
31
+ 0.26382978723404255,
32
+ 0.05167173252279635
33
+ ],
34
+ [
35
+ 0.044984802431610946,
36
+ 0.09787234042553192,
37
+ 0.4413373860182371,
38
+ 0.4158054711246201
39
+ ]
40
+ ],
41
+ "_id": "EAFKNkLcNfU-1-0-web-2-N41D3IbRZcs",
42
+ "category": "Instagram Posts",
43
+ "tags": [
44
+ "brown",
45
+ "peach",
46
+ "grey",
47
+ "modern",
48
+ "minimalist",
49
+ "simple",
50
+ "colorful",
51
+ "illustration",
52
+ "Instagram post",
53
+ "instagram",
54
+ "post",
55
+ "national pancake day",
56
+ "international pancake day",
57
+ "happy pancake day",
58
+ "pancake day",
59
+ "pancake",
60
+ "sweet",
61
+ "cake",
62
+ "discount",
63
+ "sale"
64
+ ],
65
+ "bg_prompt": "Instagram Posts. The image features a stack of pancakes with syrup and strawberries on top. The pancakes are arranged in a visually appealing manner, with some pancakes placed on top of each other. The syrup is drizzled generously over the pancakes, and the strawberries are scattered around, adding a touch of color and freshness to the scene. The overall presentation of the pancakes is appetizing and inviting. Tags: brown, peach, grey, modern, minimalist, simple, colorful, illustration, Instagram post, instagram, post, national pancake day, international pancake day, happy pancake day, pancake day, pancake, sweet, cake, discount, sale",
66
+ "seed": 1
67
+ }
examples/pancake.png ADDED
examples/shower.json ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "texts": [
3
+ "RSVP to +123-456-7890",
4
+ "Olivia Wilson",
5
+ "Baby Shower",
6
+ "Please Join Us For a",
7
+ "In Honoring",
8
+ "23 November, 2021 | 03:00 PM Fauget Hotels"
9
+ ],
10
+ "styles": [
11
+ {
12
+ "color": "#c27b33",
13
+ "font-family": "LilitaOne"
14
+ },
15
+ {
16
+ "color": "#83940f",
17
+ "font-family": "Sensei-Medium"
18
+ },
19
+ {
20
+ "color": "#889818",
21
+ "font-family": "Sensei-Medium"
22
+ },
23
+ {
24
+ "color": "#c27b33",
25
+ "font-family": "LilitaOne"
26
+ },
27
+ {
28
+ "color": "#c27b33",
29
+ "font-family": "LilitaOne"
30
+ },
31
+ {
32
+ "color": "#c27b33",
33
+ "font-family": "LilitaOne"
34
+ }
35
+ ],
36
+ "bbox": [
37
+ [
38
+ 0.07112462006079028,
39
+ 0.6462006079027356,
40
+ 0.3373860182370821,
41
+ 0.026747720364741642
42
+ ],
43
+ [
44
+ 0.07051671732522796,
45
+ 0.38662613981762917,
46
+ 0.37264437689969604,
47
+ 0.059574468085106386
48
+ ],
49
+ [
50
+ 0.07234042553191489,
51
+ 0.15623100303951368,
52
+ 0.6547112462006079,
53
+ 0.12401215805471125
54
+ ],
55
+ [
56
+ 0.0662613981762918,
57
+ 0.06747720364741641,
58
+ 0.3981762917933131,
59
+ 0.035866261398176294
60
+ ],
61
+ [
62
+ 0.07051671732522796,
63
+ 0.31550151975683893,
64
+ 0.22006079027355624,
65
+ 0.03951367781155015
66
+ ],
67
+ [
68
+ 0.06990881458966565,
69
+ 0.48328267477203646,
70
+ 0.39878419452887537,
71
+ 0.1094224924012158
72
+ ]
73
+ ],
74
+ "bg_prompt": "Cards and invitations. The image features a large gray elephant sitting in a field of flowers, holding a smaller elephant in its arms. The scene is quite serene and picturesque, with the two elephants being the main focus of the image. The field is filled with various flowers, creating a beautiful and vibrant backdrop for the elephants. Tags: Light green, orange, Illustration, watercolor, playful, Baby shower invitation, baby boy shower invitation, baby boy, welcoming baby boy, koala baby shower invitation, baby shower invitation for baby shower, baby boy invitation, background, playful baby shower card, baby shower, card, newborn, born, Baby Shirt Baby Shower Invitation",
75
+ "seed": 0
76
+ }
examples/shower.png ADDED
glyph_sdxl/custom_diffusers/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from .pipelines import *
2
+ from .models import *
glyph_sdxl/custom_diffusers/models/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .cross_attn_insert_transformer_blocks import CrossAttnInsertBasicTransformerBlock
2
+
3
+ __all__ = ['CrossAttnInsertBasicTransformerBlock']
glyph_sdxl/custom_diffusers/models/cross_attn_insert_transformer_blocks.py ADDED
@@ -0,0 +1,377 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional, Dict, Any
2
+ import copy
3
+
4
+ import torch
5
+ import torch.nn as nn
6
+
7
+ from diffusers.models.attention import (
8
+ BasicTransformerBlock,
9
+ SinusoidalPositionalEmbedding,
10
+ AdaLayerNorm,
11
+ AdaLayerNormZero,
12
+ AdaLayerNormContinuous,
13
+ Attention,
14
+ FeedForward,
15
+ GatedSelfAttentionDense,
16
+ GELU,
17
+ GEGLU,
18
+ ApproximateGELU,
19
+ _chunked_feed_forward,
20
+ )
21
+
22
+ class CrossAttnInsertBasicTransformerBlock(BasicTransformerBlock):
23
+ def __init__(
24
+ self,
25
+ dim: int,
26
+ num_attention_heads: int,
27
+ attention_head_dim: int,
28
+ dropout=0.0,
29
+ cross_attention_dim: Optional[int] = None,
30
+ glyph_cross_attention_dim: Optional[int] = None,
31
+ activation_fn: str = "geglu",
32
+ num_embeds_ada_norm: Optional[int] = None,
33
+ attention_bias: bool = False,
34
+ only_cross_attention: bool = False,
35
+ double_self_attention: bool = False,
36
+ upcast_attention: bool = False,
37
+ norm_elementwise_affine: bool = True,
38
+ norm_type: str = "layer_norm", # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single', 'layer_norm_i2vgen'
39
+ norm_eps: float = 1e-5,
40
+ final_dropout: bool = False,
41
+ attention_type: str = "default",
42
+ positional_embeddings: Optional[str] = None,
43
+ num_positional_embeddings: Optional[int] = None,
44
+ ada_norm_continous_conditioning_embedding_dim: Optional[int] = None,
45
+ ada_norm_bias: Optional[int] = None,
46
+ ff_inner_dim: Optional[int] = None,
47
+ ff_bias: bool = True,
48
+ attention_out_bias: bool = True,
49
+ ):
50
+ super(BasicTransformerBlock, self).__init__()
51
+ self.only_cross_attention = only_cross_attention
52
+
53
+ if norm_type in ("ada_norm", "ada_norm_zero") and num_embeds_ada_norm is None:
54
+ raise ValueError(
55
+ f"`norm_type` is set to {norm_type}, but `num_embeds_ada_norm` is not defined. Please make sure to"
56
+ f" define `num_embeds_ada_norm` if setting `norm_type` to {norm_type}."
57
+ )
58
+
59
+ self.norm_type = norm_type
60
+ self.num_embeds_ada_norm = num_embeds_ada_norm
61
+
62
+ if positional_embeddings and (num_positional_embeddings is None):
63
+ raise ValueError(
64
+ "If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined."
65
+ )
66
+
67
+ if positional_embeddings == "sinusoidal":
68
+ self.pos_embed = SinusoidalPositionalEmbedding(dim, max_seq_length=num_positional_embeddings)
69
+ else:
70
+ self.pos_embed = None
71
+
72
+ # Define 3 blocks. Each block has its own normalization layer.
73
+ # 1. Self-Attn
74
+ if norm_type == "ada_norm":
75
+ self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm)
76
+ elif norm_type == "ada_norm_zero":
77
+ self.norm1 = AdaLayerNormZero(dim, num_embeds_ada_norm)
78
+ elif norm_type == "ada_norm_continuous":
79
+ self.norm1 = AdaLayerNormContinuous(
80
+ dim,
81
+ ada_norm_continous_conditioning_embedding_dim,
82
+ norm_elementwise_affine,
83
+ norm_eps,
84
+ ada_norm_bias,
85
+ "rms_norm",
86
+ )
87
+ else:
88
+ self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
89
+
90
+ self.attn1 = Attention(
91
+ query_dim=dim,
92
+ heads=num_attention_heads,
93
+ dim_head=attention_head_dim,
94
+ dropout=dropout,
95
+ bias=attention_bias,
96
+ cross_attention_dim=cross_attention_dim if only_cross_attention else None,
97
+ upcast_attention=upcast_attention,
98
+ out_bias=attention_out_bias,
99
+ )
100
+
101
+ # 2. Cross-Attn
102
+ if cross_attention_dim is not None or double_self_attention:
103
+ # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
104
+ # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
105
+ # the second cross attention block.
106
+ if norm_type == "ada_norm":
107
+ self.norm2 = AdaLayerNorm(dim, num_embeds_ada_norm)
108
+ elif norm_type == "ada_norm_continuous":
109
+ self.norm2 = AdaLayerNormContinuous(
110
+ dim,
111
+ ada_norm_continous_conditioning_embedding_dim,
112
+ norm_elementwise_affine,
113
+ norm_eps,
114
+ ada_norm_bias,
115
+ "rms_norm",
116
+ )
117
+ else:
118
+ self.norm2 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
119
+
120
+ self.attn2 = Attention(
121
+ query_dim=dim,
122
+ cross_attention_dim=cross_attention_dim if not double_self_attention else None,
123
+ heads=num_attention_heads,
124
+ dim_head=attention_head_dim,
125
+ dropout=dropout,
126
+ bias=attention_bias,
127
+ upcast_attention=upcast_attention,
128
+ out_bias=attention_out_bias,
129
+ ) # is self-attn if encoder_hidden_states is none
130
+ else:
131
+ self.norm2 = None
132
+ self.attn2 = None
133
+
134
+ # 3. Feed-forward
135
+ if norm_type == "ada_norm_continuous":
136
+ self.norm3 = AdaLayerNormContinuous(
137
+ dim,
138
+ ada_norm_continous_conditioning_embedding_dim,
139
+ norm_elementwise_affine,
140
+ norm_eps,
141
+ ada_norm_bias,
142
+ "layer_norm",
143
+ )
144
+
145
+ elif norm_type in ["ada_norm_zero", "ada_norm", "layer_norm", "ada_norm_continuous"]:
146
+ self.norm3 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
147
+ elif norm_type == "layer_norm_i2vgen":
148
+ self.norm3 = None
149
+
150
+ self.ff = FeedForward(
151
+ dim,
152
+ dropout=dropout,
153
+ activation_fn=activation_fn,
154
+ final_dropout=final_dropout,
155
+ inner_dim=ff_inner_dim,
156
+ bias=ff_bias,
157
+ )
158
+
159
+ # 4. Fuser
160
+ if attention_type == "gated" or attention_type == "gated-text-image":
161
+ self.fuser = GatedSelfAttentionDense(dim, cross_attention_dim, num_attention_heads, attention_head_dim)
162
+
163
+ # 5. Scale-shift for PixArt-Alpha.
164
+ if norm_type == "ada_norm_single":
165
+ self.scale_shift_table = nn.Parameter(torch.randn(6, dim) / dim**0.5)
166
+
167
+ # let chunk size default to None
168
+ self._chunk_size = None
169
+ self._chunk_dim = 0
170
+
171
+ def get_inserted_modules(self):
172
+ return ()
173
+
174
+ def get_inserted_modules_names(self):
175
+ return ()
176
+
177
+ def get_origin_modules(self):
178
+ inserted_modules = self.get_inserted_modules()
179
+ origin_modules = []
180
+ for module in self.children():
181
+ if module not in inserted_modules:
182
+ origin_modules.append(module)
183
+ return tuple(origin_modules)
184
+
185
+
186
+ @classmethod
187
+ def from_transformer_block(
188
+ cls,
189
+ transformer_block,
190
+ glyph_cross_attention_dim,
191
+ ):
192
+ inner_dim = transformer_block.attn1.query_dim
193
+ num_attention_heads = transformer_block.attn1.heads
194
+ attention_head_dim = transformer_block.attn1.inner_dim // num_attention_heads
195
+ dropout = transformer_block.attn1.dropout
196
+ cross_attention_dim = transformer_block.attn2.cross_attention_dim
197
+ if isinstance(transformer_block.ff.net[0], GELU):
198
+ if transformer_block.ff.net[0].approximate == "tanh":
199
+ activation_fn = "gelu-approximate"
200
+ else:
201
+ activation_fn = "gelu"
202
+ elif isinstance(transformer_block.ff.net[0], GEGLU):
203
+ activation_fn = "geglu"
204
+ elif isinstance(transformer_block.ff.net[0], ApproximateGELU):
205
+ activation_fn = "geglu-approximate"
206
+ num_embeds_ada_norm = transformer_block.num_embeds_ada_norm
207
+ attention_bias = transformer_block.attn1.to_q.bias is not None
208
+ only_cross_attention = transformer_block.only_cross_attention
209
+ double_self_attention = transformer_block.attn2.cross_attention_dim is None
210
+ upcast_attention = transformer_block.attn1.upcast_attention
211
+ norm_type = transformer_block.norm_type
212
+ assert isinstance(transformer_block.norm1, nn.LayerNorm)
213
+ norm_elementwise_affine = transformer_block.norm1.elementwise_affine
214
+ norm_eps = transformer_block.norm1.eps
215
+ assert getattr(transformer_block, 'fuser', None) is None
216
+ attention_type = "default"
217
+ model = cls(
218
+ inner_dim,
219
+ num_attention_heads,
220
+ attention_head_dim,
221
+ dropout=dropout,
222
+ cross_attention_dim=cross_attention_dim,
223
+ glyph_cross_attention_dim=glyph_cross_attention_dim,
224
+ activation_fn=activation_fn,
225
+ num_embeds_ada_norm=num_embeds_ada_norm,
226
+ attention_bias=attention_bias,
227
+ only_cross_attention=only_cross_attention,
228
+ double_self_attention=double_self_attention,
229
+ upcast_attention=upcast_attention,
230
+ norm_type=norm_type,
231
+ norm_elementwise_affine=norm_elementwise_affine,
232
+ norm_eps=norm_eps,
233
+ attention_type=attention_type,
234
+ )
235
+ missing_keys, unexpected_keys = model.load_state_dict(
236
+ transformer_block.state_dict(),
237
+ strict=False,
238
+ )
239
+ assert len(unexpected_keys) == 0
240
+ assert all(i.startswith('glyph') for i in missing_keys)
241
+
242
+ return model
243
+
244
+ def forward(
245
+ self,
246
+ hidden_states: torch.FloatTensor,
247
+ attention_mask: Optional[torch.FloatTensor] = None,
248
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
249
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
250
+ timestep: Optional[torch.LongTensor] = None,
251
+ cross_attention_kwargs: Dict[str, Any] = None,
252
+ class_labels: Optional[torch.LongTensor] = None,
253
+ added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
254
+ ) -> torch.FloatTensor:
255
+ # Notice that normalization is always applied before the real computation in the following blocks.
256
+ # 0. Self-Attention
257
+ batch_size = hidden_states.shape[0]
258
+
259
+ if self.norm_type == "ada_norm":
260
+ norm_hidden_states = self.norm1(hidden_states, timestep)
261
+ elif self.norm_type == "ada_norm_zero":
262
+ norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
263
+ hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
264
+ )
265
+ elif self.norm_type in ["layer_norm", "layer_norm_i2vgen"]:
266
+ norm_hidden_states = self.norm1(hidden_states)
267
+ elif self.norm_type == "ada_norm_continuous":
268
+ norm_hidden_states = self.norm1(hidden_states, added_cond_kwargs["pooled_text_emb"])
269
+ elif self.norm_type == "ada_norm_single":
270
+ shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
271
+ self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1)
272
+ ).chunk(6, dim=1)
273
+ norm_hidden_states = self.norm1(hidden_states)
274
+ norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
275
+ norm_hidden_states = norm_hidden_states.squeeze(1)
276
+ else:
277
+ raise ValueError("Incorrect norm used")
278
+
279
+ if self.pos_embed is not None:
280
+ norm_hidden_states = self.pos_embed(norm_hidden_states)
281
+
282
+ # 1. Retrieve lora scale.
283
+ lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
284
+
285
+ # 2. Prepare GLIGEN inputs
286
+ cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
287
+ gligen_kwargs = cross_attention_kwargs.pop("gligen", None)
288
+
289
+ glyph_encoder_hidden_states = cross_attention_kwargs.pop("glyph_encoder_hidden_states", None)
290
+ # a dict. visual_feat_len: tensor(b, visual_feat_len,text—_feat_len)
291
+ glyph_attn_mask = cross_attention_kwargs.pop("glyph_attn_masks_dict", None)
292
+ bg_attn_mask = cross_attention_kwargs.pop("bg_attn_masks_dict", None)
293
+ if glyph_attn_mask is not None:
294
+ glyph_attn_mask = glyph_attn_mask[hidden_states.shape[1]]
295
+ if bg_attn_mask is not None:
296
+ bg_attn_mask = bg_attn_mask[hidden_states.shape[1]]
297
+ assert encoder_attention_mask is None, "encoder_attention_mask is not supported in this block."
298
+
299
+ attn_output = self.attn1(
300
+ norm_hidden_states,
301
+ encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
302
+ attention_mask=attention_mask,
303
+ **cross_attention_kwargs,
304
+ )
305
+ if self.norm_type == "ada_norm_zero":
306
+ attn_output = gate_msa.unsqueeze(1) * attn_output
307
+ elif self.norm_type == "ada_norm_single":
308
+ attn_output = gate_msa * attn_output
309
+
310
+ hidden_states = attn_output + hidden_states
311
+ if hidden_states.ndim == 4:
312
+ hidden_states = hidden_states.squeeze(1)
313
+
314
+ # 2.5 GLIGEN Control
315
+ if gligen_kwargs is not None:
316
+ hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])
317
+
318
+ # 3. Cross-Attention
319
+ if self.attn2 is not None:
320
+ if self.norm_type == "ada_norm":
321
+ norm_hidden_states = self.norm2(hidden_states, timestep)
322
+ elif self.norm_type in ["ada_norm_zero", "layer_norm", "layer_norm_i2vgen"]:
323
+ norm_hidden_states = self.norm2(hidden_states)
324
+ elif self.norm_type == "ada_norm_single":
325
+ # For PixArt norm2 isn't applied here:
326
+ # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
327
+ norm_hidden_states = hidden_states
328
+ elif self.norm_type == "ada_norm_continuous":
329
+ norm_hidden_states = self.norm2(hidden_states, added_cond_kwargs["pooled_text_emb"])
330
+ else:
331
+ raise ValueError("Incorrect norm")
332
+
333
+ if self.pos_embed is not None and self.norm_type != "ada_norm_single":
334
+ norm_hidden_states = self.pos_embed(norm_hidden_states)
335
+
336
+ attn_output = self.attn2(
337
+ norm_hidden_states,
338
+ encoder_hidden_states=torch.cat([encoder_hidden_states, glyph_encoder_hidden_states], dim=1),
339
+ attention_mask=torch.cat([bg_attn_mask, glyph_attn_mask], dim=-1),
340
+ **cross_attention_kwargs,
341
+ )
342
+
343
+ hidden_states = attn_output + hidden_states
344
+
345
+ # 4. Feed-forward
346
+ # i2vgen doesn't have this norm 🤷‍♂️
347
+ if self.norm_type == "ada_norm_continuous":
348
+ norm_hidden_states = self.norm3(hidden_states, added_cond_kwargs["pooled_text_emb"])
349
+ elif not self.norm_type == "ada_norm_single":
350
+ norm_hidden_states = self.norm3(hidden_states)
351
+
352
+ if self.norm_type == "ada_norm_zero":
353
+ norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
354
+
355
+ if self.norm_type == "ada_norm_single":
356
+ norm_hidden_states = self.norm2(hidden_states)
357
+ norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
358
+
359
+ if self._chunk_size is not None:
360
+ # "feed_forward_chunk_size" can be used to save memory
361
+ ff_output = _chunked_feed_forward(
362
+ self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size, lora_scale=lora_scale
363
+ )
364
+ else:
365
+ ff_output = self.ff(norm_hidden_states, scale=lora_scale)
366
+
367
+ if self.norm_type == "ada_norm_zero":
368
+ ff_output = gate_mlp.unsqueeze(1) * ff_output
369
+ elif self.norm_type == "ada_norm_single":
370
+ ff_output = gate_mlp * ff_output
371
+
372
+ hidden_states = ff_output + hidden_states
373
+ if hidden_states.ndim == 4:
374
+ hidden_states = hidden_states.squeeze(1)
375
+
376
+ return hidden_states
377
+
glyph_sdxl/custom_diffusers/pipelines/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from .pipeline_stable_diffusion_glyph_xl import StableDiffusionGlyphXLPipeline
2
+
3
+ __all__ = [
4
+ 'StableDiffusionGlyphXLPipeline',
5
+ ]
glyph_sdxl/custom_diffusers/pipelines/pipeline_stable_diffusion_glyph_xl.py ADDED
@@ -0,0 +1,922 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from typing import Optional, List, Union, Dict, Tuple, Callable, Any
3
+ import torch
4
+
5
+ from transformers import T5EncoderModel, T5Tokenizer
6
+ import torch.nn.functional as F
7
+
8
+ from diffusers.pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl import (
9
+ StableDiffusionXLPipeline,
10
+ AutoencoderKL,
11
+ CLIPTextModel,
12
+ CLIPTextModelWithProjection,
13
+ CLIPTokenizer,
14
+ UNet2DConditionModel,
15
+ KarrasDiffusionSchedulers,
16
+ CLIPVisionModelWithProjection,
17
+ CLIPImageProcessor,
18
+ VaeImageProcessor,
19
+ is_invisible_watermark_available,
20
+ StableDiffusionXLLoraLoaderMixin,
21
+ PipelineImageInput,
22
+ adjust_lora_scale_text_encoder,
23
+ scale_lora_layers,
24
+ unscale_lora_layers,
25
+ USE_PEFT_BACKEND,
26
+ StableDiffusionXLPipelineOutput,
27
+ ImageProjection,
28
+ logging,
29
+ rescale_noise_cfg,
30
+ retrieve_timesteps,
31
+ deprecate,
32
+ )
33
+ import numpy as np
34
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
35
+
36
+ from diffusers.pipelines.stable_diffusion_xl.watermark import StableDiffusionXLWatermarker
37
+
38
+ class StableDiffusionGlyphXLPipeline(StableDiffusionXLPipeline):
39
+ model_cpu_offload_seq = "text_encoder->text_encoder_2->byt5_text_encoder->image_encoder->unet->byt5_mapper->vae"
40
+ _optional_components = [
41
+ "tokenizer",
42
+ "tokenizer_2",
43
+ "byt5_tokenizer",
44
+ "text_encoder",
45
+ "text_encoder_2",
46
+ "byt5_text_encoder",
47
+ "byt5_mapper",
48
+ "image_encoder",
49
+ "feature_extractor",
50
+ ]
51
+ _callback_tensor_inputs = [
52
+ "latents",
53
+ "prompt_embeds",
54
+ "negative_prompt_embeds",
55
+ "add_text_embeds",
56
+ "add_time_ids",
57
+ "negative_pooled_prompt_embeds",
58
+ "negative_add_time_ids",
59
+ ]
60
+ def __init__(
61
+ self,
62
+ vae: AutoencoderKL,
63
+ text_encoder: CLIPTextModel,
64
+ text_encoder_2: CLIPTextModelWithProjection,
65
+ byt5_text_encoder: T5EncoderModel,
66
+ tokenizer: CLIPTokenizer,
67
+ tokenizer_2: CLIPTokenizer,
68
+ byt5_tokenizer: T5Tokenizer,
69
+ byt5_mapper,
70
+ unet: UNet2DConditionModel,
71
+ scheduler: KarrasDiffusionSchedulers,
72
+ byt5_max_length: int = 512,
73
+ image_encoder: CLIPVisionModelWithProjection = None,
74
+ feature_extractor: CLIPImageProcessor = None,
75
+ force_zeros_for_empty_prompt: bool = True,
76
+ add_watermarker: Optional[bool] = None,
77
+ ):
78
+ super(StableDiffusionXLPipeline, self).__init__()
79
+
80
+ self.register_modules(
81
+ vae=vae,
82
+ text_encoder=text_encoder,
83
+ text_encoder_2=text_encoder_2,
84
+ byt5_text_encoder=byt5_text_encoder,
85
+ tokenizer=tokenizer,
86
+ tokenizer_2=tokenizer_2,
87
+ byt5_tokenizer=byt5_tokenizer,
88
+ byt5_mapper=byt5_mapper,
89
+ unet=unet,
90
+ scheduler=scheduler,
91
+ image_encoder=image_encoder,
92
+ feature_extractor=feature_extractor,
93
+ )
94
+ self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
95
+ self.register_to_config(byt5_max_length=byt5_max_length)
96
+ self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
97
+ self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
98
+ self.byt5_max_length = byt5_max_length
99
+
100
+ self.default_sample_size = self.unet.config.sample_size
101
+
102
+ add_watermarker = add_watermarker if add_watermarker is not None else is_invisible_watermark_available()
103
+
104
+ if add_watermarker:
105
+ self.watermark = StableDiffusionXLWatermarker()
106
+ else:
107
+ self.watermark = None
108
+
109
+ def encode_prompt(
110
+ self,
111
+ prompt: str,
112
+ prompt_2: Optional[str] = None,
113
+ text_prompt = None,
114
+ device: Optional[torch.device] = None,
115
+ num_images_per_prompt: int = 1,
116
+ do_classifier_free_guidance: bool = True,
117
+ negative_prompt: Optional[str] = None,
118
+ negative_prompt_2: Optional[str] = None,
119
+ prompt_embeds: Optional[torch.FloatTensor] = None,
120
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
121
+ pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
122
+ negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
123
+ lora_scale: Optional[float] = None,
124
+ clip_skip: Optional[int] = None,
125
+ text_attn_mask: Optional[torch.LongTensor] = None,
126
+ byt5_prompt_embeds: Optional[torch.FloatTensor] = None,
127
+ ):
128
+ r"""
129
+ Encodes the prompt into text encoder hidden states.
130
+
131
+ Args:
132
+ prompt (`str` or `List[str]`, *optional*):
133
+ prompt to be encoded
134
+ prompt_2 (`str` or `List[str]`, *optional*):
135
+ The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
136
+ used in both text-encoders
137
+ device: (`torch.device`):
138
+ torch device
139
+ num_images_per_prompt (`int`):
140
+ number of images that should be generated per prompt
141
+ do_classifier_free_guidance (`bool`):
142
+ whether to use classifier free guidance or not
143
+ negative_prompt (`str` or `List[str]`, *optional*):
144
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
145
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
146
+ less than `1`).
147
+ negative_prompt_2 (`str` or `List[str]`, *optional*):
148
+ The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
149
+ `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
150
+ prompt_embeds (`torch.FloatTensor`, *optional*):
151
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
152
+ provided, text embeddings will be generated from `prompt` input argument.
153
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
154
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
155
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
156
+ argument.
157
+ pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
158
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
159
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
160
+ negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
161
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
162
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
163
+ input argument.
164
+ lora_scale (`float`, *optional*):
165
+ A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
166
+ clip_skip (`int`, *optional*):
167
+ Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
168
+ the output of the pre-final layer will be used for computing the prompt embeddings.
169
+ """
170
+ device = device or self._execution_device
171
+
172
+ # set lora scale so that monkey patched LoRA
173
+ # function of text encoder can correctly access it
174
+ if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
175
+ self._lora_scale = lora_scale
176
+
177
+ # dynamically adjust the LoRA scale
178
+ if self.text_encoder is not None:
179
+ if not USE_PEFT_BACKEND:
180
+ adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
181
+ else:
182
+ scale_lora_layers(self.text_encoder, lora_scale)
183
+
184
+ if self.text_encoder_2 is not None:
185
+ if not USE_PEFT_BACKEND:
186
+ adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
187
+ else:
188
+ scale_lora_layers(self.text_encoder_2, lora_scale)
189
+
190
+ prompt = [prompt] if isinstance(prompt, str) else prompt
191
+
192
+ if prompt is not None:
193
+ batch_size = len(prompt)
194
+ else:
195
+ batch_size = prompt_embeds.shape[0]
196
+
197
+ # Define tokenizers and text encoders
198
+ tokenizers = [self.tokenizer, self.tokenizer_2] if self.tokenizer is not None else [self.tokenizer_2]
199
+ text_encoders = (
200
+ [self.text_encoder, self.text_encoder_2] if self.text_encoder is not None else [self.text_encoder_2]
201
+ )
202
+
203
+ if prompt_embeds is None:
204
+ assert len(prompt) == 1
205
+ prompt_2 = prompt_2 or prompt
206
+ prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
207
+
208
+ text_prompt = [text_prompt] if isinstance(text_prompt, str) else text_prompt
209
+
210
+ # textual inversion: procecss multi-vector tokens if necessary
211
+ prompt_embeds_list = []
212
+ prompts = [prompt, prompt_2]
213
+ text_input_id_batchs = []
214
+ for prompt, tokenizer in zip(prompts, tokenizers):
215
+ pad_token = tokenizer.pad_token_id
216
+ total_tokens = tokenizer(prompt, truncation=False)['input_ids'][0]
217
+ bos = total_tokens[0]
218
+ eos = total_tokens[-1]
219
+ total_tokens = total_tokens[1:-1]
220
+ new_total_tokens = []
221
+ empty_flag = True
222
+ while len(total_tokens) >= 75:
223
+ head_75_tokens = [total_tokens.pop(0) for _ in range(75)]
224
+ temp_77_token_ids = [bos] + head_75_tokens + [eos]
225
+ new_total_tokens.append(temp_77_token_ids)
226
+ empty_flag = False
227
+ if len(total_tokens) > 0 or empty_flag:
228
+ padding_len = 75 - len(total_tokens)
229
+ temp_77_token_ids = [bos] + total_tokens + [eos] + [pad_token] * padding_len
230
+ new_total_tokens.append(temp_77_token_ids)
231
+ # 1,segment_len, 77
232
+ new_total_tokens = torch.tensor(new_total_tokens, dtype=torch.long).unsqueeze(0)
233
+ text_input_id_batchs.append(new_total_tokens)
234
+ if text_input_id_batchs[0].shape[1] > text_input_id_batchs[1].shape[1]:
235
+ tokenizer = tokenizers[1]
236
+ pad_token = tokenizer.pad_token_id
237
+ bos = tokenizer.bos_token_id
238
+ eos = tokenizer.eos_token_id
239
+ padding_len = text_input_id_batchs[0].shape[1] - text_input_id_batchs[1].shape[1]
240
+ # padding_len, 77
241
+ padding_part = torch.tensor([[bos] + [eos] + [pad_token] * 75 for _ in range(padding_len)])
242
+ # 1, padding_len, 77
243
+ padding_part = padding_part.unsqueeze(0)
244
+ text_input_id_batchs[1] = torch.cat((text_input_id_batchs[1],padding_part), dim=1)
245
+ elif text_input_id_batchs[0].shape[1] < text_input_id_batchs[1].shape[1]:
246
+ tokenizer = tokenizers[0]
247
+ pad_token = tokenizer.pad_token_id
248
+ bos = tokenizer.bos_token_id
249
+ eos = tokenizer.eos_token_id
250
+ padding_len = text_input_id_batchs[1].shape[1] - text_input_id_batchs[0].shape[1]
251
+ # padding_len, 77
252
+ padding_part = torch.tensor([[bos] + [eos] + [pad_token] * 75 for _ in range(padding_len)])
253
+ # 1, padding_len, 77
254
+ padding_part = padding_part.unsqueeze(0)
255
+ text_input_id_batchs[0] = torch.cat((text_input_id_batchs[0],padding_part), dim=1)
256
+
257
+ embeddings = []
258
+ for segment_idx in range(text_input_id_batchs[0].shape[1]):
259
+ prompt_embeds_list = []
260
+ for i, text_encoder in enumerate(text_encoders):
261
+ # 1, segment_len, sequence_len
262
+ text_input_ids = text_input_id_batchs[i].to(text_encoder.device)
263
+ # 1, sequence_len, dim
264
+ prompt_embeds = text_encoder(
265
+ text_input_ids[:, segment_idx],
266
+ output_hidden_states=True,
267
+ )
268
+
269
+ # We are only ALWAYS interested in the pooled output of the final text encoder
270
+ temp_pooled_prompt_embeds = prompt_embeds[0]
271
+ if clip_skip is None:
272
+ prompt_embeds = prompt_embeds.hidden_states[-2]
273
+ else:
274
+ prompt_embeds = prompt_embeds.hidden_states[-(clip_skip + 2)]
275
+ bs_embed, seq_len, _ = prompt_embeds.shape
276
+ prompt_embeds = prompt_embeds.view(bs_embed, seq_len, -1)
277
+ prompt_embeds_list.append(prompt_embeds)
278
+ # b, sequence_len, dim
279
+ prompt_embeds = torch.concat(prompt_embeds_list, dim=-1)
280
+ embeddings.append(prompt_embeds)
281
+ if segment_idx == 0:
282
+ # use the first segment's pooled prompt embeddings as
283
+ # the pooled prompt embeddings
284
+ # b, dim->b, dim
285
+ pooled_prompt_embeds = temp_pooled_prompt_embeds.view(bs_embed, -1)
286
+ # b, segment_len * sequence_len, dim
287
+ prompt_embeds = torch.cat(embeddings, dim=1)
288
+
289
+ if byt5_prompt_embeds is None:
290
+ byt5_text_inputs = self.byt5_tokenizer(
291
+ text_prompt,
292
+ padding="max_length",
293
+ max_length=self.byt5_max_length,
294
+ truncation=True,
295
+ add_special_tokens=True,
296
+ return_tensors="pt",
297
+ )
298
+ byt5_text_input_ids = byt5_text_inputs.input_ids
299
+ byt5_attention_mask = byt5_text_inputs.attention_mask.to(self.byt5_text_encoder.device) if text_attn_mask is None else text_attn_mask.to(self.byt5_text_encoder.device, dtype=byt5_text_inputs.attention_mask.dtype)
300
+ with torch.cuda.amp.autocast(enabled=False):
301
+ byt5_prompt_embeds = self.byt5_text_encoder(
302
+ byt5_text_input_ids.to(self.byt5_text_encoder.device),
303
+ attention_mask=byt5_attention_mask.float(),
304
+ )
305
+ byt5_prompt_embeds = byt5_prompt_embeds[0]
306
+ byt5_prompt_embeds = self.byt5_mapper(byt5_prompt_embeds, byt5_attention_mask)
307
+
308
+ # get unconditional embeddings for classifier free guidance
309
+ zero_out_negative_prompt = negative_prompt is None and self.config.force_zeros_for_empty_prompt
310
+ if do_classifier_free_guidance and negative_prompt_embeds is None and zero_out_negative_prompt:
311
+ negative_prompt_embeds = torch.zeros_like(prompt_embeds)
312
+ negative_byt5_prompt_embeds = torch.zeros_like(byt5_prompt_embeds)
313
+ negative_pooled_prompt_embeds = torch.zeros_like(pooled_prompt_embeds)
314
+ elif do_classifier_free_guidance and negative_prompt_embeds is None:
315
+ raise NotImplementedError
316
+
317
+ if self.text_encoder_2 is not None:
318
+ prompt_embeds = prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
319
+ else:
320
+ prompt_embeds = prompt_embeds.to(dtype=self.unet.dtype, device=device)
321
+
322
+ bs_embed, seq_len, _ = prompt_embeds.shape
323
+ # duplicate text embeddings for each generation per prompt, using mps friendly method
324
+ prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
325
+ prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
326
+
327
+ if do_classifier_free_guidance:
328
+ # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
329
+ seq_len = negative_prompt_embeds.shape[1]
330
+ byt5_seq_len = negative_byt5_prompt_embeds.shape[1]
331
+
332
+ if self.text_encoder_2 is not None:
333
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder_2.dtype, device=device)
334
+ else:
335
+ negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.unet.dtype, device=device)
336
+ negative_byt5_prompt_embeds = negative_byt5_prompt_embeds.to(dtype=self.byt5_text_encoder.dtype, device=device)
337
+
338
+ negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
339
+ negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
340
+ negative_byt5_prompt_embeds = negative_byt5_prompt_embeds.repeat(1, num_images_per_prompt, 1)
341
+ negative_byt5_prompt_embeds = negative_byt5_prompt_embeds.view(batch_size * num_images_per_prompt, byt5_seq_len, -1)
342
+
343
+ pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
344
+ bs_embed * num_images_per_prompt, -1
345
+ )
346
+ if do_classifier_free_guidance:
347
+ negative_pooled_prompt_embeds = negative_pooled_prompt_embeds.repeat(1, num_images_per_prompt).view(
348
+ bs_embed * num_images_per_prompt, -1
349
+ )
350
+
351
+ if self.text_encoder is not None:
352
+ if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
353
+ # Retrieve the original scale by scaling back the LoRA layers
354
+ unscale_lora_layers(self.text_encoder, lora_scale)
355
+
356
+ if self.text_encoder_2 is not None:
357
+ if isinstance(self, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
358
+ # Retrieve the original scale by scaling back the LoRA layers
359
+ unscale_lora_layers(self.text_encoder_2, lora_scale)
360
+
361
+ return (
362
+ prompt_embeds,
363
+ negative_prompt_embeds,
364
+ pooled_prompt_embeds,
365
+ negative_pooled_prompt_embeds,
366
+ byt5_prompt_embeds,
367
+ negative_byt5_prompt_embeds,
368
+ )
369
+
370
+ @torch.no_grad()
371
+ def __call__(
372
+ self,
373
+ prompt: Union[str, List[str]] = None,
374
+ prompt_2: Optional[Union[str, List[str]]] = None,
375
+ text_prompt = None,
376
+ texts = None,
377
+ bboxes = None,
378
+ height: Optional[int] = None,
379
+ width: Optional[int] = None,
380
+ num_inference_steps: int = 50,
381
+ timesteps: List[int] = None,
382
+ denoising_end: Optional[float] = None,
383
+ guidance_scale: float = 5.0,
384
+ negative_prompt: Optional[Union[str, List[str]]] = None,
385
+ negative_prompt_2: Optional[Union[str, List[str]]] = None,
386
+ num_images_per_prompt: Optional[int] = 1,
387
+ eta: float = 0.0,
388
+ generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
389
+ latents: Optional[torch.FloatTensor] = None,
390
+ prompt_embeds: Optional[torch.FloatTensor] = None,
391
+ negative_prompt_embeds: Optional[torch.FloatTensor] = None,
392
+ pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
393
+ negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
394
+ ip_adapter_image: Optional[PipelineImageInput] = None,
395
+ output_type: Optional[str] = "pil",
396
+ return_dict: bool = True,
397
+ cross_attention_kwargs: Optional[Dict[str, Any]] = None,
398
+ guidance_rescale: float = 0.0,
399
+ original_size: Optional[Tuple[int, int]] = None,
400
+ crops_coords_top_left: Tuple[int, int] = (0, 0),
401
+ target_size: Optional[Tuple[int, int]] = None,
402
+ negative_original_size: Optional[Tuple[int, int]] = None,
403
+ negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
404
+ negative_target_size: Optional[Tuple[int, int]] = None,
405
+ clip_skip: Optional[int] = None,
406
+ callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
407
+ callback_on_step_end_tensor_inputs: List[str] = ["latents"],
408
+ text_attn_mask: torch.LongTensor = None,
409
+ denoising_start: Optional[float] = None,
410
+ byt5_prompt_embeds: Optional[torch.FloatTensor] = None,
411
+ **kwargs,
412
+ ):
413
+ r"""
414
+ Function invoked when calling the pipeline for generation.
415
+
416
+ Args:
417
+ prompt (`str` or `List[str]`, *optional*):
418
+ The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
419
+ instead.
420
+ prompt_2 (`str` or `List[str]`, *optional*):
421
+ The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
422
+ used in both text-encoders
423
+ height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
424
+ The height in pixels of the generated image. This is set to 1024 by default for the best results.
425
+ Anything below 512 pixels won't work well for
426
+ [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
427
+ and checkpoints that are not specifically fine-tuned on low resolutions.
428
+ width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
429
+ The width in pixels of the generated image. This is set to 1024 by default for the best results.
430
+ Anything below 512 pixels won't work well for
431
+ [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
432
+ and checkpoints that are not specifically fine-tuned on low resolutions.
433
+ num_inference_steps (`int`, *optional*, defaults to 50):
434
+ The number of denoising steps. More denoising steps usually lead to a higher quality image at the
435
+ expense of slower inference.
436
+ timesteps (`List[int]`, *optional*):
437
+ Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
438
+ in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
439
+ passed will be used. Must be in descending order.
440
+ denoising_end (`float`, *optional*):
441
+ When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
442
+ completed before it is intentionally prematurely terminated. As a result, the returned sample will
443
+ still retain a substantial amount of noise as determined by the discrete timesteps selected by the
444
+ scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
445
+ "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
446
+ Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
447
+ guidance_scale (`float`, *optional*, defaults to 5.0):
448
+ Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
449
+ `guidance_scale` is defined as `w` of equation 2. of [Imagen
450
+ Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
451
+ 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
452
+ usually at the expense of lower image quality.
453
+ negative_prompt (`str` or `List[str]`, *optional*):
454
+ The prompt or prompts not to guide the image generation. If not defined, one has to pass
455
+ `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
456
+ less than `1`).
457
+ negative_prompt_2 (`str` or `List[str]`, *optional*):
458
+ The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
459
+ `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
460
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
461
+ The number of images to generate per prompt.
462
+ eta (`float`, *optional*, defaults to 0.0):
463
+ Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
464
+ [`schedulers.DDIMScheduler`], will be ignored for others.
465
+ generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
466
+ One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
467
+ to make generation deterministic.
468
+ latents (`torch.FloatTensor`, *optional*):
469
+ Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
470
+ generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
471
+ tensor will ge generated by sampling using the supplied random `generator`.
472
+ prompt_embeds (`torch.FloatTensor`, *optional*):
473
+ Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
474
+ provided, text embeddings will be generated from `prompt` input argument.
475
+ negative_prompt_embeds (`torch.FloatTensor`, *optional*):
476
+ Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
477
+ weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
478
+ argument.
479
+ pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
480
+ Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
481
+ If not provided, pooled text embeddings will be generated from `prompt` input argument.
482
+ negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
483
+ Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
484
+ weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
485
+ input argument.
486
+ ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
487
+ output_type (`str`, *optional*, defaults to `"pil"`):
488
+ The output format of the generate image. Choose between
489
+ [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
490
+ return_dict (`bool`, *optional*, defaults to `True`):
491
+ Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
492
+ of a plain tuple.
493
+ cross_attention_kwargs (`dict`, *optional*):
494
+ A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
495
+ `self.processor` in
496
+ [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
497
+ guidance_rescale (`float`, *optional*, defaults to 0.0):
498
+ Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
499
+ Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
500
+ [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
501
+ Guidance rescale factor should fix overexposure when using zero terminal SNR.
502
+ original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
503
+ If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
504
+ `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
505
+ explained in section 2.2 of
506
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
507
+ crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
508
+ `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
509
+ `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
510
+ `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
511
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
512
+ target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
513
+ For most cases, `target_size` should be set to the desired height and width of the generated image. If
514
+ not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
515
+ section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
516
+ negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
517
+ To negatively condition the generation process based on a specific image resolution. Part of SDXL's
518
+ micro-conditioning as explained in section 2.2 of
519
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
520
+ information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
521
+ negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
522
+ To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
523
+ micro-conditioning as explained in section 2.2 of
524
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
525
+ information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
526
+ negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
527
+ To negatively condition the generation process based on a target image resolution. It should be as same
528
+ as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
529
+ [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
530
+ information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
531
+ callback_on_step_end (`Callable`, *optional*):
532
+ A function that calls at the end of each denoising steps during the inference. The function is called
533
+ with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
534
+ callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
535
+ `callback_on_step_end_tensor_inputs`.
536
+ callback_on_step_end_tensor_inputs (`List`, *optional*):
537
+ The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
538
+ will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
539
+ `._callback_tensor_inputs` attribute of your pipeline class.
540
+
541
+ Examples:
542
+
543
+ Returns:
544
+ [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`:
545
+ [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
546
+ `tuple`. When returning a tuple, the first element is a list with the generated images.
547
+ """
548
+
549
+ callback = kwargs.pop("callback", None)
550
+ callback_steps = kwargs.pop("callback_steps", None)
551
+
552
+ if callback is not None:
553
+ deprecate(
554
+ "callback",
555
+ "1.0.0",
556
+ "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
557
+ )
558
+ if callback_steps is not None:
559
+ deprecate(
560
+ "callback_steps",
561
+ "1.0.0",
562
+ "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
563
+ )
564
+
565
+ # 0. Default height and width to unet
566
+ height = height or self.default_sample_size * self.vae_scale_factor
567
+ width = width or self.default_sample_size * self.vae_scale_factor
568
+
569
+ original_size = original_size or (height, width)
570
+ target_size = target_size or (height, width)
571
+
572
+ # 1. Check inputs. Raise error if not correct
573
+ self.check_inputs(
574
+ prompt,
575
+ prompt_2,
576
+ height,
577
+ width,
578
+ callback_steps,
579
+ negative_prompt,
580
+ negative_prompt_2,
581
+ prompt_embeds,
582
+ negative_prompt_embeds,
583
+ pooled_prompt_embeds,
584
+ negative_pooled_prompt_embeds,
585
+ callback_on_step_end_tensor_inputs,
586
+ )
587
+
588
+ self._guidance_scale = guidance_scale
589
+ self._guidance_rescale = guidance_rescale
590
+ self._clip_skip = clip_skip
591
+ self._cross_attention_kwargs = cross_attention_kwargs
592
+ self._denoising_end = denoising_end
593
+ self._interrupt = False
594
+
595
+ # 2. Define call parameters
596
+ if prompt is not None and isinstance(prompt, str):
597
+ batch_size = 1
598
+ elif prompt is not None and isinstance(prompt, list):
599
+ batch_size = len(prompt)
600
+ else:
601
+ batch_size = prompt_embeds.shape[0]
602
+
603
+ device = self._execution_device
604
+
605
+ # 3. Encode input prompt
606
+ lora_scale = (
607
+ self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
608
+ )
609
+
610
+ (
611
+ prompt_embeds,
612
+ negative_prompt_embeds,
613
+ pooled_prompt_embeds,
614
+ negative_pooled_prompt_embeds,
615
+ byt5_prompt_embeds,
616
+ negative_byt5_prompt_embeds,
617
+ ) = self.encode_prompt(
618
+ prompt=prompt,
619
+ prompt_2=prompt_2,
620
+ text_prompt=text_prompt,
621
+ device=device,
622
+ num_images_per_prompt=num_images_per_prompt,
623
+ do_classifier_free_guidance=self.do_classifier_free_guidance,
624
+ negative_prompt=negative_prompt,
625
+ negative_prompt_2=negative_prompt_2,
626
+ prompt_embeds=prompt_embeds,
627
+ negative_prompt_embeds=negative_prompt_embeds,
628
+ pooled_prompt_embeds=pooled_prompt_embeds,
629
+ negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
630
+ lora_scale=lora_scale,
631
+ clip_skip=self.clip_skip,
632
+ text_attn_mask=text_attn_mask,
633
+ byt5_prompt_embeds=byt5_prompt_embeds,
634
+ )
635
+
636
+ # 4. Prepare timesteps
637
+ timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
638
+
639
+ # 5. Prepare latent variables
640
+ num_channels_latents = self.unet.config.in_channels
641
+ latents = self.prepare_latents(
642
+ batch_size * num_images_per_prompt,
643
+ num_channels_latents,
644
+ height,
645
+ width,
646
+ prompt_embeds.dtype,
647
+ device,
648
+ generator,
649
+ latents,
650
+ )
651
+
652
+ # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
653
+ extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
654
+
655
+ # 7. Prepare added time ids & embeddings
656
+ add_text_embeds = pooled_prompt_embeds
657
+ if self.text_encoder_2 is None:
658
+ text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
659
+ else:
660
+ text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
661
+
662
+ add_time_ids = self._get_add_time_ids(
663
+ original_size,
664
+ crops_coords_top_left,
665
+ target_size,
666
+ dtype=prompt_embeds.dtype,
667
+ text_encoder_projection_dim=text_encoder_projection_dim,
668
+ )
669
+ if negative_original_size is not None and negative_target_size is not None:
670
+ negative_add_time_ids = self._get_add_time_ids(
671
+ negative_original_size,
672
+ negative_crops_coords_top_left,
673
+ negative_target_size,
674
+ dtype=prompt_embeds.dtype,
675
+ text_encoder_projection_dim=text_encoder_projection_dim,
676
+ )
677
+ else:
678
+ negative_add_time_ids = add_time_ids
679
+
680
+ if self.do_classifier_free_guidance:
681
+ prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
682
+ byt5_prompt_embeds = torch.cat([negative_byt5_prompt_embeds, byt5_prompt_embeds], dim=0)
683
+
684
+ add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
685
+ add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
686
+
687
+ prompt_embeds = prompt_embeds.to(device)
688
+ byt5_prompt_embeds = byt5_prompt_embeds.to(device)
689
+ add_text_embeds = add_text_embeds.to(device)
690
+ add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
691
+
692
+ if ip_adapter_image is not None:
693
+ output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
694
+ image_embeds, negative_image_embeds = self.encode_image(
695
+ ip_adapter_image, device, num_images_per_prompt, output_hidden_state
696
+ )
697
+ if self.do_classifier_free_guidance:
698
+ image_embeds = torch.cat([negative_image_embeds, image_embeds])
699
+ image_embeds = image_embeds.to(device)
700
+
701
+ # 8. Denoising loop
702
+ num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
703
+
704
+ # 8.1 Apply denoising_end
705
+ if (
706
+ self.denoising_end is not None
707
+ and isinstance(self.denoising_end, float)
708
+ and self.denoising_end > 0
709
+ and self.denoising_end < 1
710
+ ):
711
+ discrete_timestep_cutoff = int(
712
+ round(
713
+ self.scheduler.config.num_train_timesteps
714
+ - (self.denoising_end * self.scheduler.config.num_train_timesteps)
715
+ )
716
+ )
717
+ num_inference_steps = len(list(filter(lambda ts: ts >= discrete_timestep_cutoff, timesteps)))
718
+ timesteps = timesteps[:num_inference_steps]
719
+
720
+ # 9. Optionally get Guidance Scale Embedding
721
+ timestep_cond = None
722
+ if self.unet.config.time_cond_proj_dim is not None:
723
+ guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
724
+ timestep_cond = self.get_guidance_scale_embedding(
725
+ guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
726
+ ).to(device=device, dtype=latents.dtype)
727
+
728
+ assert batch_size == 1, "batch_size > 1 is not supported"
729
+ if texts is not None:
730
+ glyph_attn_mask = self.get_glyph_attn_mask(texts, bboxes)
731
+ # h,w
732
+ bg_attn_mask = glyph_attn_mask.sum(-1) == 0
733
+ # 1,h,w,byt5_max_len
734
+ glyph_attn_masks = glyph_attn_mask.unsqueeze(0).to(device)
735
+ # 1,h,w
736
+ bg_attn_masks = bg_attn_mask.unsqueeze(0).to(glyph_attn_masks.dtype).to(device)
737
+
738
+ # b, h, w, text_feat_len
739
+ glyph_attn_masks = (1 - glyph_attn_masks) * -10000.0
740
+ # b, h, w
741
+ bg_attn_masks = (1 - bg_attn_masks) * -10000.0
742
+ num_down_sample = sum(1 if i == 'CrossAttnDownBlock2D' else 0 for i in self.unet.config['down_block_types']) - 1
743
+ initial_resolution = self.default_sample_size
744
+ initial_resolution = initial_resolution // 2**sum(1 if i == 'DownBlock2D' else 0 for i in self.unet.config['down_block_types'])
745
+ resolution_list = [initial_resolution] + [initial_resolution // 2**i for i in range(1, num_down_sample + 1)]
746
+ glyph_attn_masks_dict = dict()
747
+ bg_attn_masks_dict = dict()
748
+ # b, text_fet_len, h, w
749
+ glyph_attn_masks = glyph_attn_masks.permute(0, 3, 1, 2)
750
+ # b, 1, h, w
751
+ bg_attn_masks = bg_attn_masks.unsqueeze(1)
752
+ for mask_resolution in resolution_list:
753
+ down_scaled_glyph_attn_masks = F.interpolate(
754
+ glyph_attn_masks, size=(mask_resolution, mask_resolution), mode='nearest',
755
+ )
756
+ # b, text_fet_len, h, w->b, h, w, text_fet_len->b, h*w, text_fet_len
757
+ down_scaled_glyph_attn_masks = down_scaled_glyph_attn_masks.permute(0, 2, 3, 1).flatten(1, 2)
758
+ glyph_attn_masks_dict[mask_resolution * mask_resolution] = down_scaled_glyph_attn_masks
759
+
760
+ down_scaled_bg_attn_masks = F.interpolate(
761
+ bg_attn_masks, size=(mask_resolution, mask_resolution), mode='nearest',
762
+ )
763
+ # b,1,h,w->b,h,w->b,h,w,1->b,h*w,1->b,h*w,clip_feat_len
764
+ down_scaled_bg_attn_masks = down_scaled_bg_attn_masks.squeeze(1).unsqueeze(-1)
765
+ down_scaled_bg_attn_masks = down_scaled_bg_attn_masks.flatten(1, 2)
766
+ down_scaled_bg_attn_masks = down_scaled_bg_attn_masks.repeat(1, 1, prompt_embeds.shape[1])
767
+ bg_attn_masks_dict[mask_resolution * mask_resolution] = down_scaled_bg_attn_masks
768
+ if self.do_classifier_free_guidance:
769
+ for key in glyph_attn_masks_dict:
770
+ glyph_attn_masks_dict[key] = torch.cat([
771
+ torch.zeros_like(glyph_attn_masks_dict[key]),
772
+ glyph_attn_masks_dict[key]],
773
+ dim=0)
774
+ for key in bg_attn_masks_dict:
775
+ bg_attn_masks_dict[key] = torch.cat([
776
+ torch.zeros_like(bg_attn_masks_dict[key]),
777
+ bg_attn_masks_dict[key]],
778
+ dim=0)
779
+ else:
780
+ glyph_attn_masks_dict = None
781
+ bg_attn_masks_dict = None
782
+
783
+ self._num_timesteps = len(timesteps)
784
+ with self.progress_bar(total=num_inference_steps) as progress_bar:
785
+ for i, t in enumerate(timesteps):
786
+ if self.interrupt:
787
+ continue
788
+
789
+ # expand the latents if we are doing classifier free guidance
790
+ latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
791
+
792
+ latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
793
+
794
+ # predict the noise residual
795
+ added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
796
+ if ip_adapter_image is not None:
797
+ added_cond_kwargs["image_embeds"] = image_embeds
798
+ if self.cross_attention_kwargs is None:
799
+ cross_attention_kwargs = {}
800
+ else:
801
+ cross_attention_kwargs = self.cross_attention_kwargs
802
+ cross_attention_kwargs['glyph_encoder_hidden_states'] = byt5_prompt_embeds
803
+ cross_attention_kwargs['glyph_attn_masks_dict'] = glyph_attn_masks_dict
804
+ cross_attention_kwargs['bg_attn_masks_dict'] = bg_attn_masks_dict
805
+
806
+ noise_pred = self.unet(
807
+ latent_model_input,
808
+ t,
809
+ encoder_hidden_states=prompt_embeds,
810
+ timestep_cond=timestep_cond,
811
+ cross_attention_kwargs=cross_attention_kwargs,
812
+ added_cond_kwargs=added_cond_kwargs,
813
+ return_dict=False,
814
+ )[0]
815
+
816
+ # perform guidance
817
+ if self.do_classifier_free_guidance:
818
+ noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
819
+ noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
820
+
821
+ if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
822
+ # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
823
+ noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
824
+
825
+ # compute the previous noisy sample x_t -> x_t-1
826
+ latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
827
+
828
+ if callback_on_step_end is not None:
829
+ callback_kwargs = {}
830
+ for k in callback_on_step_end_tensor_inputs:
831
+ callback_kwargs[k] = locals()[k]
832
+ callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
833
+
834
+ latents = callback_outputs.pop("latents", latents)
835
+ prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
836
+ negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
837
+ add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
838
+ negative_pooled_prompt_embeds = callback_outputs.pop(
839
+ "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
840
+ )
841
+ add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
842
+ negative_add_time_ids = callback_outputs.pop("negative_add_time_ids", negative_add_time_ids)
843
+
844
+ # call the callback, if provided
845
+ if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
846
+ progress_bar.update()
847
+ if callback is not None and i % callback_steps == 0:
848
+ step_idx = i // getattr(self.scheduler, "order", 1)
849
+ callback(step_idx, t, latents)
850
+
851
+ if not output_type == "latent":
852
+ # make sure the VAE is in float32 mode, as it overflows in float16
853
+ needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
854
+
855
+ if needs_upcasting:
856
+ self.upcast_vae()
857
+ latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
858
+
859
+ image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
860
+
861
+ # cast back to fp16 if needed
862
+ if needs_upcasting:
863
+ self.vae.to(dtype=torch.float16)
864
+ else:
865
+ image = latents
866
+
867
+ if not output_type == "latent":
868
+ # apply watermark if available
869
+ if self.watermark is not None:
870
+ image = self.watermark.apply_watermark(image)
871
+
872
+ image = self.image_processor.postprocess(image, output_type=output_type)
873
+
874
+ # Offload all models
875
+ self.maybe_free_model_hooks()
876
+
877
+ if not return_dict:
878
+ return (image,)
879
+
880
+ return StableDiffusionXLPipelineOutput(images=image)
881
+
882
+ def get_glyph_attn_mask(self, texts, bboxes):
883
+ resolution = self.default_sample_size
884
+ text_idx_list = self.get_text_start_pos(texts)
885
+ mask_tensor = torch.zeros(
886
+ resolution, resolution, self.byt5_max_length,
887
+ )
888
+ for idx, bbox in enumerate(bboxes):
889
+ # box is in [x, y, w, h] format
890
+ # area of [y:y+h, x:x+w]
891
+ bbox = [int(v * resolution + 0.5) for v in bbox]
892
+ bbox[2] = max(bbox[2], 1)
893
+ bbox[3] = max(bbox[3], 1)
894
+ bbox[0: 2] = np.clip(bbox[0: 2], 0, resolution - 1).tolist()
895
+ bbox[2: 4] = np.clip(bbox[2: 4], 1, resolution).tolist()
896
+ mask_tensor[
897
+ bbox[1]: bbox[1] + bbox[3],
898
+ bbox[0]: bbox[0] + bbox[2],
899
+ text_idx_list[idx]: text_idx_list[idx + 1]
900
+ ] = 1
901
+ return mask_tensor
902
+
903
+ def get_text_start_pos(self, texts):
904
+ prompt = "".encode('utf-8')
905
+ '''
906
+ Text "{text}" in {color}, {type}.
907
+ '''
908
+ pos_list = []
909
+ for text in texts:
910
+ pos_list.append(len(prompt))
911
+ text_prompt = f'Text "{text}"'
912
+
913
+ attr_list = ['0', '1']
914
+
915
+ attr_suffix = ", ".join(attr_list)
916
+ text_prompt += " in " + attr_suffix
917
+ text_prompt += ". "
918
+ text_prompt = text_prompt.encode('utf-8')
919
+
920
+ prompt = prompt + text_prompt
921
+ pos_list.append(len(prompt))
922
+ return pos_list
glyph_sdxl/modules/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from .simple_byt5_mapper import ByT5Mapper
2
+ from .byt5_block_byt5_mapper import T5EncoderBlockByT5Mapper
3
+
4
+ __all__ = [
5
+ 'ByT5Mapper',
6
+ 'T5EncoderBlockByT5Mapper',
7
+ ]
glyph_sdxl/modules/byt5_block_byt5_mapper.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
4
+
5
+ import warnings
6
+
7
+ import logging
8
+ from torch import Tensor
9
+ from diffusers import ModelMixin
10
+ from transformers.models.t5.modeling_t5 import T5LayerSelfAttention, T5LayerFF, T5LayerNorm
11
+
12
+ logger = logging.getLogger(__name__)
13
+
14
+ class T5EncoderBlock(nn.Module):
15
+ def __init__(self, config, has_relative_attention_bias=False):
16
+ super().__init__()
17
+ self.layer = nn.ModuleList()
18
+ self.layer.append(T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias))
19
+ self.layer.append(T5LayerFF(config))
20
+
21
+ def forward(
22
+ self,
23
+ hidden_states,
24
+ attention_mask=None,
25
+ position_bias=None,
26
+ layer_head_mask=None,
27
+ output_attentions=False,
28
+ ):
29
+ self_attn_past_key_value, cross_attn_past_key_value = None, None
30
+
31
+ self_attention_outputs = self.layer[0](
32
+ hidden_states,
33
+ attention_mask=attention_mask,
34
+ position_bias=position_bias,
35
+ layer_head_mask=layer_head_mask,
36
+ past_key_value=self_attn_past_key_value,
37
+ use_cache=False,
38
+ output_attentions=output_attentions,
39
+ )
40
+ hidden_states, present_key_value_state = self_attention_outputs[:2]
41
+ attention_outputs = self_attention_outputs[2:] # Keep self-attention outputs and relative position weights
42
+
43
+ # clamp inf values to enable fp16 training
44
+ if hidden_states.dtype == torch.float16:
45
+ clamp_value = torch.where(
46
+ torch.isinf(hidden_states).any(),
47
+ torch.finfo(hidden_states.dtype).max - 1000,
48
+ torch.finfo(hidden_states.dtype).max,
49
+ )
50
+ hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
51
+
52
+ # Apply Feed Forward layer
53
+ hidden_states = self.layer[-1](hidden_states)
54
+
55
+ # clamp inf values to enable fp16 training
56
+ if hidden_states.dtype == torch.float16:
57
+ clamp_value = torch.where(
58
+ torch.isinf(hidden_states).any(),
59
+ torch.finfo(hidden_states.dtype).max - 1000,
60
+ torch.finfo(hidden_states.dtype).max,
61
+ )
62
+ hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
63
+
64
+ outputs = (hidden_states,) + attention_outputs
65
+
66
+ return outputs # hidden-states, present_key_value_states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
67
+
68
+ class T5EncoderBlockByT5Mapper(ModelMixin):
69
+ def __init__(self, byt5_config, num_layers, sdxl_channels=None):
70
+ super().__init__()
71
+ if num_layers > 0:
72
+ self.blocks = nn.ModuleList(
73
+ [
74
+ T5EncoderBlock(
75
+ byt5_config,
76
+ has_relative_attention_bias=bool(i == 0))
77
+ for i in range(num_layers)
78
+ ]
79
+ )
80
+ else:
81
+ self.blocks = None
82
+ self.layer_norm = T5LayerNorm(byt5_config.d_model, eps=byt5_config.layer_norm_epsilon)
83
+ if sdxl_channels is not None:
84
+ self.channel_mapper = nn.Linear(byt5_config.d_model, sdxl_channels)
85
+ self.final_layer_norm = T5LayerNorm(sdxl_channels, eps=byt5_config.layer_norm_epsilon)
86
+ else:
87
+ self.channel_mapper = None
88
+ self.final_layer_norm = None
89
+
90
+ def get_extended_attention_mask(
91
+ self, attention_mask: Tensor, input_shape: Tuple[int], device: torch.device = None, dtype: torch.float = None
92
+ ) -> Tensor:
93
+ """
94
+ Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
95
+
96
+ Arguments:
97
+ attention_mask (`torch.Tensor`):
98
+ Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
99
+ input_shape (`Tuple[int]`):
100
+ The shape of the input to the model.
101
+
102
+ Returns:
103
+ `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
104
+ """
105
+ if dtype is None:
106
+ dtype = self.dtype
107
+
108
+ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
109
+ # ourselves in which case we just need to make it broadcastable to all heads.
110
+ if attention_mask.dim() == 3:
111
+ extended_attention_mask = attention_mask[:, None, :, :]
112
+ elif attention_mask.dim() == 2:
113
+ # Provided a padding mask of dimensions [batch_size, seq_length]
114
+ # - if the model is a decoder, apply a causal mask in addition to the padding mask
115
+ # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
116
+ extended_attention_mask = attention_mask[:, None, None, :]
117
+ else:
118
+ raise ValueError(
119
+ f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
120
+ )
121
+
122
+ # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
123
+ # masked positions, this operation will create a tensor which is 0.0 for
124
+ # positions we want to attend and the dtype's smallest value for masked positions.
125
+ # Since we are adding it to the raw scores before the softmax, this is
126
+ # effectively the same as removing these entirely.
127
+ extended_attention_mask = extended_attention_mask.to(dtype=dtype) # fp16 compatibility
128
+ extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(dtype).min
129
+ return extended_attention_mask
130
+
131
+
132
+ def forward(self, inputs_embeds, attention_mask):
133
+ input_shape = inputs_embeds.size()[:-1]
134
+ extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
135
+
136
+ hidden_states = inputs_embeds
137
+ position_bias = None
138
+
139
+ if self.blocks is not None:
140
+ for layer_module in self.blocks:
141
+ layer_outputs = layer_module(
142
+ hidden_states,
143
+ attention_mask=extended_attention_mask,
144
+ position_bias=position_bias,
145
+ )
146
+ hidden_states, position_bias = layer_outputs
147
+ hidden_states = self.layer_norm(hidden_states)
148
+ if self.channel_mapper is not None:
149
+ hidden_states = self.channel_mapper(hidden_states)
150
+ hidden_states = self.final_layer_norm(hidden_states)
151
+ return hidden_states
glyph_sdxl/modules/simple_byt5_mapper.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from diffusers import ModelMixin
2
+ import torch.nn as nn
3
+
4
+ class ByT5Mapper(ModelMixin):
5
+ def __init__(self, byt5_output_dim, sdxl_text_dim):
6
+ super().__init__()
7
+ self.mapper = nn.Sequential(
8
+ nn.LayerNorm(byt5_output_dim),
9
+ nn.Linear(byt5_output_dim, sdxl_text_dim),
10
+ nn.ReLU(),
11
+ nn.Linear(sdxl_text_dim, sdxl_text_dim)
12
+ )
13
+
14
+ def forward(self, byt5_embedding):
15
+ return self.mapper(byt5_embedding)
16
+
glyph_sdxl/utils/__init__.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .parse_config import parse_config
2
+ from .constants import (
3
+ UNET_CKPT_NAME,
4
+ BYT5_CKPT_NAME,
5
+ BYT5_MAPPER_CKPT_NAME,
6
+ INSERTED_ATTN_CKPT_NAME,
7
+ huggingface_cache_dir,
8
+ )
9
+ from .load_pretrained_byt5 import load_byt5_and_byt5_tokenizer
10
+ from .format_prompt import PromptFormat, MultilingualPromptFormat
11
+
12
+ __all__ = [
13
+ 'parse_config',
14
+ 'UNET_CKPT_NAME',
15
+ 'BYT5_CKPT_NAME',
16
+ 'BYT5_MAPPER_CKPT_NAME',
17
+ 'huggingface_cache_dir',
18
+ 'load_byt5_and_byt5_tokenizer',
19
+ 'INSERTED_ATTN_CKPT_NAME',
20
+ 'PromptFormat',
21
+ 'MultilingualPromptFormat',
22
+ ]
23
+
glyph_sdxl/utils/constants.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ UNET_CKPT_NAME = "unet_lora.pt"
2
+ INSERTED_ATTN_CKPT_NAME = "unet_inserted_attn.pt"
3
+ BYT5_CKPT_NAME = "byt5_model.pt"
4
+ BYT5_MAPPER_CKPT_NAME = "byt5_mapper.pt"
5
+ huggingface_cache_dir = None
glyph_sdxl/utils/format_prompt.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import webcolors
3
+
4
+
5
+ def closest_color(requested_color):
6
+ min_colors = {}
7
+ for key, name in webcolors.CSS3_HEX_TO_NAMES.items():
8
+ r_c, g_c, b_c = webcolors.hex_to_rgb(key)
9
+ rd = (r_c - requested_color[0]) ** 2
10
+ gd = (g_c - requested_color[1]) ** 2
11
+ bd = (b_c - requested_color[2]) ** 2
12
+ min_colors[(rd + gd + bd)] = name
13
+ return min_colors[min(min_colors.keys())]
14
+
15
+ def convert_rgb_to_names(rgb_tuple):
16
+ try:
17
+ color_name = webcolors.rgb_to_name(rgb_tuple)
18
+ except ValueError:
19
+ color_name = closest_color(rgb_tuple)
20
+ return color_name
21
+
22
+ class PromptFormat():
23
+ def __init__(
24
+ self,
25
+ font_path: str = 'assets/font_idx_512.json',
26
+ color_path: str = 'assets/color_idx.json',
27
+ ):
28
+ with open(font_path, 'r') as f:
29
+ self.font_dict = json.load(f)
30
+ with open(color_path, 'r') as f:
31
+ self.color_dict = json.load(f)
32
+
33
+ def format_checker(self, texts, styles):
34
+ assert len(texts) == len(styles), 'length of texts must be equal to length of styles'
35
+ for style in styles:
36
+ assert style['font-family'] in self.font_dict, f"invalid font-family: {style['font-family']}"
37
+ rgb_color = webcolors.hex_to_rgb(style['color'])
38
+ color_name = convert_rgb_to_names(rgb_color)
39
+ assert color_name in self.color_dict, f"invalid color hex {color_name}"
40
+
41
+ def format_prompt(self, texts, styles):
42
+ self.format_checker(texts, styles)
43
+
44
+ prompt = ""
45
+ '''
46
+ Text "{text}" in {color}, {type}.
47
+ '''
48
+ for text, style in zip(texts, styles):
49
+ text_prompt = f'Text "{text}"'
50
+
51
+ attr_list = []
52
+
53
+ # format color
54
+ hex_color = style["color"]
55
+ rgb_color = webcolors.hex_to_rgb(hex_color)
56
+ color_name = convert_rgb_to_names(rgb_color)
57
+ attr_list.append(f"<color-{self.color_dict[color_name]}>")
58
+
59
+ # format font
60
+ attr_list.append(f"<font-{self.font_dict[style['font-family']]}>")
61
+ attr_suffix = ", ".join(attr_list)
62
+ text_prompt += " in " + attr_suffix
63
+ text_prompt += ". "
64
+
65
+ prompt = prompt + text_prompt
66
+ return prompt
67
+
68
+
69
+ class MultilingualPromptFormat():
70
+ def __init__(
71
+ self,
72
+ font_path: str = 'assets/multilingual_cn-en_font_idx.json',
73
+ color_path: str = 'assets/color_idx.json',
74
+ ):
75
+ with open(font_path, 'r') as f:
76
+ self.font_dict = json.load(f)
77
+ with open(color_path, 'r') as f:
78
+ self.color_dict = json.load(f)
79
+
80
+ def format_checker(self, texts, styles):
81
+ assert len(texts) == len(styles), 'length of texts must be equal to length of styles'
82
+ for style in styles:
83
+ assert style['font-family'] in self.font_dict, f"invalid font-family: {style['font-family']}"
84
+ rgb_color = webcolors.hex_to_rgb(style['color'])
85
+ color_name = convert_rgb_to_names(rgb_color)
86
+ assert color_name in self.color_dict, f"invalid color hex {color_name}"
87
+
88
+ def format_prompt(self, texts, styles):
89
+ self.format_checker(texts, styles)
90
+
91
+ prompt = ""
92
+ '''
93
+ Text "{text}" in {color}, {type}.
94
+ '''
95
+ for text, style in zip(texts, styles):
96
+ text_prompt = f'Text "{text}"'
97
+
98
+ attr_list = []
99
+
100
+ # format color
101
+ hex_color = style["color"]
102
+ rgb_color = webcolors.hex_to_rgb(hex_color)
103
+ color_name = convert_rgb_to_names(rgb_color)
104
+ attr_list.append(f"<color-{self.color_dict[color_name]}>")
105
+
106
+ # format font
107
+ attr_list.append(f"<{style['font-family'][:2]}-font-{self.font_dict[style['font-family']]}>")
108
+ attr_suffix = ", ".join(attr_list)
109
+ text_prompt += " in " + attr_suffix
110
+ text_prompt += ". "
111
+
112
+ prompt = prompt + text_prompt
113
+ return prompt
glyph_sdxl/utils/load_pretrained_byt5.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ from transformers import AutoTokenizer, T5ForConditionalGeneration
4
+ from diffusers.utils import logging
5
+
6
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
7
+
8
+ def add_special_token(tokenizer, text_encoder, add_color, add_font, color_ann_path, font_ann_path, multilingual=False):
9
+ with open(font_ann_path, 'r') as f:
10
+ idx_font_dict = json.load(f)
11
+ with open(color_ann_path, 'r') as f:
12
+ idx_color_dict = json.load(f)
13
+
14
+ if multilingual:
15
+ font_token = []
16
+ for font_code in idx_font_dict:
17
+ prefix = font_code[:2]
18
+ font_token.append(f'<{prefix}-font-{idx_font_dict[font_code]}>')
19
+ else:
20
+ font_token = [f'<font-{i}>' for i in range(len(idx_font_dict))]
21
+ color_token = [f'<color-{i}>' for i in range(len(idx_color_dict))]
22
+ additional_special_tokens = []
23
+ if add_color:
24
+ additional_special_tokens += color_token
25
+ if add_font:
26
+ additional_special_tokens += font_token
27
+ tokenizer.add_tokens(additional_special_tokens, special_tokens=True)
28
+ text_encoder.resize_token_embeddings(len(tokenizer))
29
+
30
+ def load_byt5_and_byt5_tokenizer(
31
+ byt5_name='google/byt5-small',
32
+ special_token=False,
33
+ color_special_token=False,
34
+ font_special_token=False,
35
+ color_ann_path='assets/color_idx.json',
36
+ font_ann_path='assets/font_idx_512.json',
37
+ huggingface_cache_dir=None,
38
+ multilingual=False,
39
+ ):
40
+ byt5_tokenizer = AutoTokenizer.from_pretrained(
41
+ byt5_name, cache_dir=huggingface_cache_dir,
42
+ )
43
+ byt5_text_encoder = T5ForConditionalGeneration.from_pretrained(
44
+ byt5_name, cache_dir=huggingface_cache_dir,
45
+ ).get_encoder()
46
+
47
+ if special_token:
48
+ add_special_token(
49
+ byt5_tokenizer,
50
+ byt5_text_encoder,
51
+ add_color=color_special_token,
52
+ add_font=font_special_token,
53
+ color_ann_path=color_ann_path,
54
+ font_ann_path=font_ann_path,
55
+ multilingual=multilingual,
56
+ )
57
+
58
+ logger.info(f'Loaded original byt5 weight')
59
+
60
+ return byt5_text_encoder, byt5_tokenizer
glyph_sdxl/utils/parse_config.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import os.path as osp
4
+ from mmengine.config import Config
5
+
6
+
7
+ def parse_config(path=None):
8
+ if path is None:
9
+ parser = argparse.ArgumentParser()
10
+ parser.add_argument('config_dir', type=str)
11
+ args = parser.parse_args()
12
+ path = args.config_dir
13
+ config = Config.fromfile(path)
14
+
15
+ config.config_dir = path
16
+
17
+ return config
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ transformers==4.36.2
2
+ diffusers==0.26.1
3
+ mmengine
4
+ accelerate
5
+ torch==2.2.0
6
+ torchvision==0.17.0
7
+ deepspeed
8
+ peft
9
+ webcolors
10
+ gradio