Artiprocher commited on
Commit
2b7b2cd
·
1 Parent(s): 96b5d0f
README.md CHANGED
@@ -1,3 +1,49 @@
1
  ---
2
- license: mit
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ license: apache-2.0
3
+ tags:
4
+ - pytorch
5
+ - diffusers
6
+ - text-to-image
7
  ---
8
+
9
+ # Chinese Latent Diffusion Model
10
+
11
+ 我们开源了一个中文 Lattent Diffusion 模型,为中文古诗词生成精美配图
12
+
13
+ * Github: [EasyNLP](https://github.com/alibaba/EasyNLP)
14
+
15
+ ## 模型介绍
16
+
17
+ 模型分成三部分:
18
+
19
+ * Text Encoder:把中文文本输入转化成 Embedding 向量
20
+ * Latent Diffusion Model:在 Latent 空间中根据文本输入处理随机生成的噪声
21
+ * Autoencoder:将 Latent 空间中的张量还原为图片
22
+ * Super Resolution:提升图片分辨率
23
+
24
+ 我们使用中文模型 [CLIP-ViT-L](https://wukong-dataset.github.io/wukong-dataset/benchmark.html) 作为 Text Encoder,使用 [latent-diffusion](https://github.com/CompVis/latent-diffusion) 中的 Autoencoder,使用 [ESRGAN](https://github.com/xinntao/ESRGAN) 作为 Super Resolution 模型。我们使用 [Noah-Wukong](https://wukong-dataset.github.io/wukong-dataset/) 数据集中的两千万图文对 Latent Diffusion Model 进行了预训练。
25
+
26
+ 我们在私有美食数据集上进行了微调,以生成精美的美食图片。
27
+
28
+ ## 使用
29
+
30
+ 基于 Diffusers 开发,请先安装 Diffusers
31
+
32
+ ```
33
+ pip install diffusers
34
+ ```
35
+
36
+ ```python
37
+ from LdmZhPipeline import LDMZhTextToImagePipeline
38
+
39
+ generator = LDMZhTextToImagePipeline.from_pretrained("alibaba-pai/pai-diffusion-food-large-zh")
40
+ generator.to("cuda")
41
+ image = generator("番茄炒蛋").images[0]
42
+ image.save("food.png")
43
+ ```
44
+
45
+ 超分辨率模块默认是关闭的,如需启用,请添加参数 `use_sr=True`。
46
+
47
+ ```python
48
+ image = generator("番茄炒蛋", use_sr=True).images[0]
49
+ ```
bert/config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "_name_or_path": "./WukongClipTextEncoder"
3
+ }
bert/diffusion_pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f631d47508ad78826f22d0498b57bf1c7bd6c6530c8ca16496e83acfec780415
3
+ size 407646444
model_index.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "LDMTextToImagePipeline",
3
+ "_diffusers_version": "0.0.4",
4
+ "bert": [
5
+ "LdmZhPipeline",
6
+ "WukongClipTextEncoder"
7
+ ],
8
+ "scheduler": [
9
+ "diffusers",
10
+ "DDIMScheduler"
11
+ ],
12
+ "tokenizer": [
13
+ "transformers",
14
+ "BertTokenizer"
15
+ ],
16
+ "unet": [
17
+ "diffusers",
18
+ "UNet2DConditionModel"
19
+ ],
20
+ "vqvae": [
21
+ "diffusers",
22
+ "AutoencoderKL"
23
+ ],
24
+ "sr": [
25
+ "LdmZhPipeline",
26
+ "ESRGAN"
27
+ ]
28
+ }
scheduler/scheduler_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "DDIMScheduler",
3
+ "_diffusers_version": "0.0.4",
4
+ "beta_end": 0.012,
5
+ "beta_schedule": "linear",
6
+ "beta_start": 0.00085,
7
+ "clip_sample": false,
8
+ "num_train_timesteps": 1000,
9
+ "timestep_values": null,
10
+ "trained_betas": null
11
+ }
sr/config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "_name_or_path": "./ESRGAN"
3
+ }
sr/diffusion_pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6e3e8b6f99d7377a864d9db9bf09d4d345db74dca801db1f4004757ed2ab7746
3
+ size 67028637
tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "do_basic_tokenize": true,
4
+ "do_lower_case": true,
5
+ "mask_token": "[MASK]",
6
+ "model_max_length": 512,
7
+ "name_or_path": "fusing/latent-diffusion-text2im-large",
8
+ "never_split": null,
9
+ "pad_token": "[PAD]",
10
+ "sep_token": "[SEP]",
11
+ "special_tokens_map_file": null,
12
+ "strip_accents": null,
13
+ "tokenize_chinese_chars": true,
14
+ "tokenizer_class": "BertTokenizer",
15
+ "unk_token": "[UNK]"
16
+ }
tokenizer/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
unet/config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "UNet2DConditionModel",
3
+ "_diffusers_version": "0.0.4",
4
+ "act_fn": "silu",
5
+ "attention_head_dim": 8,
6
+ "block_out_channels": [
7
+ 320,
8
+ 640,
9
+ 1280,
10
+ 1280
11
+ ],
12
+ "center_input_sample": false,
13
+ "down_block_types": [
14
+ "CrossAttnDownBlock2D",
15
+ "CrossAttnDownBlock2D",
16
+ "CrossAttnDownBlock2D",
17
+ "DownBlock2D"
18
+ ],
19
+ "downsample_padding": 1,
20
+ "flip_sin_to_cos": true,
21
+ "freq_shift": 0,
22
+ "in_channels": 4,
23
+ "layers_per_block": 2,
24
+ "mid_block_scale_factor": 1,
25
+ "norm_eps": 1e-05,
26
+ "norm_num_groups": 32,
27
+ "out_channels": 4,
28
+ "sample_size": 32,
29
+ "up_block_types": [
30
+ "UpBlock2D",
31
+ "CrossAttnUpBlock2D",
32
+ "CrossAttnUpBlock2D",
33
+ "CrossAttnUpBlock2D"
34
+ ],
35
+ "cross_attention_dim": 768
36
+ }
unet/diffusion_pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d5961eef51e952fef92c1e7eeedbfda88dcbf2aa7542db7c45b101fcc97402a
3
+ size 3438322101
vqvae/config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKL",
3
+ "_diffusers_version": "0.1.2",
4
+ "act_fn": "silu",
5
+ "block_out_channels": [
6
+ 128,
7
+ 256,
8
+ 512,
9
+ 512
10
+ ],
11
+ "down_block_types": [
12
+ "DownEncoderBlock2D",
13
+ "DownEncoderBlock2D",
14
+ "DownEncoderBlock2D",
15
+ "DownEncoderBlock2D"
16
+ ],
17
+ "in_channels": 3,
18
+ "latent_channels": 4,
19
+ "layers_per_block": 2,
20
+ "out_channels": 3,
21
+ "sample_size": 256,
22
+ "up_block_types": [
23
+ "UpDecoderBlock2D",
24
+ "UpDecoderBlock2D",
25
+ "UpDecoderBlock2D",
26
+ "UpDecoderBlock2D"
27
+ ]
28
+ }
vqvae/diffusion_pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1b134cded8eb78b184aefb8805b6b572f36fa77b255c483665dda931fa0130c5
3
+ size 334707217