feishen29 commited on
Commit
4a4c777
1 Parent(s): b18e2e0

Delete ckpt

Browse files
Files changed (37) hide show
  1. ckpt/.DS_Store +0 -0
  2. ckpt/ControlNet/body_pose_model.pth +0 -3
  3. ckpt/ControlNet/facenet.pth +0 -3
  4. ckpt/ControlNet/hand_pose_model.pth +0 -3
  5. ckpt/IMAGDressing-v1_512.pt +0 -3
  6. ckpt/buffalo_l.zip +0 -3
  7. ckpt/control_v11p_sd15_openpose/.gitattributes +0 -34
  8. ckpt/control_v11p_sd15_openpose/README.md +0 -163
  9. ckpt/control_v11p_sd15_openpose/config.json +0 -42
  10. ckpt/control_v11p_sd15_openpose/control_net_open_pose.py +0 -60
  11. ckpt/control_v11p_sd15_openpose/diffusion_pytorch_model.bin +0 -3
  12. ckpt/control_v11p_sd15_openpose/diffusion_pytorch_model.fp16.bin +0 -3
  13. ckpt/control_v11p_sd15_openpose/diffusion_pytorch_model.fp16.safetensors +0 -3
  14. ckpt/control_v11p_sd15_openpose/diffusion_pytorch_model.safetensors +0 -3
  15. ckpt/control_v11p_sd15_openpose/images/control.png +0 -0
  16. ckpt/control_v11p_sd15_openpose/images/image_out.png +0 -0
  17. ckpt/control_v11p_sd15_openpose/images/input.png +0 -0
  18. ckpt/control_v11p_sd15_openpose/sd.png +0 -0
  19. ckpt/image_encoder/.DS_Store +0 -0
  20. ckpt/image_encoder/config.json +0 -23
  21. ckpt/image_encoder/model.safetensors +0 -3
  22. ckpt/image_encoder/pytorch_model.bin +0 -3
  23. ckpt/ip-adapter-faceid-plus_sd15.bin +0 -3
  24. ckpt/scheduler/scheduler_config.json +0 -21
  25. ckpt/sd-vae-ft-mse/.gitattributes +0 -33
  26. ckpt/sd-vae-ft-mse/README.md +0 -83
  27. ckpt/sd-vae-ft-mse/config.json +0 -29
  28. ckpt/sd-vae-ft-mse/diffusion_pytorch_model.bin +0 -3
  29. ckpt/sd-vae-ft-mse/diffusion_pytorch_model.safetensors +0 -3
  30. ckpt/text_encoder/config.json +0 -25
  31. ckpt/text_encoder/model.safetensors +0 -3
  32. ckpt/tokenizer/merges.txt +0 -0
  33. ckpt/tokenizer/special_tokens_map.json +0 -24
  34. ckpt/tokenizer/tokenizer_config.json +0 -33
  35. ckpt/tokenizer/vocab.json +0 -0
  36. ckpt/unet/config.json +0 -60
  37. ckpt/unet/diffusion_pytorch_model.safetensors +0 -3
ckpt/.DS_Store DELETED
Binary file (6.15 kB)
 
ckpt/ControlNet/body_pose_model.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:25a948c16078b0f08e236bda51a385d855ef4c153598947c28c0d47ed94bb746
3
- size 209267595
 
 
 
 
ckpt/ControlNet/facenet.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8beb52e548624ffcc4aed12af7aee7dcbfaeea420c75609fee999fe7add79d43
3
- size 153718792
 
 
 
 
ckpt/ControlNet/hand_pose_model.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b76b00d1750901abd07b9f9d8c98cc3385b8fe834a26d4b4f0aad439e75fc600
3
- size 147341049
 
 
 
 
ckpt/IMAGDressing-v1_512.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c37a38119c1420735345013ce79b28f927f877267297780b6669f0faa9701ce6
3
- size 3547959907
 
 
 
 
ckpt/buffalo_l.zip DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:80ffe37d8a5940d59a7384c201a2a38d4741f2f3c51eef46ebb28218a7b0ca2f
3
- size 288621354
 
 
 
 
ckpt/control_v11p_sd15_openpose/.gitattributes DELETED
@@ -1,34 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tflite filter=lfs diff=lfs merge=lfs -text
29
- *.tgz filter=lfs diff=lfs merge=lfs -text
30
- *.wasm filter=lfs diff=lfs merge=lfs -text
31
- *.xz filter=lfs diff=lfs merge=lfs -text
32
- *.zip filter=lfs diff=lfs merge=lfs -text
33
- *.zst filter=lfs diff=lfs merge=lfs -text
34
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ckpt/control_v11p_sd15_openpose/README.md DELETED
@@ -1,163 +0,0 @@
1
- ---
2
- license: openrail
3
- base_model: runwayml/stable-diffusion-v1-5
4
- tags:
5
- - art
6
- - controlnet
7
- - stable-diffusion
8
- - controlnet-v1-1
9
- - image-to-image
10
- duplicated_from: ControlNet-1-1-preview/control_v11p_sd15_openpose
11
- ---
12
-
13
- # Controlnet - v1.1 - *openpose Version*
14
-
15
- **Controlnet v1.1** is the successor model of [Controlnet v1.0](https://huggingface.co/lllyasviel/ControlNet)
16
- and was released in [lllyasviel/ControlNet-v1-1](https://huggingface.co/lllyasviel/ControlNet-v1-1) by [Lvmin Zhang](https://huggingface.co/lllyasviel).
17
-
18
- This checkpoint is a conversion of [the original checkpoint](https://huggingface.co/lllyasviel/ControlNet-v1-1/blob/main/control_v11p_sd15_openpose.pth) into `diffusers` format.
19
- It can be used in combination with **Stable Diffusion**, such as [runwayml/stable-diffusion-v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5).
20
-
21
-
22
- For more details, please also have a look at the [🧨 Diffusers docs](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/controlnet).
23
-
24
-
25
- ControlNet is a neural network structure to control diffusion models by adding extra conditions.
26
-
27
- ![img](./sd.png)
28
-
29
- This checkpoint corresponds to the ControlNet conditioned on **openpose images**.
30
-
31
- ## Model Details
32
- - **Developed by:** Lvmin Zhang, Maneesh Agrawala
33
- - **Model type:** Diffusion-based text-to-image generation model
34
- - **Language(s):** English
35
- - **License:** [The CreativeML OpenRAIL M license](https://huggingface.co/spaces/CompVis/stable-diffusion-license) is an [Open RAIL M license](https://www.licenses.ai/blog/2022/8/18/naming-convention-of-responsible-ai-licenses), adapted from the work that [BigScience](https://bigscience.huggingface.co/) and [the RAIL Initiative](https://www.licenses.ai/) are jointly carrying in the area of responsible AI licensing. See also [the article about the BLOOM Open RAIL license](https://bigscience.huggingface.co/blog/the-bigscience-rail-license) on which our license is based.
36
- - **Resources for more information:** [GitHub Repository](https://github.com/lllyasviel/ControlNet), [Paper](https://arxiv.org/abs/2302.05543).
37
- - **Cite as:**
38
-
39
- @misc{zhang2023adding,
40
- title={Adding Conditional Control to Text-to-Image Diffusion Models},
41
- author={Lvmin Zhang and Maneesh Agrawala},
42
- year={2023},
43
- eprint={2302.05543},
44
- archivePrefix={arXiv},
45
- primaryClass={cs.CV}
46
- }
47
-
48
- ## Introduction
49
-
50
- Controlnet was proposed in [*Adding Conditional Control to Text-to-Image Diffusion Models*](https://arxiv.org/abs/2302.05543) by
51
- Lvmin Zhang, Maneesh Agrawala.
52
-
53
- The abstract reads as follows:
54
-
55
- *We present a neural network structure, ControlNet, to control pretrained large diffusion models to support additional input conditions.
56
- The ControlNet learns task-specific conditions in an end-to-end way, and the learning is robust even when the training dataset is small (< 50k).
57
- Moreover, training a ControlNet is as fast as fine-tuning a diffusion model, and the model can be trained on a personal devices.
58
- Alternatively, if powerful computation clusters are available, the model can scale to large amounts (millions to billions) of data.
59
- We report that large diffusion models like Stable Diffusion can be augmented with ControlNets to enable conditional inputs like edge maps, segmentation maps, keypoints, etc.
60
- This may enrich the methods to control large diffusion models and further facilitate related applications.*
61
-
62
- ## Example
63
-
64
- It is recommended to use the checkpoint with [Stable Diffusion v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5) as the checkpoint
65
- has been trained on it.
66
- Experimentally, the checkpoint can be used with other diffusion models such as dreamboothed stable diffusion.
67
-
68
- **Note**: If you want to process an image to create the auxiliary conditioning, external dependencies are required as shown below:
69
-
70
- 1. Install https://github.com/patrickvonplaten/controlnet_aux
71
-
72
- ```sh
73
- $ pip install controlnet_aux==0.3.0
74
- ```
75
-
76
- 2. Let's install `diffusers` and related packages:
77
-
78
- ```
79
- $ pip install diffusers transformers accelerate
80
- ```
81
-
82
- 3. Run code:
83
-
84
- ```python
85
- import torch
86
- import os
87
- from huggingface_hub import HfApi
88
- from pathlib import Path
89
- from diffusers.utils import load_image
90
- from PIL import Image
91
- import numpy as np
92
- from controlnet_aux import OpenposeDetector
93
-
94
- from diffusers import (
95
- ControlNetModel,
96
- StableDiffusionControlNetPipeline,
97
- UniPCMultistepScheduler,
98
- )
99
-
100
- checkpoint = "lllyasviel/control_v11p_sd15_openpose"
101
-
102
- image = load_image(
103
- "https://huggingface.co/lllyasviel/control_v11p_sd15_openpose/resolve/main/images/input.png"
104
- )
105
-
106
- prompt = "chef in the kitchen"
107
-
108
- processor = OpenposeDetector.from_pretrained('lllyasviel/ControlNet')
109
-
110
- control_image = processor(image, hand_and_face=True)
111
- control_image.save("./images/control.png")
112
-
113
- controlnet = ControlNetModel.from_pretrained(checkpoint, torch_dtype=torch.float16)
114
- pipe = StableDiffusionControlNetPipeline.from_pretrained(
115
- "runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
116
- )
117
-
118
- pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
119
- pipe.enable_model_cpu_offload()
120
-
121
- generator = torch.manual_seed(0)
122
- image = pipe(prompt, num_inference_steps=30, generator=generator, image=control_image).images[0]
123
-
124
- image.save('images/image_out.png')
125
-
126
- ```
127
-
128
- ![bird](./images/input.png)
129
-
130
- ![bird_canny](./images/control.png)
131
-
132
- ![bird_canny_out](./images/image_out.png)
133
-
134
- ## Other released checkpoints v1-1
135
-
136
- The authors released 14 different checkpoints, each trained with [Stable Diffusion v1-5](https://huggingface.co/runwayml/stable-diffusion-v1-5)
137
- on a different type of conditioning:
138
-
139
- | Model Name | Control Image Overview| Control Image Example | Generated Image Example |
140
- |---|---|---|---|
141
- |[lllyasviel/control_v11p_sd15_canny](https://huggingface.co/lllyasviel/control_v11p_sd15_canny)<br/> *Trained with canny edge detection* | A monochrome image with white edges on a black background.|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15_canny/resolve/main/images/control.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/lllyasviel/control_v11p_sd15_canny/resolve/main/images/control.png"/></a>|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15_canny/resolve/main/images/image_out.png"><img width="64" src="https://huggingface.co/lllyasviel/control_v11p_sd15_canny/resolve/main/images/image_out.png"/></a>|
142
- |[lllyasviel/control_v11e_sd15_ip2p](https://huggingface.co/lllyasviel/control_v11e_sd15_ip2p)<br/> *Trained with pixel to pixel instruction* | No condition .|<a href="https://huggingface.co/lllyasviel/control_v11e_sd15_ip2p/resolve/main/images/control.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/lllyasviel/control_v11e_sd15_ip2p/resolve/main/images/control.png"/></a>|<a href="https://huggingface.co/lllyasviel/control_v11e_sd15_ip2p/resolve/main/images/image_out.png"><img width="64" src="https://huggingface.co/lllyasviel/control_v11e_sd15_ip2p/resolve/main/images/image_out.png"/></a>|
143
- |[lllyasviel/control_v11p_sd15_inpaint](https://huggingface.co/lllyasviel/control_v11p_sd15_inpaint)<br/> Trained with image inpainting | No condition.|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15_inpaint/resolve/main/images/control.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/lllyasviel/control_v11p_sd15_inpaint/resolve/main/images/control.png"/></a>|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15_inpaint/resolve/main/images/output.png"><img width="64" src="https://huggingface.co/lllyasviel/control_v11p_sd15_inpaint/resolve/main/images/output.png"/></a>|
144
- |[lllyasviel/control_v11p_sd15_mlsd](https://huggingface.co/lllyasviel/control_v11p_sd15_mlsd)<br/> Trained with multi-level line segment detection | An image with annotated line segments.|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15_mlsd/resolve/main/images/control.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/lllyasviel/control_v11p_sd15_mlsd/resolve/main/images/control.png"/></a>|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15_mlsd/resolve/main/images/image_out.png"><img width="64" src="https://huggingface.co/lllyasviel/control_v11p_sd15_mlsd/resolve/main/images/image_out.png"/></a>|
145
- |[lllyasviel/control_v11f1p_sd15_depth](https://huggingface.co/lllyasviel/control_v11f1p_sd15_depth)<br/> Trained with depth estimation | An image with depth information, usually represented as a grayscale image.|<a href="https://huggingface.co/lllyasviel/control_v11f1p_sd15_depth/resolve/main/images/control.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/lllyasviel/control_v11f1p_sd15_depth/resolve/main/images/control.png"/></a>|<a href="https://huggingface.co/lllyasviel/control_v11f1p_sd15_depth/resolve/main/images/image_out.png"><img width="64" src="https://huggingface.co/lllyasviel/control_v11f1p_sd15_depth/resolve/main/images/image_out.png"/></a>|
146
- |[lllyasviel/control_v11p_sd15_normalbae](https://huggingface.co/lllyasviel/control_v11p_sd15_normalbae)<br/> Trained with surface normal estimation | An image with surface normal information, usually represented as a color-coded image.|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15_normalbae/resolve/main/images/control.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/lllyasviel/control_v11p_sd15_normalbae/resolve/main/images/control.png"/></a>|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15_normalbae/resolve/main/images/image_out.png"><img width="64" src="https://huggingface.co/lllyasviel/control_v11p_sd15_normalbae/resolve/main/images/image_out.png"/></a>|
147
- |[lllyasviel/control_v11p_sd15_seg](https://huggingface.co/lllyasviel/control_v11p_sd15_seg)<br/> Trained with image segmentation | An image with segmented regions, usually represented as a color-coded image.|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15_seg/resolve/main/images/control.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/lllyasviel/control_v11p_sd15_seg/resolve/main/images/control.png"/></a>|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15_seg/resolve/main/images/image_out.png"><img width="64" src="https://huggingface.co/lllyasviel/control_v11p_sd15_seg/resolve/main/images/image_out.png"/></a>|
148
- |[lllyasviel/control_v11p_sd15_lineart](https://huggingface.co/lllyasviel/control_v11p_sd15_lineart)<br/> Trained with line art generation | An image with line art, usually black lines on a white background.|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15_lineart/resolve/main/images/control.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/lllyasviel/control_v11p_sd15_lineart/resolve/main/images/control.png"/></a>|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15_lineart/resolve/main/images/image_out.png"><img width="64" src="https://huggingface.co/lllyasviel/control_v11p_sd15_lineart/resolve/main/images/image_out.png"/></a>|
149
- |[lllyasviel/control_v11p_sd15s2_lineart_anime](https://huggingface.co/lllyasviel/control_v11p_sd15s2_lineart_anime)<br/> Trained with anime line art generation | An image with anime-style line art.|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15s2_lineart_anime/resolve/main/images/control.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/lllyasviel/control_v11p_sd15s2_lineart_anime/resolve/main/images/control.png"/></a>|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15s2_lineart_anime/resolve/main/images/image_out.png"><img width="64" src="https://huggingface.co/lllyasviel/control_v11p_sd15s2_lineart_anime/resolve/main/images/image_out.png"/></a>|
150
- |[lllyasviel/control_v11p_sd15_openpose](https://huggingface.co/lllyasviel/control_v11p_sd15_openpose)<br/> Trained with human pose estimation | An image with human poses, usually represented as a set of keypoints or skeletons.|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15_openpose/resolve/main/images/control.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/lllyasviel/control_v11p_sd15_openpose/resolve/main/images/control.png"/></a>|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15_openpose/resolve/main/images/image_out.png"><img width="64" src="https://huggingface.co/lllyasviel/control_v11p_sd15_openpose/resolve/main/images/image_out.png"/></a>|
151
- |[lllyasviel/control_v11p_sd15_scribble](https://huggingface.co/lllyasviel/control_v11p_sd15_scribble)<br/> Trained with scribble-based image generation | An image with scribbles, usually random or user-drawn strokes.|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15_scribble/resolve/main/images/control.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/lllyasviel/control_v11p_sd15_scribble/resolve/main/images/control.png"/></a>|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15_scribble/resolve/main/images/image_out.png"><img width="64" src="https://huggingface.co/lllyasviel/control_v11p_sd15_scribble/resolve/main/images/image_out.png"/></a>|
152
- |[lllyasviel/control_v11p_sd15_softedge](https://huggingface.co/lllyasviel/control_v11p_sd15_softedge)<br/> Trained with soft edge image generation | An image with soft edges, usually to create a more painterly or artistic effect.|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15_softedge/resolve/main/images/control.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/lllyasviel/control_v11p_sd15_softedge/resolve/main/images/control.png"/></a>|<a href="https://huggingface.co/lllyasviel/control_v11p_sd15_softedge/resolve/main/images/image_out.png"><img width="64" src="https://huggingface.co/lllyasviel/control_v11p_sd15_softedge/resolve/main/images/image_out.png"/></a>|
153
- |[lllyasviel/control_v11e_sd15_shuffle](https://huggingface.co/lllyasviel/control_v11e_sd15_shuffle)<br/> Trained with image shuffling | An image with shuffled patches or regions.|<a href="https://huggingface.co/lllyasviel/control_v11e_sd15_shuffle/resolve/main/images/control.png"><img width="64" style="margin:0;padding:0;" src="https://huggingface.co/lllyasviel/control_v11e_sd15_shuffle/resolve/main/images/control.png"/></a>|<a href="https://huggingface.co/lllyasviel/control_v11e_sd15_shuffle/resolve/main/images/image_out.png"><img width="64" src="https://huggingface.co/lllyasviel/control_v11e_sd15_shuffle/resolve/main/images/image_out.png"/></a>|
154
-
155
- ## Improvements in Openpose 1.1:
156
-
157
- - The improvement of this model is mainly based on our improved implementation of OpenPose. We carefully reviewed the difference between the pytorch OpenPose and CMU's c++ openpose. Now the processor should be more accurate, especially for hands. The improvement of processor leads to the improvement of Openpose 1.1.
158
- - More inputs are supported (hand and face).
159
- - The training dataset of previous cnet 1.0 has several problems including (1) a small group of greyscale human images are duplicated thousands of times (!!), causing the previous model somewhat likely to generate grayscale human images; (2) some images has low quality, very blurry, or significant JPEG artifacts; (3) a small group of images has wrong paired prompts caused by a mistake in our data processing scripts. The new model fixed all problems of the training dataset and should be more reasonable in many cases.
160
-
161
- ## More information
162
-
163
- For more information, please also have a look at the [Diffusers ControlNet Blog Post](https://huggingface.co/blog/controlnet) and have a look at the [official docs](https://github.com/lllyasviel/ControlNet-v1-1-nightly).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ckpt/control_v11p_sd15_openpose/config.json DELETED
@@ -1,42 +0,0 @@
1
- {
2
- "_class_name": "ControlNetModel",
3
- "_diffusers_version": "0.16.0.dev0",
4
- "_name_or_path": "/home/patrick/controlnet_v1_1/control_v11p_sd15_openpose",
5
- "act_fn": "silu",
6
- "attention_head_dim": 8,
7
- "block_out_channels": [
8
- 320,
9
- 640,
10
- 1280,
11
- 1280
12
- ],
13
- "class_embed_type": null,
14
- "conditioning_embedding_out_channels": [
15
- 16,
16
- 32,
17
- 96,
18
- 256
19
- ],
20
- "controlnet_conditioning_channel_order": "rgb",
21
- "cross_attention_dim": 768,
22
- "down_block_types": [
23
- "CrossAttnDownBlock2D",
24
- "CrossAttnDownBlock2D",
25
- "CrossAttnDownBlock2D",
26
- "DownBlock2D"
27
- ],
28
- "downsample_padding": 1,
29
- "flip_sin_to_cos": true,
30
- "freq_shift": 0,
31
- "in_channels": 4,
32
- "layers_per_block": 2,
33
- "mid_block_scale_factor": 1,
34
- "norm_eps": 1e-05,
35
- "norm_num_groups": 32,
36
- "num_class_embeds": null,
37
- "only_cross_attention": false,
38
- "projection_class_embeddings_input_dim": null,
39
- "resnet_time_scale_shift": "default",
40
- "upcast_attention": false,
41
- "use_linear_projection": false
42
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ckpt/control_v11p_sd15_openpose/control_net_open_pose.py DELETED
@@ -1,60 +0,0 @@
1
- #!/usr/bin/env python3
2
- import torch
3
- import os
4
- from huggingface_hub import HfApi
5
- from pathlib import Path
6
- from diffusers.utils import load_image
7
- from controlnet_aux import OpenposeDetector
8
-
9
- from diffusers import (
10
- ControlNetModel,
11
- StableDiffusionControlNetPipeline,
12
- UniPCMultistepScheduler,
13
- )
14
- import sys
15
-
16
- checkpoint = sys.argv[1]
17
-
18
- <<<<<<< HEAD
19
- image = load_image("https://github.com/lllyasviel/ControlNet-v1-1-nightly/raw/main/test_imgs/demo.jpg").resize((512, 512))
20
- prompt = "The pope with sunglasses rapping with a mic"
21
-
22
-
23
- openpose = OpenposeDetector.from_pretrained('lllyasviel/ControlNet')
24
- image = openpose(image, hand_and_face=True)
25
- =======
26
- image = load_image("https://huggingface.co/lllyasviel/sd-controlnet-openpose/resolve/main/images/pose.png")
27
- prompt = "chef in the kitchen"
28
-
29
-
30
- openpose = OpenposeDetector.from_pretrained('lllyasviel/ControlNet')
31
- image = openpose(image)
32
- >>>>>>> 6e2c3bc1a649ac194d79bb2f4ee11900d7f0e8f6
33
-
34
- controlnet = ControlNetModel.from_pretrained(checkpoint, torch_dtype=torch.float16)
35
- pipe = StableDiffusionControlNetPipeline.from_pretrained(
36
- "runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16
37
- )
38
-
39
- pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
40
- pipe.enable_model_cpu_offload()
41
-
42
- generator = torch.manual_seed(33)
43
- <<<<<<< HEAD
44
- out_image = pipe(prompt, num_inference_steps=35, generator=generator, image=image).images[0]
45
- =======
46
- out_image = pipe(prompt, num_inference_steps=20, generator=generator, image=image).images[0]
47
- >>>>>>> 6e2c3bc1a649ac194d79bb2f4ee11900d7f0e8f6
48
-
49
- path = os.path.join(Path.home(), "images", "aa.png")
50
- out_image.save(path)
51
-
52
- api = HfApi()
53
-
54
- api.upload_file(
55
- path_or_fileobj=path,
56
- path_in_repo=path.split("/")[-1],
57
- repo_id="patrickvonplaten/images",
58
- repo_type="dataset",
59
- )
60
- print("https://huggingface.co/datasets/patrickvonplaten/images/blob/main/aa.png")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ckpt/control_v11p_sd15_openpose/diffusion_pytorch_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:40c80b93aea10c31de2d282adbe8bbb945611a037ca36e0cd55d3ee7d59fedce
3
- size 1445254969
 
 
 
 
ckpt/control_v11p_sd15_openpose/diffusion_pytorch_model.fp16.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:65c13c04dc49231f7044373e3f0dbd2f44b01a445c8577ea919cd5ff5fac29a6
3
- size 722698343
 
 
 
 
ckpt/control_v11p_sd15_openpose/diffusion_pytorch_model.fp16.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b25b1125e870275550b2a7de289056cb3c236c01c293bd5ba883657b1c006e3e
3
- size 722598642
 
 
 
 
ckpt/control_v11p_sd15_openpose/diffusion_pytorch_model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:46b10abb28f3750aba7eea208e188539f7945d9256de9a248cbb9902f2276988
3
- size 1445157124
 
 
 
 
ckpt/control_v11p_sd15_openpose/images/control.png DELETED
Binary file (8.41 kB)
 
ckpt/control_v11p_sd15_openpose/images/image_out.png DELETED
Binary file (655 kB)
 
ckpt/control_v11p_sd15_openpose/images/input.png DELETED
Binary file (16.3 kB)
 
ckpt/control_v11p_sd15_openpose/sd.png DELETED
Binary file (59.5 kB)
 
ckpt/image_encoder/.DS_Store DELETED
Binary file (6.15 kB)
 
ckpt/image_encoder/config.json DELETED
@@ -1,23 +0,0 @@
1
- {
2
- "_name_or_path": "./image_encoder",
3
- "architectures": [
4
- "CLIPVisionModelWithProjection"
5
- ],
6
- "attention_dropout": 0.0,
7
- "dropout": 0.0,
8
- "hidden_act": "gelu",
9
- "hidden_size": 1280,
10
- "image_size": 224,
11
- "initializer_factor": 1.0,
12
- "initializer_range": 0.02,
13
- "intermediate_size": 5120,
14
- "layer_norm_eps": 1e-05,
15
- "model_type": "clip_vision_model",
16
- "num_attention_heads": 16,
17
- "num_channels": 3,
18
- "num_hidden_layers": 32,
19
- "patch_size": 14,
20
- "projection_dim": 1024,
21
- "torch_dtype": "float16",
22
- "transformers_version": "4.28.0.dev0"
23
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ckpt/image_encoder/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6ca9667da1ca9e0b0f75e46bb030f7e011f44f86cbfb8d5a36590fcd7507b030
3
- size 2528373448
 
 
 
 
ckpt/image_encoder/pytorch_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:3d3ec1e66737f77a4f3bc2df3c52eacefc69ce7825e2784183b1d4e9877d9193
3
- size 2528481905
 
 
 
 
ckpt/ip-adapter-faceid-plus_sd15.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:252fb53e0d018489d9e7f9b9e2001a52ff700e491894011ada7cfb471e0fadf2
3
- size 156558503
 
 
 
 
ckpt/scheduler/scheduler_config.json DELETED
@@ -1,21 +0,0 @@
1
- {
2
- "_class_name": "DEISMultistepScheduler",
3
- "_diffusers_version": "0.16.1",
4
- "algorithm_type": "deis",
5
- "beta_end": 0.012,
6
- "beta_schedule": "scaled_linear",
7
- "beta_start": 0.00085,
8
- "clip_sample": false,
9
- "clip_sample_range": 1.0,
10
- "dynamic_thresholding_ratio": 0.995,
11
- "lower_order_final": true,
12
- "num_train_timesteps": 1000,
13
- "prediction_type": "epsilon",
14
- "sample_max_value": 1.0,
15
- "set_alpha_to_one": false,
16
- "solver_order": 2,
17
- "solver_type": "logrho",
18
- "steps_offset": 1,
19
- "thresholding": false,
20
- "trained_betas": null
21
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ckpt/sd-vae-ft-mse/.gitattributes DELETED
@@ -1,33 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ftz filter=lfs diff=lfs merge=lfs -text
6
- *.gz filter=lfs diff=lfs merge=lfs -text
7
- *.h5 filter=lfs diff=lfs merge=lfs -text
8
- *.joblib filter=lfs diff=lfs merge=lfs -text
9
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
10
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
11
- *.model filter=lfs diff=lfs merge=lfs -text
12
- *.msgpack filter=lfs diff=lfs merge=lfs -text
13
- *.npy filter=lfs diff=lfs merge=lfs -text
14
- *.npz filter=lfs diff=lfs merge=lfs -text
15
- *.onnx filter=lfs diff=lfs merge=lfs -text
16
- *.ot filter=lfs diff=lfs merge=lfs -text
17
- *.parquet filter=lfs diff=lfs merge=lfs -text
18
- *.pb filter=lfs diff=lfs merge=lfs -text
19
- *.pickle filter=lfs diff=lfs merge=lfs -text
20
- *.pkl filter=lfs diff=lfs merge=lfs -text
21
- *.pt filter=lfs diff=lfs merge=lfs -text
22
- *.pth filter=lfs diff=lfs merge=lfs -text
23
- *.rar filter=lfs diff=lfs merge=lfs -text
24
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
25
- *.tar.* filter=lfs diff=lfs merge=lfs -text
26
- *.tflite filter=lfs diff=lfs merge=lfs -text
27
- *.tgz filter=lfs diff=lfs merge=lfs -text
28
- *.wasm filter=lfs diff=lfs merge=lfs -text
29
- *.xz filter=lfs diff=lfs merge=lfs -text
30
- *.zip filter=lfs diff=lfs merge=lfs -text
31
- *.zst filter=lfs diff=lfs merge=lfs -text
32
- *tfevents* filter=lfs diff=lfs merge=lfs -text
33
- diffusion_pytorch_model.safetensors filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ckpt/sd-vae-ft-mse/README.md DELETED
@@ -1,83 +0,0 @@
1
- ---
2
- license: mit
3
- tags:
4
- - stable-diffusion
5
- - stable-diffusion-diffusers
6
- inference: false
7
- ---
8
- # Improved Autoencoders
9
-
10
- ## Utilizing
11
- These weights are intended to be used with the [🧨 diffusers library](https://github.com/huggingface/diffusers). If you are looking for the model to use with the original [CompVis Stable Diffusion codebase](https://github.com/CompVis/stable-diffusion), [come here](https://huggingface.co/stabilityai/sd-vae-ft-mse-original).
12
-
13
- #### How to use with 🧨 diffusers
14
- You can integrate this fine-tuned VAE decoder to your existing `diffusers` workflows, by including a `vae` argument to the `StableDiffusionPipeline`
15
- ```py
16
- from diffusers.models import AutoencoderKL
17
- from diffusers import StableDiffusionPipeline
18
-
19
- model = "CompVis/stable-diffusion-v1-4"
20
- vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse")
21
- pipe = StableDiffusionPipeline.from_pretrained(model, vae=vae)
22
- ```
23
-
24
- ## Decoder Finetuning
25
- We publish two kl-f8 autoencoder versions, finetuned from the original [kl-f8 autoencoder](https://github.com/CompVis/latent-diffusion#pretrained-autoencoding-models) on a 1:1 ratio of [LAION-Aesthetics](https://laion.ai/blog/laion-aesthetics/) and LAION-Humans, an unreleased subset containing only SFW images of humans. The intent was to fine-tune on the Stable Diffusion training set (the autoencoder was originally trained on OpenImages) but also enrich the dataset with images of humans to improve the reconstruction of faces.
26
- The first, _ft-EMA_, was resumed from the original checkpoint, trained for 313198 steps and uses EMA weights. It uses the same loss configuration as the original checkpoint (L1 + LPIPS).
27
- The second, _ft-MSE_, was resumed from _ft-EMA_ and uses EMA weights and was trained for another 280k steps using a different loss, with more emphasis
28
- on MSE reconstruction (MSE + 0.1 * LPIPS). It produces somewhat ``smoother'' outputs. The batch size for both versions was 192 (16 A100s, batch size 12 per GPU).
29
- To keep compatibility with existing models, only the decoder part was finetuned; the checkpoints can be used as a drop-in replacement for the existing autoencoder.
30
-
31
- _Original kl-f8 VAE vs f8-ft-EMA vs f8-ft-MSE_
32
-
33
- ## Evaluation
34
- ### COCO 2017 (256x256, val, 5000 images)
35
- | Model | train steps | rFID | PSNR | SSIM | PSIM | Link | Comments
36
- |----------|---------|------|--------------|---------------|---------------|-----------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------|
37
- | | | | | | | | |
38
- | original | 246803 | 4.99 | 23.4 +/- 3.8 | 0.69 +/- 0.14 | 1.01 +/- 0.28 | https://ommer-lab.com/files/latent-diffusion/kl-f8.zip | as used in SD |
39
- | ft-EMA | 560001 | 4.42 | 23.8 +/- 3.9 | 0.69 +/- 0.13 | 0.96 +/- 0.27 | https://huggingface.co/stabilityai/sd-vae-ft-ema-original/resolve/main/vae-ft-ema-560000-ema-pruned.ckpt | slightly better overall, with EMA |
40
- | ft-MSE | 840001 | 4.70 | 24.5 +/- 3.7 | 0.71 +/- 0.13 | 0.92 +/- 0.27 | https://huggingface.co/stabilityai/sd-vae-ft-mse-original/resolve/main/vae-ft-mse-840000-ema-pruned.ckpt | resumed with EMA from ft-EMA, emphasis on MSE (rec. loss = MSE + 0.1 * LPIPS), smoother outputs |
41
-
42
-
43
- ### LAION-Aesthetics 5+ (256x256, subset, 10000 images)
44
- | Model | train steps | rFID | PSNR | SSIM | PSIM | Link | Comments
45
- |----------|-----------|------|--------------|---------------|---------------|-----------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------|
46
- | | | | | | | | |
47
- | original | 246803 | 2.61 | 26.0 +/- 4.4 | 0.81 +/- 0.12 | 0.75 +/- 0.36 | https://ommer-lab.com/files/latent-diffusion/kl-f8.zip | as used in SD |
48
- | ft-EMA | 560001 | 1.77 | 26.7 +/- 4.8 | 0.82 +/- 0.12 | 0.67 +/- 0.34 | https://huggingface.co/stabilityai/sd-vae-ft-ema-original/resolve/main/vae-ft-ema-560000-ema-pruned.ckpt | slightly better overall, with EMA |
49
- | ft-MSE | 840001 | 1.88 | 27.3 +/- 4.7 | 0.83 +/- 0.11 | 0.65 +/- 0.34 | https://huggingface.co/stabilityai/sd-vae-ft-mse-original/resolve/main/vae-ft-mse-840000-ema-pruned.ckpt | resumed with EMA from ft-EMA, emphasis on MSE (rec. loss = MSE + 0.1 * LPIPS), smoother outputs |
50
-
51
-
52
- ### Visual
53
- _Visualization of reconstructions on 256x256 images from the COCO2017 validation dataset._
54
-
55
- <p align="center">
56
- <br>
57
- <b>
58
- 256x256: ft-EMA (left), ft-MSE (middle), original (right)</b>
59
- </p>
60
-
61
- <p align="center">
62
- <img src=https://huggingface.co/stabilityai/stable-diffusion-decoder-finetune/resolve/main/eval/ae-decoder-tuning-reconstructions/merged/00025_merged.png />
63
- </p>
64
-
65
- <p align="center">
66
- <img src=https://huggingface.co/stabilityai/stable-diffusion-decoder-finetune/resolve/main/eval/ae-decoder-tuning-reconstructions/merged/00011_merged.png />
67
- </p>
68
-
69
- <p align="center">
70
- <img src=https://huggingface.co/stabilityai/stable-diffusion-decoder-finetune/resolve/main/eval/ae-decoder-tuning-reconstructions/merged/00037_merged.png />
71
- </p>
72
-
73
- <p align="center">
74
- <img src=https://huggingface.co/stabilityai/stable-diffusion-decoder-finetune/resolve/main/eval/ae-decoder-tuning-reconstructions/merged/00043_merged.png />
75
- </p>
76
-
77
- <p align="center">
78
- <img src=https://huggingface.co/stabilityai/stable-diffusion-decoder-finetune/resolve/main/eval/ae-decoder-tuning-reconstructions/merged/00053_merged.png />
79
- </p>
80
-
81
- <p align="center">
82
- <img src=https://huggingface.co/stabilityai/stable-diffusion-decoder-finetune/resolve/main/eval/ae-decoder-tuning-reconstructions/merged/00029_merged.png />
83
- </p>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ckpt/sd-vae-ft-mse/config.json DELETED
@@ -1,29 +0,0 @@
1
- {
2
- "_class_name": "AutoencoderKL",
3
- "_diffusers_version": "0.4.2",
4
- "act_fn": "silu",
5
- "block_out_channels": [
6
- 128,
7
- 256,
8
- 512,
9
- 512
10
- ],
11
- "down_block_types": [
12
- "DownEncoderBlock2D",
13
- "DownEncoderBlock2D",
14
- "DownEncoderBlock2D",
15
- "DownEncoderBlock2D"
16
- ],
17
- "in_channels": 3,
18
- "latent_channels": 4,
19
- "layers_per_block": 2,
20
- "norm_num_groups": 32,
21
- "out_channels": 3,
22
- "sample_size": 256,
23
- "up_block_types": [
24
- "UpDecoderBlock2D",
25
- "UpDecoderBlock2D",
26
- "UpDecoderBlock2D",
27
- "UpDecoderBlock2D"
28
- ]
29
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ckpt/sd-vae-ft-mse/diffusion_pytorch_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1b4889b6b1d4ce7ae320a02dedaeff1780ad77d415ea0d744b476155c6377ddc
3
- size 334707217
 
 
 
 
ckpt/sd-vae-ft-mse/diffusion_pytorch_model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a1d993488569e928462932c8c38a0760b874d166399b14414135bd9c42df5815
3
- size 334643276
 
 
 
 
ckpt/text_encoder/config.json DELETED
@@ -1,25 +0,0 @@
1
- {
2
- "_name_or_path": "openai/clip-vit-large-patch14",
3
- "architectures": [
4
- "CLIPTextModel"
5
- ],
6
- "attention_dropout": 0.0,
7
- "bos_token_id": 0,
8
- "dropout": 0.0,
9
- "eos_token_id": 2,
10
- "hidden_act": "quick_gelu",
11
- "hidden_size": 768,
12
- "initializer_factor": 1.0,
13
- "initializer_range": 0.02,
14
- "intermediate_size": 3072,
15
- "layer_norm_eps": 1e-05,
16
- "max_position_embeddings": 77,
17
- "model_type": "clip_text_model",
18
- "num_attention_heads": 12,
19
- "num_hidden_layers": 12,
20
- "pad_token_id": 1,
21
- "projection_dim": 768,
22
- "torch_dtype": "float32",
23
- "transformers_version": "4.30.2",
24
- "vocab_size": 49408
25
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ckpt/text_encoder/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8ee67788c3c9a6f5d73de077b644f1d4317258b55fbcc372dc385e8e5587c1cc
3
- size 492265880
 
 
 
 
ckpt/tokenizer/merges.txt DELETED
The diff for this file is too large to render. See raw diff
 
ckpt/tokenizer/special_tokens_map.json DELETED
@@ -1,24 +0,0 @@
1
- {
2
- "bos_token": {
3
- "content": "<|startoftext|>",
4
- "lstrip": false,
5
- "normalized": true,
6
- "rstrip": false,
7
- "single_word": false
8
- },
9
- "eos_token": {
10
- "content": "<|endoftext|>",
11
- "lstrip": false,
12
- "normalized": true,
13
- "rstrip": false,
14
- "single_word": false
15
- },
16
- "pad_token": "<|endoftext|>",
17
- "unk_token": {
18
- "content": "<|endoftext|>",
19
- "lstrip": false,
20
- "normalized": true,
21
- "rstrip": false,
22
- "single_word": false
23
- }
24
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ckpt/tokenizer/tokenizer_config.json DELETED
@@ -1,33 +0,0 @@
1
- {
2
- "add_prefix_space": false,
3
- "bos_token": {
4
- "__type": "AddedToken",
5
- "content": "<|startoftext|>",
6
- "lstrip": false,
7
- "normalized": true,
8
- "rstrip": false,
9
- "single_word": false
10
- },
11
- "clean_up_tokenization_spaces": true,
12
- "do_lower_case": true,
13
- "eos_token": {
14
- "__type": "AddedToken",
15
- "content": "<|endoftext|>",
16
- "lstrip": false,
17
- "normalized": true,
18
- "rstrip": false,
19
- "single_word": false
20
- },
21
- "errors": "replace",
22
- "model_max_length": 77,
23
- "pad_token": "<|endoftext|>",
24
- "tokenizer_class": "CLIPTokenizer",
25
- "unk_token": {
26
- "__type": "AddedToken",
27
- "content": "<|endoftext|>",
28
- "lstrip": false,
29
- "normalized": true,
30
- "rstrip": false,
31
- "single_word": false
32
- }
33
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ckpt/tokenizer/vocab.json DELETED
The diff for this file is too large to render. See raw diff
 
ckpt/unet/config.json DELETED
@@ -1,60 +0,0 @@
1
- {
2
- "_class_name": "UNet2DConditionModel",
3
- "_diffusers_version": "0.16.1",
4
- "act_fn": "silu",
5
- "addition_embed_type": null,
6
- "addition_embed_type_num_heads": 64,
7
- "attention_head_dim": 8,
8
- "block_out_channels": [
9
- 320,
10
- 640,
11
- 1280,
12
- 1280
13
- ],
14
- "center_input_sample": false,
15
- "class_embed_type": null,
16
- "class_embeddings_concat": false,
17
- "conv_in_kernel": 3,
18
- "conv_out_kernel": 3,
19
- "cross_attention_dim": 768,
20
- "cross_attention_norm": null,
21
- "down_block_types": [
22
- "CrossAttnDownBlock2D",
23
- "CrossAttnDownBlock2D",
24
- "CrossAttnDownBlock2D",
25
- "DownBlock2D"
26
- ],
27
- "downsample_padding": 1,
28
- "dual_cross_attention": false,
29
- "encoder_hid_dim": null,
30
- "flip_sin_to_cos": true,
31
- "freq_shift": 0,
32
- "in_channels": 4,
33
- "layers_per_block": 2,
34
- "mid_block_only_cross_attention": null,
35
- "mid_block_scale_factor": 1,
36
- "mid_block_type": "UNetMidBlock2DCrossAttn",
37
- "norm_eps": 1e-05,
38
- "norm_num_groups": 32,
39
- "num_class_embeds": null,
40
- "only_cross_attention": false,
41
- "out_channels": 4,
42
- "projection_class_embeddings_input_dim": null,
43
- "resnet_out_scale_factor": 1.0,
44
- "resnet_skip_time_act": false,
45
- "resnet_time_scale_shift": "default",
46
- "sample_size": 64,
47
- "time_cond_proj_dim": null,
48
- "time_embedding_act_fn": null,
49
- "time_embedding_dim": null,
50
- "time_embedding_type": "positional",
51
- "timestep_post_act": null,
52
- "up_block_types": [
53
- "UpBlock2D",
54
- "CrossAttnUpBlock2D",
55
- "CrossAttnUpBlock2D",
56
- "CrossAttnUpBlock2D"
57
- ],
58
- "upcast_attention": false,
59
- "use_linear_projection": false
60
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ckpt/unet/diffusion_pytorch_model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f75956623c8f95b40e62b3ee45f5dcba8e353b53c33c6765e517c5a8bb3dfbfe
3
- size 3438167536