Commit
•
b2fc543
0
Parent(s):
Duplicate from cagliostrolab/animagine-xl-3.1
Browse filesCo-authored-by: Asahina <[email protected]>
- .gitattributes +35 -0
- README.md +406 -0
- animagine-xl-3.1.safetensors +3 -0
- model_index.json +41 -0
- scheduler/scheduler_config.json +17 -0
- text_encoder/config.json +24 -0
- text_encoder/model.safetensors +3 -0
- text_encoder_2/config.json +24 -0
- text_encoder_2/model.safetensors +3 -0
- tokenizer/merges.txt +0 -0
- tokenizer/special_tokens_map.json +30 -0
- tokenizer/tokenizer_config.json +30 -0
- tokenizer/vocab.json +0 -0
- tokenizer_2/merges.txt +0 -0
- tokenizer_2/special_tokens_map.json +24 -0
- tokenizer_2/tokenizer_config.json +38 -0
- tokenizer_2/vocab.json +0 -0
- unet/config.json +72 -0
- unet/diffusion_pytorch_model.safetensors +3 -0
- vae/config.json +32 -0
- vae/diffusion_pytorch_model.safetensors +3 -0
.gitattributes
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,406 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: other
|
3 |
+
license_name: faipl-1.0-sd
|
4 |
+
license_link: https://freedevproject.org/faipl-1.0-sd/
|
5 |
+
language:
|
6 |
+
- en
|
7 |
+
tags:
|
8 |
+
- text-to-image
|
9 |
+
- stable-diffusion
|
10 |
+
- safetensors
|
11 |
+
- stable-diffusion-xl
|
12 |
+
base_model: cagliostrolab/animagine-xl-3.0
|
13 |
+
widget:
|
14 |
+
- text: 1girl, green hair, sweater, looking at viewer, upper body, beanie, outdoors, night, turtleneck, masterpiece, best quality, very aesthetic, absurdes
|
15 |
+
parameter:
|
16 |
+
negative_prompt: nsfw, lowres, (bad), text, error, fewer, extra, missing, worst quality, jpeg artifacts, low quality, watermark, unfinished, displeasing, oldest, early, chromatic aberration, signature, extra digits, artistic error, username, scan, [abstract]
|
17 |
+
example_title: 1girl
|
18 |
+
- text: 1boy, male focus, green hair, sweater, looking at viewer, upper body, beanie, outdoors, night, turtleneck, masterpiece, best quality, very aesthetic, absurdes
|
19 |
+
parameter:
|
20 |
+
negative_prompt: nsfw, lowres, (bad), text, error, fewer, extra, missing, worst quality, jpeg artifacts, low quality, watermark, unfinished, displeasing, oldest, early, chromatic aberration, signature, extra digits, artistic error, username, scan, [abstract]
|
21 |
+
example_title: 1boy
|
22 |
+
---
|
23 |
+
|
24 |
+
<style>
|
25 |
+
.title-container {
|
26 |
+
display: flex;
|
27 |
+
justify-content: center;
|
28 |
+
align-items: center;
|
29 |
+
height: 100vh; /* Adjust this value to position the title vertically */
|
30 |
+
}
|
31 |
+
|
32 |
+
.title {
|
33 |
+
font-size: 2.5em;
|
34 |
+
text-align: center;
|
35 |
+
color: #333;
|
36 |
+
font-family: 'Helvetica Neue', sans-serif;
|
37 |
+
text-transform: uppercase;
|
38 |
+
letter-spacing: 0.1em;
|
39 |
+
padding: 0.5em 0;
|
40 |
+
background: transparent;
|
41 |
+
}
|
42 |
+
|
43 |
+
.title span {
|
44 |
+
background: -webkit-linear-gradient(45deg, #7ed56f, #28b485);
|
45 |
+
-webkit-background-clip: text;
|
46 |
+
-webkit-text-fill-color: transparent;
|
47 |
+
}
|
48 |
+
|
49 |
+
.custom-table {
|
50 |
+
table-layout: fixed;
|
51 |
+
width: 100%;
|
52 |
+
border-collapse: collapse;
|
53 |
+
margin-top: 2em;
|
54 |
+
}
|
55 |
+
|
56 |
+
.custom-table td {
|
57 |
+
width: 50%;
|
58 |
+
vertical-align: top;
|
59 |
+
padding: 10px;
|
60 |
+
box-shadow: 0px 0px 0px 0px rgba(0, 0, 0, 0.15);
|
61 |
+
}
|
62 |
+
|
63 |
+
.custom-image-container {
|
64 |
+
position: relative;
|
65 |
+
width: 100%;
|
66 |
+
margin-bottom: 0em;
|
67 |
+
overflow: hidden;
|
68 |
+
border-radius: 10px;
|
69 |
+
transition: transform .7s;
|
70 |
+
/* Smooth transition for the container */
|
71 |
+
}
|
72 |
+
|
73 |
+
.custom-image-container:hover {
|
74 |
+
transform: scale(1.05);
|
75 |
+
/* Scale the container on hover */
|
76 |
+
}
|
77 |
+
|
78 |
+
.custom-image {
|
79 |
+
width: 100%;
|
80 |
+
height: auto;
|
81 |
+
object-fit: cover;
|
82 |
+
border-radius: 10px;
|
83 |
+
transition: transform .7s;
|
84 |
+
margin-bottom: 0em;
|
85 |
+
}
|
86 |
+
|
87 |
+
.nsfw-filter {
|
88 |
+
filter: blur(8px); /* Apply a blur effect */
|
89 |
+
transition: filter 0.3s ease; /* Smooth transition for the blur effect */
|
90 |
+
}
|
91 |
+
|
92 |
+
.custom-image-container:hover .nsfw-filter {
|
93 |
+
filter: none; /* Remove the blur effect on hover */
|
94 |
+
}
|
95 |
+
|
96 |
+
.overlay {
|
97 |
+
position: absolute;
|
98 |
+
bottom: 0;
|
99 |
+
left: 0;
|
100 |
+
right: 0;
|
101 |
+
color: white;
|
102 |
+
width: 100%;
|
103 |
+
height: 40%;
|
104 |
+
display: flex;
|
105 |
+
flex-direction: column;
|
106 |
+
justify-content: center;
|
107 |
+
align-items: center;
|
108 |
+
font-size: 1vw;
|
109 |
+
font-style: bold;
|
110 |
+
text-align: center;
|
111 |
+
opacity: 0;
|
112 |
+
/* Keep the text fully opaque */
|
113 |
+
background: linear-gradient(0deg, rgba(0, 0, 0, 0.8) 60%, rgba(0, 0, 0, 0) 100%);
|
114 |
+
transition: opacity .5s;
|
115 |
+
}
|
116 |
+
.custom-image-container:hover .overlay {
|
117 |
+
opacity: 1;
|
118 |
+
}
|
119 |
+
.overlay-text {
|
120 |
+
background: linear-gradient(45deg, #7ed56f, #28b485);
|
121 |
+
-webkit-background-clip: text;
|
122 |
+
color: transparent;
|
123 |
+
text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.7);
|
124 |
+
|
125 |
+
.overlay-subtext {
|
126 |
+
font-size: 0.75em;
|
127 |
+
margin-top: 0.5em;
|
128 |
+
font-style: italic;
|
129 |
+
}
|
130 |
+
|
131 |
+
.overlay,
|
132 |
+
.overlay-subtext {
|
133 |
+
text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.5);
|
134 |
+
}
|
135 |
+
|
136 |
+
</style>
|
137 |
+
|
138 |
+
<h1 class="title">
|
139 |
+
<span>Animagine XL 3.1</span>
|
140 |
+
</h1>
|
141 |
+
<table class="custom-table">
|
142 |
+
<tr>
|
143 |
+
<td>
|
144 |
+
<div class="custom-image-container">
|
145 |
+
<img class="custom-image" src="https://cdn-uploads.huggingface.co/production/uploads/6365c8dbf31ef76df4042821/yq_5AWegnLsGyCYyqJ-1G.png" alt="sample1">
|
146 |
+
</div>
|
147 |
+
<div class="custom-image-container">
|
148 |
+
<img class="custom-image" src="https://cdn-uploads.huggingface.co/production/uploads/6365c8dbf31ef76df4042821/sp6w1elvXVTbckkU74v3o.png" alt="sample4">
|
149 |
+
</div>
|
150 |
+
</td>
|
151 |
+
<td>
|
152 |
+
<div class="custom-image-container">
|
153 |
+
<img class="custom-image" src="https://cdn-uploads.huggingface.co/production/uploads/6365c8dbf31ef76df4042821/OYBuX1XzffN7Pxi4c75JV.png" alt="sample2">
|
154 |
+
</div>
|
155 |
+
<div class="custom-image-container">
|
156 |
+
<img class="custom-image" src="https://cdn-uploads.huggingface.co/production/uploads/6365c8dbf31ef76df4042821/ytT3Oaf-atbqrnPIqz_dq.png" alt="sample3">
|
157 |
+
</td>
|
158 |
+
<td>
|
159 |
+
<div class="custom-image-container">
|
160 |
+
<img class="custom-image" src="https://cdn-uploads.huggingface.co/production/uploads/6365c8dbf31ef76df4042821/0oRq204okFxRGECmrIK6d.png" alt="sample1">
|
161 |
+
</div>
|
162 |
+
<div class="custom-image-container">
|
163 |
+
<img class="custom-image" src="https://cdn-uploads.huggingface.co/production/uploads/6365c8dbf31ef76df4042821/DW51m0HlDuAlXwu8H8bIS.png" alt="sample4">
|
164 |
+
</div>
|
165 |
+
</td>
|
166 |
+
</tr>
|
167 |
+
</table>
|
168 |
+
|
169 |
+
**Animagine XL 3.1** is an update in the Animagine XL V3 series, enhancing the previous version, Animagine XL 3.0. This open-source, anime-themed text-to-image model has been improved for generating anime-style images with higher quality. It includes a broader range of characters from well-known anime series, an optimized dataset, and new aesthetic tags for better image creation. Built on Stable Diffusion XL, Animagine XL 3.1 aims to be a valuable resource for anime fans, artists, and content creators by producing accurate and detailed representations of anime characters.
|
170 |
+
|
171 |
+
## Model Details
|
172 |
+
- **Developed by**: [Cagliostro Research Lab](https://huggingface.co/cagliostrolab)
|
173 |
+
- **In collaboration with**: [SeaArt.ai](https://www.seaart.ai/)
|
174 |
+
- **Model type**: Diffusion-based text-to-image generative model
|
175 |
+
- **Model Description**: Animagine XL 3.1 generates high-quality anime images from textual prompts. It boasts enhanced hand anatomy, improved concept understanding, and advanced prompt interpretation.
|
176 |
+
- **License**: [Fair AI Public License 1.0-SD](https://freedevproject.org/faipl-1.0-sd/)
|
177 |
+
- **Fine-tuned from**: [Animagine XL 3.0](https://huggingface.co/cagliostrolab/animagine-xl-3.0)
|
178 |
+
|
179 |
+
## Gradio & Colab Integration
|
180 |
+
|
181 |
+
Try the demo powered by Gradio in Huggingface Spaces: [![Open In Spaces](https://img.shields.io/badge/🤗-Open%20In%20Spaces-blue.svg)](https://huggingface.co/spaces/cagliostrolab/animagine-xl-3.1)
|
182 |
+
|
183 |
+
Or open the demo in Google Colab: [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/#fileId=https%3A//huggingface.co/spaces/cagliostrolab/animagine-xl-3.1/blob/main/demo.ipynb)
|
184 |
+
|
185 |
+
## 🧨 Diffusers Installation
|
186 |
+
|
187 |
+
First install the required libraries:
|
188 |
+
|
189 |
+
```bash
|
190 |
+
pip install diffusers transformers accelerate safetensors --upgrade
|
191 |
+
```
|
192 |
+
|
193 |
+
Then run image generation with the following example code:
|
194 |
+
|
195 |
+
```python
|
196 |
+
import torch
|
197 |
+
from diffusers import DiffusionPipeline
|
198 |
+
|
199 |
+
pipe = DiffusionPipeline.from_pretrained(
|
200 |
+
"cagliostrolab/animagine-xl-3.1",
|
201 |
+
torch_dtype=torch.float16,
|
202 |
+
use_safetensors=True,
|
203 |
+
)
|
204 |
+
pipe.to('cuda')
|
205 |
+
|
206 |
+
prompt = "1girl, souryuu asuka langley, neon genesis evangelion, solo, upper body, v, smile, looking at viewer, outdoors, night"
|
207 |
+
negative_prompt = "nsfw, lowres, (bad), text, error, fewer, extra, missing, worst quality, jpeg artifacts, low quality, watermark, unfinished, displeasing, oldest, early, chromatic aberration, signature, extra digits, artistic error, username, scan, [abstract]"
|
208 |
+
|
209 |
+
image = pipe(
|
210 |
+
prompt,
|
211 |
+
negative_prompt=negative_prompt,
|
212 |
+
width=832,
|
213 |
+
height=1216,
|
214 |
+
guidance_scale=7,
|
215 |
+
num_inference_steps=28
|
216 |
+
).images[0]
|
217 |
+
|
218 |
+
image.save("./output/asuka_test.png")
|
219 |
+
```
|
220 |
+
|
221 |
+
## Usage Guidelines
|
222 |
+
|
223 |
+
### Tag Ordering
|
224 |
+
|
225 |
+
For optimal results, it's recommended to follow the structured prompt template because we train the model like this:
|
226 |
+
|
227 |
+
```
|
228 |
+
1girl/1boy, character name, from what series, everything else in any order.
|
229 |
+
```
|
230 |
+
|
231 |
+
## Special Tags
|
232 |
+
|
233 |
+
Animagine XL 3.1 utilizes special tags to steer the result toward quality, rating, creation date and aesthetic. While the model can generate images without these tags, using them can help achieve better results.
|
234 |
+
|
235 |
+
### Quality Modifiers
|
236 |
+
|
237 |
+
Quality tags now consider both scores and post ratings to ensure a balanced quality distribution. We've refined labels for greater clarity, such as changing 'high quality' to 'great quality'.
|
238 |
+
|
239 |
+
| Quality Modifier | Score Criterion |
|
240 |
+
|------------------|-------------------|
|
241 |
+
| `masterpiece` | > 95% |
|
242 |
+
| `best quality` | > 85% & ≤ 95% |
|
243 |
+
| `great quality` | > 75% & ≤ 85% |
|
244 |
+
| `good quality` | > 50% & ≤ 75% |
|
245 |
+
| `normal quality` | > 25% & ≤ 50% |
|
246 |
+
| `low quality` | > 10% & ≤ 25% |
|
247 |
+
| `worst quality` | ≤ 10% |
|
248 |
+
|
249 |
+
### Rating Modifiers
|
250 |
+
|
251 |
+
We've also streamlined our rating tags for simplicity and clarity, aiming to establish global rules that can be applied across different models. For example, the tag 'rating: general' is now simply 'general', and 'rating: sensitive' has been condensed to 'sensitive'.
|
252 |
+
|
253 |
+
| Rating Modifier | Rating Criterion |
|
254 |
+
|-------------------|------------------|
|
255 |
+
| `safe` | General |
|
256 |
+
| `sensitive` | Sensitive |
|
257 |
+
| `nsfw` | Questionable |
|
258 |
+
| `explicit, nsfw` | Explicit |
|
259 |
+
|
260 |
+
### Year Modifier
|
261 |
+
|
262 |
+
We've also redefined the year range to steer results towards specific modern or vintage anime art styles more accurately. This update simplifies the range, focusing on relevance to current and past eras.
|
263 |
+
|
264 |
+
| Year Tag | Year Range |
|
265 |
+
|----------|------------------|
|
266 |
+
| `newest` | 2021 to 2024 |
|
267 |
+
| `recent` | 2018 to 2020 |
|
268 |
+
| `mid` | 2015 to 2017 |
|
269 |
+
| `early` | 2011 to 2014 |
|
270 |
+
| `oldest` | 2005 to 2010 |
|
271 |
+
|
272 |
+
### Aesthetic Tags
|
273 |
+
|
274 |
+
We've enhanced our tagging system with aesthetic tags to refine content categorization based on visual appeal. These tags are derived from evaluations made by a specialized ViT (Vision Transformer) image classification model, specifically trained on anime data. For this purpose, we utilized the model [shadowlilac/aesthetic-shadow-v2](https://huggingface.co/shadowlilac/aesthetic-shadow-v2), which assesses the aesthetic value of content before it undergoes training. This ensures that each piece of content is not only relevant and accurate but also visually appealing.
|
275 |
+
|
276 |
+
| Aesthetic Tag | Score Range |
|
277 |
+
|-------------------|-------------------|
|
278 |
+
| `very aesthetic` | > 0.71 |
|
279 |
+
| `aesthetic` | > 0.45 & < 0.71 |
|
280 |
+
| `displeasing` | > 0.27 & < 0.45 |
|
281 |
+
| `very displeasing`| ≤ 0.27 |
|
282 |
+
|
283 |
+
## Recommended settings
|
284 |
+
|
285 |
+
To guide the model towards generating high-aesthetic images, use negative prompts like:
|
286 |
+
|
287 |
+
```
|
288 |
+
nsfw, lowres, (bad), text, error, fewer, extra, missing, worst quality, jpeg artifacts, low quality, watermark, unfinished, displeasing, oldest, early, chromatic aberration, signature, extra digits, artistic error, username, scan, [abstract]
|
289 |
+
```
|
290 |
+
|
291 |
+
For higher quality outcomes, prepend prompts with:
|
292 |
+
|
293 |
+
```
|
294 |
+
masterpiece, best quality, very aesthetic, absurdres
|
295 |
+
```
|
296 |
+
|
297 |
+
it’s recommended to use a lower classifier-free guidance (CFG Scale) of around 5-7, sampling steps below 30, and to use Euler Ancestral (Euler a) as a sampler.
|
298 |
+
|
299 |
+
### Multi Aspect Resolution
|
300 |
+
|
301 |
+
This model supports generating images at the following dimensions:
|
302 |
+
|
303 |
+
| Dimensions | Aspect Ratio |
|
304 |
+
|-------------------|-----------------|
|
305 |
+
| `1024 x 1024` | 1:1 Square |
|
306 |
+
| `1152 x 896` | 9:7 |
|
307 |
+
| `896 x 1152` | 7:9 |
|
308 |
+
| `1216 x 832` | 19:13 |
|
309 |
+
| `832 x 1216` | 13:19 |
|
310 |
+
| `1344 x 768` | 7:4 Horizontal |
|
311 |
+
| `768 x 1344` | 4:7 Vertical |
|
312 |
+
| `1536 x 640` | 12:5 Horizontal |
|
313 |
+
| `640 x 1536` | 5:12 Vertical |
|
314 |
+
|
315 |
+
## Training and Hyperparameters
|
316 |
+
|
317 |
+
**Animagine XL 3.1** was trained on 2x A100 80GB GPUs for approximately 15 days, totaling over 350 GPU hours. The training process consisted of three stages:
|
318 |
+
- **Pretraining**: Utilized a data-rich collection of 870k ordered and tagged images to increase Animagine XL 3.0's model knowledge.
|
319 |
+
- **Finetuning - First Stage**: Employed labeled and curated aesthetic datasets to refine the broken U-Net after pretraining.
|
320 |
+
- **Finetuning - Second Stage**: Utilized labeled and curated aesthetic datasets to refine the model's art style and improve hand and anatomy rendering.
|
321 |
+
|
322 |
+
### Hyperparameters
|
323 |
+
|
324 |
+
| Stage | Epochs | UNet lr | Train Text Encoder | Batch Size | Noise Offset | Optimizer | LR Scheduler | Grad Acc Steps | GPUs |
|
325 |
+
|--------------------------|--------|---------|--------------------|------------|--------------|------------|-------------------------------|----------------|------|
|
326 |
+
| **Pretraining** | 10 | 1e-5 | True | 16 | N/A | AdamW | Cosine Annealing Warm Restart | 3 | 2 |
|
327 |
+
| **Finetuning 1st Stage** | 10 | 2e-6 | False | 48 | 0.0357 | Adafactor | Constant with Warmup | 1 | 1 |
|
328 |
+
| **Finetuning 2nd Stage** | 15 | 1e-6 | False | 48 | 0.0357 | Adafactor | Constant with Warmup | 1 | 1 |
|
329 |
+
|
330 |
+
## Model Comparison (Pretraining only)
|
331 |
+
|
332 |
+
### Training Config
|
333 |
+
|
334 |
+
| Configuration Item | Animagine XL 3.0 | Animagine XL 3.1 |
|
335 |
+
|---------------------------------|------------------------------------------|------------------------------------------------|
|
336 |
+
| **GPU** | 2 x A100 80G | 2 x A100 80G |
|
337 |
+
| **Dataset** | 1,271,990 | 873,504 |
|
338 |
+
| **Shuffle Separator** | True | True |
|
339 |
+
| **Num Epochs** | 10 | 10 |
|
340 |
+
| **Learning Rate** | 7.5e-6 | 1e-5 |
|
341 |
+
| **Text Encoder Learning Rate** | 3.75e-6 | 1e-5 |
|
342 |
+
| **Effective Batch Size** | 48 x 1 x 2 | 16 x 3 x 2 |
|
343 |
+
| **Optimizer** | Adafactor | AdamW |
|
344 |
+
| **Optimizer Args** | Scale Parameter: False, Relative Step: False, Warmup Init: False | Weight Decay: 0.1, Betas: (0.9, 0.99) |
|
345 |
+
| **LR Scheduler** | Constant with Warmup | Cosine Annealing Warm Restart |
|
346 |
+
| **LR Scheduler Args** | Warmup Steps: 100 | Num Cycles: 10, Min LR: 1e-6, LR Decay: 0.9, First Cycle Steps: 9,099 |
|
347 |
+
|
348 |
+
Source code and training config are available here: https://github.com/cagliostrolab/sd-scripts/tree/main/notebook
|
349 |
+
|
350 |
+
### Acknowledgements
|
351 |
+
|
352 |
+
The development and release of Animagine XL 3.1 would not have been possible without the invaluable contributions and support from the following individuals and organizations:
|
353 |
+
|
354 |
+
- **[SeaArt.ai](https://www.seaart.ai/)**: Our collaboration partner and sponsor.
|
355 |
+
- **[Shadow Lilac](https://huggingface.co/shadowlilac)**: For providing the aesthetic classification model, [aesthetic-shadow-v2](https://huggingface.co/shadowlilac/aesthetic-shadow-v2).
|
356 |
+
- **[Derrian Distro](https://github.com/derrian-distro)**: For their custom learning rate scheduler, adapted from [LoRA Easy Training Scripts](https://github.com/derrian-distro/LoRA_Easy_Training_Scripts/blob/main/custom_scheduler/LoraEasyCustomOptimizer/CustomOptimizers.py).
|
357 |
+
- **[Kohya SS](https://github.com/kohya-ss)**: For their comprehensive training scripts.
|
358 |
+
- **Cagliostrolab Collaborators**: For their dedication to model training, project management, and data curation.
|
359 |
+
- **Early Testers**: For their valuable feedback and quality assurance efforts.
|
360 |
+
- **NovelAI**: For their innovative approach to aesthetic tagging, which served as an inspiration for our implementation.
|
361 |
+
- **KBlueLeaf**: For providing inspiration in balancing quality tags distribution and managing tags based on [Hakubooru Metainfo](https://github.com/KohakuBlueleaf/HakuBooru/blob/main/hakubooru/metainfo.py)
|
362 |
+
|
363 |
+
Thank you all for your support and expertise in pushing the boundaries of anime-style image generation.
|
364 |
+
|
365 |
+
## Collaborators
|
366 |
+
|
367 |
+
- [Linaqruf](https://huggingface.co/Linaqruf)
|
368 |
+
- [ItsMeBell](https://huggingface.co/ItsMeBell)
|
369 |
+
- [Asahina2K](https://huggingface.co/Asahina2K)
|
370 |
+
- [DamarJati](https://huggingface.co/DamarJati)
|
371 |
+
- [Zwicky18](https://huggingface.co/Zwicky18)
|
372 |
+
- [Scipius2121](https://huggingface.co/Scipius2121)
|
373 |
+
- [Raelina](https://huggingface.co/Raelina)
|
374 |
+
- [Kayfahaarukku](https://huggingface.co/kayfahaarukku)
|
375 |
+
- [Kriz](https://huggingface.co/Kr1SsSzz)
|
376 |
+
|
377 |
+
## Limitations
|
378 |
+
|
379 |
+
While Animagine XL 3.1 represents a significant advancement in anime-style image generation, it is important to acknowledge its limitations:
|
380 |
+
|
381 |
+
1. **Anime-Focused**: This model is specifically designed for generating anime-style images and is not suitable for creating realistic photos.
|
382 |
+
2. **Prompt Complexity**: This model may not be suitable for users who expect high-quality results from short or simple prompts. The training focus was on concept understanding rather than aesthetic refinement, which may require more detailed and specific prompts to achieve the desired output.
|
383 |
+
3. **Prompt Format**: Animagine XL 3.1 is optimized for Danbooru-style tags rather than natural language prompts. For best results, users are encouraged to format their prompts using the appropriate tags and syntax.
|
384 |
+
4. **Anatomy and Hand Rendering**: Despite the improvements made in anatomy and hand rendering, there may still be instances where the model produces suboptimal results in these areas.
|
385 |
+
5. **Dataset Size**: The dataset used for training Animagine XL 3.1 consists of approximately 870,000 images. When combined with the previous iteration's dataset (1.2 million), the total training data amounts to around 2.1 million images. While substantial, this dataset size may still be considered limited in scope for an "ultimate" anime model.
|
386 |
+
6. **NSFW Content**: Animagine XL 3.1 has been designed to generate more balanced NSFW content. However, it is important to note that the model may still produce NSFW results, even if not explicitly prompted.
|
387 |
+
|
388 |
+
By acknowledging these limitations, we aim to provide transparency and set realistic expectations for users of Animagine XL 3.1. Despite these constraints, we believe that the model represents a significant step forward in anime-style image generation and offers a powerful tool for artists, designers, and enthusiasts alike.
|
389 |
+
|
390 |
+
## License
|
391 |
+
|
392 |
+
Based on Animagine XL 3.0, Animagine XL 3.1 falls under [Fair AI Public License 1.0-SD](https://freedevproject.org/faipl-1.0-sd/) license, which is compatible with Stable Diffusion models’ license. Key points:
|
393 |
+
|
394 |
+
1. **Modification Sharing:** If you modify Animagine XL 3.1, you must share both your changes and the original license.
|
395 |
+
2. **Source Code Accessibility:** If your modified version is network-accessible, provide a way (like a download link) for others to get the source code. This applies to derived models too.
|
396 |
+
3. **Distribution Terms:** Any distribution must be under this license or another with similar rules.
|
397 |
+
4. **Compliance:** Non-compliance must be fixed within 30 days to avoid license termination, emphasizing transparency and adherence to open-source values.
|
398 |
+
|
399 |
+
The choice of this license aims to keep Animagine XL 3.1 open and modifiable, aligning with open source community spirit. It protects contributors and users, encouraging a collaborative, ethical open-source community. This ensures the model not only benefits from communal input but also respects open-source development freedoms.
|
400 |
+
|
401 |
+
## Cagliostro Lab Discord Server
|
402 |
+
|
403 |
+
Finally Cagliostro Lab Server open to public
|
404 |
+
https://discord.gg/cqh9tZgbGc
|
405 |
+
|
406 |
+
Feel free to join our discord server
|
animagine-xl-3.1.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e3c47aedb06418c6c331443cd89f2b3b3b34b7ed2102a3d4c4408a8d35aad6b0
|
3 |
+
size 6938325776
|
model_index.json
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_class_name": "StableDiffusionXLPipeline",
|
3 |
+
"_diffusers_version": "0.26.3",
|
4 |
+
"feature_extractor": [
|
5 |
+
null,
|
6 |
+
null
|
7 |
+
],
|
8 |
+
"force_zeros_for_empty_prompt": true,
|
9 |
+
"image_encoder": [
|
10 |
+
null,
|
11 |
+
null
|
12 |
+
],
|
13 |
+
"scheduler": [
|
14 |
+
"diffusers",
|
15 |
+
"EulerAncestralDiscreteScheduler"
|
16 |
+
],
|
17 |
+
"text_encoder": [
|
18 |
+
"transformers",
|
19 |
+
"CLIPTextModel"
|
20 |
+
],
|
21 |
+
"text_encoder_2": [
|
22 |
+
"transformers",
|
23 |
+
"CLIPTextModelWithProjection"
|
24 |
+
],
|
25 |
+
"tokenizer": [
|
26 |
+
"transformers",
|
27 |
+
"CLIPTokenizer"
|
28 |
+
],
|
29 |
+
"tokenizer_2": [
|
30 |
+
"transformers",
|
31 |
+
"CLIPTokenizer"
|
32 |
+
],
|
33 |
+
"unet": [
|
34 |
+
"diffusers",
|
35 |
+
"UNet2DConditionModel"
|
36 |
+
],
|
37 |
+
"vae": [
|
38 |
+
"diffusers",
|
39 |
+
"AutoencoderKL"
|
40 |
+
]
|
41 |
+
}
|
scheduler/scheduler_config.json
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_class_name": "EulerAncestralDiscreteScheduler",
|
3 |
+
"_diffusers_version": "0.26.3",
|
4 |
+
"beta_end": 0.012,
|
5 |
+
"beta_schedule": "scaled_linear",
|
6 |
+
"beta_start": 0.00085,
|
7 |
+
"interpolation_type": "linear",
|
8 |
+
"num_train_timesteps": 1000,
|
9 |
+
"prediction_type": "epsilon",
|
10 |
+
"rescale_betas_zero_snr": false,
|
11 |
+
"sample_max_value": 1.0,
|
12 |
+
"set_alpha_to_one": false,
|
13 |
+
"skip_prk_steps": true,
|
14 |
+
"steps_offset": 1,
|
15 |
+
"timestep_spacing": "leading",
|
16 |
+
"trained_betas": null
|
17 |
+
}
|
text_encoder/config.json
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"CLIPTextModel"
|
4 |
+
],
|
5 |
+
"attention_dropout": 0.0,
|
6 |
+
"bos_token_id": 0,
|
7 |
+
"dropout": 0.0,
|
8 |
+
"eos_token_id": 2,
|
9 |
+
"hidden_act": "quick_gelu",
|
10 |
+
"hidden_size": 768,
|
11 |
+
"initializer_factor": 1.0,
|
12 |
+
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 3072,
|
14 |
+
"layer_norm_eps": 1e-05,
|
15 |
+
"max_position_embeddings": 77,
|
16 |
+
"model_type": "clip_text_model",
|
17 |
+
"num_attention_heads": 12,
|
18 |
+
"num_hidden_layers": 12,
|
19 |
+
"pad_token_id": 1,
|
20 |
+
"projection_dim": 768,
|
21 |
+
"torch_dtype": "float16",
|
22 |
+
"transformers_version": "4.38.1",
|
23 |
+
"vocab_size": 49408
|
24 |
+
}
|
text_encoder/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4d869e7dd2f03c80673e595f128046ae0063451ddf041821b537c44cedd712f7
|
3 |
+
size 246144152
|
text_encoder_2/config.json
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"CLIPTextModelWithProjection"
|
4 |
+
],
|
5 |
+
"attention_dropout": 0.0,
|
6 |
+
"bos_token_id": 0,
|
7 |
+
"dropout": 0.0,
|
8 |
+
"eos_token_id": 2,
|
9 |
+
"hidden_act": "gelu",
|
10 |
+
"hidden_size": 1280,
|
11 |
+
"initializer_factor": 1.0,
|
12 |
+
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 5120,
|
14 |
+
"layer_norm_eps": 1e-05,
|
15 |
+
"max_position_embeddings": 77,
|
16 |
+
"model_type": "clip_text_model",
|
17 |
+
"num_attention_heads": 20,
|
18 |
+
"num_hidden_layers": 32,
|
19 |
+
"pad_token_id": 1,
|
20 |
+
"projection_dim": 1280,
|
21 |
+
"torch_dtype": "float16",
|
22 |
+
"transformers_version": "4.38.1",
|
23 |
+
"vocab_size": 49408
|
24 |
+
}
|
text_encoder_2/model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b2f6541a64af35bb59231e19c8efa840bdb076554ee17d4b418c4197ff07fc87
|
3 |
+
size 1389382176
|
tokenizer/merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer/special_tokens_map.json
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<|startoftext|>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": true,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "<|endoftext|>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": {
|
17 |
+
"content": "<|endoftext|>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
},
|
23 |
+
"unk_token": {
|
24 |
+
"content": "<|endoftext|>",
|
25 |
+
"lstrip": false,
|
26 |
+
"normalized": false,
|
27 |
+
"rstrip": false,
|
28 |
+
"single_word": false
|
29 |
+
}
|
30 |
+
}
|
tokenizer/tokenizer_config.json
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_prefix_space": false,
|
3 |
+
"added_tokens_decoder": {
|
4 |
+
"49406": {
|
5 |
+
"content": "<|startoftext|>",
|
6 |
+
"lstrip": false,
|
7 |
+
"normalized": true,
|
8 |
+
"rstrip": false,
|
9 |
+
"single_word": false,
|
10 |
+
"special": true
|
11 |
+
},
|
12 |
+
"49407": {
|
13 |
+
"content": "<|endoftext|>",
|
14 |
+
"lstrip": false,
|
15 |
+
"normalized": false,
|
16 |
+
"rstrip": false,
|
17 |
+
"single_word": false,
|
18 |
+
"special": true
|
19 |
+
}
|
20 |
+
},
|
21 |
+
"bos_token": "<|startoftext|>",
|
22 |
+
"clean_up_tokenization_spaces": true,
|
23 |
+
"do_lower_case": true,
|
24 |
+
"eos_token": "<|endoftext|>",
|
25 |
+
"errors": "replace",
|
26 |
+
"model_max_length": 77,
|
27 |
+
"pad_token": "<|endoftext|>",
|
28 |
+
"tokenizer_class": "CLIPTokenizer",
|
29 |
+
"unk_token": "<|endoftext|>"
|
30 |
+
}
|
tokenizer/vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_2/merges.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_2/special_tokens_map.json
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<|startoftext|>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": true,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "<|endoftext|>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"pad_token": "!",
|
17 |
+
"unk_token": {
|
18 |
+
"content": "<|endoftext|>",
|
19 |
+
"lstrip": false,
|
20 |
+
"normalized": false,
|
21 |
+
"rstrip": false,
|
22 |
+
"single_word": false
|
23 |
+
}
|
24 |
+
}
|
tokenizer_2/tokenizer_config.json
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"add_prefix_space": false,
|
3 |
+
"added_tokens_decoder": {
|
4 |
+
"0": {
|
5 |
+
"content": "!",
|
6 |
+
"lstrip": false,
|
7 |
+
"normalized": false,
|
8 |
+
"rstrip": false,
|
9 |
+
"single_word": false,
|
10 |
+
"special": true
|
11 |
+
},
|
12 |
+
"49406": {
|
13 |
+
"content": "<|startoftext|>",
|
14 |
+
"lstrip": false,
|
15 |
+
"normalized": true,
|
16 |
+
"rstrip": false,
|
17 |
+
"single_word": false,
|
18 |
+
"special": true
|
19 |
+
},
|
20 |
+
"49407": {
|
21 |
+
"content": "<|endoftext|>",
|
22 |
+
"lstrip": false,
|
23 |
+
"normalized": false,
|
24 |
+
"rstrip": false,
|
25 |
+
"single_word": false,
|
26 |
+
"special": true
|
27 |
+
}
|
28 |
+
},
|
29 |
+
"bos_token": "<|startoftext|>",
|
30 |
+
"clean_up_tokenization_spaces": true,
|
31 |
+
"do_lower_case": true,
|
32 |
+
"eos_token": "<|endoftext|>",
|
33 |
+
"errors": "replace",
|
34 |
+
"model_max_length": 77,
|
35 |
+
"pad_token": "!",
|
36 |
+
"tokenizer_class": "CLIPTokenizer",
|
37 |
+
"unk_token": "<|endoftext|>"
|
38 |
+
}
|
tokenizer_2/vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
unet/config.json
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_class_name": "UNet2DConditionModel",
|
3 |
+
"_diffusers_version": "0.26.3",
|
4 |
+
"act_fn": "silu",
|
5 |
+
"addition_embed_type": "text_time",
|
6 |
+
"addition_embed_type_num_heads": 64,
|
7 |
+
"addition_time_embed_dim": 256,
|
8 |
+
"attention_head_dim": [
|
9 |
+
5,
|
10 |
+
10,
|
11 |
+
20
|
12 |
+
],
|
13 |
+
"attention_type": "default",
|
14 |
+
"block_out_channels": [
|
15 |
+
320,
|
16 |
+
640,
|
17 |
+
1280
|
18 |
+
],
|
19 |
+
"center_input_sample": false,
|
20 |
+
"class_embed_type": null,
|
21 |
+
"class_embeddings_concat": false,
|
22 |
+
"conv_in_kernel": 3,
|
23 |
+
"conv_out_kernel": 3,
|
24 |
+
"cross_attention_dim": 2048,
|
25 |
+
"cross_attention_norm": null,
|
26 |
+
"down_block_types": [
|
27 |
+
"DownBlock2D",
|
28 |
+
"CrossAttnDownBlock2D",
|
29 |
+
"CrossAttnDownBlock2D"
|
30 |
+
],
|
31 |
+
"downsample_padding": 1,
|
32 |
+
"dropout": 0.0,
|
33 |
+
"dual_cross_attention": false,
|
34 |
+
"encoder_hid_dim": null,
|
35 |
+
"encoder_hid_dim_type": null,
|
36 |
+
"flip_sin_to_cos": true,
|
37 |
+
"freq_shift": 0,
|
38 |
+
"in_channels": 4,
|
39 |
+
"layers_per_block": 2,
|
40 |
+
"mid_block_only_cross_attention": null,
|
41 |
+
"mid_block_scale_factor": 1,
|
42 |
+
"mid_block_type": "UNetMidBlock2DCrossAttn",
|
43 |
+
"norm_eps": 1e-05,
|
44 |
+
"norm_num_groups": 32,
|
45 |
+
"num_attention_heads": null,
|
46 |
+
"num_class_embeds": null,
|
47 |
+
"only_cross_attention": false,
|
48 |
+
"out_channels": 4,
|
49 |
+
"projection_class_embeddings_input_dim": 2816,
|
50 |
+
"resnet_out_scale_factor": 1.0,
|
51 |
+
"resnet_skip_time_act": false,
|
52 |
+
"resnet_time_scale_shift": "default",
|
53 |
+
"reverse_transformer_layers_per_block": null,
|
54 |
+
"sample_size": 128,
|
55 |
+
"time_cond_proj_dim": null,
|
56 |
+
"time_embedding_act_fn": null,
|
57 |
+
"time_embedding_dim": null,
|
58 |
+
"time_embedding_type": "positional",
|
59 |
+
"timestep_post_act": null,
|
60 |
+
"transformer_layers_per_block": [
|
61 |
+
1,
|
62 |
+
2,
|
63 |
+
10
|
64 |
+
],
|
65 |
+
"up_block_types": [
|
66 |
+
"CrossAttnUpBlock2D",
|
67 |
+
"CrossAttnUpBlock2D",
|
68 |
+
"UpBlock2D"
|
69 |
+
],
|
70 |
+
"upcast_attention": false,
|
71 |
+
"use_linear_projection": true
|
72 |
+
}
|
unet/diffusion_pytorch_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c1e43f5fa892e1c54c99fc7caebf9c3426910ea5a730861ff89dead23b9f260e
|
3 |
+
size 5135149760
|
vae/config.json
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_class_name": "AutoencoderKL",
|
3 |
+
"_diffusers_version": "0.26.3",
|
4 |
+
"_name_or_path": "madebyollin/sdxl-vae-fp16-fix",
|
5 |
+
"act_fn": "silu",
|
6 |
+
"block_out_channels": [
|
7 |
+
128,
|
8 |
+
256,
|
9 |
+
512,
|
10 |
+
512
|
11 |
+
],
|
12 |
+
"down_block_types": [
|
13 |
+
"DownEncoderBlock2D",
|
14 |
+
"DownEncoderBlock2D",
|
15 |
+
"DownEncoderBlock2D",
|
16 |
+
"DownEncoderBlock2D"
|
17 |
+
],
|
18 |
+
"force_upcast": false,
|
19 |
+
"in_channels": 3,
|
20 |
+
"latent_channels": 4,
|
21 |
+
"layers_per_block": 2,
|
22 |
+
"norm_num_groups": 32,
|
23 |
+
"out_channels": 3,
|
24 |
+
"sample_size": 512,
|
25 |
+
"scaling_factor": 0.13025,
|
26 |
+
"up_block_types": [
|
27 |
+
"UpDecoderBlock2D",
|
28 |
+
"UpDecoderBlock2D",
|
29 |
+
"UpDecoderBlock2D",
|
30 |
+
"UpDecoderBlock2D"
|
31 |
+
]
|
32 |
+
}
|
vae/diffusion_pytorch_model.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6353737672c94b96174cb590f711eac6edf2fcce5b6e91aa9d73c5adc589ee48
|
3 |
+
size 167335342
|