lisonallen commited on
Commit
50f328c
·
1 Parent(s): 4292ab9

添加Hugging Face Space部署配置文件和依赖

Browse files
Files changed (10) hide show
  1. .gitattributes +5 -0
  2. .gitignore +27 -23
  3. Dockerfile +42 -0
  4. README-HF.md +36 -0
  5. README.md +24 -462
  6. app.py +387 -0
  7. diffusers_helper/__init__.py +1 -0
  8. diffusers_helper/hf_login.py +21 -17
  9. requirements.txt +4 -1
  10. setup.sh +7 -0
.gitattributes ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
2
+ *.bin filter=lfs diff=lfs merge=lfs -text
3
+ *.mp4 filter=lfs diff=lfs merge=lfs -text
4
+ *.pt filter=lfs diff=lfs merge=lfs -text
5
+ *.pth filter=lfs diff=lfs merge=lfs -text
.gitignore CHANGED
@@ -1,16 +1,8 @@
1
- hf_download/
2
- outputs/
3
- repo/
4
-
5
- # Byte-compiled / optimized / DLL files
6
  __pycache__/
7
  *.py[cod]
8
  *$py.class
9
-
10
- # C extensions
11
  *.so
12
-
13
- # Distribution / packaging
14
  .Python
15
  build/
16
  develop-eggs/
@@ -24,15 +16,36 @@ parts/
24
  sdist/
25
  var/
26
  wheels/
27
- share/python-wheels/
28
  *.egg-info/
29
  .installed.cfg
30
  *.egg
31
- MANIFEST
32
 
33
- # PyInstaller
34
- # Usually these files are written by a python script from a template
35
- # before PyInstaller builds the exe, so as to inject date/other infos into it.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
  *.manifest
37
  *.spec
38
 
@@ -131,15 +144,6 @@ celerybeat.pid
131
  # SageMath parsed files
132
  *.sage.py
133
 
134
- # Environments
135
- .env
136
- .venv
137
- env/
138
- venv/
139
- ENV/
140
- env.bak/
141
- venv.bak/
142
-
143
  # Spyder project settings
144
  .spyderproject
145
  .spyproject
 
1
+ # Python
 
 
 
 
2
  __pycache__/
3
  *.py[cod]
4
  *$py.class
 
 
5
  *.so
 
 
6
  .Python
7
  build/
8
  develop-eggs/
 
16
  sdist/
17
  var/
18
  wheels/
 
19
  *.egg-info/
20
  .installed.cfg
21
  *.egg
 
22
 
23
+ # Project specific
24
+ outputs/
25
+ hf_download/
26
+ *.mp4
27
+ *.safetensors
28
+ *.bin
29
+ *.pt
30
+ *.pth
31
+
32
+ # Environment
33
+ .env
34
+ .venv
35
+ env/
36
+ venv/
37
+ ENV/
38
+ env.bak/
39
+ venv.bak/
40
+ .DS_Store
41
+
42
+ # IDE settings
43
+ .vscode/
44
+ .idea/
45
+ *.swp
46
+ *.swo
47
+
48
+ # Byte-compiled / optimized / DLL files
49
  *.manifest
50
  *.spec
51
 
 
144
  # SageMath parsed files
145
  *.sage.py
146
 
 
 
 
 
 
 
 
 
 
147
  # Spyder project settings
148
  .spyderproject
149
  .spyproject
Dockerfile ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:12.1.1-cudnn8-runtime-ubuntu22.04
2
+
3
+ # 设置非交互式安装并避免不必要的包
4
+ ENV DEBIAN_FRONTEND=noninteractive
5
+ ENV TZ=Asia/Shanghai
6
+
7
+ # 安装基本工具和Python
8
+ RUN apt-get update && apt-get install -y \
9
+ git \
10
+ python3 \
11
+ python3-pip \
12
+ ffmpeg \
13
+ libgl1-mesa-glx \
14
+ libglib2.0-0 \
15
+ && apt-get clean \
16
+ && rm -rf /var/lib/apt/lists/*
17
+
18
+ # 设置工作目录
19
+ WORKDIR /app
20
+
21
+ # 复制需要的文件
22
+ COPY requirements.txt ./
23
+ COPY app.py ./
24
+ COPY setup.sh ./
25
+ COPY README.md ./
26
+ COPY diffusers_helper ./diffusers_helper
27
+
28
+ # 安装Python依赖
29
+ RUN pip3 install --no-cache-dir -r requirements.txt
30
+
31
+ # 创建需要的目录
32
+ RUN mkdir -p /app/outputs
33
+ RUN mkdir -p /app/hf_download
34
+
35
+ # 设置权限
36
+ RUN chmod +x setup.sh
37
+
38
+ # 设置环境变量
39
+ ENV HF_HOME=/app/hf_download
40
+
41
+ # 运行应用
42
+ CMD ["python3", "app.py"]
README-HF.md ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FramePack - 图像到视频生成
2
+
3
+ ![FramePack封面图](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/gradio-spaces/gradio-banner.png)
4
+
5
+ 将静态图像转换为动态视频的人工智能应用。上传一张人物图像,添加动作描述,即可生成流畅的视频!
6
+
7
+ ## 使用方法
8
+
9
+ 1. 上传一张人物图像
10
+ 2. 输入描述所需动作的提示词(如"The girl dances gracefully")
11
+ 3. 调整视频长度和其他可选参数
12
+ 4. 点击"开始生成"按钮
13
+ 5. 等待视频生成(过程是渐进式的,会不断扩展视频长度)
14
+
15
+ ## 示例提示词
16
+
17
+ - "The girl dances gracefully, with clear movements, full of charm."
18
+ - "The man dances energetically, leaping mid-air with fluid arm swings and quick footwork."
19
+ - "A character doing some simple body movements."
20
+
21
+ ## 技术特点
22
+
23
+ - 基于Hunyuan Video和FramePack架构
24
+ - 支持低显存GPU运行
25
+ - 可生成最长120秒的视频
26
+ - 使用TeaCache技术加速生成过程
27
+
28
+ ## 注意事项
29
+
30
+ - 视频生成是倒序进行的,结束动作将先于开始动作生成
31
+ - 首次使用时需要下载模型(约30GB),请耐心等待
32
+ - 如果遇到内存不足错误,可以增加"GPU推理保留内存"的值
33
+
34
+ ---
35
+
36
+ 原项目: [FramePack GitHub](https://github.com/lllyasviel/FramePack)
README.md CHANGED
@@ -1,477 +1,39 @@
1
- <p align="center">
2
- <img src="https://github.com/user-attachments/assets/2cc030b4-87e1-40a0-b5bf-1b7d6b62820b" width="300">
3
- </p>
4
-
5
  # FramePack
6
 
7
- Official implementation and desktop software for ["Packing Input Frame Context in Next-Frame Prediction Models for Video Generation"](https://lllyasviel.github.io/frame_pack_gitpage/).
8
-
9
- Links: [**Paper**](https://lllyasviel.github.io/frame_pack_gitpage/pack.pdf), [**Project Page**](https://lllyasviel.github.io/frame_pack_gitpage/)
10
-
11
- FramePack is a next-frame (next-frame-section) prediction neural network structure that generates videos progressively.
12
-
13
- FramePack compresses input contexts to a constant length so that the generation workload is invariant to video length.
14
-
15
- FramePack can process a very large number of frames with 13B models even on laptop GPUs.
16
-
17
- FramePack can be trained with a much larger batch size, similar to the batch size for image diffusion training.
18
-
19
- **Video diffusion, but feels like image diffusion.**
20
-
21
- # Requirements
22
-
23
- Note that this repo is a functional desktop software with minimal standalone high-quality sampling system and memory management.
24
-
25
- **Start with this repo before you try anything else!**
26
-
27
- Requirements:
28
-
29
- * Nvidia GPU in RTX 30XX, 40XX, 50XX series that supports fp16 and bf16. The GTX 10XX/20XX are not tested.
30
- * Linux or Windows operating system.
31
- * At least 6GB GPU memory.
32
-
33
- To generate 1-minute video (60 seconds) at 30fps (1800 frames) using 13B model, the minimal required GPU memory is 6GB. (Yes 6 GB, not a typo. Laptop GPUs are okay.)
34
-
35
- About speed, on my RTX 4090 desktop it generates at a speed of 2.5 seconds/frame (unoptimized) or 1.5 seconds/frame (teacache). On my laptops like 3070ti laptop or 3060 laptop, it is about 4x to 8x slower.
36
-
37
- In any case, you will directly see the generated frames since it is next-frame(-section) prediction. So you will get lots of visual feedback before the entire video is generated.
38
-
39
- # Installation
40
-
41
- **Windows**:
42
-
43
- [>>> Click Here to Download One-Click Package (CUDA 12.6 + Pytorch 2.6) <<<](https://github.com/lllyasviel/FramePack/releases/download/windows/framepack_cu126_torch26.7z)
44
-
45
- After you download, you uncompress, use `update.bat` to update, and use `run.bat` to run.
46
-
47
- Note that running `update.bat` is important, otherwise you may be using a previous version with potential bugs unfixed.
48
-
49
- ![image](https://github.com/lllyasviel/stable-diffusion-webui-forge/assets/19834515/c49bd60d-82bd-4086-9859-88d472582b94)
50
-
51
- Note that the models will be downloaded automatically. You will download more than 30GB from HuggingFace.
52
-
53
- **Linux**:
54
-
55
- We recommend having an independent Python 3.10.
56
-
57
- pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
58
- pip install -r requirements.txt
59
-
60
- To start the GUI, run:
61
-
62
- python demo_gradio.py
63
-
64
- Note that it supports `--share`, `--port`, `--server`, and so on.
65
-
66
- The software supports PyTorch attention, xformers, flash-attn, sage-attention. By default, it will just use PyTorch attention. You can install those attention kernels if you know how.
67
-
68
- For example, to install sage-attention (linux):
69
-
70
- pip install sageattention==1.0.6
71
-
72
- However, you are highly recommended to first try without sage-attention since it will influence results, though the influence is minimal.
73
-
74
- # GUI
75
-
76
- ![ui](https://github.com/user-attachments/assets/8c5cdbb1-b80c-4b7e-ac27-83834ac24cc4)
77
-
78
- On the left you upload an image and write a prompt.
79
-
80
- On the right are the generated videos and latent previews.
81
-
82
- Because this is a next-frame-section prediction model, videos will be generated longer and longer.
83
-
84
- You will see the progress bar for each section and the latent preview for the next section.
85
-
86
- Note that the initial progress may be slower than later diffusion as the device may need some warmup.
87
-
88
- # Sanity Check
89
-
90
- Before trying your own inputs, we highly recommend going through the sanity check to find out if any hardware or software went wrong.
91
-
92
- Next-frame-section prediction models are very sensitive to subtle differences in noise and hardware. Usually, people will get slightly different results on different devices, but the results should look overall similar. In some cases, if possible, you'll get exactly the same results.
93
-
94
- ## Image-to-5-seconds
95
-
96
- Download this image:
97
-
98
- <img src="https://github.com/user-attachments/assets/f3bc35cf-656a-4c9c-a83a-bbab24858b09" width="150">
99
-
100
- Copy this prompt:
101
-
102
- `The man dances energetically, leaping mid-air with fluid arm swings and quick footwork.`
103
-
104
- Set like this:
105
-
106
- (all default parameters, with teacache turned off)
107
- ![image](https://github.com/user-attachments/assets/0071fbb6-600c-4e0f-adc9-31980d540e9d)
108
-
109
- The result will be:
110
-
111
- <table>
112
- <tr>
113
- <td align="center" width="300">
114
- <video
115
- src="https://github.com/user-attachments/assets/bc74f039-2b14-4260-a30b-ceacf611a185"
116
- controls
117
- style="max-width:100%;">
118
- </video>
119
- </td>
120
- </tr>
121
- <tr>
122
- <td align="center">
123
- <em>Video may be compressed by GitHub</em>
124
- </td>
125
- </tr>
126
- </table>
127
-
128
- **Important Note:**
129
-
130
- Again, this is a next-frame-section prediction model. This means you will generate videos frame-by-frame or section-by-section.
131
-
132
- **If you get a much shorter video in the UI, like a video with only 1 second, then it is totally expected.** You just need to wait. More sections will be generated to complete the video.
133
-
134
- ## Know the influence of TeaCache and Quantization
135
-
136
- Download this image:
137
-
138
- <img src="https://github.com/user-attachments/assets/42293e30-bdd4-456d-895c-8fedff71be04" width="150">
139
-
140
- Copy this prompt:
141
-
142
- `The girl dances gracefully, with clear movements, full of charm.`
143
-
144
- Set like this:
145
-
146
- ![image](https://github.com/user-attachments/assets/4274207d-5180-4824-a552-d0d801933435)
147
-
148
- Turn off teacache:
149
-
150
- ![image](https://github.com/user-attachments/assets/53b309fb-667b-4aa8-96a1-f129c7a09ca6)
151
-
152
- You will get this:
153
-
154
- <table>
155
- <tr>
156
- <td align="center" width="300">
157
- <video
158
- src="https://github.com/user-attachments/assets/04ab527b-6da1-4726-9210-a8853dda5577"
159
- controls
160
- style="max-width:100%;">
161
- </video>
162
- </td>
163
- </tr>
164
- <tr>
165
- <td align="center">
166
- <em>Video may be compressed by GitHub</em>
167
- </td>
168
- </tr>
169
- </table>
170
-
171
- Now turn on teacache:
172
-
173
- ![image](https://github.com/user-attachments/assets/16ad047b-fbcc-4091-83dc-d46bea40708c)
174
-
175
- About 30% users will get this (the other 70% will get other random results depending on their hardware):
176
-
177
- <table>
178
- <tr>
179
- <td align="center" width="300">
180
- <video
181
- src="https://github.com/user-attachments/assets/149fb486-9ccc-4a48-b1f0-326253051e9b"
182
- controls
183
- style="max-width:100%;">
184
- </video>
185
- </td>
186
- </tr>
187
- <tr>
188
- <td align="center">
189
- <em>A typical worse result.</em>
190
- </td>
191
- </tr>
192
- </table>
193
 
194
- So you can see that teacache is not really lossless and sometimes can influence the result a lot.
195
 
196
- We recommend using teacache to try ideas and then using the full diffusion process to get high-quality results.
 
 
 
 
197
 
198
- This recommendation also applies to sage-attention, bnb quant, gguf, etc., etc.
199
 
200
- ## Image-to-1-minute
 
 
 
 
201
 
202
- <img src="https://github.com/user-attachments/assets/820af6ca-3c2e-4bbc-afe8-9a9be1994ff5" width="150">
203
 
204
- `The girl dances gracefully, with clear movements, full of charm.`
 
 
205
 
206
- ![image](https://github.com/user-attachments/assets/8c34fcb2-288a-44b3-a33d-9d2324e30cbd)
207
 
208
- Set video length to 60 seconds:
 
 
209
 
210
- ![image](https://github.com/user-attachments/assets/5595a7ea-f74e-445e-ad5f-3fb5b4b21bee)
211
 
212
- If everything is in order you will get some result like this eventually.
213
-
214
- 60s version:
215
-
216
- <table>
217
- <tr>
218
- <td align="center" width="300">
219
- <video
220
- src="https://github.com/user-attachments/assets/c3be4bde-2e33-4fd4-b76d-289a036d3a47"
221
- controls
222
- style="max-width:100%;">
223
- </video>
224
- </td>
225
- </tr>
226
- <tr>
227
- <td align="center">
228
- <em>Video may be compressed by GitHub</em>
229
- </td>
230
- </tr>
231
- </table>
232
-
233
- 6s version:
234
-
235
- <table>
236
- <tr>
237
- <td align="center" width="300">
238
- <video
239
- src="https://github.com/user-attachments/assets/37fe2c33-cb03-41e8-acca-920ab3e34861"
240
- controls
241
- style="max-width:100%;">
242
- </video>
243
- </td>
244
- </tr>
245
- <tr>
246
- <td align="center">
247
- <em>Video may be compressed by GitHub</em>
248
- </td>
249
- </tr>
250
- </table>
251
-
252
- # More Examples
253
-
254
- Many more examples are in [**Project Page**](https://lllyasviel.github.io/frame_pack_gitpage/).
255
-
256
- Below are some more examples that you may be interested in reproducing.
257
-
258
- ---
259
-
260
- <img src="https://github.com/user-attachments/assets/99f4d281-28ad-44f5-8700-aa7a4e5638fa" width="150">
261
-
262
- `The girl dances gracefully, with clear movements, full of charm.`
263
-
264
- ![image](https://github.com/user-attachments/assets/0e98bfca-1d91-4b1d-b30f-4236b517c35e)
265
-
266
- <table>
267
- <tr>
268
- <td align="center" width="300">
269
- <video
270
- src="https://github.com/user-attachments/assets/cebe178a-09ce-4b7a-8f3c-060332f4dab1"
271
- controls
272
- style="max-width:100%;">
273
- </video>
274
- </td>
275
- </tr>
276
- <tr>
277
- <td align="center">
278
- <em>Video may be compressed by GitHub</em>
279
- </td>
280
- </tr>
281
- </table>
282
-
283
- ---
284
-
285
- <img src="https://github.com/user-attachments/assets/853f4f40-2956-472f-aa7a-fa50da03ed92" width="150">
286
-
287
- `The girl suddenly took out a sign that said “cute” using right hand`
288
-
289
- ![image](https://github.com/user-attachments/assets/d51180e4-5537-4e25-a6c6-faecae28648a)
290
-
291
- <table>
292
- <tr>
293
- <td align="center" width="300">
294
- <video
295
- src="https://github.com/user-attachments/assets/116069d2-7499-4f38-ada7-8f85517d1fbb"
296
- controls
297
- style="max-width:100%;">
298
- </video>
299
- </td>
300
- </tr>
301
- <tr>
302
- <td align="center">
303
- <em>Video may be compressed by GitHub</em>
304
- </td>
305
- </tr>
306
- </table>
307
-
308
- ---
309
-
310
- <img src="https://github.com/user-attachments/assets/6d87c53f-81b2-4108-a704-697164ae2e81" width="150">
311
-
312
- `The girl skateboarding, repeating the endless spinning and dancing and jumping on a skateboard, with clear movements, full of charm.`
313
-
314
- ![image](https://github.com/user-attachments/assets/c2cfa835-b8e6-4c28-97f8-88f42da1ffdf)
315
-
316
- <table>
317
- <tr>
318
- <td align="center" width="300">
319
- <video
320
- src="https://github.com/user-attachments/assets/d9e3534a-eb17-4af2-a8ed-8e692e9993d2"
321
- controls
322
- style="max-width:100%;">
323
- </video>
324
- </td>
325
- </tr>
326
- <tr>
327
- <td align="center">
328
- <em>Video may be compressed by GitHub</em>
329
- </td>
330
- </tr>
331
- </table>
332
-
333
- ---
334
-
335
- <img src="https://github.com/user-attachments/assets/6e95d1a5-9674-4c9a-97a9-ddf704159b79" width="150">
336
-
337
- `The girl dances gracefully, with clear movements, full of charm.`
338
-
339
- ![image](https://github.com/user-attachments/assets/7412802a-ce44-4188-b1a4-cfe19f9c9118)
340
-
341
- <table>
342
- <tr>
343
- <td align="center" width="300">
344
- <video
345
- src="https://github.com/user-attachments/assets/e1b3279e-e30d-4d32-b55f-2fb1d37c81d2"
346
- controls
347
- style="max-width:100%;">
348
- </video>
349
- </td>
350
- </tr>
351
- <tr>
352
- <td align="center">
353
- <em>Video may be compressed by GitHub</em>
354
- </td>
355
- </tr>
356
- </table>
357
-
358
- ---
359
-
360
- <img src="https://github.com/user-attachments/assets/90fc6d7e-8f6b-4f8c-a5df-ee5b1c8b63c9" width="150">
361
-
362
- `The man dances flamboyantly, swinging his hips and striking bold poses with dramatic flair.`
363
-
364
- ![image](https://github.com/user-attachments/assets/1dcf10a3-9747-4e77-a269-03a9379dd9af)
365
-
366
- <table>
367
- <tr>
368
- <td align="center" width="300">
369
- <video
370
- src="https://github.com/user-attachments/assets/aaa4481b-7bf8-4c64-bc32-909659767115"
371
- controls
372
- style="max-width:100%;">
373
- </video>
374
- </td>
375
- </tr>
376
- <tr>
377
- <td align="center">
378
- <em>Video may be compressed by GitHub</em>
379
- </td>
380
- </tr>
381
- </table>
382
 
383
  ---
384
 
385
- <img src="https://github.com/user-attachments/assets/62ecf987-ec0c-401d-b3c9-be9ffe84ee5b" width="150">
386
-
387
- `The woman dances elegantly among the blossoms, spinning slowly with flowing sleeves and graceful hand movements.`
388
-
389
- ![image](https://github.com/user-attachments/assets/396f06bc-e399-4ac3-9766-8a42d4f8d383)
390
-
391
-
392
- <table>
393
- <tr>
394
- <td align="center" width="300">
395
- <video
396
- src="https://github.com/user-attachments/assets/f23f2f37-c9b8-45d5-a1be-7c87bd4b41cf"
397
- controls
398
- style="max-width:100%;">
399
- </video>
400
- </td>
401
- </tr>
402
- <tr>
403
- <td align="center">
404
- <em>Video may be compressed by GitHub</em>
405
- </td>
406
- </tr>
407
- </table>
408
-
409
- ---
410
-
411
- <img src="https://github.com/user-attachments/assets/4f740c1a-2d2f-40a6-9613-d6fe64c428aa" width="150">
412
-
413
- `The young man writes intensely, flipping papers and adjusting his glasses with swift, focused movements.`
414
-
415
- ![image](https://github.com/user-attachments/assets/c4513c4b-997a-429b-b092-bb275a37b719)
416
-
417
- <table>
418
- <tr>
419
- <td align="center" width="300">
420
- <video
421
- src="https://github.com/user-attachments/assets/62e9910e-aea6-4b2b-9333-2e727bccfc64"
422
- controls
423
- style="max-width:100%;">
424
- </video>
425
- </td>
426
- </tr>
427
- <tr>
428
- <td align="center">
429
- <em>Video may be compressed by GitHub</em>
430
- </td>
431
- </tr>
432
- </table>
433
-
434
- ---
435
-
436
- # Prompting Guideline
437
-
438
- Many people would ask how to write better prompts.
439
-
440
- Below is a ChatGPT template that I personally often use to get prompts:
441
-
442
- You are an assistant that writes short, motion-focused prompts for animating images.
443
-
444
- When the user sends an image, respond with a single, concise prompt describing visual motion (such as human activity, moving objects, or camera movements). Focus only on how the scene could come alive and become dynamic using brief phrases.
445
-
446
- Larger and more dynamic motions (like dancing, jumping, running, etc.) are preferred over smaller or more subtle ones (like standing still, sitting, etc.).
447
-
448
- Describe subject, then motion, then other things. For example: "The girl dances gracefully, with clear movements, full of charm."
449
-
450
- If there is something that can dance (like a man, girl, robot, etc.), then prefer to describe it as dancing.
451
-
452
- Stay in a loop: one image in, one motion prompt out. Do not explain, ask questions, or generate multiple options.
453
-
454
- You paste the instruct to ChatGPT and then feed it an image to get prompt like this:
455
-
456
- ![image](https://github.com/user-attachments/assets/586c53b9-0b8c-4c94-b1d3-d7e7c1a705c3)
457
-
458
- *The man dances powerfully, striking sharp poses and gliding smoothly across the reflective floor.*
459
-
460
- Usually this will give you a prompt that works well.
461
-
462
- You can also write prompts yourself. Concise prompts are usually preferred, for example:
463
-
464
- *The girl dances gracefully, with clear movements, full of charm.*
465
-
466
- *The man dances powerfully, with clear movements, full of energy.*
467
-
468
- and so on.
469
-
470
- # Cite
471
-
472
- @article{zhang2025framepack,
473
- title={Packing Input Frame Contexts in Next-Frame Prediction Models for Video Generation},
474
- author={Lvmin Zhang and Maneesh Agrawala},
475
- journal={Arxiv},
476
- year={2025}
477
- }
 
 
 
 
 
1
  # FramePack
2
 
3
+ FramePack是一个图像到视频生成工具,利用扩散模型将静态图像转换为动态视频。
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
+ ## 特点
6
 
7
+ - 使用单张图片生成流畅的动作视频
8
+ - 基于HunyuanVideo和FramePack架构
9
+ - 支持低显存GPU(最低6GB)运行
10
+ - 可以生成最长120秒的视频
11
+ - 使用TeaCache技术加速生成过程
12
 
13
+ ## 使用方法
14
 
15
+ 1. 上传一张人物图像
16
+ 2. 输入描述所需动作的提示词
17
+ 3. 设置所需视频长度(秒)
18
+ 4. 点击"开始生成"按钮
19
+ 5. 等待视频生成(生成过程是渐进式的,会不断扩展视频长度)
20
 
21
+ ## 示例提示词
22
 
23
+ - "The girl dances gracefully, with clear movements, full of charm."
24
+ - "A character doing some simple body movements."
25
+ - "The man dances energetically, leaping mid-air with fluid arm swings and quick footwork."
26
 
27
+ ## 注意事项
28
 
29
+ - 视频生成是倒序进行的,结束动作将先于开始动作生成
30
+ - 如果需要高质量结果,建议关闭TeaCache选项
31
+ - 如果遇到内存不足错误,可以增加"GPU推理保留内存"的值
32
 
33
+ ## 技术细节
34
 
35
+ 此应用基于[FramePack](https://github.com/lllyasviel/FramePack)项目,使用了Hunyuan Video模型和FramePack技术进行视频生成。该技术可以将输入上下文压缩为固定长度,使生成工作量与视频长度无关,从而在笔记本电脑GPU上也能处理大量帧。
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
  ---
38
 
39
+ 原项目链接:[FramePack GitHub](https://github.com/lllyasviel/FramePack)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,387 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from diffusers_helper.hf_login import login
2
+
3
+ import os
4
+
5
+ os.environ['HF_HOME'] = os.path.abspath(os.path.realpath(os.path.join(os.path.dirname(__file__), './hf_download')))
6
+
7
+ import gradio as gr
8
+ import torch
9
+ import traceback
10
+ import einops
11
+ import safetensors.torch as sf
12
+ import numpy as np
13
+ import math
14
+
15
+ from PIL import Image
16
+ from diffusers import AutoencoderKLHunyuanVideo
17
+ from transformers import LlamaModel, CLIPTextModel, LlamaTokenizerFast, CLIPTokenizer
18
+ from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode, vae_decode_fake
19
+ from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge, generate_timestamp
20
+ from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
21
+ from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
22
+ from diffusers_helper.memory import cpu, gpu, get_cuda_free_memory_gb, move_model_to_device_with_memory_preservation, offload_model_from_device_for_memory_preservation, fake_diffusers_current_device, DynamicSwapInstaller, unload_complete_models, load_model_as_complete
23
+ from diffusers_helper.thread_utils import AsyncStream, async_run
24
+ from diffusers_helper.gradio.progress_bar import make_progress_bar_css, make_progress_bar_html
25
+ from transformers import SiglipImageProcessor, SiglipVisionModel
26
+ from diffusers_helper.clip_vision import hf_clip_vision_encode
27
+ from diffusers_helper.bucket_tools import find_nearest_bucket
28
+
29
+ # 获取可用的CUDA内存
30
+ free_mem_gb = get_cuda_free_memory_gb(gpu)
31
+ high_vram = free_mem_gb > 60
32
+
33
+ print(f'Free VRAM {free_mem_gb} GB')
34
+ print(f'High-VRAM Mode: {high_vram}')
35
+
36
+ # 加载模型
37
+ text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu()
38
+ text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu()
39
+ tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer')
40
+ tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer_2')
41
+ vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='vae', torch_dtype=torch.float16).cpu()
42
+
43
+ feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='feature_extractor')
44
+ image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='image_encoder', torch_dtype=torch.float16).cpu()
45
+
46
+ transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePackI2V_HY', torch_dtype=torch.bfloat16).cpu()
47
+
48
+ vae.eval()
49
+ text_encoder.eval()
50
+ text_encoder_2.eval()
51
+ image_encoder.eval()
52
+ transformer.eval()
53
+
54
+ if not high_vram:
55
+ vae.enable_slicing()
56
+ vae.enable_tiling()
57
+
58
+ transformer.high_quality_fp32_output_for_inference = True
59
+ print('transformer.high_quality_fp32_output_for_inference = True')
60
+
61
+ transformer.to(dtype=torch.bfloat16)
62
+ vae.to(dtype=torch.float16)
63
+ image_encoder.to(dtype=torch.float16)
64
+ text_encoder.to(dtype=torch.float16)
65
+ text_encoder_2.to(dtype=torch.float16)
66
+
67
+ vae.requires_grad_(False)
68
+ text_encoder.requires_grad_(False)
69
+ text_encoder_2.requires_grad_(False)
70
+ image_encoder.requires_grad_(False)
71
+ transformer.requires_grad_(False)
72
+
73
+ if not high_vram:
74
+ # DynamicSwapInstaller is same as huggingface's enable_sequential_offload but 3x faster
75
+ DynamicSwapInstaller.install_model(transformer, device=gpu)
76
+ DynamicSwapInstaller.install_model(text_encoder, device=gpu)
77
+ else:
78
+ text_encoder.to(gpu)
79
+ text_encoder_2.to(gpu)
80
+ image_encoder.to(gpu)
81
+ vae.to(gpu)
82
+ transformer.to(gpu)
83
+
84
+ stream = AsyncStream()
85
+
86
+ outputs_folder = './outputs/'
87
+ os.makedirs(outputs_folder, exist_ok=True)
88
+
89
+
90
+ @torch.no_grad()
91
+ def worker(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache):
92
+ total_latent_sections = (total_second_length * 30) / (latent_window_size * 4)
93
+ total_latent_sections = int(max(round(total_latent_sections), 1))
94
+
95
+ job_id = generate_timestamp()
96
+
97
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
98
+
99
+ try:
100
+ # Clean GPU
101
+ if not high_vram:
102
+ unload_complete_models(
103
+ text_encoder, text_encoder_2, image_encoder, vae, transformer
104
+ )
105
+
106
+ # Text encoding
107
+
108
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
109
+
110
+ if not high_vram:
111
+ fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
112
+ load_model_as_complete(text_encoder_2, target_device=gpu)
113
+
114
+ llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
115
+
116
+ if cfg == 1:
117
+ llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
118
+ else:
119
+ llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
120
+
121
+ llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
122
+ llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
123
+
124
+ # Processing input image
125
+
126
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Image processing ...'))))
127
+
128
+ H, W, C = input_image.shape
129
+ height, width = find_nearest_bucket(H, W, resolution=640)
130
+ input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
131
+
132
+ Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}.png'))
133
+
134
+ input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
135
+ input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
136
+
137
+ # VAE encoding
138
+
139
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
140
+
141
+ if not high_vram:
142
+ load_model_as_complete(vae, target_device=gpu)
143
+
144
+ start_latent = vae_encode(input_image_pt, vae)
145
+
146
+ # CLIP Vision
147
+
148
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
149
+
150
+ if not high_vram:
151
+ load_model_as_complete(image_encoder, target_device=gpu)
152
+
153
+ image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
154
+ image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
155
+
156
+ # Dtype
157
+
158
+ llama_vec = llama_vec.to(transformer.dtype)
159
+ llama_vec_n = llama_vec_n.to(transformer.dtype)
160
+ clip_l_pooler = clip_l_pooler.to(transformer.dtype)
161
+ clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
162
+ image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
163
+
164
+ # Sampling
165
+
166
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
167
+
168
+ rnd = torch.Generator("cpu").manual_seed(seed)
169
+ num_frames = latent_window_size * 4 - 3
170
+
171
+ history_latents = torch.zeros(size=(1, 16, 1 + 2 + 16, height // 8, width // 8), dtype=torch.float32).cpu()
172
+ history_pixels = None
173
+ total_generated_latent_frames = 0
174
+
175
+ latent_paddings = reversed(range(total_latent_sections))
176
+
177
+ if total_latent_sections > 4:
178
+ # In theory the latent_paddings should follow the above sequence, but it seems that duplicating some
179
+ # items looks better than expanding it when total_latent_sections > 4
180
+ # One can try to remove below trick and just
181
+ # use `latent_paddings = list(reversed(range(total_latent_sections)))` to compare
182
+ latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]
183
+
184
+ for latent_padding in latent_paddings:
185
+ is_last_section = latent_padding == 0
186
+ latent_padding_size = latent_padding * latent_window_size
187
+
188
+ if stream.input_queue.top() == 'end':
189
+ stream.output_queue.push(('end', None))
190
+ return
191
+
192
+ print(f'latent_padding_size = {latent_padding_size}, is_last_section = {is_last_section}')
193
+
194
+ indices = torch.arange(0, sum([1, latent_padding_size, latent_window_size, 1, 2, 16])).unsqueeze(0)
195
+ clean_latent_indices_pre, blank_indices, latent_indices, clean_latent_indices_post, clean_latent_2x_indices, clean_latent_4x_indices = indices.split([1, latent_padding_size, latent_window_size, 1, 2, 16], dim=1)
196
+ clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)
197
+
198
+ clean_latents_pre = start_latent.to(history_latents)
199
+ clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, :1 + 2 + 16, :, :].split([1, 2, 16], dim=2)
200
+ clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2)
201
+
202
+ if not high_vram:
203
+ unload_complete_models()
204
+ move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
205
+
206
+ if use_teacache:
207
+ transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
208
+ else:
209
+ transformer.initialize_teacache(enable_teacache=False)
210
+
211
+ def callback(d):
212
+ preview = d['denoised']
213
+ preview = vae_decode_fake(preview)
214
+
215
+ preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
216
+ preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
217
+
218
+ if stream.input_queue.top() == 'end':
219
+ stream.output_queue.push(('end', None))
220
+ raise KeyboardInterrupt('User ends the task.')
221
+
222
+ current_step = d['i'] + 1
223
+ percentage = int(100.0 * current_step / steps)
224
+ hint = f'Sampling {current_step}/{steps}'
225
+ desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / 30) :.2f} seconds (FPS-30). The video is being extended now ...'
226
+ stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
227
+ return
228
+
229
+ generated_latents = sample_hunyuan(
230
+ transformer=transformer,
231
+ sampler='unipc',
232
+ width=width,
233
+ height=height,
234
+ frames=num_frames,
235
+ real_guidance_scale=cfg,
236
+ distilled_guidance_scale=gs,
237
+ guidance_rescale=rs,
238
+ # shift=3.0,
239
+ num_inference_steps=steps,
240
+ generator=rnd,
241
+ prompt_embeds=llama_vec,
242
+ prompt_embeds_mask=llama_attention_mask,
243
+ prompt_poolers=clip_l_pooler,
244
+ negative_prompt_embeds=llama_vec_n,
245
+ negative_prompt_embeds_mask=llama_attention_mask_n,
246
+ negative_prompt_poolers=clip_l_pooler_n,
247
+ device=gpu,
248
+ dtype=torch.bfloat16,
249
+ image_embeddings=image_encoder_last_hidden_state,
250
+ latent_indices=latent_indices,
251
+ clean_latents=clean_latents,
252
+ clean_latent_indices=clean_latent_indices,
253
+ clean_latents_2x=clean_latents_2x,
254
+ clean_latent_2x_indices=clean_latent_2x_indices,
255
+ clean_latents_4x=clean_latents_4x,
256
+ clean_latent_4x_indices=clean_latent_4x_indices,
257
+ callback=callback,
258
+ )
259
+
260
+ if is_last_section:
261
+ generated_latents = torch.cat([start_latent.to(generated_latents), generated_latents], dim=2)
262
+
263
+ total_generated_latent_frames += int(generated_latents.shape[2])
264
+ history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2)
265
+
266
+ if not high_vram:
267
+ offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
268
+ load_model_as_complete(vae, target_device=gpu)
269
+
270
+ real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
271
+
272
+ if history_pixels is None:
273
+ history_pixels = vae_decode(real_history_latents, vae).cpu()
274
+ else:
275
+ section_latent_frames = (latent_window_size * 2 + 1) if is_last_section else (latent_window_size * 2)
276
+ overlapped_frames = latent_window_size * 4 - 3
277
+
278
+ current_pixels = vae_decode(real_history_latents[:, :, :section_latent_frames], vae).cpu()
279
+ history_pixels = soft_append_bcthw(current_pixels, history_pixels, overlapped_frames)
280
+
281
+ if not high_vram:
282
+ unload_complete_models()
283
+
284
+ output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
285
+
286
+ save_bcthw_as_mp4(history_pixels, output_filename, fps=30)
287
+
288
+ print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}')
289
+
290
+ stream.output_queue.push(('file', output_filename))
291
+
292
+ if is_last_section:
293
+ break
294
+ except:
295
+ traceback.print_exc()
296
+
297
+ if not high_vram:
298
+ unload_complete_models(
299
+ text_encoder, text_encoder_2, image_encoder, vae, transformer
300
+ )
301
+
302
+ stream.output_queue.push(('end', None))
303
+ return
304
+
305
+
306
+ def process(input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache):
307
+ global stream
308
+ assert input_image is not None, 'No input image!'
309
+
310
+ yield None, None, '', '', gr.update(interactive=False), gr.update(interactive=True)
311
+
312
+ stream = AsyncStream()
313
+
314
+ async_run(worker, input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache)
315
+
316
+ output_filename = None
317
+
318
+ while True:
319
+ flag, data = stream.output_queue.next()
320
+
321
+ if flag == 'file':
322
+ output_filename = data
323
+ yield output_filename, gr.update(), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True)
324
+
325
+ if flag == 'progress':
326
+ preview, desc, html = data
327
+ yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
328
+
329
+ if flag == 'end':
330
+ yield output_filename, gr.update(visible=False), gr.update(), '', gr.update(interactive=True), gr.update(interactive=False)
331
+ break
332
+
333
+
334
+ def end_process():
335
+ stream.input_queue.push('end')
336
+
337
+
338
+ quick_prompts = [
339
+ 'The girl dances gracefully, with clear movements, full of charm.',
340
+ 'A character doing some simple body movements.',
341
+ ]
342
+ quick_prompts = [[x] for x in quick_prompts]
343
+
344
+
345
+ css = make_progress_bar_css()
346
+ block = gr.Blocks(css=css).queue()
347
+ with block:
348
+ gr.Markdown('# FramePack - 图像到视频生成')
349
+ with gr.Row():
350
+ with gr.Column():
351
+ input_image = gr.Image(sources='upload', type="numpy", label="上传图像", height=320)
352
+ prompt = gr.Textbox(label="提示词", value='')
353
+ example_quick_prompts = gr.Dataset(samples=quick_prompts, label='快速提示词列表', samples_per_page=1000, components=[prompt])
354
+ example_quick_prompts.click(lambda x: x[0], inputs=[example_quick_prompts], outputs=prompt, show_progress=False, queue=False)
355
+
356
+ with gr.Row():
357
+ start_button = gr.Button(value="开始生成")
358
+ end_button = gr.Button(value="结束生成", interactive=False)
359
+
360
+ with gr.Group():
361
+ use_teacache = gr.Checkbox(label='使用TeaCache', value=True, info='速度更快,但可能会使手指和手的生成效果稍差。')
362
+
363
+ n_prompt = gr.Textbox(label="负面提示词", value="", visible=False) # Not used
364
+ seed = gr.Number(label="随机种子", value=31337, precision=0)
365
+
366
+ total_second_length = gr.Slider(label="视频长度(秒)", minimum=1, maximum=120, value=5, step=0.1)
367
+ latent_window_size = gr.Slider(label="潜在窗口大小", minimum=1, maximum=33, value=9, step=1, visible=False) # Should not change
368
+ steps = gr.Slider(label="推理步数", minimum=1, maximum=100, value=25, step=1, info='不建议修改此值。')
369
+
370
+ cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01, visible=False) # Should not change
371
+ gs = gr.Slider(label="蒸馏CFG比例", minimum=1.0, maximum=32.0, value=10.0, step=0.01, info='不建议修改此值。')
372
+ rs = gr.Slider(label="CFG重缩放", minimum=0.0, maximum=1.0, value=0.0, step=0.01, visible=False) # Should not change
373
+
374
+ gpu_memory_preservation = gr.Slider(label="GPU推理保留内存(GB)(值越大速度越慢)", minimum=6, maximum=128, value=6, step=0.1, info="如果出现OOM错误,请将此值设置得更大。值越大,速度越慢。")
375
+
376
+ with gr.Column():
377
+ preview_image = gr.Image(label="下一批潜变量", height=200, visible=False)
378
+ result_video = gr.Video(label="生成的视频", autoplay=True, show_share_button=False, height=512, loop=True)
379
+ gr.Markdown('注意:由于采样是倒序的,结束动作将在开始动作之前生成。如果视频中没有出现起始动作,请继续等待,它将在稍后生成。')
380
+ progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
381
+ progress_bar = gr.HTML('', elem_classes='no-generating-animation')
382
+ ips = [input_image, prompt, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache]
383
+ start_button.click(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
384
+ end_button.click(fn=end_process)
385
+
386
+
387
+ block.launch()
diffusers_helper/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # diffusers_helper package
diffusers_helper/hf_login.py CHANGED
@@ -1,21 +1,25 @@
1
  import os
 
2
 
 
 
 
 
 
 
3
 
4
- def login(token):
5
- from huggingface_hub import login
6
- import time
 
 
 
 
 
 
 
 
 
7
 
8
- while True:
9
- try:
10
- login(token)
11
- print('HF login ok.')
12
- break
13
- except Exception as e:
14
- print(f'HF login failed: {e}. Retrying')
15
- time.sleep(0.5)
16
-
17
-
18
- hf_token = os.environ.get('HF_TOKEN', None)
19
-
20
- if hf_token is not None:
21
- login(hf_token)
 
1
  import os
2
+ from huggingface_hub import login
3
 
4
+ def login():
5
+ # 如果是在Hugging Face Space环境中运行,使用环境变量中的token
6
+ if os.environ.get('SPACE_ID') is not None:
7
+ print("Running in Hugging Face Space, using environment HF_TOKEN")
8
+ # Space自带访问权限,无需额外登录
9
+ return
10
 
11
+ # 如果本地环境有token,则使用它登录
12
+ hf_token = os.environ.get('HF_TOKEN')
13
+ if hf_token:
14
+ print("Logging in with HF_TOKEN from environment")
15
+ login(token=hf_token)
16
+ return
17
+
18
+ # 检查缓存的token
19
+ cache_file = os.path.expanduser('~/.huggingface/token')
20
+ if os.path.exists(cache_file):
21
+ print("Found cached Hugging Face token")
22
+ return
23
 
24
+ print("No Hugging Face token found. Using public access.")
25
+ # 无token时使用公共访问,速度可能较慢且有限制
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -9,7 +9,10 @@ numpy==1.26.2
9
  scipy==1.12.0
10
  requests==2.31.0
11
  torchsde==0.2.6
12
-
 
 
13
  einops
14
  opencv-contrib-python
15
  safetensors
 
 
9
  scipy==1.12.0
10
  requests==2.31.0
11
  torchsde==0.2.6
12
+ torch>=2.0.0
13
+ torchvision
14
+ torchaudio
15
  einops
16
  opencv-contrib-python
17
  safetensors
18
+ huggingface_hub
setup.sh ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ # 创建必要的目录
3
+ mkdir -p hf_download
4
+ mkdir -p outputs
5
+
6
+ # 如果模型尚未下载,会在首次运行时自动下载
7
+ echo "环境准备完毕,运行 python app.py 启动应用"