yijin928 commited on
Commit
25bb7a0
·
verified ·
1 Parent(s): 1b80e0f

Upload 70 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. custom_nodes/ComfyUI-CogVideoXWrapper/.gitattributes +2 -0
  3. custom_nodes/ComfyUI-CogVideoXWrapper/.github/FUNDING.yml +2 -0
  4. custom_nodes/ComfyUI-CogVideoXWrapper/.github/workflows/publish.yml +24 -0
  5. custom_nodes/ComfyUI-CogVideoXWrapper/.gitignore +11 -0
  6. custom_nodes/ComfyUI-CogVideoXWrapper/LICENSE +201 -0
  7. custom_nodes/ComfyUI-CogVideoXWrapper/__init__.py +7 -0
  8. custom_nodes/ComfyUI-CogVideoXWrapper/__pycache__/__init__.cpython-311.pyc +0 -0
  9. custom_nodes/ComfyUI-CogVideoXWrapper/__pycache__/__init__.cpython-312.pyc +0 -0
  10. custom_nodes/ComfyUI-CogVideoXWrapper/__pycache__/custom_cogvideox_transformer_3d.cpython-311.pyc +0 -0
  11. custom_nodes/ComfyUI-CogVideoXWrapper/__pycache__/custom_cogvideox_transformer_3d.cpython-312.pyc +0 -0
  12. custom_nodes/ComfyUI-CogVideoXWrapper/__pycache__/embeddings.cpython-311.pyc +0 -0
  13. custom_nodes/ComfyUI-CogVideoXWrapper/__pycache__/embeddings.cpython-312.pyc +0 -0
  14. custom_nodes/ComfyUI-CogVideoXWrapper/__pycache__/model_loading.cpython-311.pyc +0 -0
  15. custom_nodes/ComfyUI-CogVideoXWrapper/__pycache__/model_loading.cpython-312.pyc +0 -0
  16. custom_nodes/ComfyUI-CogVideoXWrapper/__pycache__/nodes.cpython-311.pyc +0 -0
  17. custom_nodes/ComfyUI-CogVideoXWrapper/__pycache__/nodes.cpython-312.pyc +0 -0
  18. custom_nodes/ComfyUI-CogVideoXWrapper/__pycache__/pipeline_cogvideox.cpython-311.pyc +0 -0
  19. custom_nodes/ComfyUI-CogVideoXWrapper/__pycache__/pipeline_cogvideox.cpython-312.pyc +0 -0
  20. custom_nodes/ComfyUI-CogVideoXWrapper/__pycache__/utils.cpython-311.pyc +0 -0
  21. custom_nodes/ComfyUI-CogVideoXWrapper/__pycache__/utils.cpython-312.pyc +0 -0
  22. custom_nodes/ComfyUI-CogVideoXWrapper/cogvideo_controlnet.py +220 -0
  23. custom_nodes/ComfyUI-CogVideoXWrapper/cogvideox_fun/utils.py +43 -0
  24. custom_nodes/ComfyUI-CogVideoXWrapper/configs/scheduler_config_2b.json +18 -0
  25. custom_nodes/ComfyUI-CogVideoXWrapper/configs/scheduler_config_5b.json +18 -0
  26. custom_nodes/ComfyUI-CogVideoXWrapper/configs/transformer_config_2b.json +26 -0
  27. custom_nodes/ComfyUI-CogVideoXWrapper/configs/transformer_config_5b.json +26 -0
  28. custom_nodes/ComfyUI-CogVideoXWrapper/configs/transformer_config_I2V_5b.json +27 -0
  29. custom_nodes/ComfyUI-CogVideoXWrapper/configs/vae_config.json +39 -0
  30. custom_nodes/ComfyUI-CogVideoXWrapper/context.py +184 -0
  31. custom_nodes/ComfyUI-CogVideoXWrapper/custom_cogvideox_transformer_3d.py +779 -0
  32. custom_nodes/ComfyUI-CogVideoXWrapper/embeddings.py +226 -0
  33. custom_nodes/ComfyUI-CogVideoXWrapper/enhance_a_video/__init__.py +0 -0
  34. custom_nodes/ComfyUI-CogVideoXWrapper/enhance_a_video/__pycache__/__init__.cpython-311.pyc +0 -0
  35. custom_nodes/ComfyUI-CogVideoXWrapper/enhance_a_video/__pycache__/__init__.cpython-312.pyc +0 -0
  36. custom_nodes/ComfyUI-CogVideoXWrapper/enhance_a_video/__pycache__/enhance.cpython-311.pyc +0 -0
  37. custom_nodes/ComfyUI-CogVideoXWrapper/enhance_a_video/__pycache__/enhance.cpython-312.pyc +0 -0
  38. custom_nodes/ComfyUI-CogVideoXWrapper/enhance_a_video/__pycache__/globals.cpython-311.pyc +0 -0
  39. custom_nodes/ComfyUI-CogVideoXWrapper/enhance_a_video/__pycache__/globals.cpython-312.pyc +0 -0
  40. custom_nodes/ComfyUI-CogVideoXWrapper/enhance_a_video/enhance.py +82 -0
  41. custom_nodes/ComfyUI-CogVideoXWrapper/enhance_a_video/globals.py +31 -0
  42. custom_nodes/ComfyUI-CogVideoXWrapper/example_workflows/cogvideox_1.0_5b_vid2vid_02.json +1061 -0
  43. custom_nodes/ComfyUI-CogVideoXWrapper/example_workflows/cogvideox_1_0_2b_controlnet_02.json +1003 -0
  44. custom_nodes/ComfyUI-CogVideoXWrapper/example_workflows/cogvideox_1_0_5b_I2V_02.json +688 -0
  45. custom_nodes/ComfyUI-CogVideoXWrapper/example_workflows/cogvideox_1_0_5b_I2V_Tora_02.json +0 -0
  46. custom_nodes/ComfyUI-CogVideoXWrapper/example_workflows/cogvideox_1_0_5b_I2V_noise_warp_01.json +1291 -0
  47. custom_nodes/ComfyUI-CogVideoXWrapper/example_workflows/cogvideox_1_0_5b_T2V_02.json +529 -0
  48. custom_nodes/ComfyUI-CogVideoXWrapper/example_workflows/cogvideox_1_0_5b_interpolation_02.json +864 -0
  49. custom_nodes/ComfyUI-CogVideoXWrapper/example_workflows/cogvideox_1_0_5b_vid2vid_02.json +1061 -0
  50. custom_nodes/ComfyUI-CogVideoXWrapper/example_workflows/cogvideox_1_5_5b_I2V_01.json +688 -0
.gitattributes CHANGED
@@ -16,3 +16,4 @@ custom_nodes/ComfyUI-N-Nodes/libs/rifle/demo/I2_0.png filter=lfs diff=lfs merge=
16
  custom_nodes/ComfyUI-N-Nodes/libs/rifle/demo/I2_1.png filter=lfs diff=lfs merge=lfs -text
17
  custom_nodes/ComfyUI-N-Nodes/libs/rifle/demo/I2_slomo_clipped.gif filter=lfs diff=lfs merge=lfs -text
18
  custom_nodes/ComfyUI-N-Nodes/libs/rifle/train_log/flownet.pkl filter=lfs diff=lfs merge=lfs -text
 
 
16
  custom_nodes/ComfyUI-N-Nodes/libs/rifle/demo/I2_1.png filter=lfs diff=lfs merge=lfs -text
17
  custom_nodes/ComfyUI-N-Nodes/libs/rifle/demo/I2_slomo_clipped.gif filter=lfs diff=lfs merge=lfs -text
18
  custom_nodes/ComfyUI-N-Nodes/libs/rifle/train_log/flownet.pkl filter=lfs diff=lfs merge=lfs -text
19
+ custom_nodes/ComfyUI-CogVideoXWrapper/example_workflows/noise_warp_example_input_video.mp4 filter=lfs diff=lfs merge=lfs -text
custom_nodes/ComfyUI-CogVideoXWrapper/.gitattributes ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Auto detect text files and perform LF normalization
2
+ * text=auto
custom_nodes/ComfyUI-CogVideoXWrapper/.github/FUNDING.yml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ github: [kijai]
2
+ custom: ["https://www.paypal.me/kijaidesign"]
custom_nodes/ComfyUI-CogVideoXWrapper/.github/workflows/publish.yml ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Publish to Comfy registry
2
+ on:
3
+ workflow_dispatch:
4
+ push:
5
+ branches:
6
+ - main
7
+ - master
8
+ paths:
9
+ - "pyproject.toml"
10
+
11
+ jobs:
12
+ publish-node:
13
+ name: Publish Custom Node to registry
14
+ runs-on: ubuntu-latest
15
+ # if this is a forked repository. Skipping the workflow.
16
+ if: github.event.repository.fork == false
17
+ steps:
18
+ - name: Check out code
19
+ uses: actions/checkout@v4
20
+ - name: Publish Custom Node
21
+ uses: Comfy-Org/publish-node-action@main
22
+ with:
23
+ ## Add your own personal access token to your Github Repository secrets and reference it here.
24
+ personal_access_token: ${{ secrets.REGISTRY_ACCESS_TOKEN }}
custom_nodes/ComfyUI-CogVideoXWrapper/.gitignore ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ output/
2
+ *__pycache__/
3
+ samples*/
4
+ runs/
5
+ checkpoints/
6
+ master_ip
7
+ logs/
8
+ *.DS_Store
9
+ .idea
10
+ *.pt
11
+ tools/
custom_nodes/ComfyUI-CogVideoXWrapper/LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
custom_nodes/ComfyUI-CogVideoXWrapper/__init__.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from .nodes import NODE_CLASS_MAPPINGS as NODES_CLASS, NODE_DISPLAY_NAME_MAPPINGS as NODES_DISPLAY
2
+ from .model_loading import NODE_CLASS_MAPPINGS as MODEL_CLASS, NODE_DISPLAY_NAME_MAPPINGS as MODEL_DISPLAY
3
+
4
+ NODE_CLASS_MAPPINGS = {**NODES_CLASS, **MODEL_CLASS}
5
+ NODE_DISPLAY_NAME_MAPPINGS = {**NODES_DISPLAY, **MODEL_DISPLAY}
6
+
7
+ __all__ = ["NODE_CLASS_MAPPINGS", "NODE_DISPLAY_NAME_MAPPINGS"]
custom_nodes/ComfyUI-CogVideoXWrapper/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (498 Bytes). View file
 
custom_nodes/ComfyUI-CogVideoXWrapper/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (508 Bytes). View file
 
custom_nodes/ComfyUI-CogVideoXWrapper/__pycache__/custom_cogvideox_transformer_3d.cpython-311.pyc ADDED
Binary file (37.6 kB). View file
 
custom_nodes/ComfyUI-CogVideoXWrapper/__pycache__/custom_cogvideox_transformer_3d.cpython-312.pyc ADDED
Binary file (34.9 kB). View file
 
custom_nodes/ComfyUI-CogVideoXWrapper/__pycache__/embeddings.cpython-311.pyc ADDED
Binary file (11.1 kB). View file
 
custom_nodes/ComfyUI-CogVideoXWrapper/__pycache__/embeddings.cpython-312.pyc ADDED
Binary file (10.6 kB). View file
 
custom_nodes/ComfyUI-CogVideoXWrapper/__pycache__/model_loading.cpython-311.pyc ADDED
Binary file (52.7 kB). View file
 
custom_nodes/ComfyUI-CogVideoXWrapper/__pycache__/model_loading.cpython-312.pyc ADDED
Binary file (47 kB). View file
 
custom_nodes/ComfyUI-CogVideoXWrapper/__pycache__/nodes.cpython-311.pyc ADDED
Binary file (53.1 kB). View file
 
custom_nodes/ComfyUI-CogVideoXWrapper/__pycache__/nodes.cpython-312.pyc ADDED
Binary file (47.6 kB). View file
 
custom_nodes/ComfyUI-CogVideoXWrapper/__pycache__/pipeline_cogvideox.cpython-311.pyc ADDED
Binary file (43.1 kB). View file
 
custom_nodes/ComfyUI-CogVideoXWrapper/__pycache__/pipeline_cogvideox.cpython-312.pyc ADDED
Binary file (40.9 kB). View file
 
custom_nodes/ComfyUI-CogVideoXWrapper/__pycache__/utils.cpython-311.pyc ADDED
Binary file (2.83 kB). View file
 
custom_nodes/ComfyUI-CogVideoXWrapper/__pycache__/utils.cpython-312.pyc ADDED
Binary file (2.56 kB). View file
 
custom_nodes/ComfyUI-CogVideoXWrapper/cogvideo_controlnet.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://github.com/TheDenk/cogvideox-controlnet/blob/main/cogvideo_controlnet.py
2
+ from typing import Any, Dict, Optional, Tuple, Union
3
+
4
+ import torch
5
+ from torch import nn
6
+ from einops import rearrange
7
+ import torch.nn.functional as F
8
+ from .custom_cogvideox_transformer_3d import Transformer2DModelOutput, CogVideoXBlock
9
+ from diffusers.utils import is_torch_version
10
+ from diffusers.loaders import PeftAdapterMixin
11
+ from diffusers.models.embeddings import CogVideoXPatchEmbed, TimestepEmbedding, Timesteps
12
+ from diffusers.models.modeling_utils import ModelMixin
13
+ from diffusers.configuration_utils import ConfigMixin, register_to_config
14
+
15
+
16
+ class CogVideoXControlnet(ModelMixin, ConfigMixin, PeftAdapterMixin):
17
+ _supports_gradient_checkpointing = True
18
+
19
+ @register_to_config
20
+ def __init__(
21
+ self,
22
+ num_attention_heads: int = 30,
23
+ attention_head_dim: int = 64,
24
+ vae_channels: int = 16,
25
+ in_channels: int = 3,
26
+ downscale_coef: int = 8,
27
+ flip_sin_to_cos: bool = True,
28
+ freq_shift: int = 0,
29
+ time_embed_dim: int = 512,
30
+ num_layers: int = 8,
31
+ dropout: float = 0.0,
32
+ attention_bias: bool = True,
33
+ sample_width: int = 90,
34
+ sample_height: int = 60,
35
+ sample_frames: int = 49,
36
+ patch_size: int = 2,
37
+ temporal_compression_ratio: int = 4,
38
+ max_text_seq_length: int = 226,
39
+ activation_fn: str = "gelu-approximate",
40
+ timestep_activation_fn: str = "silu",
41
+ norm_elementwise_affine: bool = True,
42
+ norm_eps: float = 1e-5,
43
+ spatial_interpolation_scale: float = 1.875,
44
+ temporal_interpolation_scale: float = 1.0,
45
+ use_rotary_positional_embeddings: bool = False,
46
+ use_learned_positional_embeddings: bool = False,
47
+ out_proj_dim = None,
48
+ ):
49
+ super().__init__()
50
+ inner_dim = num_attention_heads * attention_head_dim
51
+
52
+ if not use_rotary_positional_embeddings and use_learned_positional_embeddings:
53
+ raise ValueError(
54
+ "There are no CogVideoX checkpoints available with disable rotary embeddings and learned positional "
55
+ "embeddings. If you're using a custom model and/or believe this should be supported, please open an "
56
+ "issue at https://github.com/huggingface/diffusers/issues."
57
+ )
58
+
59
+ start_channels = in_channels * (downscale_coef ** 2)
60
+ input_channels = [start_channels, start_channels // 2, start_channels // 4]
61
+ self.unshuffle = nn.PixelUnshuffle(downscale_coef)
62
+
63
+ self.controlnet_encode_first = nn.Sequential(
64
+ nn.Conv2d(input_channels[0], input_channels[1], kernel_size=1, stride=1, padding=0),
65
+ nn.GroupNorm(2, input_channels[1]),
66
+ nn.ReLU(),
67
+ )
68
+
69
+ self.controlnet_encode_second = nn.Sequential(
70
+ nn.Conv2d(input_channels[1], input_channels[2], kernel_size=1, stride=1, padding=0),
71
+ nn.GroupNorm(2, input_channels[2]),
72
+ nn.ReLU(),
73
+ )
74
+
75
+ # 1. Patch embedding
76
+ self.patch_embed = CogVideoXPatchEmbed(
77
+ patch_size=patch_size,
78
+ in_channels=vae_channels + input_channels[2],
79
+ embed_dim=inner_dim,
80
+ bias=True,
81
+ sample_width=sample_width,
82
+ sample_height=sample_height,
83
+ sample_frames=sample_frames,
84
+ temporal_compression_ratio=temporal_compression_ratio,
85
+ spatial_interpolation_scale=spatial_interpolation_scale,
86
+ temporal_interpolation_scale=temporal_interpolation_scale,
87
+ use_positional_embeddings=not use_rotary_positional_embeddings,
88
+ use_learned_positional_embeddings=use_learned_positional_embeddings,
89
+ )
90
+
91
+ self.embedding_dropout = nn.Dropout(dropout)
92
+
93
+ # 2. Time embeddings
94
+ self.time_proj = Timesteps(inner_dim, flip_sin_to_cos, freq_shift)
95
+ self.time_embedding = TimestepEmbedding(inner_dim, time_embed_dim, timestep_activation_fn)
96
+
97
+ # 3. Define spatio-temporal transformers blocks
98
+ self.transformer_blocks = nn.ModuleList(
99
+ [
100
+ CogVideoXBlock(
101
+ dim=inner_dim,
102
+ num_attention_heads=num_attention_heads,
103
+ attention_head_dim=attention_head_dim,
104
+ time_embed_dim=time_embed_dim,
105
+ dropout=dropout,
106
+ activation_fn=activation_fn,
107
+ attention_bias=attention_bias,
108
+ norm_elementwise_affine=norm_elementwise_affine,
109
+ norm_eps=norm_eps,
110
+ )
111
+ for _ in range(num_layers)
112
+ ]
113
+ )
114
+
115
+ self.out_projectors = None
116
+ if out_proj_dim is not None:
117
+ self.out_projectors = nn.ModuleList(
118
+ [nn.Linear(inner_dim, out_proj_dim) for _ in range(num_layers)]
119
+ )
120
+
121
+ self.gradient_checkpointing = False
122
+
123
+ def _set_gradient_checkpointing(self, module, value=False):
124
+ self.gradient_checkpointing = value
125
+
126
+ def compress_time(self, x, num_frames):
127
+ x = rearrange(x, '(b f) c h w -> b f c h w', f=num_frames)
128
+ batch_size, frames, channels, height, width = x.shape
129
+ x = rearrange(x, 'b f c h w -> (b h w) c f')
130
+
131
+ if x.shape[-1] % 2 == 1:
132
+ x_first, x_rest = x[..., 0], x[..., 1:]
133
+ if x_rest.shape[-1] > 0:
134
+ x_rest = F.avg_pool1d(x_rest, kernel_size=2, stride=2)
135
+
136
+ x = torch.cat([x_first[..., None], x_rest], dim=-1)
137
+ else:
138
+ x = F.avg_pool1d(x, kernel_size=2, stride=2)
139
+ x = rearrange(x, '(b h w) c f -> (b f) c h w', b=batch_size, h=height, w=width)
140
+ return x
141
+
142
+ def forward(
143
+ self,
144
+ hidden_states: torch.Tensor,
145
+ encoder_hidden_states: torch.Tensor,
146
+ controlnet_states: torch.Tensor,
147
+ timestep: Union[int, float, torch.LongTensor],
148
+ image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
149
+ timestep_cond: Optional[torch.Tensor] = None,
150
+ return_dict: bool = True,
151
+ ):
152
+ batch_size, num_frames, channels, height, width = controlnet_states.shape
153
+ # 0. Controlnet encoder
154
+ controlnet_states = rearrange(controlnet_states, 'b f c h w -> (b f) c h w')
155
+ controlnet_states = self.unshuffle(controlnet_states)
156
+ controlnet_states = self.controlnet_encode_first(controlnet_states)
157
+ controlnet_states = self.compress_time(controlnet_states, num_frames=num_frames)
158
+ num_frames = controlnet_states.shape[0] // batch_size
159
+
160
+ controlnet_states = self.controlnet_encode_second(controlnet_states)
161
+ controlnet_states = self.compress_time(controlnet_states, num_frames=num_frames)
162
+ controlnet_states = rearrange(controlnet_states, '(b f) c h w -> b f c h w', b=batch_size)
163
+
164
+ hidden_states = torch.cat([hidden_states, controlnet_states], dim=2)
165
+ # controlnet_states = self.controlnext_encoder(controlnet_states, timestep=timestep)
166
+ # 1. Time embedding
167
+ timesteps = timestep
168
+ t_emb = self.time_proj(timesteps)
169
+
170
+ # timesteps does not contain any weights and will always return f32 tensors
171
+ # but time_embedding might actually be running in fp16. so we need to cast here.
172
+ # there might be better ways to encapsulate this.
173
+ t_emb = t_emb.to(dtype=hidden_states.dtype)
174
+ emb = self.time_embedding(t_emb, timestep_cond)
175
+
176
+ hidden_states = self.patch_embed(encoder_hidden_states, hidden_states)
177
+ hidden_states = self.embedding_dropout(hidden_states)
178
+
179
+
180
+ text_seq_length = encoder_hidden_states.shape[1]
181
+ encoder_hidden_states = hidden_states[:, :text_seq_length]
182
+ hidden_states = hidden_states[:, text_seq_length:]
183
+
184
+
185
+ controlnet_hidden_states = ()
186
+ # 3. Transformer blocks
187
+ for i, block in enumerate(self.transformer_blocks):
188
+ if self.training and self.gradient_checkpointing:
189
+
190
+ def create_custom_forward(module):
191
+ def custom_forward(*inputs):
192
+ return module(*inputs)
193
+
194
+ return custom_forward
195
+
196
+ ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
197
+ hidden_states, encoder_hidden_states = torch.utils.checkpoint.checkpoint(
198
+ create_custom_forward(block),
199
+ hidden_states,
200
+ encoder_hidden_states,
201
+ emb,
202
+ image_rotary_emb,
203
+ **ckpt_kwargs,
204
+ )
205
+ else:
206
+ hidden_states, encoder_hidden_states = block(
207
+ hidden_states=hidden_states,
208
+ encoder_hidden_states=encoder_hidden_states,
209
+ temb=emb,
210
+ image_rotary_emb=image_rotary_emb,
211
+ )
212
+
213
+ if self.out_projectors is not None:
214
+ controlnet_hidden_states += (self.out_projectors[i](hidden_states),)
215
+ else:
216
+ controlnet_hidden_states += (hidden_states,)
217
+
218
+ if not return_dict:
219
+ return (controlnet_hidden_states,)
220
+ return Transformer2DModelOutput(sample=controlnet_hidden_states)
custom_nodes/ComfyUI-CogVideoXWrapper/cogvideox_fun/utils.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from PIL import Image
3
+
4
+ ASPECT_RATIO_512 = {
5
+ '0.25': [256.0, 1024.0], '0.26': [256.0, 992.0], '0.27': [256.0, 960.0], '0.28': [256.0, 928.0],
6
+ '0.32': [288.0, 896.0], '0.33': [288.0, 864.0], '0.35': [288.0, 832.0], '0.4': [320.0, 800.0],
7
+ '0.42': [320.0, 768.0], '0.48': [352.0, 736.0], '0.5': [352.0, 704.0], '0.52': [352.0, 672.0],
8
+ '0.57': [384.0, 672.0], '0.6': [384.0, 640.0], '0.68': [416.0, 608.0], '0.72': [416.0, 576.0],
9
+ '0.78': [448.0, 576.0], '0.82': [448.0, 544.0], '0.88': [480.0, 544.0], '0.94': [480.0, 512.0],
10
+ '1.0': [512.0, 512.0], '1.07': [512.0, 480.0], '1.13': [544.0, 480.0], '1.21': [544.0, 448.0],
11
+ '1.29': [576.0, 448.0], '1.38': [576.0, 416.0], '1.46': [608.0, 416.0], '1.67': [640.0, 384.0],
12
+ '1.75': [672.0, 384.0], '2.0': [704.0, 352.0], '2.09': [736.0, 352.0], '2.4': [768.0, 320.0],
13
+ '2.5': [800.0, 320.0], '2.89': [832.0, 288.0], '3.0': [864.0, 288.0], '3.11': [896.0, 288.0],
14
+ '3.62': [928.0, 256.0], '3.75': [960.0, 256.0], '3.88': [992.0, 256.0], '4.0': [1024.0, 256.0]
15
+ }
16
+ ASPECT_RATIO_RANDOM_CROP_512 = {
17
+ '0.42': [320.0, 768.0], '0.5': [352.0, 704.0],
18
+ '0.57': [384.0, 672.0], '0.68': [416.0, 608.0], '0.78': [448.0, 576.0], '0.88': [480.0, 544.0],
19
+ '0.94': [480.0, 512.0], '1.0': [512.0, 512.0], '1.07': [512.0, 480.0],
20
+ '1.13': [544.0, 480.0], '1.29': [576.0, 448.0], '1.46': [608.0, 416.0], '1.75': [672.0, 384.0],
21
+ '2.0': [704.0, 352.0], '2.4': [768.0, 320.0]
22
+ }
23
+ ASPECT_RATIO_RANDOM_CROP_PROB = [
24
+ 1, 2,
25
+ 4, 4, 4, 4,
26
+ 8, 8, 8,
27
+ 4, 4, 4, 4,
28
+ 2, 1
29
+ ]
30
+ ASPECT_RATIO_RANDOM_CROP_PROB = np.array(ASPECT_RATIO_RANDOM_CROP_PROB) / sum(ASPECT_RATIO_RANDOM_CROP_PROB)
31
+
32
+ def get_closest_ratio(height: float, width: float, ratios: dict = ASPECT_RATIO_512):
33
+ aspect_ratio = height / width
34
+ closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - aspect_ratio))
35
+ return ratios[closest_ratio], float(closest_ratio)
36
+
37
+ def get_width_and_height_from_image_and_base_resolution(image, base_resolution):
38
+ target_pixels = int(base_resolution) * int(base_resolution)
39
+ original_width, original_height = Image.open(image).size
40
+ ratio = (target_pixels / (original_width * original_height)) ** 0.5
41
+ width_slider = round(original_width * ratio)
42
+ height_slider = round(original_height * ratio)
43
+ return height_slider, width_slider
custom_nodes/ComfyUI-CogVideoXWrapper/configs/scheduler_config_2b.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "CogVideoXDDIMScheduler",
3
+ "_diffusers_version": "0.30.0.dev0",
4
+ "beta_end": 0.012,
5
+ "beta_schedule": "scaled_linear",
6
+ "beta_start": 0.00085,
7
+ "clip_sample": false,
8
+ "clip_sample_range": 1.0,
9
+ "num_train_timesteps": 1000,
10
+ "prediction_type": "v_prediction",
11
+ "rescale_betas_zero_snr": true,
12
+ "sample_max_value": 1.0,
13
+ "set_alpha_to_one": true,
14
+ "snr_shift_scale": 3.0,
15
+ "steps_offset": 0,
16
+ "timestep_spacing": "trailing",
17
+ "trained_betas": null
18
+ }
custom_nodes/ComfyUI-CogVideoXWrapper/configs/scheduler_config_5b.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "CogVideoXDDIMScheduler",
3
+ "_diffusers_version": "0.31.0.dev0",
4
+ "beta_end": 0.012,
5
+ "beta_schedule": "scaled_linear",
6
+ "beta_start": 0.00085,
7
+ "clip_sample": false,
8
+ "clip_sample_range": 1.0,
9
+ "num_train_timesteps": 1000,
10
+ "prediction_type": "v_prediction",
11
+ "rescale_betas_zero_snr": true,
12
+ "sample_max_value": 1.0,
13
+ "set_alpha_to_one": true,
14
+ "snr_shift_scale": 1.0,
15
+ "steps_offset": 0,
16
+ "timestep_spacing": "trailing",
17
+ "trained_betas": null
18
+ }
custom_nodes/ComfyUI-CogVideoXWrapper/configs/transformer_config_2b.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_fn": "gelu-approximate",
3
+ "attention_bias": true,
4
+ "attention_head_dim": 64,
5
+ "dropout": 0.0,
6
+ "flip_sin_to_cos": true,
7
+ "freq_shift": 0,
8
+ "in_channels": 16,
9
+ "max_text_seq_length": 226,
10
+ "norm_elementwise_affine": true,
11
+ "norm_eps": 1e-05,
12
+ "num_attention_heads": 30,
13
+ "num_layers": 30,
14
+ "out_channels": 16,
15
+ "patch_size": 2,
16
+ "sample_frames": 49,
17
+ "sample_height": 60,
18
+ "sample_width": 90,
19
+ "spatial_interpolation_scale": 1.875,
20
+ "temporal_compression_ratio": 4,
21
+ "temporal_interpolation_scale": 1.0,
22
+ "text_embed_dim": 4096,
23
+ "time_embed_dim": 512,
24
+ "timestep_activation_fn": "silu",
25
+ "use_rotary_positional_embeddings": false
26
+ }
custom_nodes/ComfyUI-CogVideoXWrapper/configs/transformer_config_5b.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_fn": "gelu-approximate",
3
+ "attention_bias": true,
4
+ "attention_head_dim": 64,
5
+ "dropout": 0.0,
6
+ "flip_sin_to_cos": true,
7
+ "freq_shift": 0,
8
+ "in_channels": 16,
9
+ "max_text_seq_length": 226,
10
+ "norm_elementwise_affine": true,
11
+ "norm_eps": 1e-05,
12
+ "num_attention_heads": 48,
13
+ "num_layers": 42,
14
+ "out_channels": 16,
15
+ "patch_size": 2,
16
+ "sample_frames": 49,
17
+ "sample_height": 60,
18
+ "sample_width": 90,
19
+ "spatial_interpolation_scale": 1.875,
20
+ "temporal_compression_ratio": 4,
21
+ "temporal_interpolation_scale": 1.0,
22
+ "text_embed_dim": 4096,
23
+ "time_embed_dim": 512,
24
+ "timestep_activation_fn": "silu",
25
+ "use_rotary_positional_embeddings": true
26
+ }
custom_nodes/ComfyUI-CogVideoXWrapper/configs/transformer_config_I2V_5b.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_fn": "gelu-approximate",
3
+ "attention_bias": true,
4
+ "attention_head_dim": 64,
5
+ "dropout": 0.0,
6
+ "flip_sin_to_cos": true,
7
+ "freq_shift": 0,
8
+ "in_channels": 32,
9
+ "max_text_seq_length": 226,
10
+ "norm_elementwise_affine": true,
11
+ "norm_eps": 1e-05,
12
+ "num_attention_heads": 48,
13
+ "num_layers": 42,
14
+ "out_channels": 16,
15
+ "patch_size": 2,
16
+ "sample_frames": 49,
17
+ "sample_height": 60,
18
+ "sample_width": 90,
19
+ "spatial_interpolation_scale": 1.875,
20
+ "temporal_compression_ratio": 4,
21
+ "temporal_interpolation_scale": 1.0,
22
+ "text_embed_dim": 4096,
23
+ "time_embed_dim": 512,
24
+ "timestep_activation_fn": "silu",
25
+ "use_learned_positional_embeddings": true,
26
+ "use_rotary_positional_embeddings": true
27
+ }
custom_nodes/ComfyUI-CogVideoXWrapper/configs/vae_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKLCogVideoX",
3
+ "_diffusers_version": "0.31.0.dev0",
4
+ "act_fn": "silu",
5
+ "block_out_channels": [
6
+ 128,
7
+ 256,
8
+ 256,
9
+ 512
10
+ ],
11
+ "down_block_types": [
12
+ "CogVideoXDownBlock3D",
13
+ "CogVideoXDownBlock3D",
14
+ "CogVideoXDownBlock3D",
15
+ "CogVideoXDownBlock3D"
16
+ ],
17
+ "force_upcast": true,
18
+ "in_channels": 3,
19
+ "latent_channels": 16,
20
+ "latents_mean": null,
21
+ "latents_std": null,
22
+ "layers_per_block": 3,
23
+ "norm_eps": 1e-06,
24
+ "norm_num_groups": 32,
25
+ "out_channels": 3,
26
+ "sample_height": 480,
27
+ "sample_width": 720,
28
+ "scaling_factor": 0.7,
29
+ "shift_factor": null,
30
+ "temporal_compression_ratio": 4,
31
+ "up_block_types": [
32
+ "CogVideoXUpBlock3D",
33
+ "CogVideoXUpBlock3D",
34
+ "CogVideoXUpBlock3D",
35
+ "CogVideoXUpBlock3D"
36
+ ],
37
+ "use_post_quant_conv": false,
38
+ "use_quant_conv": false
39
+ }
custom_nodes/ComfyUI-CogVideoXWrapper/context.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from typing import Callable, Optional, List
3
+
4
+
5
+ def ordered_halving(val):
6
+ bin_str = f"{val:064b}"
7
+ bin_flip = bin_str[::-1]
8
+ as_int = int(bin_flip, 2)
9
+
10
+ return as_int / (1 << 64)
11
+
12
+ def does_window_roll_over(window: list[int], num_frames: int) -> tuple[bool, int]:
13
+ prev_val = -1
14
+ for i, val in enumerate(window):
15
+ val = val % num_frames
16
+ if val < prev_val:
17
+ return True, i
18
+ prev_val = val
19
+ return False, -1
20
+
21
+ def shift_window_to_start(window: list[int], num_frames: int):
22
+ start_val = window[0]
23
+ for i in range(len(window)):
24
+ # 1) subtract each element by start_val to move vals relative to the start of all frames
25
+ # 2) add num_frames and take modulus to get adjusted vals
26
+ window[i] = ((window[i] - start_val) + num_frames) % num_frames
27
+
28
+ def shift_window_to_end(window: list[int], num_frames: int):
29
+ # 1) shift window to start
30
+ shift_window_to_start(window, num_frames)
31
+ end_val = window[-1]
32
+ end_delta = num_frames - end_val - 1
33
+ for i in range(len(window)):
34
+ # 2) add end_delta to each val to slide windows to end
35
+ window[i] = window[i] + end_delta
36
+
37
+ def get_missing_indexes(windows: list[list[int]], num_frames: int) -> list[int]:
38
+ all_indexes = list(range(num_frames))
39
+ for w in windows:
40
+ for val in w:
41
+ try:
42
+ all_indexes.remove(val)
43
+ except ValueError:
44
+ pass
45
+ return all_indexes
46
+
47
+ def uniform_looped(
48
+ step: int = ...,
49
+ num_steps: Optional[int] = None,
50
+ num_frames: int = ...,
51
+ context_size: Optional[int] = None,
52
+ context_stride: int = 3,
53
+ context_overlap: int = 4,
54
+ closed_loop: bool = True,
55
+ ):
56
+ if num_frames <= context_size:
57
+ yield list(range(num_frames))
58
+ return
59
+
60
+ context_stride = min(context_stride, int(np.ceil(np.log2(num_frames / context_size))) + 1)
61
+
62
+ for context_step in 1 << np.arange(context_stride):
63
+ pad = int(round(num_frames * ordered_halving(step)))
64
+ for j in range(
65
+ int(ordered_halving(step) * context_step) + pad,
66
+ num_frames + pad + (0 if closed_loop else -context_overlap),
67
+ (context_size * context_step - context_overlap),
68
+ ):
69
+ yield [e % num_frames for e in range(j, j + context_size * context_step, context_step)]
70
+
71
+ #from AnimateDiff-Evolved by Kosinkadink (https://github.com/Kosinkadink/ComfyUI-AnimateDiff-Evolved)
72
+ def uniform_standard(
73
+ step: int = ...,
74
+ num_steps: Optional[int] = None,
75
+ num_frames: int = ...,
76
+ context_size: Optional[int] = None,
77
+ context_stride: int = 3,
78
+ context_overlap: int = 4,
79
+ closed_loop: bool = True,
80
+ ):
81
+ windows = []
82
+ if num_frames <= context_size:
83
+ windows.append(list(range(num_frames)))
84
+ return windows
85
+
86
+ context_stride = min(context_stride, int(np.ceil(np.log2(num_frames / context_size))) + 1)
87
+
88
+ for context_step in 1 << np.arange(context_stride):
89
+ pad = int(round(num_frames * ordered_halving(step)))
90
+ for j in range(
91
+ int(ordered_halving(step) * context_step) + pad,
92
+ num_frames + pad + (0 if closed_loop else -context_overlap),
93
+ (context_size * context_step - context_overlap),
94
+ ):
95
+ windows.append([e % num_frames for e in range(j, j + context_size * context_step, context_step)])
96
+
97
+ # now that windows are created, shift any windows that loop, and delete duplicate windows
98
+ delete_idxs = []
99
+ win_i = 0
100
+ while win_i < len(windows):
101
+ # if window is rolls over itself, need to shift it
102
+ is_roll, roll_idx = does_window_roll_over(windows[win_i], num_frames)
103
+ if is_roll:
104
+ roll_val = windows[win_i][roll_idx] # roll_val might not be 0 for windows of higher strides
105
+ shift_window_to_end(windows[win_i], num_frames=num_frames)
106
+ # check if next window (cyclical) is missing roll_val
107
+ if roll_val not in windows[(win_i+1) % len(windows)]:
108
+ # need to insert new window here - just insert window starting at roll_val
109
+ windows.insert(win_i+1, list(range(roll_val, roll_val + context_size)))
110
+ # delete window if it's not unique
111
+ for pre_i in range(0, win_i):
112
+ if windows[win_i] == windows[pre_i]:
113
+ delete_idxs.append(win_i)
114
+ break
115
+ win_i += 1
116
+
117
+ # reverse delete_idxs so that they will be deleted in an order that doesn't break idx correlation
118
+ delete_idxs.reverse()
119
+ for i in delete_idxs:
120
+ windows.pop(i)
121
+ return windows
122
+
123
+ def static_standard(
124
+ step: int = ...,
125
+ num_steps: Optional[int] = None,
126
+ num_frames: int = ...,
127
+ context_size: Optional[int] = None,
128
+ context_stride: int = 3,
129
+ context_overlap: int = 4,
130
+ closed_loop: bool = True,
131
+ ):
132
+ windows = []
133
+ if num_frames <= context_size:
134
+ windows.append(list(range(num_frames)))
135
+ return windows
136
+ # always return the same set of windows
137
+ delta = context_size - context_overlap
138
+ for start_idx in range(0, num_frames, delta):
139
+ # if past the end of frames, move start_idx back to allow same context_length
140
+ ending = start_idx + context_size
141
+ if ending >= num_frames:
142
+ final_delta = ending - num_frames
143
+ final_start_idx = start_idx - final_delta
144
+ windows.append(list(range(final_start_idx, final_start_idx + context_size)))
145
+ break
146
+ windows.append(list(range(start_idx, start_idx + context_size)))
147
+ return windows
148
+
149
+ def get_context_scheduler(name: str) -> Callable:
150
+ if name == "uniform_looped":
151
+ return uniform_looped
152
+ elif name == "uniform_standard":
153
+ return uniform_standard
154
+ elif name == "static_standard":
155
+ return static_standard
156
+ else:
157
+ raise ValueError(f"Unknown context_overlap policy {name}")
158
+
159
+
160
+ def get_total_steps(
161
+ scheduler,
162
+ timesteps: List[int],
163
+ num_steps: Optional[int] = None,
164
+ num_frames: int = ...,
165
+ context_size: Optional[int] = None,
166
+ context_stride: int = 3,
167
+ context_overlap: int = 4,
168
+ closed_loop: bool = True,
169
+ ):
170
+ return sum(
171
+ len(
172
+ list(
173
+ scheduler(
174
+ i,
175
+ num_steps,
176
+ num_frames,
177
+ context_size,
178
+ context_stride,
179
+ context_overlap,
180
+ )
181
+ )
182
+ )
183
+ for i in range(len(timesteps))
184
+ )
custom_nodes/ComfyUI-CogVideoXWrapper/custom_cogvideox_transformer_3d.py ADDED
@@ -0,0 +1,779 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 The CogVideoX team, Tsinghua University & ZhipuAI and The HuggingFace Team.
2
+ # All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ from typing import Any, Dict, Optional, Tuple, Union
17
+
18
+ import torch
19
+ from torch import nn
20
+ import torch.nn.functional as F
21
+
22
+ import numpy as np
23
+ from einops import rearrange
24
+
25
+ from diffusers.configuration_utils import ConfigMixin, register_to_config
26
+ from diffusers.utils import logging
27
+ from diffusers.utils.torch_utils import maybe_allow_in_graph
28
+ from diffusers.models.attention import Attention, FeedForward
29
+ from diffusers.models.attention_processor import AttentionProcessor
30
+ from diffusers.models.embeddings import TimestepEmbedding, Timesteps
31
+ from diffusers.models.modeling_outputs import Transformer2DModelOutput
32
+ from diffusers.models.modeling_utils import ModelMixin
33
+ from diffusers.models.normalization import AdaLayerNorm, CogVideoXLayerNormZero
34
+ from diffusers.loaders import PeftAdapterMixin
35
+ from diffusers.models.embeddings import apply_rotary_emb
36
+ from .embeddings import CogVideoXPatchEmbed
37
+
38
+ from .enhance_a_video.enhance import get_feta_scores
39
+ from .enhance_a_video.globals import is_enhance_enabled, set_num_frames
40
+
41
+
42
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
43
+
44
+ try:
45
+ from sageattention import sageattn
46
+ SAGEATTN_IS_AVAILABLE = True
47
+ except:
48
+ SAGEATTN_IS_AVAILABLE = False
49
+
50
+ from comfy.ldm.modules.attention import optimized_attention
51
+
52
+
53
+ def set_attention_func(attention_mode, heads):
54
+ if attention_mode == "sdpa" or attention_mode == "fused_sdpa":
55
+ def func(q, k, v, is_causal=False, attn_mask=None):
56
+ return F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask, dropout_p=0.0, is_causal=is_causal)
57
+ return func
58
+ elif attention_mode == "comfy":
59
+ def func(q, k, v, is_causal=False, attn_mask=None):
60
+ return optimized_attention(q, k, v, mask=attn_mask, heads=heads, skip_reshape=True)
61
+ return func
62
+
63
+ elif attention_mode == "sageattn" or attention_mode == "fused_sageattn":
64
+ @torch.compiler.disable()
65
+ def func(q, k, v, is_causal=False, attn_mask=None):
66
+ return sageattn(q.to(v), k.to(v), v, is_causal=is_causal, attn_mask=attn_mask)
67
+ return func
68
+ elif attention_mode == "sageattn_qk_int8_pv_fp16_cuda":
69
+ from sageattention import sageattn_qk_int8_pv_fp16_cuda
70
+ @torch.compiler.disable()
71
+ def func(q, k, v, is_causal=False, attn_mask=None):
72
+ return sageattn_qk_int8_pv_fp16_cuda(q.to(v), k.to(v), v, is_causal=is_causal, attn_mask=attn_mask, pv_accum_dtype="fp32")
73
+ return func
74
+ elif attention_mode == "sageattn_qk_int8_pv_fp16_triton":
75
+ from sageattention import sageattn_qk_int8_pv_fp16_triton
76
+ @torch.compiler.disable()
77
+ def func(q, k, v, is_causal=False, attn_mask=None):
78
+ return sageattn_qk_int8_pv_fp16_triton(q.to(v), k.to(v), v, is_causal=is_causal, attn_mask=attn_mask)
79
+ return func
80
+ elif attention_mode == "sageattn_qk_int8_pv_fp8_cuda":
81
+ from sageattention import sageattn_qk_int8_pv_fp8_cuda
82
+ @torch.compiler.disable()
83
+ def func(q, k, v, is_causal=False, attn_mask=None):
84
+ return sageattn_qk_int8_pv_fp8_cuda(q.to(v), k.to(v), v, is_causal=is_causal, attn_mask=attn_mask, pv_accum_dtype="fp32+fp32")
85
+ return func
86
+
87
+ #for fastercache
88
+ def fft(tensor):
89
+ tensor_fft = torch.fft.fft2(tensor)
90
+ tensor_fft_shifted = torch.fft.fftshift(tensor_fft)
91
+ B, C, H, W = tensor.size()
92
+ radius = min(H, W) // 5
93
+
94
+ Y, X = torch.meshgrid(torch.arange(H), torch.arange(W))
95
+ center_x, center_y = W // 2, H // 2
96
+ mask = (X - center_x) ** 2 + (Y - center_y) ** 2 <= radius ** 2
97
+ low_freq_mask = mask.unsqueeze(0).unsqueeze(0).to(tensor.device)
98
+ high_freq_mask = ~low_freq_mask
99
+
100
+ low_freq_fft = tensor_fft_shifted * low_freq_mask
101
+ high_freq_fft = tensor_fft_shifted * high_freq_mask
102
+
103
+ return low_freq_fft, high_freq_fft
104
+
105
+ #for teacache
106
+ def poly1d(coefficients, x):
107
+ result = torch.zeros_like(x)
108
+ for i, coeff in enumerate(coefficients):
109
+ result += coeff * (x ** (len(coefficients) - 1 - i))
110
+ return result.abs()
111
+
112
+ #region Attention
113
+ class CogVideoXAttnProcessor2_0:
114
+ r"""
115
+ Processor for implementing scaled dot-product attention for the CogVideoX model. It applies a rotary embedding on
116
+ query and key vectors, but does not include spatial normalization.
117
+ """
118
+
119
+ def __init__(self, attn_func, attention_mode: Optional[str] = None):
120
+ if not hasattr(F, "scaled_dot_product_attention"):
121
+ raise ImportError("CogVideoXAttnProcessor requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
122
+ self.attention_mode = attention_mode
123
+ self.attn_func = attn_func
124
+ def __call__(
125
+ self,
126
+ attn: Attention,
127
+ hidden_states: torch.Tensor,
128
+ encoder_hidden_states: torch.Tensor,
129
+ attention_mask: Optional[torch.Tensor] = None,
130
+ image_rotary_emb: Optional[torch.Tensor] = None,
131
+ ) -> torch.Tensor:
132
+ text_seq_length = encoder_hidden_states.size(1)
133
+
134
+ hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
135
+
136
+ batch_size, sequence_length, _ = (
137
+ hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
138
+ )
139
+
140
+ if attention_mask is not None:
141
+ attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
142
+ attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
143
+
144
+ if attn.to_q.weight.dtype == torch.float16 or attn.to_q.weight.dtype == torch.bfloat16:
145
+ hidden_states = hidden_states.to(attn.to_q.weight.dtype)
146
+
147
+ if not "fused" in self.attention_mode:
148
+ query = attn.to_q(hidden_states)
149
+ key = attn.to_k(hidden_states)
150
+ value = attn.to_v(hidden_states)
151
+ else:
152
+ qkv = attn.to_qkv(hidden_states)
153
+ split_size = qkv.shape[-1] // 3
154
+ query, key, value = torch.split(qkv, split_size, dim=-1)
155
+
156
+ inner_dim = key.shape[-1]
157
+ head_dim = inner_dim // attn.heads
158
+
159
+ query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
160
+ key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
161
+ value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
162
+
163
+ if attn.norm_q is not None:
164
+ query = attn.norm_q(query)
165
+ if attn.norm_k is not None:
166
+ key = attn.norm_k(key)
167
+
168
+ # Apply RoPE if needed
169
+ if image_rotary_emb is not None:
170
+ query[:, :, text_seq_length:] = apply_rotary_emb(query[:, :, text_seq_length:], image_rotary_emb)
171
+ if not attn.is_cross_attention:
172
+ key[:, :, text_seq_length:] = apply_rotary_emb(key[:, :, text_seq_length:], image_rotary_emb)
173
+
174
+ #feta
175
+ if is_enhance_enabled():
176
+ feta_scores = get_feta_scores(attn, query, key, head_dim, text_seq_length)
177
+
178
+ hidden_states = self.attn_func(query, key, value, attn_mask=attention_mask, is_causal=False)
179
+
180
+ if self.attention_mode != "comfy":
181
+ hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
182
+
183
+ # linear proj
184
+ hidden_states = attn.to_out[0](hidden_states)
185
+ # dropout
186
+ hidden_states = attn.to_out[1](hidden_states)
187
+
188
+ encoder_hidden_states, hidden_states = hidden_states.split(
189
+ [text_seq_length, hidden_states.size(1) - text_seq_length], dim=1
190
+ )
191
+
192
+ if is_enhance_enabled():
193
+ hidden_states *= feta_scores
194
+
195
+ return hidden_states, encoder_hidden_states
196
+
197
+ #region Blocks
198
+ @maybe_allow_in_graph
199
+ class CogVideoXBlock(nn.Module):
200
+
201
+ r"""
202
+ Transformer block used in [CogVideoX](https://github.com/THUDM/CogVideo) model.
203
+
204
+ Parameters:
205
+ dim (`int`):
206
+ The number of channels in the input and output.
207
+ num_attention_heads (`int`):
208
+ The number of heads to use for multi-head attention.
209
+ attention_head_dim (`int`):
210
+ The number of channels in each head.
211
+ time_embed_dim (`int`):
212
+ The number of channels in timestep embedding.
213
+ dropout (`float`, defaults to `0.0`):
214
+ The dropout probability to use.
215
+ activation_fn (`str`, defaults to `"gelu-approximate"`):
216
+ Activation function to be used in feed-forward.
217
+ attention_bias (`bool`, defaults to `False`):
218
+ Whether or not to use bias in attention projection layers.
219
+ qk_norm (`bool`, defaults to `True`):
220
+ Whether or not to use normalization after query and key projections in Attention.
221
+ norm_elementwise_affine (`bool`, defaults to `True`):
222
+ Whether to use learnable elementwise affine parameters for normalization.
223
+ norm_eps (`float`, defaults to `1e-5`):
224
+ Epsilon value for normalization layers.
225
+ final_dropout (`bool` defaults to `False`):
226
+ Whether to apply a final dropout after the last feed-forward layer.
227
+ ff_inner_dim (`int`, *optional*, defaults to `None`):
228
+ Custom hidden dimension of Feed-forward layer. If not provided, `4 * dim` is used.
229
+ ff_bias (`bool`, defaults to `True`):
230
+ Whether or not to use bias in Feed-forward layer.
231
+ attention_out_bias (`bool`, defaults to `True`):
232
+ Whether or not to use bias in Attention output projection layer.
233
+ """
234
+
235
+ def __init__(
236
+ self,
237
+ dim: int,
238
+ num_attention_heads: int,
239
+ attention_head_dim: int,
240
+ time_embed_dim: int,
241
+ dropout: float = 0.0,
242
+ activation_fn: str = "gelu-approximate",
243
+ attention_bias: bool = False,
244
+ qk_norm: bool = True,
245
+ norm_elementwise_affine: bool = True,
246
+ norm_eps: float = 1e-5,
247
+ final_dropout: bool = True,
248
+ ff_inner_dim: Optional[int] = None,
249
+ ff_bias: bool = True,
250
+ attention_out_bias: bool = True,
251
+ attention_mode: Optional[str] = "sdpa",
252
+ ):
253
+ super().__init__()
254
+
255
+ # 1. Self Attention
256
+ self.norm1 = CogVideoXLayerNormZero(time_embed_dim, dim, norm_elementwise_affine, norm_eps, bias=True)
257
+
258
+ attn_func = set_attention_func(attention_mode, num_attention_heads)
259
+
260
+ self.attn1 = Attention(
261
+ query_dim=dim,
262
+ dim_head=attention_head_dim,
263
+ heads=num_attention_heads,
264
+ qk_norm="layer_norm" if qk_norm else None,
265
+ eps=1e-6,
266
+ bias=attention_bias,
267
+ out_bias=attention_out_bias,
268
+ processor=CogVideoXAttnProcessor2_0(attn_func, attention_mode=attention_mode),
269
+ )
270
+
271
+ # 2. Feed Forward
272
+ self.norm2 = CogVideoXLayerNormZero(time_embed_dim, dim, norm_elementwise_affine, norm_eps, bias=True)
273
+
274
+ self.ff = FeedForward(
275
+ dim,
276
+ dropout=dropout,
277
+ activation_fn=activation_fn,
278
+ final_dropout=final_dropout,
279
+ inner_dim=ff_inner_dim,
280
+ bias=ff_bias,
281
+ )
282
+ self.cached_hidden_states = []
283
+ self.cached_encoder_hidden_states = []
284
+
285
+ def forward(
286
+ self,
287
+ hidden_states: torch.Tensor,
288
+ encoder_hidden_states: torch.Tensor,
289
+ temb: torch.Tensor,
290
+ image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
291
+ video_flow_feature: Optional[torch.Tensor] = None,
292
+ fuser=None,
293
+ block_use_fastercache=False,
294
+ fastercache_counter=0,
295
+ fastercache_start_step=15,
296
+ fastercache_device="cuda:0",
297
+ ) -> torch.Tensor:
298
+ #print("hidden_states in block: ", hidden_states.shape) #1.5: torch.Size([2, 3200, 3072]) 10.: torch.Size([2, 6400, 3072])
299
+ text_seq_length = encoder_hidden_states.size(1)
300
+
301
+ # norm & modulate
302
+ norm_hidden_states, norm_encoder_hidden_states, gate_msa, enc_gate_msa = self.norm1(
303
+ hidden_states, encoder_hidden_states, temb
304
+ )
305
+ #print("norm_hidden_states in block: ", norm_hidden_states.shape) #torch.Size([2, 3200, 3072])
306
+
307
+ # Tora Motion-guidance Fuser
308
+ if video_flow_feature is not None:
309
+ H, W = video_flow_feature.shape[-2:]
310
+ T = norm_hidden_states.shape[1] // H // W
311
+ h = rearrange(norm_hidden_states, "B (T H W) C -> (B T) C H W", H=H, W=W)
312
+ h = fuser(h, video_flow_feature.to(h), T=T)
313
+ norm_hidden_states = rearrange(h, "(B T) C H W -> B (T H W) C", T=T)
314
+ del h, fuser
315
+
316
+ #region fastercache
317
+ if block_use_fastercache:
318
+ B = norm_hidden_states.shape[0]
319
+ if fastercache_counter >= fastercache_start_step + 3 and fastercache_counter%3!=0 and self.cached_hidden_states[-1].shape[0] >= B:
320
+ attn_hidden_states = (
321
+ self.cached_hidden_states[1][:B] +
322
+ (self.cached_hidden_states[1][:B] - self.cached_hidden_states[0][:B])
323
+ * 0.3
324
+ ).to(norm_hidden_states.device, non_blocking=True)
325
+ attn_encoder_hidden_states = (
326
+ self.cached_encoder_hidden_states[1][:B] +
327
+ (self.cached_encoder_hidden_states[1][:B] - self.cached_encoder_hidden_states[0][:B])
328
+ * 0.3
329
+ ).to(norm_hidden_states.device, non_blocking=True)
330
+ else:
331
+ attn_hidden_states, attn_encoder_hidden_states = self.attn1(
332
+ hidden_states=norm_hidden_states,
333
+ encoder_hidden_states=norm_encoder_hidden_states,
334
+ image_rotary_emb=image_rotary_emb,
335
+ )
336
+ if fastercache_counter == fastercache_start_step:
337
+ self.cached_hidden_states = [attn_hidden_states.to(fastercache_device), attn_hidden_states.to(fastercache_device)]
338
+ self.cached_encoder_hidden_states = [attn_encoder_hidden_states.to(fastercache_device), attn_encoder_hidden_states.to(fastercache_device)]
339
+ elif fastercache_counter > fastercache_start_step:
340
+ self.cached_hidden_states[-1].copy_(attn_hidden_states.to(fastercache_device))
341
+ self.cached_encoder_hidden_states[-1].copy_(attn_encoder_hidden_states.to(fastercache_device))
342
+ else:
343
+ attn_hidden_states, attn_encoder_hidden_states = self.attn1(
344
+ hidden_states=norm_hidden_states,
345
+ encoder_hidden_states=norm_encoder_hidden_states,
346
+ image_rotary_emb=image_rotary_emb
347
+ )
348
+
349
+ hidden_states = hidden_states + gate_msa * attn_hidden_states
350
+ encoder_hidden_states = encoder_hidden_states + enc_gate_msa * attn_encoder_hidden_states
351
+
352
+ # norm & modulate
353
+
354
+ norm_hidden_states, norm_encoder_hidden_states, gate_ff, enc_gate_ff = self.norm2(
355
+ hidden_states, encoder_hidden_states, temb
356
+ )
357
+
358
+ # feed-forward
359
+ norm_hidden_states = torch.cat([norm_encoder_hidden_states, norm_hidden_states], dim=1)
360
+ ff_output = self.ff(norm_hidden_states)
361
+
362
+ hidden_states = hidden_states + gate_ff * ff_output[:, text_seq_length:]
363
+ encoder_hidden_states = encoder_hidden_states + enc_gate_ff * ff_output[:, :text_seq_length]
364
+
365
+ return hidden_states, encoder_hidden_states
366
+
367
+ #region Transformer
368
+ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
369
+ """
370
+ A Transformer model for video-like data in [CogVideoX](https://github.com/THUDM/CogVideo).
371
+
372
+ Parameters:
373
+ num_attention_heads (`int`, defaults to `30`):
374
+ The number of heads to use for multi-head attention.
375
+ attention_head_dim (`int`, defaults to `64`):
376
+ The number of channels in each head.
377
+ in_channels (`int`, defaults to `16`):
378
+ The number of channels in the input.
379
+ out_channels (`int`, *optional*, defaults to `16`):
380
+ The number of channels in the output.
381
+ flip_sin_to_cos (`bool`, defaults to `True`):
382
+ Whether to flip the sin to cos in the time embedding.
383
+ time_embed_dim (`int`, defaults to `512`):
384
+ Output dimension of timestep embeddings.
385
+ text_embed_dim (`int`, defaults to `4096`):
386
+ Input dimension of text embeddings from the text encoder.
387
+ num_layers (`int`, defaults to `30`):
388
+ The number of layers of Transformer blocks to use.
389
+ dropout (`float`, defaults to `0.0`):
390
+ The dropout probability to use.
391
+ attention_bias (`bool`, defaults to `True`):
392
+ Whether or not to use bias in the attention projection layers.
393
+ sample_width (`int`, defaults to `90`):
394
+ The width of the input latents.
395
+ sample_height (`int`, defaults to `60`):
396
+ The height of the input latents.
397
+ sample_frames (`int`, defaults to `49`):
398
+ The number of frames in the input latents. Note that this parameter was incorrectly initialized to 49
399
+ instead of 13 because CogVideoX processed 13 latent frames at once in its default and recommended settings,
400
+ but cannot be changed to the correct value to ensure backwards compatibility. To create a transformer with
401
+ K latent frames, the correct value to pass here would be: ((K - 1) * temporal_compression_ratio + 1).
402
+ patch_size (`int`, defaults to `2`):
403
+ The size of the patches to use in the patch embedding layer.
404
+ temporal_compression_ratio (`int`, defaults to `4`):
405
+ The compression ratio across the temporal dimension. See documentation for `sample_frames`.
406
+ max_text_seq_length (`int`, defaults to `226`):
407
+ The maximum sequence length of the input text embeddings.
408
+ activation_fn (`str`, defaults to `"gelu-approximate"`):
409
+ Activation function to use in feed-forward.
410
+ timestep_activation_fn (`str`, defaults to `"silu"`):
411
+ Activation function to use when generating the timestep embeddings.
412
+ norm_elementwise_affine (`bool`, defaults to `True`):
413
+ Whether or not to use elementwise affine in normalization layers.
414
+ norm_eps (`float`, defaults to `1e-5`):
415
+ The epsilon value to use in normalization layers.
416
+ spatial_interpolation_scale (`float`, defaults to `1.875`):
417
+ Scaling factor to apply in 3D positional embeddings across spatial dimensions.
418
+ temporal_interpolation_scale (`float`, defaults to `1.0`):
419
+ Scaling factor to apply in 3D positional embeddings across temporal dimensions.
420
+ """
421
+
422
+ _supports_gradient_checkpointing = True
423
+
424
+ @register_to_config
425
+ def __init__(
426
+ self,
427
+ num_attention_heads: int = 30,
428
+ attention_head_dim: int = 64,
429
+ in_channels: int = 16,
430
+ out_channels: Optional[int] = 16,
431
+ flip_sin_to_cos: bool = True,
432
+ freq_shift: int = 0,
433
+ time_embed_dim: int = 512,
434
+ ofs_embed_dim: Optional[int] = None,
435
+ text_embed_dim: int = 4096,
436
+ num_layers: int = 30,
437
+ dropout: float = 0.0,
438
+ attention_bias: bool = True,
439
+ sample_width: int = 90,
440
+ sample_height: int = 60,
441
+ sample_frames: int = 49,
442
+ patch_size: int = 2,
443
+ patch_size_t: int = None,
444
+ temporal_compression_ratio: int = 4,
445
+ max_text_seq_length: int = 226,
446
+ activation_fn: str = "gelu-approximate",
447
+ timestep_activation_fn: str = "silu",
448
+ norm_elementwise_affine: bool = True,
449
+ norm_eps: float = 1e-5,
450
+ spatial_interpolation_scale: float = 1.875,
451
+ temporal_interpolation_scale: float = 1.0,
452
+ use_rotary_positional_embeddings: bool = False,
453
+ use_learned_positional_embeddings: bool = False,
454
+ patch_bias: bool = True,
455
+ attention_mode: Optional[str] = "sdpa",
456
+ ):
457
+ super().__init__()
458
+ inner_dim = num_attention_heads * attention_head_dim
459
+
460
+ if not use_rotary_positional_embeddings and use_learned_positional_embeddings:
461
+ raise ValueError(
462
+ "There are no CogVideoX checkpoints available with disable rotary embeddings and learned positional "
463
+ "embeddings. If you're using a custom model and/or believe this should be supported, please open an "
464
+ "issue at https://github.com/huggingface/diffusers/issues."
465
+ )
466
+
467
+ # 1. Patch embedding
468
+ self.patch_embed = CogVideoXPatchEmbed(
469
+ patch_size=patch_size,
470
+ patch_size_t=patch_size_t,
471
+ in_channels=in_channels,
472
+ embed_dim=inner_dim,
473
+ text_embed_dim=text_embed_dim,
474
+ bias=patch_bias,
475
+ sample_width=sample_width,
476
+ sample_height=sample_height,
477
+ sample_frames=sample_frames,
478
+ temporal_compression_ratio=temporal_compression_ratio,
479
+ max_text_seq_length=max_text_seq_length,
480
+ spatial_interpolation_scale=spatial_interpolation_scale,
481
+ temporal_interpolation_scale=temporal_interpolation_scale,
482
+ use_positional_embeddings=not use_rotary_positional_embeddings,
483
+ use_learned_positional_embeddings=use_learned_positional_embeddings,
484
+ )
485
+ self.embedding_dropout = nn.Dropout(dropout)
486
+
487
+ # 2. Time embeddings
488
+ self.time_proj = Timesteps(inner_dim, flip_sin_to_cos, freq_shift)
489
+ self.time_embedding = TimestepEmbedding(inner_dim, time_embed_dim, timestep_activation_fn)
490
+
491
+ self.ofs_proj = None
492
+ self.ofs_embedding = None
493
+
494
+ if ofs_embed_dim:
495
+ self.ofs_proj = Timesteps(ofs_embed_dim, flip_sin_to_cos, freq_shift)
496
+ self.ofs_embedding = TimestepEmbedding(ofs_embed_dim, ofs_embed_dim, timestep_activation_fn) # same as time embeddings, for ofs
497
+
498
+ # 3. Define spatio-temporal transformers blocks
499
+ self.transformer_blocks = nn.ModuleList(
500
+ [
501
+ CogVideoXBlock(
502
+ dim=inner_dim,
503
+ num_attention_heads=num_attention_heads,
504
+ attention_head_dim=attention_head_dim,
505
+ time_embed_dim=time_embed_dim,
506
+ dropout=dropout,
507
+ activation_fn=activation_fn,
508
+ attention_bias=attention_bias,
509
+ attention_mode=attention_mode,
510
+ norm_elementwise_affine=norm_elementwise_affine,
511
+ norm_eps=norm_eps,
512
+ )
513
+ for _ in range(num_layers)
514
+ ]
515
+ )
516
+ self.norm_final = nn.LayerNorm(inner_dim, norm_eps, norm_elementwise_affine)
517
+
518
+ # 4. Output blocks
519
+ self.norm_out = AdaLayerNorm(
520
+ embedding_dim=time_embed_dim,
521
+ output_dim=2 * inner_dim,
522
+ norm_elementwise_affine=norm_elementwise_affine,
523
+ norm_eps=norm_eps,
524
+ chunk_dim=1,
525
+ )
526
+ if patch_size_t is None:
527
+ # For CogVideox 1.0
528
+ output_dim = patch_size * patch_size * out_channels
529
+ else:
530
+ # For CogVideoX 1.5
531
+ output_dim = patch_size * patch_size * patch_size_t * out_channels
532
+
533
+ self.proj_out = nn.Linear(inner_dim, output_dim)
534
+
535
+ self.gradient_checkpointing = False
536
+
537
+ self.attention_mode = attention_mode
538
+
539
+ #tora
540
+ self.fuser_list = None
541
+
542
+ #fastercache
543
+ self.use_fastercache = False
544
+ self.fastercache_counter = 0
545
+ self.fastercache_start_step = 15
546
+ self.fastercache_lf_step = 40
547
+ self.fastercache_hf_step = 30
548
+ self.fastercache_device = "cuda"
549
+ self.fastercache_num_blocks_to_cache = len(self.transformer_blocks)
550
+
551
+ #teacache
552
+ self.use_teacache = False
553
+ self.teacache_rel_l1_thresh = 0.0
554
+ if not self.config.use_rotary_positional_embeddings:
555
+ #CogVideoX-2B
556
+ self.teacache_coefficients = [-3.10658903e+01, 2.54732368e+01, -5.92380459e+00, 1.75769064e+00, -3.61568434e-03]
557
+ else:
558
+ #CogVideoX-5B
559
+ self.teacache_coefficients = [-1.53880483e+03, 8.43202495e+02, -1.34363087e+02, 7.97131516e+00, -5.23162339e-02]
560
+
561
+
562
+ def _set_gradient_checkpointing(self, module, value=False):
563
+ self.gradient_checkpointing = value
564
+ #region forward
565
+ def forward(
566
+ self,
567
+ hidden_states: torch.Tensor,
568
+ encoder_hidden_states: torch.Tensor,
569
+ timestep: Union[int, float, torch.LongTensor],
570
+ timestep_cond: Optional[torch.Tensor] = None,
571
+ ofs: Optional[Union[int, float, torch.LongTensor]] = None,
572
+ image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
573
+ controlnet_states: torch.Tensor = None,
574
+ controlnet_weights: Optional[Union[float, int, list, np.ndarray, torch.FloatTensor]] = 1.0,
575
+ video_flow_features: Optional[torch.Tensor] = None,
576
+ return_dict: bool = True,
577
+ ):
578
+ batch_size, num_frames, channels, height, width = hidden_states.shape
579
+
580
+ set_num_frames(num_frames) #enhance a video global
581
+
582
+ # 1. Time embedding
583
+ timesteps = timestep
584
+ t_emb = self.time_proj(timesteps)
585
+
586
+ # timesteps does not contain any weights and will always return f32 tensors
587
+ # but time_embedding might actually be running in fp16. so we need to cast here.
588
+ # there might be better ways to encapsulate this.
589
+ t_emb = t_emb.to(dtype=hidden_states.dtype)
590
+
591
+ emb = self.time_embedding(t_emb, timestep_cond)
592
+ if self.ofs_embedding is not None: #1.5 I2V
593
+ ofs_emb = self.ofs_proj(ofs)
594
+ ofs_emb = ofs_emb.to(dtype=hidden_states.dtype)
595
+ ofs_emb = self.ofs_embedding(ofs_emb)
596
+ emb = emb + ofs_emb
597
+
598
+ # 2. Patch embedding
599
+ p = self.config.patch_size
600
+ p_t = self.config.patch_size_t
601
+
602
+ #print("hidden_states before patch_embedding", hidden_states.shape) #torch.Size([2, 4, 16, 60, 90])
603
+
604
+ hidden_states = self.patch_embed(encoder_hidden_states, hidden_states)
605
+ #print("hidden_states after patch_embedding", hidden_states.shape) #1.5: torch.Size([2, 2926, 3072]) #1.0: torch.Size([2, 5626, 3072])
606
+ hidden_states = self.embedding_dropout(hidden_states)
607
+
608
+ text_seq_length = encoder_hidden_states.shape[1]
609
+ encoder_hidden_states = hidden_states[:, :text_seq_length]
610
+ hidden_states = hidden_states[:, text_seq_length:]
611
+ #print("hidden_states after split", hidden_states.shape) #1.5: torch.Size([2, 2700, 3072]) #1.0: torch.Size([2, 5400, 3072])
612
+
613
+ if self.use_fastercache:
614
+ self.fastercache_counter+=1
615
+ if self.fastercache_counter >= self.fastercache_start_step + 3 and self.fastercache_counter % 5 !=0:
616
+ # 3. Transformer blocks
617
+ for i, block in enumerate(self.transformer_blocks):
618
+ hidden_states, encoder_hidden_states = block(
619
+ hidden_states=hidden_states[:1],
620
+ encoder_hidden_states=encoder_hidden_states[:1],
621
+ temb=emb[:1],
622
+ image_rotary_emb=image_rotary_emb,
623
+ video_flow_feature=video_flow_features[i][:1] if video_flow_features is not None else None,
624
+ fuser = self.fuser_list[i] if self.fuser_list is not None else None,
625
+ block_use_fastercache = i <= self.fastercache_num_blocks_to_cache,
626
+ fastercache_counter = self.fastercache_counter,
627
+ fastercache_start_step = self.fastercache_start_step,
628
+ fastercache_device = self.fastercache_device
629
+ )
630
+
631
+ if (controlnet_states is not None) and (i < len(controlnet_states)):
632
+ controlnet_states_block = controlnet_states[i]
633
+ controlnet_block_weight = 1.0
634
+ if isinstance(controlnet_weights, (list, np.ndarray)) or torch.is_tensor(controlnet_weights):
635
+ controlnet_block_weight = controlnet_weights[i]
636
+ elif isinstance(controlnet_weights, (float, int)):
637
+ controlnet_block_weight = controlnet_weights
638
+
639
+ hidden_states = hidden_states + controlnet_states_block * controlnet_block_weight
640
+
641
+ if not self.config.use_rotary_positional_embeddings:
642
+ # CogVideoX-2B
643
+ hidden_states = self.norm_final(hidden_states)
644
+ else:
645
+ # CogVideoX-5B
646
+ hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
647
+ hidden_states = self.norm_final(hidden_states)
648
+ hidden_states = hidden_states[:, text_seq_length:]
649
+
650
+ # 4. Final block
651
+ hidden_states = self.norm_out(hidden_states, temb=emb[:1])
652
+ hidden_states = self.proj_out(hidden_states)
653
+
654
+ # 5. Unpatchify
655
+ # Note: we use `-1` instead of `channels`:
656
+ # - It is okay to `channels` use for CogVideoX-2b and CogVideoX-5b (number of input channels is equal to output channels)
657
+ # - However, for CogVideoX-5b-I2V also takes concatenated input image latents (number of input channels is twice the output channels)
658
+
659
+ if p_t is None:
660
+ output = hidden_states.reshape(1, num_frames, height // p, width // p, -1, p, p)
661
+ output = output.permute(0, 1, 4, 2, 5, 3, 6).flatten(5, 6).flatten(3, 4)
662
+ else:
663
+ output = hidden_states.reshape(
664
+ 1, (num_frames + p_t - 1) // p_t, height // p, width // p, -1, p_t, p, p
665
+ )
666
+ output = output.permute(0, 1, 5, 4, 2, 6, 3, 7).flatten(6, 7).flatten(4, 5).flatten(1, 2)
667
+
668
+ (bb, tt, cc, hh, ww) = output.shape
669
+ cond = rearrange(output, "B T C H W -> (B T) C H W", B=bb, C=cc, T=tt, H=hh, W=ww)
670
+ lf_c, hf_c = fft(cond.float())
671
+ #lf_step = 40
672
+ #hf_step = 30
673
+ if self.fastercache_counter <= self.fastercache_lf_step:
674
+ self.delta_lf = self.delta_lf * 1.1
675
+ if self.fastercache_counter >= self.fastercache_hf_step:
676
+ self.delta_hf = self.delta_hf * 1.1
677
+
678
+ new_hf_uc = self.delta_hf + hf_c
679
+ new_lf_uc = self.delta_lf + lf_c
680
+
681
+ combine_uc = new_lf_uc + new_hf_uc
682
+ combined_fft = torch.fft.ifftshift(combine_uc)
683
+ recovered_uncond = torch.fft.ifft2(combined_fft).real
684
+ recovered_uncond = rearrange(recovered_uncond.to(output.dtype), "(B T) C H W -> B T C H W", B=bb, C=cc, T=tt, H=hh, W=ww)
685
+ output = torch.cat([output, recovered_uncond])
686
+ else:
687
+ if self.use_teacache:
688
+ if not hasattr(self, 'accumulated_rel_l1_distance'):
689
+ should_calc = True
690
+ self.accumulated_rel_l1_distance = 0
691
+ else:
692
+ self.accumulated_rel_l1_distance += poly1d(self.teacache_coefficients, ((emb-self.previous_modulated_input).abs().mean() / self.previous_modulated_input.abs().mean()))
693
+ if self.accumulated_rel_l1_distance < self.teacache_rel_l1_thresh:
694
+ should_calc = False
695
+ self.teacache_counter += 1
696
+ else:
697
+ should_calc = True
698
+ self.accumulated_rel_l1_distance = 0
699
+ #print("self.accumulated_rel_l1_distance ", self.accumulated_rel_l1_distance)
700
+ self.previous_modulated_input = emb
701
+ if not should_calc:
702
+ hidden_states += self.previous_residual
703
+ encoder_hidden_states += self.previous_residual_encoder
704
+
705
+ if not self.use_teacache or (self.use_teacache and should_calc):
706
+ if self.use_teacache:
707
+ ori_hidden_states = hidden_states.clone()
708
+ ori_encoder_hidden_states = encoder_hidden_states.clone()
709
+ for i, block in enumerate(self.transformer_blocks):
710
+ hidden_states, encoder_hidden_states = block(
711
+ hidden_states=hidden_states,
712
+ encoder_hidden_states=encoder_hidden_states,
713
+ temb=emb,
714
+ image_rotary_emb=image_rotary_emb,
715
+ video_flow_feature=video_flow_features[i] if video_flow_features is not None else None,
716
+ fuser = self.fuser_list[i] if self.fuser_list is not None else None,
717
+ block_use_fastercache = i <= self.fastercache_num_blocks_to_cache,
718
+ fastercache_counter = self.fastercache_counter,
719
+ fastercache_start_step = self.fastercache_start_step,
720
+ fastercache_device = self.fastercache_device
721
+ )
722
+
723
+ #controlnet
724
+ if (controlnet_states is not None) and (i < len(controlnet_states)):
725
+ controlnet_states_block = controlnet_states[i]
726
+ controlnet_block_weight = 1.0
727
+ if isinstance(controlnet_weights, (list, np.ndarray)) or torch.is_tensor(controlnet_weights):
728
+ controlnet_block_weight = controlnet_weights[i]
729
+ print(controlnet_block_weight)
730
+ elif isinstance(controlnet_weights, (float, int)):
731
+ controlnet_block_weight = controlnet_weights
732
+ hidden_states = hidden_states + controlnet_states_block * controlnet_block_weight
733
+
734
+ if self.use_teacache:
735
+ self.previous_residual = hidden_states - ori_hidden_states
736
+ self.previous_residual_encoder = encoder_hidden_states - ori_encoder_hidden_states
737
+
738
+ if not self.config.use_rotary_positional_embeddings:
739
+ # CogVideoX-2B
740
+ hidden_states = self.norm_final(hidden_states)
741
+ else:
742
+ # CogVideoX-5B
743
+ hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
744
+ hidden_states = self.norm_final(hidden_states)
745
+ hidden_states = hidden_states[:, text_seq_length:]
746
+
747
+ # 4. Final block
748
+ hidden_states = self.norm_out(hidden_states, temb=emb)
749
+ hidden_states = self.proj_out(hidden_states)
750
+
751
+ # 5. Unpatchify
752
+ # Note: we use `-1` instead of `channels`:
753
+ # - It is okay to `channels` use for CogVideoX-2b and CogVideoX-5b (number of input channels is equal to output channels)
754
+ # - However, for CogVideoX-5b-I2V also takes concatenated input image latents (number of input channels is twice the output channels)
755
+
756
+ if p_t is None:
757
+ output = hidden_states.reshape(batch_size, num_frames, height // p, width // p, -1, p, p)
758
+ output = output.permute(0, 1, 4, 2, 5, 3, 6).flatten(5, 6).flatten(3, 4)
759
+ else:
760
+ output = hidden_states.reshape(
761
+ batch_size, (num_frames + p_t - 1) // p_t, height // p, width // p, -1, p_t, p, p
762
+ )
763
+ output = output.permute(0, 1, 5, 4, 2, 6, 3, 7).flatten(6, 7).flatten(4, 5).flatten(1, 2)
764
+
765
+ if self.fastercache_counter >= self.fastercache_start_step + 1:
766
+ (bb, tt, cc, hh, ww) = output.shape
767
+ cond = rearrange(output[0:1].float(), "B T C H W -> (B T) C H W", B=bb//2, C=cc, T=tt, H=hh, W=ww)
768
+ uncond = rearrange(output[1:2].float(), "B T C H W -> (B T) C H W", B=bb//2, C=cc, T=tt, H=hh, W=ww)
769
+
770
+ lf_c, hf_c = fft(cond)
771
+ lf_uc, hf_uc = fft(uncond)
772
+
773
+ self.delta_lf = lf_uc - lf_c
774
+ self.delta_hf = hf_uc - hf_c
775
+
776
+ if not return_dict:
777
+ return (output,)
778
+ return Transformer2DModelOutput(sample=output)
779
+
custom_nodes/ComfyUI-CogVideoXWrapper/embeddings.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import numpy as np
4
+ from typing import Tuple, Union, Optional
5
+ from diffusers.models.embeddings import get_3d_sincos_pos_embed, get_1d_rotary_pos_embed
6
+
7
+
8
+ class CogVideoXPatchEmbed(nn.Module):
9
+ def __init__(
10
+ self,
11
+ patch_size: int = 2,
12
+ patch_size_t: Optional[int] = None,
13
+ in_channels: int = 16,
14
+ embed_dim: int = 1920,
15
+ text_embed_dim: int = 4096,
16
+ bias: bool = True,
17
+ sample_width: int = 90,
18
+ sample_height: int = 60,
19
+ sample_frames: int = 49,
20
+ temporal_compression_ratio: int = 4,
21
+ max_text_seq_length: int = 226,
22
+ spatial_interpolation_scale: float = 1.875,
23
+ temporal_interpolation_scale: float = 1.0,
24
+ use_positional_embeddings: bool = True,
25
+ use_learned_positional_embeddings: bool = True,
26
+ ) -> None:
27
+ super().__init__()
28
+
29
+ self.patch_size = patch_size
30
+ self.patch_size_t = patch_size_t
31
+ self.embed_dim = embed_dim
32
+ self.sample_height = sample_height
33
+ self.sample_width = sample_width
34
+ self.sample_frames = sample_frames
35
+ self.temporal_compression_ratio = temporal_compression_ratio
36
+ self.max_text_seq_length = max_text_seq_length
37
+ self.spatial_interpolation_scale = spatial_interpolation_scale
38
+ self.temporal_interpolation_scale = temporal_interpolation_scale
39
+ self.use_positional_embeddings = use_positional_embeddings
40
+ self.use_learned_positional_embeddings = use_learned_positional_embeddings
41
+
42
+ if patch_size_t is None:
43
+ # CogVideoX 1.0 checkpoints
44
+ self.proj = nn.Conv2d(
45
+ in_channels, embed_dim, kernel_size=(patch_size, patch_size), stride=patch_size, bias=bias
46
+ )
47
+ else:
48
+ # CogVideoX 1.5 checkpoints
49
+ self.proj = nn.Linear(in_channels * patch_size * patch_size * patch_size_t, embed_dim)
50
+
51
+ self.text_proj = nn.Linear(text_embed_dim, embed_dim)
52
+
53
+ if use_positional_embeddings or use_learned_positional_embeddings:
54
+ persistent = use_learned_positional_embeddings
55
+ pos_embedding = self._get_positional_embeddings(sample_height, sample_width, sample_frames)
56
+ self.register_buffer("pos_embedding", pos_embedding, persistent=persistent)
57
+
58
+ def _get_positional_embeddings(self, sample_height: int, sample_width: int, sample_frames: int) -> torch.Tensor:
59
+ post_patch_height = sample_height // self.patch_size
60
+ post_patch_width = sample_width // self.patch_size
61
+ post_time_compression_frames = (sample_frames - 1) // self.temporal_compression_ratio + 1
62
+ num_patches = post_patch_height * post_patch_width * post_time_compression_frames
63
+
64
+ pos_embedding = get_3d_sincos_pos_embed(
65
+ self.embed_dim,
66
+ (post_patch_width, post_patch_height),
67
+ post_time_compression_frames,
68
+ self.spatial_interpolation_scale,
69
+ self.temporal_interpolation_scale,
70
+ )
71
+ pos_embedding = torch.from_numpy(pos_embedding).flatten(0, 1)
72
+ joint_pos_embedding = torch.zeros(
73
+ 1, self.max_text_seq_length + num_patches, self.embed_dim, requires_grad=False
74
+ )
75
+ joint_pos_embedding.data[:, self.max_text_seq_length :].copy_(pos_embedding)
76
+
77
+ return joint_pos_embedding
78
+
79
+ def forward(self, text_embeds: torch.Tensor, image_embeds: torch.Tensor):
80
+ r"""
81
+ Args:
82
+ text_embeds (`torch.Tensor`):
83
+ Input text embeddings. Expected shape: (batch_size, seq_length, embedding_dim).
84
+ image_embeds (`torch.Tensor`):
85
+ Input image embeddings. Expected shape: (batch_size, num_frames, channels, height, width).
86
+ """
87
+ text_embeds = self.text_proj(text_embeds)
88
+
89
+ batch_size, num_frames, channels, height, width = image_embeds.shape
90
+
91
+ if self.patch_size_t is None:
92
+ image_embeds = image_embeds.reshape(-1, channels, height, width)
93
+ image_embeds = self.proj(image_embeds)
94
+ image_embeds = image_embeds.view(batch_size, num_frames, *image_embeds.shape[1:])
95
+ image_embeds = image_embeds.flatten(3).transpose(2, 3) # [batch, num_frames, height x width, channels]
96
+ image_embeds = image_embeds.flatten(1, 2) # [batch, num_frames x height x width, channels]
97
+ else:
98
+ p = self.patch_size
99
+ p_t = self.patch_size_t
100
+
101
+ image_embeds = image_embeds.permute(0, 1, 3, 4, 2)
102
+ image_embeds = image_embeds.reshape(
103
+ batch_size, num_frames // p_t, p_t, height // p, p, width // p, p, channels
104
+ )
105
+ image_embeds = image_embeds.permute(0, 1, 3, 5, 7, 2, 4, 6).flatten(4, 7).flatten(1, 3)
106
+ image_embeds = self.proj(image_embeds)
107
+
108
+ embeds = torch.cat(
109
+ [text_embeds, image_embeds], dim=1
110
+ ).contiguous() # [batch, seq_length + num_frames x height x width, channels]
111
+
112
+ if self.use_positional_embeddings or self.use_learned_positional_embeddings:
113
+ if self.use_learned_positional_embeddings and (self.sample_width != width or self.sample_height != height):
114
+ raise ValueError(
115
+ "It is currently not possible to generate videos at a different resolution that the defaults. This should only be the case with 'THUDM/CogVideoX-5b-I2V'."
116
+ "If you think this is incorrect, please open an issue at https://github.com/huggingface/diffusers/issues."
117
+ )
118
+
119
+ pre_time_compression_frames = (num_frames - 1) * self.temporal_compression_ratio + 1
120
+
121
+ if (
122
+ self.sample_height != height
123
+ or self.sample_width != width
124
+ or self.sample_frames != pre_time_compression_frames
125
+ ):
126
+ pos_embedding = self._get_positional_embeddings(height, width, pre_time_compression_frames)
127
+ pos_embedding = pos_embedding.to(embeds.device, dtype=embeds.dtype)
128
+ else:
129
+ pos_embedding = self.pos_embedding
130
+
131
+ embeds = embeds + pos_embedding
132
+
133
+ return embeds
134
+
135
+ def get_3d_rotary_pos_embed(
136
+ embed_dim,
137
+ crops_coords,
138
+ grid_size,
139
+ temporal_size,
140
+ theta: int = 10000,
141
+ use_real: bool = True,
142
+ grid_type: str = "linspace",
143
+ max_size: Optional[Tuple[int, int]] = None,
144
+ ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
145
+ """
146
+ RoPE for video tokens with 3D structure.
147
+
148
+ Args:
149
+ embed_dim: (`int`):
150
+ The embedding dimension size, corresponding to hidden_size_head.
151
+ crops_coords (`Tuple[int]`):
152
+ The top-left and bottom-right coordinates of the crop.
153
+ grid_size (`Tuple[int]`):
154
+ The grid size of the spatial positional embedding (height, width).
155
+ temporal_size (`int`):
156
+ The size of the temporal dimension.
157
+ theta (`float`):
158
+ Scaling factor for frequency computation.
159
+ grid_type (`str`):
160
+ Whether to use "linspace" or "slice" to compute grids.
161
+
162
+ Returns:
163
+ `torch.Tensor`: positional embedding with shape `(temporal_size * grid_size[0] * grid_size[1], embed_dim/2)`.
164
+ """
165
+ if use_real is not True:
166
+ raise ValueError(" `use_real = False` is not currently supported for get_3d_rotary_pos_embed")
167
+
168
+ if grid_type == "linspace":
169
+ start, stop = crops_coords
170
+ grid_size_h, grid_size_w = grid_size
171
+ grid_h = np.linspace(start[0], stop[0], grid_size_h, endpoint=False, dtype=np.float32)
172
+ grid_w = np.linspace(start[1], stop[1], grid_size_w, endpoint=False, dtype=np.float32)
173
+ grid_t = np.arange(temporal_size, dtype=np.float32)
174
+ grid_t = np.linspace(0, temporal_size, temporal_size, endpoint=False, dtype=np.float32)
175
+ elif grid_type == "slice":
176
+ max_h, max_w = max_size
177
+ grid_size_h, grid_size_w = grid_size
178
+ grid_h = np.arange(max_h, dtype=np.float32)
179
+ grid_w = np.arange(max_w, dtype=np.float32)
180
+ grid_t = np.arange(temporal_size, dtype=np.float32)
181
+ else:
182
+ raise ValueError("Invalid value passed for `grid_type`.")
183
+
184
+ # Compute dimensions for each axis
185
+ dim_t = embed_dim // 4
186
+ dim_h = embed_dim // 8 * 3
187
+ dim_w = embed_dim // 8 * 3
188
+
189
+ # Temporal frequencies
190
+ freqs_t = get_1d_rotary_pos_embed(dim_t, grid_t, use_real=True)
191
+ # Spatial frequencies for height and width
192
+ freqs_h = get_1d_rotary_pos_embed(dim_h, grid_h, use_real=True)
193
+ freqs_w = get_1d_rotary_pos_embed(dim_w, grid_w, use_real=True)
194
+
195
+ # BroadCast and concatenate temporal and spaial frequencie (height and width) into a 3d tensor
196
+ def combine_time_height_width(freqs_t, freqs_h, freqs_w):
197
+ freqs_t = freqs_t[:, None, None, :].expand(
198
+ -1, grid_size_h, grid_size_w, -1
199
+ ) # temporal_size, grid_size_h, grid_size_w, dim_t
200
+ freqs_h = freqs_h[None, :, None, :].expand(
201
+ temporal_size, -1, grid_size_w, -1
202
+ ) # temporal_size, grid_size_h, grid_size_2, dim_h
203
+ freqs_w = freqs_w[None, None, :, :].expand(
204
+ temporal_size, grid_size_h, -1, -1
205
+ ) # temporal_size, grid_size_h, grid_size_2, dim_w
206
+
207
+ freqs = torch.cat(
208
+ [freqs_t, freqs_h, freqs_w], dim=-1
209
+ ) # temporal_size, grid_size_h, grid_size_w, (dim_t + dim_h + dim_w)
210
+ freqs = freqs.view(
211
+ temporal_size * grid_size_h * grid_size_w, -1
212
+ ) # (temporal_size * grid_size_h * grid_size_w), (dim_t + dim_h + dim_w)
213
+ return freqs
214
+
215
+ t_cos, t_sin = freqs_t # both t_cos and t_sin has shape: temporal_size, dim_t
216
+ h_cos, h_sin = freqs_h # both h_cos and h_sin has shape: grid_size_h, dim_h
217
+ w_cos, w_sin = freqs_w # both w_cos and w_sin has shape: grid_size_w, dim_w
218
+
219
+ if grid_type == "slice":
220
+ t_cos, t_sin = t_cos[:temporal_size], t_sin[:temporal_size]
221
+ h_cos, h_sin = h_cos[:grid_size_h], h_sin[:grid_size_h]
222
+ w_cos, w_sin = w_cos[:grid_size_w], w_sin[:grid_size_w]
223
+
224
+ cos = combine_time_height_width(t_cos, h_cos, w_cos)
225
+ sin = combine_time_height_width(t_sin, h_sin, w_sin)
226
+ return cos, sin
custom_nodes/ComfyUI-CogVideoXWrapper/enhance_a_video/__init__.py ADDED
File without changes
custom_nodes/ComfyUI-CogVideoXWrapper/enhance_a_video/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (205 Bytes). View file
 
custom_nodes/ComfyUI-CogVideoXWrapper/enhance_a_video/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (251 Bytes). View file
 
custom_nodes/ComfyUI-CogVideoXWrapper/enhance_a_video/__pycache__/enhance.cpython-311.pyc ADDED
Binary file (2.81 kB). View file
 
custom_nodes/ComfyUI-CogVideoXWrapper/enhance_a_video/__pycache__/enhance.cpython-312.pyc ADDED
Binary file (2.63 kB). View file
 
custom_nodes/ComfyUI-CogVideoXWrapper/enhance_a_video/__pycache__/globals.cpython-311.pyc ADDED
Binary file (1.36 kB). View file
 
custom_nodes/ComfyUI-CogVideoXWrapper/enhance_a_video/__pycache__/globals.cpython-312.pyc ADDED
Binary file (1.28 kB). View file
 
custom_nodes/ComfyUI-CogVideoXWrapper/enhance_a_video/enhance.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from einops import rearrange
3
+ from diffusers.models.attention import Attention
4
+ from .globals import get_enhance_weight, get_num_frames
5
+
6
+ # def get_feta_scores(query, key):
7
+ # img_q, img_k = query, key
8
+
9
+ # num_frames = get_num_frames()
10
+
11
+ # B, S, N, C = img_q.shape
12
+
13
+ # # Calculate spatial dimension
14
+ # spatial_dim = S // num_frames
15
+
16
+ # # Add time dimension between spatial and head dims
17
+ # query_image = img_q.reshape(B, spatial_dim, num_frames, N, C)
18
+ # key_image = img_k.reshape(B, spatial_dim, num_frames, N, C)
19
+
20
+ # # Expand time dimension
21
+ # query_image = query_image.expand(-1, -1, num_frames, -1, -1) # [B, S, T, N, C]
22
+ # key_image = key_image.expand(-1, -1, num_frames, -1, -1) # [B, S, T, N, C]
23
+
24
+ # # Reshape to match feta_score input format: [(B S) N T C]
25
+ # query_image = rearrange(query_image, "b s t n c -> (b s) n t c") #torch.Size([3200, 24, 5, 128])
26
+ # key_image = rearrange(key_image, "b s t n c -> (b s) n t c")
27
+
28
+ # return feta_score(query_image, key_image, C, num_frames)
29
+
30
+ def get_feta_scores(
31
+ attn: Attention,
32
+ query: torch.Tensor,
33
+ key: torch.Tensor,
34
+ head_dim: int,
35
+ text_seq_length: int,
36
+ ) -> torch.Tensor:
37
+ num_frames = get_num_frames()
38
+ spatial_dim = int((query.shape[2] - text_seq_length) / num_frames)
39
+
40
+ query_image = rearrange(
41
+ query[:, :, text_seq_length:],
42
+ "B N (T S) C -> (B S) N T C",
43
+ N=attn.heads,
44
+ T=num_frames,
45
+ S=spatial_dim,
46
+ C=head_dim,
47
+ )
48
+ key_image = rearrange(
49
+ key[:, :, text_seq_length:],
50
+ "B N (T S) C -> (B S) N T C",
51
+ N=attn.heads,
52
+ T=num_frames,
53
+ S=spatial_dim,
54
+ C=head_dim,
55
+ )
56
+ return feta_score(query_image, key_image, head_dim, num_frames)
57
+
58
+ def feta_score(query_image, key_image, head_dim, num_frames):
59
+ scale = head_dim**-0.5
60
+ query_image = query_image * scale
61
+ attn_temp = query_image @ key_image.transpose(-2, -1) # translate attn to float32
62
+ attn_temp = attn_temp.to(torch.float32)
63
+ attn_temp = attn_temp.softmax(dim=-1)
64
+
65
+ # Reshape to [batch_size * num_tokens, num_frames, num_frames]
66
+ attn_temp = attn_temp.reshape(-1, num_frames, num_frames)
67
+
68
+ # Create a mask for diagonal elements
69
+ diag_mask = torch.eye(num_frames, device=attn_temp.device).bool()
70
+ diag_mask = diag_mask.unsqueeze(0).expand(attn_temp.shape[0], -1, -1)
71
+
72
+ # Zero out diagonal elements
73
+ attn_wo_diag = attn_temp.masked_fill(diag_mask, 0)
74
+
75
+ # Calculate mean for each token's attention matrix
76
+ # Number of off-diagonal elements per matrix is n*n - n
77
+ num_off_diag = num_frames * num_frames - num_frames
78
+ mean_scores = attn_wo_diag.sum(dim=(1, 2)) / num_off_diag
79
+
80
+ enhance_scores = mean_scores.mean() * (num_frames + get_enhance_weight())
81
+ enhance_scores = enhance_scores.clamp(min=1)
82
+ return enhance_scores
custom_nodes/ComfyUI-CogVideoXWrapper/enhance_a_video/globals.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ NUM_FRAMES = None
2
+ FETA_WEIGHT = None
3
+ ENABLE_FETA = False
4
+
5
+ def set_num_frames(num_frames: int):
6
+ global NUM_FRAMES
7
+ NUM_FRAMES = num_frames
8
+
9
+
10
+ def get_num_frames() -> int:
11
+ return NUM_FRAMES
12
+
13
+
14
+ def enable_enhance():
15
+ global ENABLE_FETA
16
+ ENABLE_FETA = True
17
+
18
+ def disable_enhance():
19
+ global ENABLE_FETA
20
+ ENABLE_FETA = False
21
+
22
+ def is_enhance_enabled() -> bool:
23
+ return ENABLE_FETA
24
+
25
+ def set_enhance_weight(feta_weight: float):
26
+ global FETA_WEIGHT
27
+ FETA_WEIGHT = feta_weight
28
+
29
+
30
+ def get_enhance_weight() -> float:
31
+ return FETA_WEIGHT
custom_nodes/ComfyUI-CogVideoXWrapper/example_workflows/cogvideox_1.0_5b_vid2vid_02.json ADDED
@@ -0,0 +1,1061 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "last_node_id": 78,
3
+ "last_link_id": 218,
4
+ "nodes": [
5
+ {
6
+ "id": 20,
7
+ "type": "CLIPLoader",
8
+ "pos": {
9
+ "0": -29,
10
+ "1": 407
11
+ },
12
+ "size": {
13
+ "0": 451.30548095703125,
14
+ "1": 82
15
+ },
16
+ "flags": {},
17
+ "order": 0,
18
+ "mode": 0,
19
+ "inputs": [],
20
+ "outputs": [
21
+ {
22
+ "name": "CLIP",
23
+ "type": "CLIP",
24
+ "links": [
25
+ 54
26
+ ],
27
+ "slot_index": 0,
28
+ "shape": 3
29
+ }
30
+ ],
31
+ "properties": {
32
+ "Node name for S&R": "CLIPLoader"
33
+ },
34
+ "widgets_values": [
35
+ "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
36
+ "sd3"
37
+ ]
38
+ },
39
+ {
40
+ "id": 41,
41
+ "type": "ImageResizeKJ",
42
+ "pos": {
43
+ "0": 206,
44
+ "1": -69
45
+ },
46
+ "size": {
47
+ "0": 315,
48
+ "1": 242
49
+ },
50
+ "flags": {},
51
+ "order": 7,
52
+ "mode": 0,
53
+ "inputs": [
54
+ {
55
+ "name": "image",
56
+ "type": "IMAGE",
57
+ "link": 180
58
+ },
59
+ {
60
+ "name": "get_image_size",
61
+ "type": "IMAGE",
62
+ "link": null,
63
+ "shape": 7
64
+ },
65
+ {
66
+ "name": "width_input",
67
+ "type": "INT",
68
+ "link": null,
69
+ "widget": {
70
+ "name": "width_input"
71
+ }
72
+ },
73
+ {
74
+ "name": "height_input",
75
+ "type": "INT",
76
+ "link": null,
77
+ "widget": {
78
+ "name": "height_input"
79
+ }
80
+ }
81
+ ],
82
+ "outputs": [
83
+ {
84
+ "name": "IMAGE",
85
+ "type": "IMAGE",
86
+ "links": [
87
+ 126
88
+ ],
89
+ "slot_index": 0,
90
+ "shape": 3
91
+ },
92
+ {
93
+ "name": "width",
94
+ "type": "INT",
95
+ "links": null,
96
+ "shape": 3
97
+ },
98
+ {
99
+ "name": "height",
100
+ "type": "INT",
101
+ "links": null,
102
+ "shape": 3
103
+ }
104
+ ],
105
+ "properties": {
106
+ "Node name for S&R": "ImageResizeKJ"
107
+ },
108
+ "widgets_values": [
109
+ 720,
110
+ 480,
111
+ "lanczos",
112
+ false,
113
+ 2,
114
+ 0,
115
+ 0,
116
+ "disabled"
117
+ ]
118
+ },
119
+ {
120
+ "id": 45,
121
+ "type": "VHS_LoadVideo",
122
+ "pos": {
123
+ "0": -93,
124
+ "1": -153
125
+ },
126
+ "size": [
127
+ 247.455078125,
128
+ 365.7275390625
129
+ ],
130
+ "flags": {},
131
+ "order": 4,
132
+ "mode": 0,
133
+ "inputs": [
134
+ {
135
+ "name": "meta_batch",
136
+ "type": "VHS_BatchManager",
137
+ "link": null,
138
+ "shape": 7
139
+ },
140
+ {
141
+ "name": "vae",
142
+ "type": "VAE",
143
+ "link": null,
144
+ "shape": 7
145
+ },
146
+ {
147
+ "name": "frame_load_cap",
148
+ "type": "INT",
149
+ "link": 177,
150
+ "widget": {
151
+ "name": "frame_load_cap"
152
+ }
153
+ }
154
+ ],
155
+ "outputs": [
156
+ {
157
+ "name": "IMAGE",
158
+ "type": "IMAGE",
159
+ "links": [
160
+ 179
161
+ ],
162
+ "slot_index": 0,
163
+ "shape": 3
164
+ },
165
+ {
166
+ "name": "frame_count",
167
+ "type": "INT",
168
+ "links": null,
169
+ "shape": 3
170
+ },
171
+ {
172
+ "name": "audio",
173
+ "type": "AUDIO",
174
+ "links": null,
175
+ "shape": 3
176
+ },
177
+ {
178
+ "name": "video_info",
179
+ "type": "VHS_VIDEOINFO",
180
+ "links": null,
181
+ "shape": 3
182
+ }
183
+ ],
184
+ "properties": {
185
+ "Node name for S&R": "VHS_LoadVideo"
186
+ },
187
+ "widgets_values": {
188
+ "video": "jeep.mp4",
189
+ "force_rate": 0,
190
+ "force_size": "Disabled",
191
+ "custom_width": 512,
192
+ "custom_height": 512,
193
+ "frame_load_cap": 20,
194
+ "skip_first_frames": 0,
195
+ "select_every_nth": 1,
196
+ "choose video to upload": "image",
197
+ "videopreview": {
198
+ "hidden": false,
199
+ "paused": false,
200
+ "params": {
201
+ "frame_load_cap": 20,
202
+ "skip_first_frames": 0,
203
+ "force_rate": 0,
204
+ "filename": "jeep.mp4",
205
+ "type": "input",
206
+ "format": "video/mp4",
207
+ "select_every_nth": 1
208
+ }
209
+ }
210
+ }
211
+ },
212
+ {
213
+ "id": 70,
214
+ "type": "GetImageSizeAndCount",
215
+ "pos": {
216
+ "0": 214,
217
+ "1": -234
218
+ },
219
+ "size": {
220
+ "0": 202.2143096923828,
221
+ "1": 99.23601531982422
222
+ },
223
+ "flags": {},
224
+ "order": 6,
225
+ "mode": 0,
226
+ "inputs": [
227
+ {
228
+ "name": "image",
229
+ "type": "IMAGE",
230
+ "link": 179,
231
+ "slot_index": 0
232
+ }
233
+ ],
234
+ "outputs": [
235
+ {
236
+ "name": "image",
237
+ "type": "IMAGE",
238
+ "links": [
239
+ 180
240
+ ],
241
+ "slot_index": 0,
242
+ "shape": 3
243
+ },
244
+ {
245
+ "name": "512 width",
246
+ "type": "INT",
247
+ "links": [],
248
+ "slot_index": 1,
249
+ "shape": 3
250
+ },
251
+ {
252
+ "name": "256 height",
253
+ "type": "INT",
254
+ "links": [],
255
+ "slot_index": 2,
256
+ "shape": 3
257
+ },
258
+ {
259
+ "name": "33 count",
260
+ "type": "INT",
261
+ "links": [],
262
+ "slot_index": 3,
263
+ "shape": 3
264
+ }
265
+ ],
266
+ "properties": {
267
+ "Node name for S&R": "GetImageSizeAndCount"
268
+ },
269
+ "widgets_values": []
270
+ },
271
+ {
272
+ "id": 69,
273
+ "type": "INTConstant",
274
+ "pos": {
275
+ "0": -90,
276
+ "1": -305
277
+ },
278
+ "size": {
279
+ "0": 210,
280
+ "1": 58
281
+ },
282
+ "flags": {},
283
+ "order": 1,
284
+ "mode": 0,
285
+ "inputs": [],
286
+ "outputs": [
287
+ {
288
+ "name": "value",
289
+ "type": "INT",
290
+ "links": [
291
+ 177
292
+ ],
293
+ "shape": 3
294
+ }
295
+ ],
296
+ "title": "Frames to load",
297
+ "properties": {
298
+ "Node name for S&R": "INTConstant"
299
+ },
300
+ "widgets_values": [
301
+ 33
302
+ ],
303
+ "color": "#1b4669",
304
+ "bgcolor": "#29699c"
305
+ },
306
+ {
307
+ "id": 58,
308
+ "type": "ImageConcanate",
309
+ "pos": {
310
+ "0": 1594,
311
+ "1": 230
312
+ },
313
+ "size": {
314
+ "0": 315,
315
+ "1": 102
316
+ },
317
+ "flags": {},
318
+ "order": 13,
319
+ "mode": 0,
320
+ "inputs": [
321
+ {
322
+ "name": "image1",
323
+ "type": "IMAGE",
324
+ "link": 191
325
+ },
326
+ {
327
+ "name": "image2",
328
+ "type": "IMAGE",
329
+ "link": 170
330
+ }
331
+ ],
332
+ "outputs": [
333
+ {
334
+ "name": "IMAGE",
335
+ "type": "IMAGE",
336
+ "links": [
337
+ 132
338
+ ],
339
+ "slot_index": 0,
340
+ "shape": 3
341
+ }
342
+ ],
343
+ "properties": {
344
+ "Node name for S&R": "ImageConcanate"
345
+ },
346
+ "widgets_values": [
347
+ "right",
348
+ false
349
+ ]
350
+ },
351
+ {
352
+ "id": 55,
353
+ "type": "GetImageSizeAndCount",
354
+ "pos": {
355
+ "0": 1654,
356
+ "1": 77
357
+ },
358
+ "size": {
359
+ "0": 210,
360
+ "1": 86
361
+ },
362
+ "flags": {},
363
+ "order": 12,
364
+ "mode": 0,
365
+ "inputs": [
366
+ {
367
+ "name": "image",
368
+ "type": "IMAGE",
369
+ "link": 208,
370
+ "slot_index": 0
371
+ }
372
+ ],
373
+ "outputs": [
374
+ {
375
+ "name": "image",
376
+ "type": "IMAGE",
377
+ "links": [
378
+ 170
379
+ ],
380
+ "slot_index": 0,
381
+ "shape": 3
382
+ },
383
+ {
384
+ "name": "720 width",
385
+ "type": "INT",
386
+ "links": null,
387
+ "shape": 3
388
+ },
389
+ {
390
+ "name": "480 height",
391
+ "type": "INT",
392
+ "links": null,
393
+ "shape": 3
394
+ },
395
+ {
396
+ "name": "33 count",
397
+ "type": "INT",
398
+ "links": [],
399
+ "slot_index": 3,
400
+ "shape": 3
401
+ }
402
+ ],
403
+ "properties": {
404
+ "Node name for S&R": "GetImageSizeAndCount"
405
+ },
406
+ "widgets_values": []
407
+ },
408
+ {
409
+ "id": 77,
410
+ "type": "CogVideoImageEncode",
411
+ "pos": {
412
+ "0": 952,
413
+ "1": -118
414
+ },
415
+ "size": {
416
+ "0": 315,
417
+ "1": 122
418
+ },
419
+ "flags": {},
420
+ "order": 9,
421
+ "mode": 0,
422
+ "inputs": [
423
+ {
424
+ "name": "vae",
425
+ "type": "VAE",
426
+ "link": 209
427
+ },
428
+ {
429
+ "name": "start_image",
430
+ "type": "IMAGE",
431
+ "link": 210
432
+ },
433
+ {
434
+ "name": "end_image",
435
+ "type": "IMAGE",
436
+ "link": null,
437
+ "shape": 7
438
+ }
439
+ ],
440
+ "outputs": [
441
+ {
442
+ "name": "samples",
443
+ "type": "LATENT",
444
+ "links": [
445
+ 215
446
+ ]
447
+ }
448
+ ],
449
+ "properties": {
450
+ "Node name for S&R": "CogVideoImageEncode"
451
+ },
452
+ "widgets_values": [
453
+ false,
454
+ 0
455
+ ]
456
+ },
457
+ {
458
+ "id": 76,
459
+ "type": "CogVideoDecode",
460
+ "pos": {
461
+ "0": 1335,
462
+ "1": -123
463
+ },
464
+ "size": {
465
+ "0": 315,
466
+ "1": 198
467
+ },
468
+ "flags": {},
469
+ "order": 11,
470
+ "mode": 0,
471
+ "inputs": [
472
+ {
473
+ "name": "vae",
474
+ "type": "VAE",
475
+ "link": 206
476
+ },
477
+ {
478
+ "name": "samples",
479
+ "type": "LATENT",
480
+ "link": 216
481
+ }
482
+ ],
483
+ "outputs": [
484
+ {
485
+ "name": "images",
486
+ "type": "IMAGE",
487
+ "links": [
488
+ 208
489
+ ]
490
+ }
491
+ ],
492
+ "properties": {
493
+ "Node name for S&R": "CogVideoDecode"
494
+ },
495
+ "widgets_values": [
496
+ true,
497
+ 240,
498
+ 360,
499
+ 0.2,
500
+ 0.2,
501
+ true
502
+ ]
503
+ },
504
+ {
505
+ "id": 30,
506
+ "type": "CogVideoTextEncode",
507
+ "pos": {
508
+ "0": 491,
509
+ "1": 372
510
+ },
511
+ "size": [
512
+ 478.6890949595422,
513
+ 215.66308749666905
514
+ ],
515
+ "flags": {},
516
+ "order": 3,
517
+ "mode": 0,
518
+ "inputs": [
519
+ {
520
+ "name": "clip",
521
+ "type": "CLIP",
522
+ "link": 54
523
+ }
524
+ ],
525
+ "outputs": [
526
+ {
527
+ "name": "conditioning",
528
+ "type": "CONDITIONING",
529
+ "links": [
530
+ 213
531
+ ],
532
+ "slot_index": 0,
533
+ "shape": 3
534
+ },
535
+ {
536
+ "name": "clip",
537
+ "type": "CLIP",
538
+ "links": [
539
+ 217
540
+ ],
541
+ "slot_index": 1
542
+ }
543
+ ],
544
+ "properties": {
545
+ "Node name for S&R": "CogVideoTextEncode"
546
+ },
547
+ "widgets_values": [
548
+ "A high-definition nature video showcasing a brown bear as it gracefully runs down a crystal-clear stream, surrounded by the serene ambiance of a dense, verdant forest. The sunlight filters through the canopy of tall trees, casting dappled light on the forest floor, while the gentle sound of flowing water and rustling leaves creates a peaceful atmosphere. The brown bear's fur glistens in the sunlight, highlighting its striking red and white markings as it navigates the stream with agility and playfulness.",
549
+ 1,
550
+ false
551
+ ]
552
+ },
553
+ {
554
+ "id": 31,
555
+ "type": "CogVideoTextEncode",
556
+ "pos": {
557
+ "0": 504,
558
+ "1": 651
559
+ },
560
+ "size": {
561
+ "0": 463.01251220703125,
562
+ "1": 144
563
+ },
564
+ "flags": {},
565
+ "order": 5,
566
+ "mode": 0,
567
+ "inputs": [
568
+ {
569
+ "name": "clip",
570
+ "type": "CLIP",
571
+ "link": 217
572
+ }
573
+ ],
574
+ "outputs": [
575
+ {
576
+ "name": "conditioning",
577
+ "type": "CONDITIONING",
578
+ "links": [
579
+ 214
580
+ ],
581
+ "slot_index": 0,
582
+ "shape": 3
583
+ },
584
+ {
585
+ "name": "clip",
586
+ "type": "CLIP",
587
+ "links": null
588
+ }
589
+ ],
590
+ "properties": {
591
+ "Node name for S&R": "CogVideoTextEncode"
592
+ },
593
+ "widgets_values": [
594
+ "",
595
+ 1,
596
+ true
597
+ ]
598
+ },
599
+ {
600
+ "id": 78,
601
+ "type": "CogVideoSampler",
602
+ "pos": {
603
+ "0": 1083,
604
+ "1": 255
605
+ },
606
+ "size": [
607
+ 330,
608
+ 574
609
+ ],
610
+ "flags": {},
611
+ "order": 10,
612
+ "mode": 0,
613
+ "inputs": [
614
+ {
615
+ "name": "model",
616
+ "type": "COGVIDEOMODEL",
617
+ "link": 212
618
+ },
619
+ {
620
+ "name": "positive",
621
+ "type": "CONDITIONING",
622
+ "link": 213
623
+ },
624
+ {
625
+ "name": "negative",
626
+ "type": "CONDITIONING",
627
+ "link": 214
628
+ },
629
+ {
630
+ "name": "samples",
631
+ "type": "LATENT",
632
+ "link": 215,
633
+ "shape": 7
634
+ },
635
+ {
636
+ "name": "image_cond_latents",
637
+ "type": "LATENT",
638
+ "link": null,
639
+ "shape": 7
640
+ },
641
+ {
642
+ "name": "context_options",
643
+ "type": "COGCONTEXT",
644
+ "link": null,
645
+ "shape": 7
646
+ },
647
+ {
648
+ "name": "controlnet",
649
+ "type": "COGVIDECONTROLNET",
650
+ "link": null,
651
+ "shape": 7
652
+ },
653
+ {
654
+ "name": "tora_trajectory",
655
+ "type": "TORAFEATURES",
656
+ "link": null,
657
+ "shape": 7
658
+ },
659
+ {
660
+ "name": "fastercache",
661
+ "type": "FASTERCACHEARGS",
662
+ "link": null,
663
+ "shape": 7
664
+ },
665
+ {
666
+ "name": "num_frames",
667
+ "type": "INT",
668
+ "link": 218,
669
+ "widget": {
670
+ "name": "num_frames"
671
+ }
672
+ }
673
+ ],
674
+ "outputs": [
675
+ {
676
+ "name": "samples",
677
+ "type": "LATENT",
678
+ "links": [
679
+ 216
680
+ ]
681
+ }
682
+ ],
683
+ "properties": {
684
+ "Node name for S&R": "CogVideoSampler"
685
+ },
686
+ "widgets_values": [
687
+ 49,
688
+ 25,
689
+ 6,
690
+ 0,
691
+ "fixed",
692
+ "CogVideoXDDIM",
693
+ 0.8
694
+ ]
695
+ },
696
+ {
697
+ "id": 57,
698
+ "type": "GetImageSizeAndCount",
699
+ "pos": {
700
+ "0": 595,
701
+ "1": -79
702
+ },
703
+ "size": {
704
+ "0": 202.2143096923828,
705
+ "1": 99.23601531982422
706
+ },
707
+ "flags": {},
708
+ "order": 8,
709
+ "mode": 0,
710
+ "inputs": [
711
+ {
712
+ "name": "image",
713
+ "type": "IMAGE",
714
+ "link": 126,
715
+ "slot_index": 0
716
+ }
717
+ ],
718
+ "outputs": [
719
+ {
720
+ "name": "image",
721
+ "type": "IMAGE",
722
+ "links": [
723
+ 191,
724
+ 210
725
+ ],
726
+ "slot_index": 0,
727
+ "shape": 3
728
+ },
729
+ {
730
+ "name": "720 width",
731
+ "type": "INT",
732
+ "links": [],
733
+ "slot_index": 1,
734
+ "shape": 3
735
+ },
736
+ {
737
+ "name": "480 height",
738
+ "type": "INT",
739
+ "links": [],
740
+ "slot_index": 2,
741
+ "shape": 3
742
+ },
743
+ {
744
+ "name": "33 count",
745
+ "type": "INT",
746
+ "links": [
747
+ 218
748
+ ],
749
+ "slot_index": 3,
750
+ "shape": 3
751
+ }
752
+ ],
753
+ "properties": {
754
+ "Node name for S&R": "GetImageSizeAndCount"
755
+ },
756
+ "widgets_values": []
757
+ },
758
+ {
759
+ "id": 75,
760
+ "type": "DownloadAndLoadCogVideoModel",
761
+ "pos": {
762
+ "0": 606,
763
+ "1": 85
764
+ },
765
+ "size": {
766
+ "0": 315,
767
+ "1": 218
768
+ },
769
+ "flags": {},
770
+ "order": 2,
771
+ "mode": 0,
772
+ "inputs": [
773
+ {
774
+ "name": "block_edit",
775
+ "type": "TRANSFORMERBLOCKS",
776
+ "link": null,
777
+ "shape": 7
778
+ },
779
+ {
780
+ "name": "lora",
781
+ "type": "COGLORA",
782
+ "link": null,
783
+ "shape": 7
784
+ },
785
+ {
786
+ "name": "compile_args",
787
+ "type": "COMPILEARGS",
788
+ "link": null,
789
+ "shape": 7
790
+ }
791
+ ],
792
+ "outputs": [
793
+ {
794
+ "name": "model",
795
+ "type": "COGVIDEOMODEL",
796
+ "links": [
797
+ 212
798
+ ]
799
+ },
800
+ {
801
+ "name": "vae",
802
+ "type": "VAE",
803
+ "links": [
804
+ 206,
805
+ 209
806
+ ]
807
+ }
808
+ ],
809
+ "properties": {
810
+ "Node name for S&R": "DownloadAndLoadCogVideoModel"
811
+ },
812
+ "widgets_values": [
813
+ "THUDM/CogVideoX-5b",
814
+ "bf16",
815
+ "disabled",
816
+ false,
817
+ "sdpa",
818
+ "main_device"
819
+ ]
820
+ },
821
+ {
822
+ "id": 47,
823
+ "type": "VHS_VideoCombine",
824
+ "pos": {
825
+ "0": 1946,
826
+ "1": -172
827
+ },
828
+ "size": [
829
+ 1110,
830
+ 687.3333333333333
831
+ ],
832
+ "flags": {},
833
+ "order": 14,
834
+ "mode": 0,
835
+ "inputs": [
836
+ {
837
+ "name": "images",
838
+ "type": "IMAGE",
839
+ "link": 132
840
+ },
841
+ {
842
+ "name": "audio",
843
+ "type": "AUDIO",
844
+ "link": null,
845
+ "shape": 7
846
+ },
847
+ {
848
+ "name": "meta_batch",
849
+ "type": "VHS_BatchManager",
850
+ "link": null,
851
+ "shape": 7
852
+ },
853
+ {
854
+ "name": "vae",
855
+ "type": "VAE",
856
+ "link": null,
857
+ "shape": 7
858
+ }
859
+ ],
860
+ "outputs": [
861
+ {
862
+ "name": "Filenames",
863
+ "type": "VHS_FILENAMES",
864
+ "links": null,
865
+ "shape": 3
866
+ }
867
+ ],
868
+ "properties": {
869
+ "Node name for S&R": "VHS_VideoCombine"
870
+ },
871
+ "widgets_values": {
872
+ "frame_rate": 8,
873
+ "loop_count": 0,
874
+ "filename_prefix": "CogVideoX_vid2vid",
875
+ "format": "video/h264-mp4",
876
+ "pix_fmt": "yuv420p",
877
+ "crf": 19,
878
+ "save_metadata": true,
879
+ "pingpong": false,
880
+ "save_output": true,
881
+ "videopreview": {
882
+ "hidden": false,
883
+ "paused": false,
884
+ "params": {
885
+ "filename": "CogVideoX_vid2vid_00003.mp4",
886
+ "subfolder": "",
887
+ "type": "temp",
888
+ "format": "video/h264-mp4",
889
+ "frame_rate": 8
890
+ }
891
+ }
892
+ }
893
+ }
894
+ ],
895
+ "links": [
896
+ [
897
+ 54,
898
+ 20,
899
+ 0,
900
+ 30,
901
+ 0,
902
+ "CLIP"
903
+ ],
904
+ [
905
+ 126,
906
+ 41,
907
+ 0,
908
+ 57,
909
+ 0,
910
+ "IMAGE"
911
+ ],
912
+ [
913
+ 132,
914
+ 58,
915
+ 0,
916
+ 47,
917
+ 0,
918
+ "IMAGE"
919
+ ],
920
+ [
921
+ 170,
922
+ 55,
923
+ 0,
924
+ 58,
925
+ 1,
926
+ "IMAGE"
927
+ ],
928
+ [
929
+ 177,
930
+ 69,
931
+ 0,
932
+ 45,
933
+ 2,
934
+ "INT"
935
+ ],
936
+ [
937
+ 179,
938
+ 45,
939
+ 0,
940
+ 70,
941
+ 0,
942
+ "IMAGE"
943
+ ],
944
+ [
945
+ 180,
946
+ 70,
947
+ 0,
948
+ 41,
949
+ 0,
950
+ "IMAGE"
951
+ ],
952
+ [
953
+ 191,
954
+ 57,
955
+ 0,
956
+ 58,
957
+ 0,
958
+ "IMAGE"
959
+ ],
960
+ [
961
+ 206,
962
+ 75,
963
+ 1,
964
+ 76,
965
+ 0,
966
+ "VAE"
967
+ ],
968
+ [
969
+ 208,
970
+ 76,
971
+ 0,
972
+ 55,
973
+ 0,
974
+ "IMAGE"
975
+ ],
976
+ [
977
+ 209,
978
+ 75,
979
+ 1,
980
+ 77,
981
+ 0,
982
+ "VAE"
983
+ ],
984
+ [
985
+ 210,
986
+ 57,
987
+ 0,
988
+ 77,
989
+ 1,
990
+ "IMAGE"
991
+ ],
992
+ [
993
+ 212,
994
+ 75,
995
+ 0,
996
+ 78,
997
+ 0,
998
+ "COGVIDEOMODEL"
999
+ ],
1000
+ [
1001
+ 213,
1002
+ 30,
1003
+ 0,
1004
+ 78,
1005
+ 1,
1006
+ "CONDITIONING"
1007
+ ],
1008
+ [
1009
+ 214,
1010
+ 31,
1011
+ 0,
1012
+ 78,
1013
+ 2,
1014
+ "CONDITIONING"
1015
+ ],
1016
+ [
1017
+ 215,
1018
+ 77,
1019
+ 0,
1020
+ 78,
1021
+ 3,
1022
+ "LATENT"
1023
+ ],
1024
+ [
1025
+ 216,
1026
+ 78,
1027
+ 0,
1028
+ 76,
1029
+ 1,
1030
+ "LATENT"
1031
+ ],
1032
+ [
1033
+ 217,
1034
+ 30,
1035
+ 1,
1036
+ 31,
1037
+ 0,
1038
+ "CLIP"
1039
+ ],
1040
+ [
1041
+ 218,
1042
+ 57,
1043
+ 3,
1044
+ 78,
1045
+ 9,
1046
+ "INT"
1047
+ ]
1048
+ ],
1049
+ "groups": [],
1050
+ "config": {},
1051
+ "extra": {
1052
+ "ds": {
1053
+ "scale": 0.8390545288825798,
1054
+ "offset": [
1055
+ -318.82552550589344,
1056
+ 331.70430573737934
1057
+ ]
1058
+ }
1059
+ },
1060
+ "version": 0.4
1061
+ }
custom_nodes/ComfyUI-CogVideoXWrapper/example_workflows/cogvideox_1_0_2b_controlnet_02.json ADDED
@@ -0,0 +1,1003 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "last_node_id": 48,
3
+ "last_link_id": 90,
4
+ "nodes": [
5
+ {
6
+ "id": 41,
7
+ "type": "HEDPreprocessor",
8
+ "pos": {
9
+ "0": -570,
10
+ "1": -76
11
+ },
12
+ "size": {
13
+ "0": 315,
14
+ "1": 82
15
+ },
16
+ "flags": {},
17
+ "order": 4,
18
+ "mode": 0,
19
+ "inputs": [
20
+ {
21
+ "name": "image",
22
+ "type": "IMAGE",
23
+ "link": 73
24
+ }
25
+ ],
26
+ "outputs": [
27
+ {
28
+ "name": "IMAGE",
29
+ "type": "IMAGE",
30
+ "links": [
31
+ 74
32
+ ],
33
+ "slot_index": 0
34
+ }
35
+ ],
36
+ "properties": {
37
+ "Node name for S&R": "HEDPreprocessor"
38
+ },
39
+ "widgets_values": [
40
+ "enable",
41
+ 768
42
+ ]
43
+ },
44
+ {
45
+ "id": 38,
46
+ "type": "VHS_LoadVideo",
47
+ "pos": {
48
+ "0": -847,
49
+ "1": -78
50
+ },
51
+ "size": [
52
+ 247.455078125,
53
+ 427.63671875
54
+ ],
55
+ "flags": {},
56
+ "order": 0,
57
+ "mode": 0,
58
+ "inputs": [
59
+ {
60
+ "name": "meta_batch",
61
+ "type": "VHS_BatchManager",
62
+ "link": null,
63
+ "shape": 7
64
+ },
65
+ {
66
+ "name": "vae",
67
+ "type": "VAE",
68
+ "link": null,
69
+ "shape": 7
70
+ }
71
+ ],
72
+ "outputs": [
73
+ {
74
+ "name": "IMAGE",
75
+ "type": "IMAGE",
76
+ "links": [
77
+ 73
78
+ ],
79
+ "slot_index": 0
80
+ },
81
+ {
82
+ "name": "frame_count",
83
+ "type": "INT",
84
+ "links": null
85
+ },
86
+ {
87
+ "name": "audio",
88
+ "type": "AUDIO",
89
+ "links": null
90
+ },
91
+ {
92
+ "name": "video_info",
93
+ "type": "VHS_VIDEOINFO",
94
+ "links": null
95
+ }
96
+ ],
97
+ "properties": {
98
+ "Node name for S&R": "VHS_LoadVideo"
99
+ },
100
+ "widgets_values": {
101
+ "video": "car.mp4",
102
+ "force_rate": 0,
103
+ "force_size": "Disabled",
104
+ "custom_width": 512,
105
+ "custom_height": 512,
106
+ "frame_load_cap": 49,
107
+ "skip_first_frames": 0,
108
+ "select_every_nth": 1,
109
+ "choose video to upload": "image",
110
+ "videopreview": {
111
+ "hidden": false,
112
+ "paused": false,
113
+ "params": {
114
+ "frame_load_cap": 49,
115
+ "skip_first_frames": 0,
116
+ "force_rate": 0,
117
+ "filename": "car.mp4",
118
+ "type": "input",
119
+ "format": "video/mp4",
120
+ "select_every_nth": 1
121
+ },
122
+ "muted": false
123
+ }
124
+ }
125
+ },
126
+ {
127
+ "id": 39,
128
+ "type": "ImageResizeKJ",
129
+ "pos": {
130
+ "0": -563,
131
+ "1": 63
132
+ },
133
+ "size": {
134
+ "0": 315,
135
+ "1": 266
136
+ },
137
+ "flags": {},
138
+ "order": 6,
139
+ "mode": 0,
140
+ "inputs": [
141
+ {
142
+ "name": "image",
143
+ "type": "IMAGE",
144
+ "link": 74
145
+ },
146
+ {
147
+ "name": "get_image_size",
148
+ "type": "IMAGE",
149
+ "link": null,
150
+ "shape": 7
151
+ },
152
+ {
153
+ "name": "width_input",
154
+ "type": "INT",
155
+ "link": null,
156
+ "widget": {
157
+ "name": "width_input"
158
+ },
159
+ "shape": 7
160
+ },
161
+ {
162
+ "name": "height_input",
163
+ "type": "INT",
164
+ "link": null,
165
+ "widget": {
166
+ "name": "height_input"
167
+ },
168
+ "shape": 7
169
+ }
170
+ ],
171
+ "outputs": [
172
+ {
173
+ "name": "IMAGE",
174
+ "type": "IMAGE",
175
+ "links": [
176
+ 71
177
+ ],
178
+ "slot_index": 0
179
+ },
180
+ {
181
+ "name": "width",
182
+ "type": "INT",
183
+ "links": null
184
+ },
185
+ {
186
+ "name": "height",
187
+ "type": "INT",
188
+ "links": null
189
+ }
190
+ ],
191
+ "properties": {
192
+ "Node name for S&R": "ImageResizeKJ"
193
+ },
194
+ "widgets_values": [
195
+ 720,
196
+ 480,
197
+ "lanczos",
198
+ false,
199
+ 2,
200
+ 0,
201
+ 0,
202
+ "disabled"
203
+ ]
204
+ },
205
+ {
206
+ "id": 30,
207
+ "type": "CogVideoTextEncode",
208
+ "pos": {
209
+ "0": 130,
210
+ "1": 350
211
+ },
212
+ "size": {
213
+ "0": 475.7875061035156,
214
+ "1": 231.29896545410156
215
+ },
216
+ "flags": {},
217
+ "order": 5,
218
+ "mode": 0,
219
+ "inputs": [
220
+ {
221
+ "name": "clip",
222
+ "type": "CLIP",
223
+ "link": 54
224
+ }
225
+ ],
226
+ "outputs": [
227
+ {
228
+ "name": "conditioning",
229
+ "type": "CONDITIONING",
230
+ "links": [
231
+ 84
232
+ ],
233
+ "slot_index": 0,
234
+ "shape": 3
235
+ },
236
+ {
237
+ "name": "clip",
238
+ "type": "CLIP",
239
+ "links": [
240
+ 78
241
+ ],
242
+ "slot_index": 1
243
+ }
244
+ ],
245
+ "properties": {
246
+ "Node name for S&R": "CogVideoTextEncode"
247
+ },
248
+ "widgets_values": [
249
+ "car is moving among mountains",
250
+ 1,
251
+ false
252
+ ]
253
+ },
254
+ {
255
+ "id": 31,
256
+ "type": "CogVideoTextEncode",
257
+ "pos": {
258
+ "0": 139,
259
+ "1": 643
260
+ },
261
+ "size": {
262
+ "0": 463.01251220703125,
263
+ "1": 144
264
+ },
265
+ "flags": {},
266
+ "order": 7,
267
+ "mode": 0,
268
+ "inputs": [
269
+ {
270
+ "name": "clip",
271
+ "type": "CLIP",
272
+ "link": 78
273
+ }
274
+ ],
275
+ "outputs": [
276
+ {
277
+ "name": "conditioning",
278
+ "type": "CONDITIONING",
279
+ "links": [
280
+ 85
281
+ ],
282
+ "slot_index": 0,
283
+ "shape": 3
284
+ },
285
+ {
286
+ "name": "clip",
287
+ "type": "CLIP",
288
+ "links": null
289
+ }
290
+ ],
291
+ "properties": {
292
+ "Node name for S&R": "CogVideoTextEncode"
293
+ },
294
+ "widgets_values": [
295
+ "",
296
+ 1,
297
+ true
298
+ ]
299
+ },
300
+ {
301
+ "id": 44,
302
+ "type": "DownloadAndLoadCogVideoModel",
303
+ "pos": {
304
+ "0": 326,
305
+ "1": -319
306
+ },
307
+ "size": {
308
+ "0": 315,
309
+ "1": 218
310
+ },
311
+ "flags": {},
312
+ "order": 1,
313
+ "mode": 0,
314
+ "inputs": [
315
+ {
316
+ "name": "block_edit",
317
+ "type": "TRANSFORMERBLOCKS",
318
+ "link": null,
319
+ "shape": 7
320
+ },
321
+ {
322
+ "name": "lora",
323
+ "type": "COGLORA",
324
+ "link": null,
325
+ "shape": 7
326
+ },
327
+ {
328
+ "name": "compile_args",
329
+ "type": "COMPILEARGS",
330
+ "link": null,
331
+ "shape": 7
332
+ }
333
+ ],
334
+ "outputs": [
335
+ {
336
+ "name": "model",
337
+ "type": "COGVIDEOMODEL",
338
+ "links": [
339
+ 83
340
+ ]
341
+ },
342
+ {
343
+ "name": "vae",
344
+ "type": "VAE",
345
+ "links": [
346
+ 82
347
+ ],
348
+ "slot_index": 1
349
+ }
350
+ ],
351
+ "properties": {
352
+ "Node name for S&R": "DownloadAndLoadCogVideoModel"
353
+ },
354
+ "widgets_values": [
355
+ "THUDM/CogVideoX-2b",
356
+ "bf16",
357
+ "disabled",
358
+ false,
359
+ "sdpa",
360
+ "main_device"
361
+ ]
362
+ },
363
+ {
364
+ "id": 20,
365
+ "type": "CLIPLoader",
366
+ "pos": {
367
+ "0": -175,
368
+ "1": -317
369
+ },
370
+ "size": {
371
+ "0": 452.912353515625,
372
+ "1": 82
373
+ },
374
+ "flags": {},
375
+ "order": 2,
376
+ "mode": 0,
377
+ "inputs": [],
378
+ "outputs": [
379
+ {
380
+ "name": "CLIP",
381
+ "type": "CLIP",
382
+ "links": [
383
+ 54
384
+ ],
385
+ "slot_index": 0,
386
+ "shape": 3
387
+ }
388
+ ],
389
+ "properties": {
390
+ "Node name for S&R": "CLIPLoader"
391
+ },
392
+ "widgets_values": [
393
+ "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
394
+ "sd3"
395
+ ]
396
+ },
397
+ {
398
+ "id": 35,
399
+ "type": "DownloadAndLoadCogVideoControlNet",
400
+ "pos": {
401
+ "0": -105,
402
+ "1": -182
403
+ },
404
+ "size": {
405
+ "0": 378,
406
+ "1": 58
407
+ },
408
+ "flags": {},
409
+ "order": 3,
410
+ "mode": 0,
411
+ "inputs": [],
412
+ "outputs": [
413
+ {
414
+ "name": "cogvideo_controlnet",
415
+ "type": "COGVIDECONTROLNETMODEL",
416
+ "links": [
417
+ 67
418
+ ]
419
+ }
420
+ ],
421
+ "properties": {
422
+ "Node name for S&R": "DownloadAndLoadCogVideoControlNet"
423
+ },
424
+ "widgets_values": [
425
+ "TheDenk/cogvideox-2b-controlnet-hed-v1"
426
+ ]
427
+ },
428
+ {
429
+ "id": 37,
430
+ "type": "CogVideoControlNet",
431
+ "pos": {
432
+ "0": 220,
433
+ "1": 155
434
+ },
435
+ "size": {
436
+ "0": 367.79998779296875,
437
+ "1": 126
438
+ },
439
+ "flags": {},
440
+ "order": 9,
441
+ "mode": 0,
442
+ "inputs": [
443
+ {
444
+ "name": "controlnet",
445
+ "type": "COGVIDECONTROLNETMODEL",
446
+ "link": 67
447
+ },
448
+ {
449
+ "name": "images",
450
+ "type": "IMAGE",
451
+ "link": 72
452
+ }
453
+ ],
454
+ "outputs": [
455
+ {
456
+ "name": "cogvideo_controlnet",
457
+ "type": "COGVIDECONTROLNET",
458
+ "links": [
459
+ 86
460
+ ],
461
+ "slot_index": 0
462
+ }
463
+ ],
464
+ "properties": {
465
+ "Node name for S&R": "CogVideoControlNet"
466
+ },
467
+ "widgets_values": [
468
+ 1,
469
+ 0,
470
+ 1
471
+ ]
472
+ },
473
+ {
474
+ "id": 40,
475
+ "type": "GetImageSizeAndCount",
476
+ "pos": {
477
+ "0": -123,
478
+ "1": -34
479
+ },
480
+ "size": {
481
+ "0": 277.20001220703125,
482
+ "1": 86
483
+ },
484
+ "flags": {},
485
+ "order": 8,
486
+ "mode": 0,
487
+ "inputs": [
488
+ {
489
+ "name": "image",
490
+ "type": "IMAGE",
491
+ "link": 71
492
+ }
493
+ ],
494
+ "outputs": [
495
+ {
496
+ "name": "image",
497
+ "type": "IMAGE",
498
+ "links": [
499
+ 72,
500
+ 75
501
+ ],
502
+ "slot_index": 0
503
+ },
504
+ {
505
+ "name": "720 width",
506
+ "type": "INT",
507
+ "links": [
508
+ 89
509
+ ]
510
+ },
511
+ {
512
+ "name": "480 height",
513
+ "type": "INT",
514
+ "links": [
515
+ 90
516
+ ],
517
+ "slot_index": 2
518
+ },
519
+ {
520
+ "name": "49 count",
521
+ "type": "INT",
522
+ "links": null
523
+ }
524
+ ],
525
+ "properties": {
526
+ "Node name for S&R": "GetImageSizeAndCount"
527
+ },
528
+ "widgets_values": []
529
+ },
530
+ {
531
+ "id": 47,
532
+ "type": "EmptyLatentImage",
533
+ "pos": {
534
+ "0": 409,
535
+ "1": 77
536
+ },
537
+ "size": {
538
+ "0": 315,
539
+ "1": 106
540
+ },
541
+ "flags": {
542
+ "collapsed": true
543
+ },
544
+ "order": 10,
545
+ "mode": 0,
546
+ "inputs": [
547
+ {
548
+ "name": "width",
549
+ "type": "INT",
550
+ "link": 89,
551
+ "widget": {
552
+ "name": "width"
553
+ }
554
+ },
555
+ {
556
+ "name": "height",
557
+ "type": "INT",
558
+ "link": 90,
559
+ "widget": {
560
+ "name": "height"
561
+ }
562
+ }
563
+ ],
564
+ "outputs": [
565
+ {
566
+ "name": "LATENT",
567
+ "type": "LATENT",
568
+ "links": [
569
+ 88
570
+ ]
571
+ }
572
+ ],
573
+ "properties": {
574
+ "Node name for S&R": "EmptyLatentImage"
575
+ },
576
+ "widgets_values": [
577
+ 720,
578
+ 480,
579
+ 1
580
+ ]
581
+ },
582
+ {
583
+ "id": 46,
584
+ "type": "CogVideoSampler",
585
+ "pos": {
586
+ "0": 743,
587
+ "1": 49
588
+ },
589
+ "size": {
590
+ "0": 330,
591
+ "1": 574
592
+ },
593
+ "flags": {},
594
+ "order": 11,
595
+ "mode": 0,
596
+ "inputs": [
597
+ {
598
+ "name": "model",
599
+ "type": "COGVIDEOMODEL",
600
+ "link": 83
601
+ },
602
+ {
603
+ "name": "positive",
604
+ "type": "CONDITIONING",
605
+ "link": 84
606
+ },
607
+ {
608
+ "name": "negative",
609
+ "type": "CONDITIONING",
610
+ "link": 85
611
+ },
612
+ {
613
+ "name": "samples",
614
+ "type": "LATENT",
615
+ "link": 88,
616
+ "shape": 7
617
+ },
618
+ {
619
+ "name": "image_cond_latents",
620
+ "type": "LATENT",
621
+ "link": null,
622
+ "shape": 7
623
+ },
624
+ {
625
+ "name": "context_options",
626
+ "type": "COGCONTEXT",
627
+ "link": null,
628
+ "shape": 7
629
+ },
630
+ {
631
+ "name": "controlnet",
632
+ "type": "COGVIDECONTROLNET",
633
+ "link": 86,
634
+ "shape": 7
635
+ },
636
+ {
637
+ "name": "tora_trajectory",
638
+ "type": "TORAFEATURES",
639
+ "link": null,
640
+ "shape": 7
641
+ },
642
+ {
643
+ "name": "fastercache",
644
+ "type": "FASTERCACHEARGS",
645
+ "link": null,
646
+ "shape": 7
647
+ }
648
+ ],
649
+ "outputs": [
650
+ {
651
+ "name": "samples",
652
+ "type": "LATENT",
653
+ "links": [
654
+ 87
655
+ ]
656
+ }
657
+ ],
658
+ "properties": {
659
+ "Node name for S&R": "CogVideoSampler"
660
+ },
661
+ "widgets_values": [
662
+ 49,
663
+ 40,
664
+ 6,
665
+ 0,
666
+ "fixed",
667
+ "CogVideoXDDIM",
668
+ 1
669
+ ]
670
+ },
671
+ {
672
+ "id": 45,
673
+ "type": "CogVideoDecode",
674
+ "pos": {
675
+ "0": 758,
676
+ "1": 685
677
+ },
678
+ "size": {
679
+ "0": 315,
680
+ "1": 198
681
+ },
682
+ "flags": {},
683
+ "order": 12,
684
+ "mode": 0,
685
+ "inputs": [
686
+ {
687
+ "name": "vae",
688
+ "type": "VAE",
689
+ "link": 82
690
+ },
691
+ {
692
+ "name": "samples",
693
+ "type": "LATENT",
694
+ "link": 87
695
+ }
696
+ ],
697
+ "outputs": [
698
+ {
699
+ "name": "images",
700
+ "type": "IMAGE",
701
+ "links": [
702
+ 81
703
+ ]
704
+ }
705
+ ],
706
+ "properties": {
707
+ "Node name for S&R": "CogVideoDecode"
708
+ },
709
+ "widgets_values": [
710
+ true,
711
+ 240,
712
+ 360,
713
+ 0.2,
714
+ 0.2,
715
+ true
716
+ ]
717
+ },
718
+ {
719
+ "id": 42,
720
+ "type": "ImageConcatMulti",
721
+ "pos": {
722
+ "0": 1145,
723
+ "1": -24
724
+ },
725
+ "size": {
726
+ "0": 210,
727
+ "1": 150
728
+ },
729
+ "flags": {},
730
+ "order": 13,
731
+ "mode": 0,
732
+ "inputs": [
733
+ {
734
+ "name": "image_1",
735
+ "type": "IMAGE",
736
+ "link": 75
737
+ },
738
+ {
739
+ "name": "image_2",
740
+ "type": "IMAGE",
741
+ "link": 81
742
+ }
743
+ ],
744
+ "outputs": [
745
+ {
746
+ "name": "images",
747
+ "type": "IMAGE",
748
+ "links": [
749
+ 77
750
+ ],
751
+ "slot_index": 0
752
+ }
753
+ ],
754
+ "properties": {},
755
+ "widgets_values": [
756
+ 2,
757
+ "right",
758
+ false,
759
+ null
760
+ ]
761
+ },
762
+ {
763
+ "id": 43,
764
+ "type": "VHS_VideoCombine",
765
+ "pos": {
766
+ "0": 1154,
767
+ "1": 202
768
+ },
769
+ "size": [
770
+ 778.7022705078125,
771
+ 576.9007568359375
772
+ ],
773
+ "flags": {},
774
+ "order": 14,
775
+ "mode": 0,
776
+ "inputs": [
777
+ {
778
+ "name": "images",
779
+ "type": "IMAGE",
780
+ "link": 77
781
+ },
782
+ {
783
+ "name": "audio",
784
+ "type": "AUDIO",
785
+ "link": null,
786
+ "shape": 7
787
+ },
788
+ {
789
+ "name": "meta_batch",
790
+ "type": "VHS_BatchManager",
791
+ "link": null,
792
+ "shape": 7
793
+ },
794
+ {
795
+ "name": "vae",
796
+ "type": "VAE",
797
+ "link": null,
798
+ "shape": 7
799
+ }
800
+ ],
801
+ "outputs": [
802
+ {
803
+ "name": "Filenames",
804
+ "type": "VHS_FILENAMES",
805
+ "links": null,
806
+ "shape": 3
807
+ }
808
+ ],
809
+ "properties": {
810
+ "Node name for S&R": "VHS_VideoCombine"
811
+ },
812
+ "widgets_values": {
813
+ "frame_rate": 8,
814
+ "loop_count": 0,
815
+ "filename_prefix": "CogVideoX_2b_controlnet",
816
+ "format": "video/h264-mp4",
817
+ "pix_fmt": "yuv420p",
818
+ "crf": 19,
819
+ "save_metadata": true,
820
+ "pingpong": false,
821
+ "save_output": true,
822
+ "videopreview": {
823
+ "hidden": false,
824
+ "paused": false,
825
+ "params": {
826
+ "filename": "CogVideoX2B_controlnet_00003.mp4",
827
+ "subfolder": "",
828
+ "type": "temp",
829
+ "format": "video/h264-mp4",
830
+ "frame_rate": 8
831
+ },
832
+ "muted": false
833
+ }
834
+ }
835
+ }
836
+ ],
837
+ "links": [
838
+ [
839
+ 54,
840
+ 20,
841
+ 0,
842
+ 30,
843
+ 0,
844
+ "CLIP"
845
+ ],
846
+ [
847
+ 67,
848
+ 35,
849
+ 0,
850
+ 37,
851
+ 0,
852
+ "COGVIDECONTROLNETMODEL"
853
+ ],
854
+ [
855
+ 71,
856
+ 39,
857
+ 0,
858
+ 40,
859
+ 0,
860
+ "IMAGE"
861
+ ],
862
+ [
863
+ 72,
864
+ 40,
865
+ 0,
866
+ 37,
867
+ 1,
868
+ "IMAGE"
869
+ ],
870
+ [
871
+ 73,
872
+ 38,
873
+ 0,
874
+ 41,
875
+ 0,
876
+ "IMAGE"
877
+ ],
878
+ [
879
+ 74,
880
+ 41,
881
+ 0,
882
+ 39,
883
+ 0,
884
+ "IMAGE"
885
+ ],
886
+ [
887
+ 75,
888
+ 40,
889
+ 0,
890
+ 42,
891
+ 0,
892
+ "IMAGE"
893
+ ],
894
+ [
895
+ 77,
896
+ 42,
897
+ 0,
898
+ 43,
899
+ 0,
900
+ "IMAGE"
901
+ ],
902
+ [
903
+ 78,
904
+ 30,
905
+ 1,
906
+ 31,
907
+ 0,
908
+ "CLIP"
909
+ ],
910
+ [
911
+ 81,
912
+ 45,
913
+ 0,
914
+ 42,
915
+ 1,
916
+ "IMAGE"
917
+ ],
918
+ [
919
+ 82,
920
+ 44,
921
+ 1,
922
+ 45,
923
+ 0,
924
+ "VAE"
925
+ ],
926
+ [
927
+ 83,
928
+ 44,
929
+ 0,
930
+ 46,
931
+ 0,
932
+ "COGVIDEOMODEL"
933
+ ],
934
+ [
935
+ 84,
936
+ 30,
937
+ 0,
938
+ 46,
939
+ 1,
940
+ "CONDITIONING"
941
+ ],
942
+ [
943
+ 85,
944
+ 31,
945
+ 0,
946
+ 46,
947
+ 2,
948
+ "CONDITIONING"
949
+ ],
950
+ [
951
+ 86,
952
+ 37,
953
+ 0,
954
+ 46,
955
+ 6,
956
+ "COGVIDECONTROLNET"
957
+ ],
958
+ [
959
+ 87,
960
+ 46,
961
+ 0,
962
+ 45,
963
+ 1,
964
+ "LATENT"
965
+ ],
966
+ [
967
+ 88,
968
+ 47,
969
+ 0,
970
+ 46,
971
+ 3,
972
+ "LATENT"
973
+ ],
974
+ [
975
+ 89,
976
+ 40,
977
+ 1,
978
+ 47,
979
+ 0,
980
+ "INT"
981
+ ],
982
+ [
983
+ 90,
984
+ 40,
985
+ 2,
986
+ 47,
987
+ 1,
988
+ "INT"
989
+ ]
990
+ ],
991
+ "groups": [],
992
+ "config": {},
993
+ "extra": {
994
+ "ds": {
995
+ "scale": 0.7627768444387069,
996
+ "offset": [
997
+ 1075.4957551311677,
998
+ 398.4420252790512
999
+ ]
1000
+ }
1001
+ },
1002
+ "version": 0.4
1003
+ }
custom_nodes/ComfyUI-CogVideoXWrapper/example_workflows/cogvideox_1_0_5b_I2V_02.json ADDED
@@ -0,0 +1,688 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "last_node_id": 63,
3
+ "last_link_id": 149,
4
+ "nodes": [
5
+ {
6
+ "id": 31,
7
+ "type": "CogVideoTextEncode",
8
+ "pos": {
9
+ "0": 497,
10
+ "1": 520
11
+ },
12
+ "size": {
13
+ "0": 463.01251220703125,
14
+ "1": 144
15
+ },
16
+ "flags": {},
17
+ "order": 6,
18
+ "mode": 0,
19
+ "inputs": [
20
+ {
21
+ "name": "clip",
22
+ "type": "CLIP",
23
+ "link": 149
24
+ }
25
+ ],
26
+ "outputs": [
27
+ {
28
+ "name": "conditioning",
29
+ "type": "CONDITIONING",
30
+ "links": [
31
+ 146
32
+ ],
33
+ "slot_index": 0,
34
+ "shape": 3
35
+ },
36
+ {
37
+ "name": "clip",
38
+ "type": "CLIP",
39
+ "links": null
40
+ }
41
+ ],
42
+ "properties": {
43
+ "Node name for S&R": "CogVideoTextEncode"
44
+ },
45
+ "widgets_values": [
46
+ "The video is not of a high quality, it has a low resolution. Watermark present in each frame. Strange motion trajectory. ",
47
+ 1,
48
+ true
49
+ ]
50
+ },
51
+ {
52
+ "id": 63,
53
+ "type": "CogVideoSampler",
54
+ "pos": {
55
+ "0": 1142,
56
+ "1": 74
57
+ },
58
+ "size": [
59
+ 330,
60
+ 574
61
+ ],
62
+ "flags": {},
63
+ "order": 7,
64
+ "mode": 0,
65
+ "inputs": [
66
+ {
67
+ "name": "model",
68
+ "type": "COGVIDEOMODEL",
69
+ "link": 144
70
+ },
71
+ {
72
+ "name": "positive",
73
+ "type": "CONDITIONING",
74
+ "link": 145
75
+ },
76
+ {
77
+ "name": "negative",
78
+ "type": "CONDITIONING",
79
+ "link": 146
80
+ },
81
+ {
82
+ "name": "samples",
83
+ "type": "LATENT",
84
+ "link": null,
85
+ "shape": 7
86
+ },
87
+ {
88
+ "name": "image_cond_latents",
89
+ "type": "LATENT",
90
+ "link": 147,
91
+ "shape": 7
92
+ },
93
+ {
94
+ "name": "context_options",
95
+ "type": "COGCONTEXT",
96
+ "link": null,
97
+ "shape": 7
98
+ },
99
+ {
100
+ "name": "controlnet",
101
+ "type": "COGVIDECONTROLNET",
102
+ "link": null,
103
+ "shape": 7
104
+ },
105
+ {
106
+ "name": "tora_trajectory",
107
+ "type": "TORAFEATURES",
108
+ "link": null,
109
+ "shape": 7
110
+ },
111
+ {
112
+ "name": "fastercache",
113
+ "type": "FASTERCACHEARGS",
114
+ "link": null,
115
+ "shape": 7
116
+ }
117
+ ],
118
+ "outputs": [
119
+ {
120
+ "name": "samples",
121
+ "type": "LATENT",
122
+ "links": [
123
+ 148
124
+ ]
125
+ }
126
+ ],
127
+ "properties": {
128
+ "Node name for S&R": "CogVideoSampler"
129
+ },
130
+ "widgets_values": [
131
+ 49,
132
+ 25,
133
+ 6,
134
+ 0,
135
+ "fixed",
136
+ "CogVideoXDDIM",
137
+ 1
138
+ ]
139
+ },
140
+ {
141
+ "id": 62,
142
+ "type": "CogVideoImageEncode",
143
+ "pos": {
144
+ "0": 1149,
145
+ "1": 711
146
+ },
147
+ "size": {
148
+ "0": 315,
149
+ "1": 122
150
+ },
151
+ "flags": {},
152
+ "order": 5,
153
+ "mode": 0,
154
+ "inputs": [
155
+ {
156
+ "name": "vae",
157
+ "type": "VAE",
158
+ "link": 141
159
+ },
160
+ {
161
+ "name": "start_image",
162
+ "type": "IMAGE",
163
+ "link": 142
164
+ },
165
+ {
166
+ "name": "end_image",
167
+ "type": "IMAGE",
168
+ "link": null,
169
+ "shape": 7
170
+ }
171
+ ],
172
+ "outputs": [
173
+ {
174
+ "name": "samples",
175
+ "type": "LATENT",
176
+ "links": [
177
+ 147
178
+ ]
179
+ }
180
+ ],
181
+ "properties": {
182
+ "Node name for S&R": "CogVideoImageEncode"
183
+ },
184
+ "widgets_values": [
185
+ false,
186
+ 0
187
+ ]
188
+ },
189
+ {
190
+ "id": 59,
191
+ "type": "DownloadAndLoadCogVideoModel",
192
+ "pos": {
193
+ "0": 622,
194
+ "1": -25
195
+ },
196
+ "size": {
197
+ "0": 315,
198
+ "1": 218
199
+ },
200
+ "flags": {},
201
+ "order": 0,
202
+ "mode": 0,
203
+ "inputs": [
204
+ {
205
+ "name": "block_edit",
206
+ "type": "TRANSFORMERBLOCKS",
207
+ "link": null,
208
+ "shape": 7
209
+ },
210
+ {
211
+ "name": "lora",
212
+ "type": "COGLORA",
213
+ "link": null,
214
+ "shape": 7
215
+ },
216
+ {
217
+ "name": "compile_args",
218
+ "type": "COMPILEARGS",
219
+ "link": null,
220
+ "shape": 7
221
+ }
222
+ ],
223
+ "outputs": [
224
+ {
225
+ "name": "model",
226
+ "type": "COGVIDEOMODEL",
227
+ "links": [
228
+ 144
229
+ ]
230
+ },
231
+ {
232
+ "name": "vae",
233
+ "type": "VAE",
234
+ "links": [
235
+ 132,
236
+ 141
237
+ ],
238
+ "slot_index": 1
239
+ }
240
+ ],
241
+ "properties": {
242
+ "Node name for S&R": "DownloadAndLoadCogVideoModel"
243
+ },
244
+ "widgets_values": [
245
+ "THUDM/CogVideoX-5b-I2V",
246
+ "bf16",
247
+ "disabled",
248
+ false,
249
+ "sdpa",
250
+ "main_device"
251
+ ]
252
+ },
253
+ {
254
+ "id": 30,
255
+ "type": "CogVideoTextEncode",
256
+ "pos": {
257
+ "0": 493,
258
+ "1": 303
259
+ },
260
+ "size": {
261
+ "0": 471.90142822265625,
262
+ "1": 168.08047485351562
263
+ },
264
+ "flags": {},
265
+ "order": 4,
266
+ "mode": 0,
267
+ "inputs": [
268
+ {
269
+ "name": "clip",
270
+ "type": "CLIP",
271
+ "link": 54
272
+ }
273
+ ],
274
+ "outputs": [
275
+ {
276
+ "name": "conditioning",
277
+ "type": "CONDITIONING",
278
+ "links": [
279
+ 145
280
+ ],
281
+ "slot_index": 0,
282
+ "shape": 3
283
+ },
284
+ {
285
+ "name": "clip",
286
+ "type": "CLIP",
287
+ "links": [
288
+ 149
289
+ ],
290
+ "slot_index": 1
291
+ }
292
+ ],
293
+ "properties": {
294
+ "Node name for S&R": "CogVideoTextEncode"
295
+ },
296
+ "widgets_values": [
297
+ "a majestic stag is grazing in an enhanced forest, basking in the setting sun filtered by the trees",
298
+ 1,
299
+ false
300
+ ]
301
+ },
302
+ {
303
+ "id": 37,
304
+ "type": "ImageResizeKJ",
305
+ "pos": {
306
+ "0": 784,
307
+ "1": 731
308
+ },
309
+ "size": {
310
+ "0": 315,
311
+ "1": 266
312
+ },
313
+ "flags": {},
314
+ "order": 3,
315
+ "mode": 0,
316
+ "inputs": [
317
+ {
318
+ "name": "image",
319
+ "type": "IMAGE",
320
+ "link": 71
321
+ },
322
+ {
323
+ "name": "get_image_size",
324
+ "type": "IMAGE",
325
+ "link": null,
326
+ "shape": 7
327
+ },
328
+ {
329
+ "name": "width_input",
330
+ "type": "INT",
331
+ "link": null,
332
+ "widget": {
333
+ "name": "width_input"
334
+ }
335
+ },
336
+ {
337
+ "name": "height_input",
338
+ "type": "INT",
339
+ "link": null,
340
+ "widget": {
341
+ "name": "height_input"
342
+ }
343
+ }
344
+ ],
345
+ "outputs": [
346
+ {
347
+ "name": "IMAGE",
348
+ "type": "IMAGE",
349
+ "links": [
350
+ 142
351
+ ],
352
+ "slot_index": 0,
353
+ "shape": 3
354
+ },
355
+ {
356
+ "name": "width",
357
+ "type": "INT",
358
+ "links": null,
359
+ "shape": 3
360
+ },
361
+ {
362
+ "name": "height",
363
+ "type": "INT",
364
+ "links": null,
365
+ "shape": 3
366
+ }
367
+ ],
368
+ "properties": {
369
+ "Node name for S&R": "ImageResizeKJ"
370
+ },
371
+ "widgets_values": [
372
+ 720,
373
+ 480,
374
+ "lanczos",
375
+ false,
376
+ 16,
377
+ 0,
378
+ 0,
379
+ "disabled"
380
+ ]
381
+ },
382
+ {
383
+ "id": 36,
384
+ "type": "LoadImage",
385
+ "pos": {
386
+ "0": 335,
387
+ "1": 731
388
+ },
389
+ "size": {
390
+ "0": 402.06353759765625,
391
+ "1": 396.6225891113281
392
+ },
393
+ "flags": {},
394
+ "order": 1,
395
+ "mode": 0,
396
+ "inputs": [],
397
+ "outputs": [
398
+ {
399
+ "name": "IMAGE",
400
+ "type": "IMAGE",
401
+ "links": [
402
+ 71
403
+ ],
404
+ "slot_index": 0,
405
+ "shape": 3
406
+ },
407
+ {
408
+ "name": "MASK",
409
+ "type": "MASK",
410
+ "links": null,
411
+ "shape": 3
412
+ }
413
+ ],
414
+ "properties": {
415
+ "Node name for S&R": "LoadImage"
416
+ },
417
+ "widgets_values": [
418
+ "sd3stag.png",
419
+ "image"
420
+ ]
421
+ },
422
+ {
423
+ "id": 20,
424
+ "type": "CLIPLoader",
425
+ "pos": {
426
+ "0": -2,
427
+ "1": 304
428
+ },
429
+ "size": {
430
+ "0": 451.30548095703125,
431
+ "1": 82
432
+ },
433
+ "flags": {},
434
+ "order": 2,
435
+ "mode": 0,
436
+ "inputs": [],
437
+ "outputs": [
438
+ {
439
+ "name": "CLIP",
440
+ "type": "CLIP",
441
+ "links": [
442
+ 54
443
+ ],
444
+ "slot_index": 0,
445
+ "shape": 3
446
+ }
447
+ ],
448
+ "properties": {
449
+ "Node name for S&R": "CLIPLoader"
450
+ },
451
+ "widgets_values": [
452
+ "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
453
+ "sd3"
454
+ ]
455
+ },
456
+ {
457
+ "id": 60,
458
+ "type": "CogVideoDecode",
459
+ "pos": {
460
+ "0": 1523,
461
+ "1": -6
462
+ },
463
+ "size": {
464
+ "0": 315,
465
+ "1": 198
466
+ },
467
+ "flags": {},
468
+ "order": 8,
469
+ "mode": 0,
470
+ "inputs": [
471
+ {
472
+ "name": "vae",
473
+ "type": "VAE",
474
+ "link": 132
475
+ },
476
+ {
477
+ "name": "samples",
478
+ "type": "LATENT",
479
+ "link": 148
480
+ }
481
+ ],
482
+ "outputs": [
483
+ {
484
+ "name": "images",
485
+ "type": "IMAGE",
486
+ "links": [
487
+ 134
488
+ ]
489
+ }
490
+ ],
491
+ "properties": {
492
+ "Node name for S&R": "CogVideoDecode"
493
+ },
494
+ "widgets_values": [
495
+ true,
496
+ 240,
497
+ 360,
498
+ 0.2,
499
+ 0.2,
500
+ true
501
+ ]
502
+ },
503
+ {
504
+ "id": 44,
505
+ "type": "VHS_VideoCombine",
506
+ "pos": {
507
+ "0": 1884,
508
+ "1": -6
509
+ },
510
+ "size": [
511
+ 605.3909912109375,
512
+ 714.2606608072917
513
+ ],
514
+ "flags": {},
515
+ "order": 9,
516
+ "mode": 0,
517
+ "inputs": [
518
+ {
519
+ "name": "images",
520
+ "type": "IMAGE",
521
+ "link": 134
522
+ },
523
+ {
524
+ "name": "audio",
525
+ "type": "AUDIO",
526
+ "link": null,
527
+ "shape": 7
528
+ },
529
+ {
530
+ "name": "meta_batch",
531
+ "type": "VHS_BatchManager",
532
+ "link": null,
533
+ "shape": 7
534
+ },
535
+ {
536
+ "name": "vae",
537
+ "type": "VAE",
538
+ "link": null,
539
+ "shape": 7
540
+ }
541
+ ],
542
+ "outputs": [
543
+ {
544
+ "name": "Filenames",
545
+ "type": "VHS_FILENAMES",
546
+ "links": null,
547
+ "shape": 3
548
+ }
549
+ ],
550
+ "properties": {
551
+ "Node name for S&R": "VHS_VideoCombine"
552
+ },
553
+ "widgets_values": {
554
+ "frame_rate": 8,
555
+ "loop_count": 0,
556
+ "filename_prefix": "CogVideoX-I2V",
557
+ "format": "video/h264-mp4",
558
+ "pix_fmt": "yuv420p",
559
+ "crf": 19,
560
+ "save_metadata": true,
561
+ "pingpong": false,
562
+ "save_output": true,
563
+ "videopreview": {
564
+ "hidden": false,
565
+ "paused": false,
566
+ "params": {
567
+ "filename": "CogVideoX-I2V_00001.mp4",
568
+ "subfolder": "",
569
+ "type": "temp",
570
+ "format": "video/h264-mp4",
571
+ "frame_rate": 8
572
+ },
573
+ "muted": false
574
+ }
575
+ }
576
+ }
577
+ ],
578
+ "links": [
579
+ [
580
+ 54,
581
+ 20,
582
+ 0,
583
+ 30,
584
+ 0,
585
+ "CLIP"
586
+ ],
587
+ [
588
+ 71,
589
+ 36,
590
+ 0,
591
+ 37,
592
+ 0,
593
+ "IMAGE"
594
+ ],
595
+ [
596
+ 132,
597
+ 59,
598
+ 1,
599
+ 60,
600
+ 0,
601
+ "VAE"
602
+ ],
603
+ [
604
+ 134,
605
+ 60,
606
+ 0,
607
+ 44,
608
+ 0,
609
+ "IMAGE"
610
+ ],
611
+ [
612
+ 141,
613
+ 59,
614
+ 1,
615
+ 62,
616
+ 0,
617
+ "VAE"
618
+ ],
619
+ [
620
+ 142,
621
+ 37,
622
+ 0,
623
+ 62,
624
+ 1,
625
+ "IMAGE"
626
+ ],
627
+ [
628
+ 144,
629
+ 59,
630
+ 0,
631
+ 63,
632
+ 0,
633
+ "COGVIDEOMODEL"
634
+ ],
635
+ [
636
+ 145,
637
+ 30,
638
+ 0,
639
+ 63,
640
+ 1,
641
+ "CONDITIONING"
642
+ ],
643
+ [
644
+ 146,
645
+ 31,
646
+ 0,
647
+ 63,
648
+ 2,
649
+ "CONDITIONING"
650
+ ],
651
+ [
652
+ 147,
653
+ 62,
654
+ 0,
655
+ 63,
656
+ 4,
657
+ "LATENT"
658
+ ],
659
+ [
660
+ 148,
661
+ 63,
662
+ 0,
663
+ 60,
664
+ 1,
665
+ "LATENT"
666
+ ],
667
+ [
668
+ 149,
669
+ 30,
670
+ 1,
671
+ 31,
672
+ 0,
673
+ "CLIP"
674
+ ]
675
+ ],
676
+ "groups": [],
677
+ "config": {},
678
+ "extra": {
679
+ "ds": {
680
+ "scale": 0.7627768444387059,
681
+ "offset": [
682
+ 648.7113591814891,
683
+ 185.9907078691075
684
+ ]
685
+ }
686
+ },
687
+ "version": 0.4
688
+ }
custom_nodes/ComfyUI-CogVideoXWrapper/example_workflows/cogvideox_1_0_5b_I2V_Tora_02.json ADDED
The diff for this file is too large to render. See raw diff
 
custom_nodes/ComfyUI-CogVideoXWrapper/example_workflows/cogvideox_1_0_5b_I2V_noise_warp_01.json ADDED
@@ -0,0 +1,1291 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "last_node_id": 84,
3
+ "last_link_id": 190,
4
+ "nodes": [
5
+ {
6
+ "id": 31,
7
+ "type": "CogVideoTextEncode",
8
+ "pos": [
9
+ 497,
10
+ 520
11
+ ],
12
+ "size": [
13
+ 463.01251220703125,
14
+ 144
15
+ ],
16
+ "flags": {},
17
+ "order": 10,
18
+ "mode": 0,
19
+ "inputs": [
20
+ {
21
+ "name": "clip",
22
+ "type": "CLIP",
23
+ "link": 149
24
+ }
25
+ ],
26
+ "outputs": [
27
+ {
28
+ "name": "conditioning",
29
+ "type": "CONDITIONING",
30
+ "links": [
31
+ 146
32
+ ],
33
+ "slot_index": 0,
34
+ "shape": 3
35
+ },
36
+ {
37
+ "name": "clip",
38
+ "type": "CLIP",
39
+ "links": null
40
+ }
41
+ ],
42
+ "properties": {
43
+ "Node name for S&R": "CogVideoTextEncode"
44
+ },
45
+ "widgets_values": [
46
+ "The video is not of a high quality, it has a low resolution. Watermark present in each frame. Strange motion trajectory. ",
47
+ 1,
48
+ true
49
+ ]
50
+ },
51
+ {
52
+ "id": 20,
53
+ "type": "CLIPLoader",
54
+ "pos": [
55
+ -2,
56
+ 304
57
+ ],
58
+ "size": [
59
+ 451.30548095703125,
60
+ 82
61
+ ],
62
+ "flags": {},
63
+ "order": 0,
64
+ "mode": 0,
65
+ "inputs": [],
66
+ "outputs": [
67
+ {
68
+ "name": "CLIP",
69
+ "type": "CLIP",
70
+ "links": [
71
+ 54
72
+ ],
73
+ "slot_index": 0,
74
+ "shape": 3
75
+ }
76
+ ],
77
+ "properties": {
78
+ "Node name for S&R": "CLIPLoader"
79
+ },
80
+ "widgets_values": [
81
+ "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
82
+ "sd3",
83
+ "default"
84
+ ]
85
+ },
86
+ {
87
+ "id": 74,
88
+ "type": "ImageConcatMulti",
89
+ "pos": [
90
+ 1787.351318359375,
91
+ 513.0852661132812
92
+ ],
93
+ "size": [
94
+ 210,
95
+ 150
96
+ ],
97
+ "flags": {},
98
+ "order": 19,
99
+ "mode": 0,
100
+ "inputs": [
101
+ {
102
+ "name": "image_1",
103
+ "type": "IMAGE",
104
+ "link": 171
105
+ },
106
+ {
107
+ "name": "image_2",
108
+ "type": "IMAGE",
109
+ "link": 184
110
+ }
111
+ ],
112
+ "outputs": [
113
+ {
114
+ "name": "images",
115
+ "type": "IMAGE",
116
+ "links": [
117
+ 170
118
+ ],
119
+ "slot_index": 0
120
+ }
121
+ ],
122
+ "properties": {},
123
+ "widgets_values": [
124
+ 2,
125
+ "right",
126
+ false,
127
+ null
128
+ ]
129
+ },
130
+ {
131
+ "id": 60,
132
+ "type": "CogVideoDecode",
133
+ "pos": [
134
+ 1518.4959716796875,
135
+ -16.81044578552246
136
+ ],
137
+ "size": [
138
+ 315,
139
+ 198
140
+ ],
141
+ "flags": {},
142
+ "order": 18,
143
+ "mode": 0,
144
+ "inputs": [
145
+ {
146
+ "name": "vae",
147
+ "type": "VAE",
148
+ "link": 132
149
+ },
150
+ {
151
+ "name": "samples",
152
+ "type": "LATENT",
153
+ "link": 148
154
+ }
155
+ ],
156
+ "outputs": [
157
+ {
158
+ "name": "images",
159
+ "type": "IMAGE",
160
+ "links": [
161
+ 184
162
+ ],
163
+ "slot_index": 0
164
+ }
165
+ ],
166
+ "properties": {
167
+ "Node name for S&R": "CogVideoDecode"
168
+ },
169
+ "widgets_values": [
170
+ true,
171
+ 240,
172
+ 360,
173
+ 0.2,
174
+ 0.2,
175
+ true
176
+ ]
177
+ },
178
+ {
179
+ "id": 72,
180
+ "type": "CogVideoLoraSelect",
181
+ "pos": [
182
+ 149.58236694335938,
183
+ -19.5003604888916
184
+ ],
185
+ "size": [
186
+ 429.9602355957031,
187
+ 108.1800765991211
188
+ ],
189
+ "flags": {},
190
+ "order": 1,
191
+ "mode": 0,
192
+ "inputs": [
193
+ {
194
+ "name": "prev_lora",
195
+ "type": "COGLORA",
196
+ "link": null,
197
+ "shape": 7
198
+ }
199
+ ],
200
+ "outputs": [
201
+ {
202
+ "name": "lora",
203
+ "type": "COGLORA",
204
+ "links": [
205
+ 174
206
+ ],
207
+ "slot_index": 0
208
+ }
209
+ ],
210
+ "properties": {
211
+ "Node name for S&R": "CogVideoLoraSelect"
212
+ },
213
+ "widgets_values": [
214
+ "I2V5B_final_i30000_lora_weights.safetensors",
215
+ 1,
216
+ false
217
+ ]
218
+ },
219
+ {
220
+ "id": 59,
221
+ "type": "DownloadAndLoadCogVideoModel",
222
+ "pos": [
223
+ 620.1983032226562,
224
+ -39.41391372680664
225
+ ],
226
+ "size": [
227
+ 315,
228
+ 218
229
+ ],
230
+ "flags": {},
231
+ "order": 8,
232
+ "mode": 0,
233
+ "inputs": [
234
+ {
235
+ "name": "block_edit",
236
+ "type": "TRANSFORMERBLOCKS",
237
+ "link": null,
238
+ "shape": 7
239
+ },
240
+ {
241
+ "name": "lora",
242
+ "type": "COGLORA",
243
+ "link": 174,
244
+ "shape": 7
245
+ },
246
+ {
247
+ "name": "compile_args",
248
+ "type": "COMPILEARGS",
249
+ "link": null,
250
+ "shape": 7
251
+ }
252
+ ],
253
+ "outputs": [
254
+ {
255
+ "name": "model",
256
+ "type": "COGVIDEOMODEL",
257
+ "links": [
258
+ 144
259
+ ]
260
+ },
261
+ {
262
+ "name": "vae",
263
+ "type": "VAE",
264
+ "links": [
265
+ 132,
266
+ 141,
267
+ 165
268
+ ],
269
+ "slot_index": 1
270
+ }
271
+ ],
272
+ "properties": {
273
+ "Node name for S&R": "DownloadAndLoadCogVideoModel"
274
+ },
275
+ "widgets_values": [
276
+ "THUDM/CogVideoX-5b-I2V",
277
+ "bf16",
278
+ "disabled",
279
+ false,
280
+ "sageattn",
281
+ "main_device"
282
+ ]
283
+ },
284
+ {
285
+ "id": 30,
286
+ "type": "CogVideoTextEncode",
287
+ "pos": [
288
+ 493,
289
+ 303
290
+ ],
291
+ "size": [
292
+ 471.90142822265625,
293
+ 168.08047485351562
294
+ ],
295
+ "flags": {},
296
+ "order": 7,
297
+ "mode": 0,
298
+ "inputs": [
299
+ {
300
+ "name": "clip",
301
+ "type": "CLIP",
302
+ "link": 54
303
+ }
304
+ ],
305
+ "outputs": [
306
+ {
307
+ "name": "conditioning",
308
+ "type": "CONDITIONING",
309
+ "links": [
310
+ 145
311
+ ],
312
+ "slot_index": 0,
313
+ "shape": 3
314
+ },
315
+ {
316
+ "name": "clip",
317
+ "type": "CLIP",
318
+ "links": [
319
+ 149
320
+ ],
321
+ "slot_index": 1
322
+ }
323
+ ],
324
+ "properties": {
325
+ "Node name for S&R": "CogVideoTextEncode"
326
+ },
327
+ "widgets_values": [
328
+ "mouse knight walking in a forest",
329
+ 1,
330
+ false
331
+ ]
332
+ },
333
+ {
334
+ "id": 63,
335
+ "type": "CogVideoSampler",
336
+ "pos": [
337
+ 1144.7025146484375,
338
+ 55.98257064819336
339
+ ],
340
+ "size": [
341
+ 330,
342
+ 594
343
+ ],
344
+ "flags": {},
345
+ "order": 17,
346
+ "mode": 0,
347
+ "inputs": [
348
+ {
349
+ "name": "model",
350
+ "type": "COGVIDEOMODEL",
351
+ "link": 144
352
+ },
353
+ {
354
+ "name": "positive",
355
+ "type": "CONDITIONING",
356
+ "link": 145
357
+ },
358
+ {
359
+ "name": "negative",
360
+ "type": "CONDITIONING",
361
+ "link": 146
362
+ },
363
+ {
364
+ "name": "samples",
365
+ "type": "LATENT",
366
+ "link": 164,
367
+ "shape": 7
368
+ },
369
+ {
370
+ "name": "image_cond_latents",
371
+ "type": "LATENT",
372
+ "link": 147,
373
+ "shape": 7
374
+ },
375
+ {
376
+ "name": "context_options",
377
+ "type": "COGCONTEXT",
378
+ "link": null,
379
+ "shape": 7
380
+ },
381
+ {
382
+ "name": "controlnet",
383
+ "type": "COGVIDECONTROLNET",
384
+ "link": null,
385
+ "shape": 7
386
+ },
387
+ {
388
+ "name": "tora_trajectory",
389
+ "type": "TORAFEATURES",
390
+ "link": null,
391
+ "shape": 7
392
+ },
393
+ {
394
+ "name": "fastercache",
395
+ "type": "FASTERCACHEARGS",
396
+ "link": null,
397
+ "shape": 7
398
+ },
399
+ {
400
+ "name": "feta_args",
401
+ "type": "FETAARGS",
402
+ "link": null,
403
+ "shape": 7
404
+ }
405
+ ],
406
+ "outputs": [
407
+ {
408
+ "name": "samples",
409
+ "type": "LATENT",
410
+ "links": [
411
+ 148
412
+ ]
413
+ }
414
+ ],
415
+ "properties": {
416
+ "Node name for S&R": "CogVideoSampler"
417
+ },
418
+ "widgets_values": [
419
+ 49,
420
+ 25,
421
+ 6,
422
+ 0,
423
+ "fixed",
424
+ "CogVideoXDDIM",
425
+ 1
426
+ ]
427
+ },
428
+ {
429
+ "id": 79,
430
+ "type": "Note",
431
+ "pos": [
432
+ 141.44003295898438,
433
+ -129.33815002441406
434
+ ],
435
+ "size": [
436
+ 436.1673889160156,
437
+ 58
438
+ ],
439
+ "flags": {},
440
+ "order": 2,
441
+ "mode": 0,
442
+ "inputs": [],
443
+ "outputs": [],
444
+ "properties": {},
445
+ "widgets_values": [
446
+ "https://huggingface.co/VGenAI-Netflix-Eyeline-Research/Go-with-the-Flow/blob/main/I2V5B_final_i38800_nearest_lora_weights.safetensors"
447
+ ],
448
+ "color": "#432",
449
+ "bgcolor": "#653"
450
+ },
451
+ {
452
+ "id": 76,
453
+ "type": "VHS_VideoCombine",
454
+ "pos": [
455
+ 1955.22119140625,
456
+ 841.7718505859375
457
+ ],
458
+ "size": [
459
+ 1141.2095947265625,
460
+ 1095.4730224609375
461
+ ],
462
+ "flags": {},
463
+ "order": 16,
464
+ "mode": 2,
465
+ "inputs": [
466
+ {
467
+ "name": "images",
468
+ "type": "IMAGE",
469
+ "link": 185
470
+ },
471
+ {
472
+ "name": "audio",
473
+ "type": "AUDIO",
474
+ "link": null,
475
+ "shape": 7
476
+ },
477
+ {
478
+ "name": "meta_batch",
479
+ "type": "VHS_BatchManager",
480
+ "link": null,
481
+ "shape": 7
482
+ },
483
+ {
484
+ "name": "vae",
485
+ "type": "VAE",
486
+ "link": null,
487
+ "shape": 7
488
+ }
489
+ ],
490
+ "outputs": [
491
+ {
492
+ "name": "Filenames",
493
+ "type": "VHS_FILENAMES",
494
+ "links": null,
495
+ "shape": 3
496
+ }
497
+ ],
498
+ "properties": {
499
+ "Node name for S&R": "VHS_VideoCombine"
500
+ },
501
+ "widgets_values": {
502
+ "frame_rate": 8,
503
+ "loop_count": 0,
504
+ "filename_prefix": "CogVideoX-I2V",
505
+ "format": "video/h264-mp4",
506
+ "pix_fmt": "yuv420p",
507
+ "crf": 19,
508
+ "save_metadata": true,
509
+ "trim_to_audio": false,
510
+ "pingpong": false,
511
+ "save_output": false,
512
+ "videopreview": {
513
+ "hidden": false,
514
+ "paused": false,
515
+ "params": {
516
+ "filename": "CogVideoX-I2V_00001.mp4",
517
+ "subfolder": "",
518
+ "type": "temp",
519
+ "format": "video/h264-mp4",
520
+ "frame_rate": 8,
521
+ "workflow": "CogVideoX-I2V_00001.png",
522
+ "fullpath": "N:\\AI\\ComfyUI\\temp\\CogVideoX-I2V_00001.mp4"
523
+ },
524
+ "muted": false
525
+ }
526
+ }
527
+ },
528
+ {
529
+ "id": 80,
530
+ "type": "Note",
531
+ "pos": [
532
+ 1648.847900390625,
533
+ 1100.5545654296875
534
+ ],
535
+ "size": [
536
+ 249.00543212890625,
537
+ 58
538
+ ],
539
+ "flags": {},
540
+ "order": 3,
541
+ "mode": 0,
542
+ "inputs": [],
543
+ "outputs": [],
544
+ "properties": {},
545
+ "widgets_values": [
546
+ "This is just for testing the noise"
547
+ ],
548
+ "color": "#432",
549
+ "bgcolor": "#653"
550
+ },
551
+ {
552
+ "id": 73,
553
+ "type": "CogVideoDecode",
554
+ "pos": [
555
+ 1567.16064453125,
556
+ 842.2813110351562
557
+ ],
558
+ "size": [
559
+ 315,
560
+ 198
561
+ ],
562
+ "flags": {},
563
+ "order": 14,
564
+ "mode": 2,
565
+ "inputs": [
566
+ {
567
+ "name": "vae",
568
+ "type": "VAE",
569
+ "link": 165
570
+ },
571
+ {
572
+ "name": "samples",
573
+ "type": "LATENT",
574
+ "link": 167
575
+ }
576
+ ],
577
+ "outputs": [
578
+ {
579
+ "name": "images",
580
+ "type": "IMAGE",
581
+ "links": [
582
+ 185
583
+ ],
584
+ "slot_index": 0
585
+ }
586
+ ],
587
+ "properties": {
588
+ "Node name for S&R": "CogVideoDecode"
589
+ },
590
+ "widgets_values": [
591
+ true,
592
+ 240,
593
+ 360,
594
+ 0.2,
595
+ 0.2,
596
+ true
597
+ ]
598
+ },
599
+ {
600
+ "id": 68,
601
+ "type": "GetImageSizeAndCount",
602
+ "pos": [
603
+ -195.5599822998047,
604
+ 1273.8702392578125
605
+ ],
606
+ "size": [
607
+ 277.20001220703125,
608
+ 86
609
+ ],
610
+ "flags": {},
611
+ "order": 9,
612
+ "mode": 0,
613
+ "inputs": [
614
+ {
615
+ "name": "image",
616
+ "type": "IMAGE",
617
+ "link": 181
618
+ }
619
+ ],
620
+ "outputs": [
621
+ {
622
+ "name": "image",
623
+ "type": "IMAGE",
624
+ "links": [
625
+ 178
626
+ ],
627
+ "slot_index": 0
628
+ },
629
+ {
630
+ "name": "1024 width",
631
+ "type": "INT",
632
+ "links": null
633
+ },
634
+ {
635
+ "name": "768 height",
636
+ "type": "INT",
637
+ "links": null
638
+ },
639
+ {
640
+ "name": "49 count",
641
+ "type": "INT",
642
+ "links": null
643
+ }
644
+ ],
645
+ "properties": {
646
+ "Node name for S&R": "GetImageSizeAndCount"
647
+ }
648
+ },
649
+ {
650
+ "id": 62,
651
+ "type": "CogVideoImageEncode",
652
+ "pos": [
653
+ 612.8922729492188,
654
+ 751.6295776367188
655
+ ],
656
+ "size": [
657
+ 315,
658
+ 194
659
+ ],
660
+ "flags": {},
661
+ "order": 15,
662
+ "mode": 0,
663
+ "inputs": [
664
+ {
665
+ "name": "vae",
666
+ "type": "VAE",
667
+ "link": 141
668
+ },
669
+ {
670
+ "name": "start_image",
671
+ "type": "IMAGE",
672
+ "link": 190
673
+ },
674
+ {
675
+ "name": "end_image",
676
+ "type": "IMAGE",
677
+ "link": null,
678
+ "shape": 7
679
+ }
680
+ ],
681
+ "outputs": [
682
+ {
683
+ "name": "samples",
684
+ "type": "LATENT",
685
+ "links": [
686
+ 147
687
+ ]
688
+ }
689
+ ],
690
+ "properties": {
691
+ "Node name for S&R": "CogVideoImageEncode"
692
+ },
693
+ "widgets_values": [
694
+ false,
695
+ 0,
696
+ 1,
697
+ 0,
698
+ 1
699
+ ]
700
+ },
701
+ {
702
+ "id": 82,
703
+ "type": "Note",
704
+ "pos": [
705
+ -533.0764770507812,
706
+ 1158.188232421875
707
+ ],
708
+ "size": [
709
+ 364.71002197265625,
710
+ 58
711
+ ],
712
+ "flags": {},
713
+ "order": 4,
714
+ "mode": 0,
715
+ "inputs": [],
716
+ "outputs": [],
717
+ "properties": {},
718
+ "widgets_values": [
719
+ "Input video that's used to create the noise"
720
+ ],
721
+ "color": "#432",
722
+ "bgcolor": "#653"
723
+ },
724
+ {
725
+ "id": 64,
726
+ "type": "GetWarpedNoiseFromVideo",
727
+ "pos": [
728
+ 674.1111450195312,
729
+ 1289.6090087890625
730
+ ],
731
+ "size": [
732
+ 315,
733
+ 222
734
+ ],
735
+ "flags": {},
736
+ "order": 12,
737
+ "mode": 0,
738
+ "inputs": [
739
+ {
740
+ "name": "images",
741
+ "type": "IMAGE",
742
+ "link": 161
743
+ }
744
+ ],
745
+ "outputs": [
746
+ {
747
+ "name": "noise",
748
+ "type": "LATENT",
749
+ "links": [
750
+ 164,
751
+ 167
752
+ ],
753
+ "slot_index": 0
754
+ },
755
+ {
756
+ "name": "visualization",
757
+ "type": "IMAGE",
758
+ "links": null
759
+ }
760
+ ],
761
+ "properties": {
762
+ "Node name for S&R": "GetWarpedNoiseFromVideo"
763
+ },
764
+ "widgets_values": [
765
+ 16,
766
+ "nearest",
767
+ 13,
768
+ 0.5,
769
+ "BCTHW",
770
+ 99026504067718,
771
+ "fixed"
772
+ ]
773
+ },
774
+ {
775
+ "id": 83,
776
+ "type": "Note",
777
+ "pos": [
778
+ 679.4560546875,
779
+ 1179.797607421875
780
+ ],
781
+ "size": [
782
+ 293.1480407714844,
783
+ 58
784
+ ],
785
+ "flags": {},
786
+ "order": 5,
787
+ "mode": 0,
788
+ "inputs": [],
789
+ "outputs": [],
790
+ "properties": {},
791
+ "widgets_values": [
792
+ "https://github.com/kijai/ComfyUI-VideoNoiseWarp"
793
+ ],
794
+ "color": "#432",
795
+ "bgcolor": "#653"
796
+ },
797
+ {
798
+ "id": 69,
799
+ "type": "VHS_LoadVideo",
800
+ "pos": [
801
+ -536.2808837890625,
802
+ 1265.4254150390625
803
+ ],
804
+ "size": [
805
+ 247.455078125,
806
+ 446.3408203125
807
+ ],
808
+ "flags": {},
809
+ "order": 6,
810
+ "mode": 0,
811
+ "inputs": [
812
+ {
813
+ "name": "meta_batch",
814
+ "type": "VHS_BatchManager",
815
+ "link": null,
816
+ "shape": 7
817
+ },
818
+ {
819
+ "name": "vae",
820
+ "type": "VAE",
821
+ "link": null,
822
+ "shape": 7
823
+ }
824
+ ],
825
+ "outputs": [
826
+ {
827
+ "name": "IMAGE",
828
+ "type": "IMAGE",
829
+ "links": [
830
+ 181
831
+ ],
832
+ "slot_index": 0
833
+ },
834
+ {
835
+ "name": "frame_count",
836
+ "type": "INT",
837
+ "links": null
838
+ },
839
+ {
840
+ "name": "audio",
841
+ "type": "AUDIO",
842
+ "links": null
843
+ },
844
+ {
845
+ "name": "video_info",
846
+ "type": "VHS_VIDEOINFO",
847
+ "links": null
848
+ }
849
+ ],
850
+ "properties": {
851
+ "Node name for S&R": "VHS_LoadVideo"
852
+ },
853
+ "widgets_values": {
854
+ "video": "AnimateDiff_00023 (16).mp4",
855
+ "force_rate": 0,
856
+ "force_size": "Disabled",
857
+ "custom_width": 512,
858
+ "custom_height": 512,
859
+ "frame_load_cap": 0,
860
+ "skip_first_frames": 0,
861
+ "select_every_nth": 1,
862
+ "choose video to upload": "image",
863
+ "videopreview": {
864
+ "hidden": false,
865
+ "paused": false,
866
+ "params": {
867
+ "force_rate": 0,
868
+ "frame_load_cap": 0,
869
+ "skip_first_frames": 0,
870
+ "select_every_nth": 1,
871
+ "filename": "AnimateDiff_00023 (16).mp4",
872
+ "type": "input",
873
+ "format": "video/mp4"
874
+ },
875
+ "muted": false
876
+ }
877
+ }
878
+ },
879
+ {
880
+ "id": 44,
881
+ "type": "VHS_VideoCombine",
882
+ "pos": [
883
+ 2071.7626953125,
884
+ -69.11408233642578
885
+ ],
886
+ "size": [
887
+ 1141.2095947265625,
888
+ 721.7365112304688
889
+ ],
890
+ "flags": {},
891
+ "order": 20,
892
+ "mode": 0,
893
+ "inputs": [
894
+ {
895
+ "name": "images",
896
+ "type": "IMAGE",
897
+ "link": 170
898
+ },
899
+ {
900
+ "name": "audio",
901
+ "type": "AUDIO",
902
+ "link": null,
903
+ "shape": 7
904
+ },
905
+ {
906
+ "name": "meta_batch",
907
+ "type": "VHS_BatchManager",
908
+ "link": null,
909
+ "shape": 7
910
+ },
911
+ {
912
+ "name": "vae",
913
+ "type": "VAE",
914
+ "link": null,
915
+ "shape": 7
916
+ }
917
+ ],
918
+ "outputs": [
919
+ {
920
+ "name": "Filenames",
921
+ "type": "VHS_FILENAMES",
922
+ "links": null,
923
+ "shape": 3
924
+ }
925
+ ],
926
+ "properties": {
927
+ "Node name for S&R": "VHS_VideoCombine"
928
+ },
929
+ "widgets_values": {
930
+ "frame_rate": 8,
931
+ "loop_count": 0,
932
+ "filename_prefix": "CogVideoX_I2V_NoiseWarp",
933
+ "format": "video/h264-mp4",
934
+ "pix_fmt": "yuv420p",
935
+ "crf": 19,
936
+ "save_metadata": true,
937
+ "trim_to_audio": false,
938
+ "pingpong": false,
939
+ "save_output": false,
940
+ "videopreview": {
941
+ "hidden": false,
942
+ "paused": false,
943
+ "params": {
944
+ "filename": "CogVideoX-I2V_00002.mp4",
945
+ "subfolder": "",
946
+ "type": "temp",
947
+ "format": "video/h264-mp4",
948
+ "frame_rate": 8,
949
+ "workflow": "CogVideoX-I2V_00002.png",
950
+ "fullpath": "N:\\AI\\ComfyUI\\temp\\CogVideoX-I2V_00002.mp4"
951
+ },
952
+ "muted": false
953
+ }
954
+ }
955
+ },
956
+ {
957
+ "id": 71,
958
+ "type": "ImageResizeKJ",
959
+ "pos": [
960
+ 204.58009338378906,
961
+ 1289.261474609375
962
+ ],
963
+ "size": [
964
+ 315,
965
+ 266
966
+ ],
967
+ "flags": {},
968
+ "order": 11,
969
+ "mode": 0,
970
+ "inputs": [
971
+ {
972
+ "name": "image",
973
+ "type": "IMAGE",
974
+ "link": 178
975
+ },
976
+ {
977
+ "name": "get_image_size",
978
+ "type": "IMAGE",
979
+ "link": null,
980
+ "shape": 7
981
+ },
982
+ {
983
+ "name": "width_input",
984
+ "type": "INT",
985
+ "link": null,
986
+ "widget": {
987
+ "name": "width_input"
988
+ },
989
+ "shape": 7
990
+ },
991
+ {
992
+ "name": "height_input",
993
+ "type": "INT",
994
+ "link": null,
995
+ "widget": {
996
+ "name": "height_input"
997
+ },
998
+ "shape": 7
999
+ }
1000
+ ],
1001
+ "outputs": [
1002
+ {
1003
+ "name": "IMAGE",
1004
+ "type": "IMAGE",
1005
+ "links": [
1006
+ 161,
1007
+ 171,
1008
+ 189
1009
+ ],
1010
+ "slot_index": 0,
1011
+ "shape": 3
1012
+ },
1013
+ {
1014
+ "name": "width",
1015
+ "type": "INT",
1016
+ "links": null,
1017
+ "shape": 3
1018
+ },
1019
+ {
1020
+ "name": "height",
1021
+ "type": "INT",
1022
+ "links": null,
1023
+ "shape": 3
1024
+ }
1025
+ ],
1026
+ "properties": {
1027
+ "Node name for S&R": "ImageResizeKJ"
1028
+ },
1029
+ "widgets_values": [
1030
+ 720,
1031
+ 480,
1032
+ "lanczos",
1033
+ false,
1034
+ 16,
1035
+ 0,
1036
+ 0,
1037
+ "disabled"
1038
+ ]
1039
+ },
1040
+ {
1041
+ "id": 84,
1042
+ "type": "GetImageRangeFromBatch",
1043
+ "pos": [
1044
+ 197.0398712158203,
1045
+ 1077.9952392578125
1046
+ ],
1047
+ "size": [
1048
+ 340.2047424316406,
1049
+ 102
1050
+ ],
1051
+ "flags": {},
1052
+ "order": 13,
1053
+ "mode": 0,
1054
+ "inputs": [
1055
+ {
1056
+ "name": "images",
1057
+ "type": "IMAGE",
1058
+ "link": 189,
1059
+ "shape": 7
1060
+ },
1061
+ {
1062
+ "name": "masks",
1063
+ "type": "MASK",
1064
+ "link": null,
1065
+ "shape": 7
1066
+ }
1067
+ ],
1068
+ "outputs": [
1069
+ {
1070
+ "name": "IMAGE",
1071
+ "type": "IMAGE",
1072
+ "links": [
1073
+ 190
1074
+ ],
1075
+ "slot_index": 0
1076
+ },
1077
+ {
1078
+ "name": "MASK",
1079
+ "type": "MASK",
1080
+ "links": null
1081
+ }
1082
+ ],
1083
+ "properties": {
1084
+ "Node name for S&R": "GetImageRangeFromBatch"
1085
+ },
1086
+ "widgets_values": [
1087
+ 0,
1088
+ 1
1089
+ ]
1090
+ }
1091
+ ],
1092
+ "links": [
1093
+ [
1094
+ 54,
1095
+ 20,
1096
+ 0,
1097
+ 30,
1098
+ 0,
1099
+ "CLIP"
1100
+ ],
1101
+ [
1102
+ 132,
1103
+ 59,
1104
+ 1,
1105
+ 60,
1106
+ 0,
1107
+ "VAE"
1108
+ ],
1109
+ [
1110
+ 141,
1111
+ 59,
1112
+ 1,
1113
+ 62,
1114
+ 0,
1115
+ "VAE"
1116
+ ],
1117
+ [
1118
+ 144,
1119
+ 59,
1120
+ 0,
1121
+ 63,
1122
+ 0,
1123
+ "COGVIDEOMODEL"
1124
+ ],
1125
+ [
1126
+ 145,
1127
+ 30,
1128
+ 0,
1129
+ 63,
1130
+ 1,
1131
+ "CONDITIONING"
1132
+ ],
1133
+ [
1134
+ 146,
1135
+ 31,
1136
+ 0,
1137
+ 63,
1138
+ 2,
1139
+ "CONDITIONING"
1140
+ ],
1141
+ [
1142
+ 147,
1143
+ 62,
1144
+ 0,
1145
+ 63,
1146
+ 4,
1147
+ "LATENT"
1148
+ ],
1149
+ [
1150
+ 148,
1151
+ 63,
1152
+ 0,
1153
+ 60,
1154
+ 1,
1155
+ "LATENT"
1156
+ ],
1157
+ [
1158
+ 149,
1159
+ 30,
1160
+ 1,
1161
+ 31,
1162
+ 0,
1163
+ "CLIP"
1164
+ ],
1165
+ [
1166
+ 161,
1167
+ 71,
1168
+ 0,
1169
+ 64,
1170
+ 0,
1171
+ "IMAGE"
1172
+ ],
1173
+ [
1174
+ 164,
1175
+ 64,
1176
+ 0,
1177
+ 63,
1178
+ 3,
1179
+ "LATENT"
1180
+ ],
1181
+ [
1182
+ 165,
1183
+ 59,
1184
+ 1,
1185
+ 73,
1186
+ 0,
1187
+ "VAE"
1188
+ ],
1189
+ [
1190
+ 167,
1191
+ 64,
1192
+ 0,
1193
+ 73,
1194
+ 1,
1195
+ "LATENT"
1196
+ ],
1197
+ [
1198
+ 170,
1199
+ 74,
1200
+ 0,
1201
+ 44,
1202
+ 0,
1203
+ "IMAGE"
1204
+ ],
1205
+ [
1206
+ 171,
1207
+ 71,
1208
+ 0,
1209
+ 74,
1210
+ 0,
1211
+ "IMAGE"
1212
+ ],
1213
+ [
1214
+ 174,
1215
+ 72,
1216
+ 0,
1217
+ 59,
1218
+ 1,
1219
+ "COGLORA"
1220
+ ],
1221
+ [
1222
+ 178,
1223
+ 68,
1224
+ 0,
1225
+ 71,
1226
+ 0,
1227
+ "IMAGE"
1228
+ ],
1229
+ [
1230
+ 181,
1231
+ 69,
1232
+ 0,
1233
+ 68,
1234
+ 0,
1235
+ "IMAGE"
1236
+ ],
1237
+ [
1238
+ 184,
1239
+ 60,
1240
+ 0,
1241
+ 74,
1242
+ 1,
1243
+ "IMAGE"
1244
+ ],
1245
+ [
1246
+ 185,
1247
+ 73,
1248
+ 0,
1249
+ 76,
1250
+ 0,
1251
+ "IMAGE"
1252
+ ],
1253
+ [
1254
+ 189,
1255
+ 71,
1256
+ 0,
1257
+ 84,
1258
+ 0,
1259
+ "IMAGE"
1260
+ ],
1261
+ [
1262
+ 190,
1263
+ 84,
1264
+ 0,
1265
+ 62,
1266
+ 1,
1267
+ "IMAGE"
1268
+ ]
1269
+ ],
1270
+ "groups": [],
1271
+ "config": {},
1272
+ "extra": {
1273
+ "ds": {
1274
+ "scale": 0.6115909044841579,
1275
+ "offset": [
1276
+ 1276.2661497783536,
1277
+ -1.7440717555266154
1278
+ ]
1279
+ },
1280
+ "node_versions": {
1281
+ "ComfyUI-CogVideoXWrapper": "8c5e4f812d869653a6c201af0dcd6249c18b3231",
1282
+ "comfy-core": "0.3.12",
1283
+ "ComfyUI-KJNodes": "c9c8dcd5e7ed2f7669f130a5ced1e3005264a2de",
1284
+ "ComfyUI-VideoHelperSuite": "c47b10ca1798b4925ff5a5f07d80c51ca80a837d",
1285
+ "ComfyUI-NoiseWarp": "8c5e4f812d869653a6c201af0dcd6249c18b3231"
1286
+ },
1287
+ "VHS_latentpreview": true,
1288
+ "VHS_latentpreviewrate": 0
1289
+ },
1290
+ "version": 0.4
1291
+ }
custom_nodes/ComfyUI-CogVideoXWrapper/example_workflows/cogvideox_1_0_5b_T2V_02.json ADDED
@@ -0,0 +1,529 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "last_node_id": 37,
3
+ "last_link_id": 72,
4
+ "nodes": [
5
+ {
6
+ "id": 30,
7
+ "type": "CogVideoTextEncode",
8
+ "pos": {
9
+ "0": 500,
10
+ "1": 308
11
+ },
12
+ "size": [
13
+ 470.99399664051055,
14
+ 237.5088638951354
15
+ ],
16
+ "flags": {},
17
+ "order": 3,
18
+ "mode": 0,
19
+ "inputs": [
20
+ {
21
+ "name": "clip",
22
+ "type": "CLIP",
23
+ "link": 54
24
+ }
25
+ ],
26
+ "outputs": [
27
+ {
28
+ "name": "conditioning",
29
+ "type": "CONDITIONING",
30
+ "links": [
31
+ 67
32
+ ],
33
+ "slot_index": 0,
34
+ "shape": 3
35
+ },
36
+ {
37
+ "name": "clip",
38
+ "type": "CLIP",
39
+ "links": [
40
+ 65
41
+ ],
42
+ "slot_index": 1
43
+ }
44
+ ],
45
+ "properties": {
46
+ "Node name for S&R": "CogVideoTextEncode"
47
+ },
48
+ "widgets_values": [
49
+ "A golden retriever, sporting sleek black sunglasses, with its lengthy fur flowing in the breeze, sprints playfully across a rooftop terrace, recently refreshed by a light rain. The scene unfolds from a distance, the dog's energetic bounds growing larger as it approaches the camera, its tail wagging with unrestrained joy, while droplets of water glisten on the concrete behind it. The overcast sky provides a dramatic backdrop, emphasizing the vibrant golden coat of the canine as it dashes towards the viewer.\n\n",
50
+ 1,
51
+ false
52
+ ]
53
+ },
54
+ {
55
+ "id": 31,
56
+ "type": "CogVideoTextEncode",
57
+ "pos": {
58
+ "0": 503,
59
+ "1": 602
60
+ },
61
+ "size": [
62
+ 464.4980515341475,
63
+ 169.87479027400514
64
+ ],
65
+ "flags": {},
66
+ "order": 4,
67
+ "mode": 0,
68
+ "inputs": [
69
+ {
70
+ "name": "clip",
71
+ "type": "CLIP",
72
+ "link": 65
73
+ }
74
+ ],
75
+ "outputs": [
76
+ {
77
+ "name": "conditioning",
78
+ "type": "CONDITIONING",
79
+ "links": [
80
+ 68
81
+ ],
82
+ "slot_index": 0,
83
+ "shape": 3
84
+ },
85
+ {
86
+ "name": "clip",
87
+ "type": "CLIP",
88
+ "links": null
89
+ }
90
+ ],
91
+ "properties": {
92
+ "Node name for S&R": "CogVideoTextEncode"
93
+ },
94
+ "widgets_values": [
95
+ "",
96
+ 1,
97
+ true
98
+ ]
99
+ },
100
+ {
101
+ "id": 11,
102
+ "type": "CogVideoDecode",
103
+ "pos": {
104
+ "0": 1416,
105
+ "1": 40
106
+ },
107
+ "size": {
108
+ "0": 300.396484375,
109
+ "1": 198
110
+ },
111
+ "flags": {},
112
+ "order": 6,
113
+ "mode": 0,
114
+ "inputs": [
115
+ {
116
+ "name": "vae",
117
+ "type": "VAE",
118
+ "link": 71
119
+ },
120
+ {
121
+ "name": "samples",
122
+ "type": "LATENT",
123
+ "link": 69
124
+ }
125
+ ],
126
+ "outputs": [
127
+ {
128
+ "name": "images",
129
+ "type": "IMAGE",
130
+ "links": [
131
+ 59
132
+ ],
133
+ "slot_index": 0,
134
+ "shape": 3
135
+ }
136
+ ],
137
+ "properties": {
138
+ "Node name for S&R": "CogVideoDecode"
139
+ },
140
+ "widgets_values": [
141
+ false,
142
+ 240,
143
+ 360,
144
+ 0.2,
145
+ 0.2,
146
+ true
147
+ ]
148
+ },
149
+ {
150
+ "id": 36,
151
+ "type": "DownloadAndLoadCogVideoModel",
152
+ "pos": {
153
+ "0": 645,
154
+ "1": 17
155
+ },
156
+ "size": {
157
+ "0": 315,
158
+ "1": 218
159
+ },
160
+ "flags": {},
161
+ "order": 0,
162
+ "mode": 0,
163
+ "inputs": [
164
+ {
165
+ "name": "block_edit",
166
+ "type": "TRANSFORMERBLOCKS",
167
+ "link": null,
168
+ "shape": 7
169
+ },
170
+ {
171
+ "name": "lora",
172
+ "type": "COGLORA",
173
+ "link": null,
174
+ "shape": 7
175
+ },
176
+ {
177
+ "name": "compile_args",
178
+ "type": "COMPILEARGS",
179
+ "link": null,
180
+ "shape": 7
181
+ }
182
+ ],
183
+ "outputs": [
184
+ {
185
+ "name": "model",
186
+ "type": "COGVIDEOMODEL",
187
+ "links": [
188
+ 70
189
+ ]
190
+ },
191
+ {
192
+ "name": "vae",
193
+ "type": "VAE",
194
+ "links": [
195
+ 71
196
+ ],
197
+ "slot_index": 1
198
+ }
199
+ ],
200
+ "properties": {
201
+ "Node name for S&R": "DownloadAndLoadCogVideoModel"
202
+ },
203
+ "widgets_values": [
204
+ "THUDM/CogVideoX-5b",
205
+ "bf16",
206
+ "disabled",
207
+ false,
208
+ "sdpa",
209
+ "main_device"
210
+ ]
211
+ },
212
+ {
213
+ "id": 20,
214
+ "type": "CLIPLoader",
215
+ "pos": {
216
+ "0": 5,
217
+ "1": 308
218
+ },
219
+ "size": {
220
+ "0": 451.30548095703125,
221
+ "1": 82
222
+ },
223
+ "flags": {},
224
+ "order": 1,
225
+ "mode": 0,
226
+ "inputs": [],
227
+ "outputs": [
228
+ {
229
+ "name": "CLIP",
230
+ "type": "CLIP",
231
+ "links": [
232
+ 54
233
+ ],
234
+ "slot_index": 0,
235
+ "shape": 3
236
+ }
237
+ ],
238
+ "properties": {
239
+ "Node name for S&R": "CLIPLoader"
240
+ },
241
+ "widgets_values": [
242
+ "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
243
+ "sd3"
244
+ ]
245
+ },
246
+ {
247
+ "id": 37,
248
+ "type": "EmptyLatentImage",
249
+ "pos": {
250
+ "0": 643,
251
+ "1": 827
252
+ },
253
+ "size": {
254
+ "0": 315,
255
+ "1": 106
256
+ },
257
+ "flags": {},
258
+ "order": 2,
259
+ "mode": 0,
260
+ "inputs": [],
261
+ "outputs": [
262
+ {
263
+ "name": "LATENT",
264
+ "type": "LATENT",
265
+ "links": [
266
+ 72
267
+ ]
268
+ }
269
+ ],
270
+ "properties": {
271
+ "Node name for S&R": "EmptyLatentImage"
272
+ },
273
+ "widgets_values": [
274
+ 720,
275
+ 480,
276
+ 1
277
+ ]
278
+ },
279
+ {
280
+ "id": 35,
281
+ "type": "CogVideoSampler",
282
+ "pos": {
283
+ "0": 1042,
284
+ "1": 291
285
+ },
286
+ "size": [
287
+ 330,
288
+ 574
289
+ ],
290
+ "flags": {},
291
+ "order": 5,
292
+ "mode": 0,
293
+ "inputs": [
294
+ {
295
+ "name": "model",
296
+ "type": "COGVIDEOMODEL",
297
+ "link": 70
298
+ },
299
+ {
300
+ "name": "positive",
301
+ "type": "CONDITIONING",
302
+ "link": 67
303
+ },
304
+ {
305
+ "name": "negative",
306
+ "type": "CONDITIONING",
307
+ "link": 68
308
+ },
309
+ {
310
+ "name": "samples",
311
+ "type": "LATENT",
312
+ "link": 72,
313
+ "shape": 7
314
+ },
315
+ {
316
+ "name": "image_cond_latents",
317
+ "type": "LATENT",
318
+ "link": null,
319
+ "shape": 7
320
+ },
321
+ {
322
+ "name": "context_options",
323
+ "type": "COGCONTEXT",
324
+ "link": null,
325
+ "shape": 7
326
+ },
327
+ {
328
+ "name": "controlnet",
329
+ "type": "COGVIDECONTROLNET",
330
+ "link": null,
331
+ "shape": 7
332
+ },
333
+ {
334
+ "name": "tora_trajectory",
335
+ "type": "TORAFEATURES",
336
+ "link": null,
337
+ "shape": 7
338
+ },
339
+ {
340
+ "name": "fastercache",
341
+ "type": "FASTERCACHEARGS",
342
+ "link": null,
343
+ "shape": 7
344
+ }
345
+ ],
346
+ "outputs": [
347
+ {
348
+ "name": "samples",
349
+ "type": "LATENT",
350
+ "links": [
351
+ 69
352
+ ]
353
+ }
354
+ ],
355
+ "properties": {
356
+ "Node name for S&R": "CogVideoSampler"
357
+ },
358
+ "widgets_values": [
359
+ 49,
360
+ 50,
361
+ 6,
362
+ 0,
363
+ "fixed",
364
+ "CogVideoXDDIM",
365
+ 1
366
+ ]
367
+ },
368
+ {
369
+ "id": 33,
370
+ "type": "VHS_VideoCombine",
371
+ "pos": {
372
+ "0": 1767,
373
+ "1": 39
374
+ },
375
+ "size": [
376
+ 778.7022705078125,
377
+ 829.801513671875
378
+ ],
379
+ "flags": {},
380
+ "order": 7,
381
+ "mode": 0,
382
+ "inputs": [
383
+ {
384
+ "name": "images",
385
+ "type": "IMAGE",
386
+ "link": 59
387
+ },
388
+ {
389
+ "name": "audio",
390
+ "type": "AUDIO",
391
+ "link": null,
392
+ "shape": 7
393
+ },
394
+ {
395
+ "name": "meta_batch",
396
+ "type": "VHS_BatchManager",
397
+ "link": null,
398
+ "shape": 7
399
+ },
400
+ {
401
+ "name": "vae",
402
+ "type": "VAE",
403
+ "link": null,
404
+ "shape": 7
405
+ }
406
+ ],
407
+ "outputs": [
408
+ {
409
+ "name": "Filenames",
410
+ "type": "VHS_FILENAMES",
411
+ "links": null,
412
+ "shape": 3
413
+ }
414
+ ],
415
+ "properties": {
416
+ "Node name for S&R": "VHS_VideoCombine"
417
+ },
418
+ "widgets_values": {
419
+ "frame_rate": 8,
420
+ "loop_count": 0,
421
+ "filename_prefix": "CogVideoX5B-T2V",
422
+ "format": "video/h264-mp4",
423
+ "pix_fmt": "yuv420p",
424
+ "crf": 19,
425
+ "save_metadata": true,
426
+ "pingpong": false,
427
+ "save_output": false,
428
+ "videopreview": {
429
+ "hidden": false,
430
+ "paused": false,
431
+ "params": {
432
+ "filename": "CogVideoX5B_00001.mp4",
433
+ "subfolder": "",
434
+ "type": "temp",
435
+ "format": "video/h264-mp4",
436
+ "frame_rate": 8
437
+ },
438
+ "muted": false
439
+ }
440
+ }
441
+ }
442
+ ],
443
+ "links": [
444
+ [
445
+ 54,
446
+ 20,
447
+ 0,
448
+ 30,
449
+ 0,
450
+ "CLIP"
451
+ ],
452
+ [
453
+ 59,
454
+ 11,
455
+ 0,
456
+ 33,
457
+ 0,
458
+ "IMAGE"
459
+ ],
460
+ [
461
+ 65,
462
+ 30,
463
+ 1,
464
+ 31,
465
+ 0,
466
+ "CLIP"
467
+ ],
468
+ [
469
+ 67,
470
+ 30,
471
+ 0,
472
+ 35,
473
+ 1,
474
+ "CONDITIONING"
475
+ ],
476
+ [
477
+ 68,
478
+ 31,
479
+ 0,
480
+ 35,
481
+ 2,
482
+ "CONDITIONING"
483
+ ],
484
+ [
485
+ 69,
486
+ 35,
487
+ 0,
488
+ 11,
489
+ 1,
490
+ "LATENT"
491
+ ],
492
+ [
493
+ 70,
494
+ 36,
495
+ 0,
496
+ 35,
497
+ 0,
498
+ "COGVIDEOMODEL"
499
+ ],
500
+ [
501
+ 71,
502
+ 36,
503
+ 1,
504
+ 11,
505
+ 0,
506
+ "VAE"
507
+ ],
508
+ [
509
+ 72,
510
+ 37,
511
+ 0,
512
+ 35,
513
+ 3,
514
+ "LATENT"
515
+ ]
516
+ ],
517
+ "groups": [],
518
+ "config": {},
519
+ "extra": {
520
+ "ds": {
521
+ "scale": 0.7627768444387061,
522
+ "offset": [
523
+ 734.1791945221892,
524
+ 237.29437844909364
525
+ ]
526
+ }
527
+ },
528
+ "version": 0.4
529
+ }
custom_nodes/ComfyUI-CogVideoXWrapper/example_workflows/cogvideox_1_0_5b_interpolation_02.json ADDED
@@ -0,0 +1,864 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "last_node_id": 68,
3
+ "last_link_id": 155,
4
+ "nodes": [
5
+ {
6
+ "id": 31,
7
+ "type": "CogVideoTextEncode",
8
+ "pos": {
9
+ "0": 497,
10
+ "1": 520
11
+ },
12
+ "size": {
13
+ "0": 463.01251220703125,
14
+ "1": 144
15
+ },
16
+ "flags": {},
17
+ "order": 6,
18
+ "mode": 0,
19
+ "inputs": [
20
+ {
21
+ "name": "clip",
22
+ "type": "CLIP",
23
+ "link": 149
24
+ }
25
+ ],
26
+ "outputs": [
27
+ {
28
+ "name": "conditioning",
29
+ "type": "CONDITIONING",
30
+ "links": [
31
+ 146
32
+ ],
33
+ "slot_index": 0,
34
+ "shape": 3
35
+ },
36
+ {
37
+ "name": "clip",
38
+ "type": "CLIP",
39
+ "links": null
40
+ }
41
+ ],
42
+ "properties": {
43
+ "Node name for S&R": "CogVideoTextEncode"
44
+ },
45
+ "widgets_values": [
46
+ "The video is not of a high quality, it has a low resolution. Watermark present in each frame. Strange motion trajectory. ",
47
+ 1,
48
+ true
49
+ ]
50
+ },
51
+ {
52
+ "id": 63,
53
+ "type": "CogVideoSampler",
54
+ "pos": {
55
+ "0": 1142,
56
+ "1": 74
57
+ },
58
+ "size": [
59
+ 330,
60
+ 574
61
+ ],
62
+ "flags": {},
63
+ "order": 9,
64
+ "mode": 0,
65
+ "inputs": [
66
+ {
67
+ "name": "model",
68
+ "type": "COGVIDEOMODEL",
69
+ "link": 144
70
+ },
71
+ {
72
+ "name": "positive",
73
+ "type": "CONDITIONING",
74
+ "link": 145
75
+ },
76
+ {
77
+ "name": "negative",
78
+ "type": "CONDITIONING",
79
+ "link": 146
80
+ },
81
+ {
82
+ "name": "samples",
83
+ "type": "LATENT",
84
+ "link": null,
85
+ "shape": 7
86
+ },
87
+ {
88
+ "name": "image_cond_latents",
89
+ "type": "LATENT",
90
+ "link": 147,
91
+ "shape": 7
92
+ },
93
+ {
94
+ "name": "context_options",
95
+ "type": "COGCONTEXT",
96
+ "link": null,
97
+ "shape": 7
98
+ },
99
+ {
100
+ "name": "controlnet",
101
+ "type": "COGVIDECONTROLNET",
102
+ "link": null,
103
+ "shape": 7
104
+ },
105
+ {
106
+ "name": "tora_trajectory",
107
+ "type": "TORAFEATURES",
108
+ "link": null,
109
+ "shape": 7
110
+ },
111
+ {
112
+ "name": "fastercache",
113
+ "type": "FASTERCACHEARGS",
114
+ "link": null,
115
+ "shape": 7
116
+ }
117
+ ],
118
+ "outputs": [
119
+ {
120
+ "name": "samples",
121
+ "type": "LATENT",
122
+ "links": [
123
+ 148
124
+ ]
125
+ }
126
+ ],
127
+ "properties": {
128
+ "Node name for S&R": "CogVideoSampler"
129
+ },
130
+ "widgets_values": [
131
+ 49,
132
+ 25,
133
+ 6,
134
+ 0,
135
+ "fixed",
136
+ "CogVideoXDDIM",
137
+ 1
138
+ ]
139
+ },
140
+ {
141
+ "id": 30,
142
+ "type": "CogVideoTextEncode",
143
+ "pos": {
144
+ "0": 493,
145
+ "1": 303
146
+ },
147
+ "size": {
148
+ "0": 471.90142822265625,
149
+ "1": 168.08047485351562
150
+ },
151
+ "flags": {},
152
+ "order": 4,
153
+ "mode": 0,
154
+ "inputs": [
155
+ {
156
+ "name": "clip",
157
+ "type": "CLIP",
158
+ "link": 54
159
+ }
160
+ ],
161
+ "outputs": [
162
+ {
163
+ "name": "conditioning",
164
+ "type": "CONDITIONING",
165
+ "links": [
166
+ 145
167
+ ],
168
+ "slot_index": 0,
169
+ "shape": 3
170
+ },
171
+ {
172
+ "name": "clip",
173
+ "type": "CLIP",
174
+ "links": [
175
+ 149
176
+ ],
177
+ "slot_index": 1
178
+ }
179
+ ],
180
+ "properties": {
181
+ "Node name for S&R": "CogVideoTextEncode"
182
+ },
183
+ "widgets_values": [
184
+ "a majestic stag is grazing in an enhanced forest, basking in the setting sun filtered by the trees",
185
+ 1,
186
+ false
187
+ ]
188
+ },
189
+ {
190
+ "id": 20,
191
+ "type": "CLIPLoader",
192
+ "pos": {
193
+ "0": -2,
194
+ "1": 304
195
+ },
196
+ "size": {
197
+ "0": 451.30548095703125,
198
+ "1": 82
199
+ },
200
+ "flags": {},
201
+ "order": 0,
202
+ "mode": 0,
203
+ "inputs": [],
204
+ "outputs": [
205
+ {
206
+ "name": "CLIP",
207
+ "type": "CLIP",
208
+ "links": [
209
+ 54
210
+ ],
211
+ "slot_index": 0,
212
+ "shape": 3
213
+ }
214
+ ],
215
+ "properties": {
216
+ "Node name for S&R": "CLIPLoader"
217
+ },
218
+ "widgets_values": [
219
+ "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
220
+ "sd3"
221
+ ]
222
+ },
223
+ {
224
+ "id": 36,
225
+ "type": "LoadImage",
226
+ "pos": {
227
+ "0": 105,
228
+ "1": 732
229
+ },
230
+ "size": {
231
+ "0": 402.06353759765625,
232
+ "1": 396.6225891113281
233
+ },
234
+ "flags": {},
235
+ "order": 1,
236
+ "mode": 0,
237
+ "inputs": [],
238
+ "outputs": [
239
+ {
240
+ "name": "IMAGE",
241
+ "type": "IMAGE",
242
+ "links": [
243
+ 71
244
+ ],
245
+ "slot_index": 0,
246
+ "shape": 3
247
+ },
248
+ {
249
+ "name": "MASK",
250
+ "type": "MASK",
251
+ "links": null,
252
+ "shape": 3
253
+ }
254
+ ],
255
+ "properties": {
256
+ "Node name for S&R": "LoadImage"
257
+ },
258
+ "widgets_values": [
259
+ "sd3stag.png",
260
+ "image"
261
+ ]
262
+ },
263
+ {
264
+ "id": 64,
265
+ "type": "LoadImage",
266
+ "pos": {
267
+ "0": 105,
268
+ "1": 1189
269
+ },
270
+ "size": {
271
+ "0": 402.06353759765625,
272
+ "1": 396.6225891113281
273
+ },
274
+ "flags": {},
275
+ "order": 2,
276
+ "mode": 0,
277
+ "inputs": [],
278
+ "outputs": [
279
+ {
280
+ "name": "IMAGE",
281
+ "type": "IMAGE",
282
+ "links": [
283
+ 151
284
+ ],
285
+ "slot_index": 0,
286
+ "shape": 3
287
+ },
288
+ {
289
+ "name": "MASK",
290
+ "type": "MASK",
291
+ "links": null,
292
+ "shape": 3
293
+ }
294
+ ],
295
+ "properties": {
296
+ "Node name for S&R": "LoadImage"
297
+ },
298
+ "widgets_values": [
299
+ "sd3stag.png",
300
+ "image"
301
+ ]
302
+ },
303
+ {
304
+ "id": 65,
305
+ "type": "ImageResizeKJ",
306
+ "pos": {
307
+ "0": 607,
308
+ "1": 1188
309
+ },
310
+ "size": [
311
+ 315,
312
+ 266
313
+ ],
314
+ "flags": {},
315
+ "order": 7,
316
+ "mode": 0,
317
+ "inputs": [
318
+ {
319
+ "name": "image",
320
+ "type": "IMAGE",
321
+ "link": 151
322
+ },
323
+ {
324
+ "name": "get_image_size",
325
+ "type": "IMAGE",
326
+ "link": null,
327
+ "shape": 7
328
+ },
329
+ {
330
+ "name": "width_input",
331
+ "type": "INT",
332
+ "link": null,
333
+ "widget": {
334
+ "name": "width_input"
335
+ },
336
+ "shape": 7
337
+ },
338
+ {
339
+ "name": "height_input",
340
+ "type": "INT",
341
+ "link": null,
342
+ "widget": {
343
+ "name": "height_input"
344
+ },
345
+ "shape": 7
346
+ },
347
+ {
348
+ "name": "width",
349
+ "type": "INT",
350
+ "link": 152,
351
+ "widget": {
352
+ "name": "width"
353
+ }
354
+ },
355
+ {
356
+ "name": "height",
357
+ "type": "INT",
358
+ "link": 153,
359
+ "widget": {
360
+ "name": "height"
361
+ }
362
+ }
363
+ ],
364
+ "outputs": [
365
+ {
366
+ "name": "IMAGE",
367
+ "type": "IMAGE",
368
+ "links": [
369
+ 155
370
+ ],
371
+ "slot_index": 0,
372
+ "shape": 3
373
+ },
374
+ {
375
+ "name": "width",
376
+ "type": "INT",
377
+ "links": null,
378
+ "shape": 3
379
+ },
380
+ {
381
+ "name": "height",
382
+ "type": "INT",
383
+ "links": null,
384
+ "shape": 3
385
+ }
386
+ ],
387
+ "properties": {
388
+ "Node name for S&R": "ImageResizeKJ"
389
+ },
390
+ "widgets_values": [
391
+ 720,
392
+ 480,
393
+ "lanczos",
394
+ false,
395
+ 16,
396
+ 0,
397
+ 0,
398
+ "disabled"
399
+ ]
400
+ },
401
+ {
402
+ "id": 37,
403
+ "type": "ImageResizeKJ",
404
+ "pos": {
405
+ "0": 593,
406
+ "1": 731
407
+ },
408
+ "size": {
409
+ "0": 315,
410
+ "1": 266
411
+ },
412
+ "flags": {},
413
+ "order": 5,
414
+ "mode": 0,
415
+ "inputs": [
416
+ {
417
+ "name": "image",
418
+ "type": "IMAGE",
419
+ "link": 71
420
+ },
421
+ {
422
+ "name": "get_image_size",
423
+ "type": "IMAGE",
424
+ "link": null,
425
+ "shape": 7
426
+ },
427
+ {
428
+ "name": "width_input",
429
+ "type": "INT",
430
+ "link": null,
431
+ "widget": {
432
+ "name": "width_input"
433
+ }
434
+ },
435
+ {
436
+ "name": "height_input",
437
+ "type": "INT",
438
+ "link": null,
439
+ "widget": {
440
+ "name": "height_input"
441
+ }
442
+ }
443
+ ],
444
+ "outputs": [
445
+ {
446
+ "name": "IMAGE",
447
+ "type": "IMAGE",
448
+ "links": [
449
+ 142
450
+ ],
451
+ "slot_index": 0,
452
+ "shape": 3
453
+ },
454
+ {
455
+ "name": "width",
456
+ "type": "INT",
457
+ "links": [
458
+ 152
459
+ ],
460
+ "shape": 3,
461
+ "slot_index": 1
462
+ },
463
+ {
464
+ "name": "height",
465
+ "type": "INT",
466
+ "links": [
467
+ 153
468
+ ],
469
+ "shape": 3,
470
+ "slot_index": 2
471
+ }
472
+ ],
473
+ "properties": {
474
+ "Node name for S&R": "ImageResizeKJ"
475
+ },
476
+ "widgets_values": [
477
+ 720,
478
+ 480,
479
+ "lanczos",
480
+ false,
481
+ 16,
482
+ 0,
483
+ 0,
484
+ "disabled"
485
+ ]
486
+ },
487
+ {
488
+ "id": 60,
489
+ "type": "CogVideoDecode",
490
+ "pos": {
491
+ "0": 1526,
492
+ "1": -4
493
+ },
494
+ "size": {
495
+ "0": 315,
496
+ "1": 198
497
+ },
498
+ "flags": {},
499
+ "order": 10,
500
+ "mode": 0,
501
+ "inputs": [
502
+ {
503
+ "name": "vae",
504
+ "type": "VAE",
505
+ "link": 132
506
+ },
507
+ {
508
+ "name": "samples",
509
+ "type": "LATENT",
510
+ "link": 148
511
+ }
512
+ ],
513
+ "outputs": [
514
+ {
515
+ "name": "images",
516
+ "type": "IMAGE",
517
+ "links": [
518
+ 134
519
+ ]
520
+ }
521
+ ],
522
+ "properties": {
523
+ "Node name for S&R": "CogVideoDecode"
524
+ },
525
+ "widgets_values": [
526
+ true,
527
+ 240,
528
+ 360,
529
+ 0.2,
530
+ 0.2,
531
+ true
532
+ ]
533
+ },
534
+ {
535
+ "id": 62,
536
+ "type": "CogVideoImageEncode",
537
+ "pos": {
538
+ "0": 1152,
539
+ "1": 706
540
+ },
541
+ "size": {
542
+ "0": 315,
543
+ "1": 122
544
+ },
545
+ "flags": {},
546
+ "order": 8,
547
+ "mode": 0,
548
+ "inputs": [
549
+ {
550
+ "name": "vae",
551
+ "type": "VAE",
552
+ "link": 141
553
+ },
554
+ {
555
+ "name": "start_image",
556
+ "type": "IMAGE",
557
+ "link": 142
558
+ },
559
+ {
560
+ "name": "end_image",
561
+ "type": "IMAGE",
562
+ "link": 155,
563
+ "shape": 7
564
+ }
565
+ ],
566
+ "outputs": [
567
+ {
568
+ "name": "samples",
569
+ "type": "LATENT",
570
+ "links": [
571
+ 147
572
+ ]
573
+ }
574
+ ],
575
+ "properties": {
576
+ "Node name for S&R": "CogVideoImageEncode"
577
+ },
578
+ "widgets_values": [
579
+ false,
580
+ 0
581
+ ]
582
+ },
583
+ {
584
+ "id": 44,
585
+ "type": "VHS_VideoCombine",
586
+ "pos": {
587
+ "0": 1884,
588
+ "1": -3
589
+ },
590
+ "size": [
591
+ 605.3909912109375,
592
+ 714.2606608072917
593
+ ],
594
+ "flags": {},
595
+ "order": 11,
596
+ "mode": 0,
597
+ "inputs": [
598
+ {
599
+ "name": "images",
600
+ "type": "IMAGE",
601
+ "link": 134
602
+ },
603
+ {
604
+ "name": "audio",
605
+ "type": "AUDIO",
606
+ "link": null,
607
+ "shape": 7
608
+ },
609
+ {
610
+ "name": "meta_batch",
611
+ "type": "VHS_BatchManager",
612
+ "link": null,
613
+ "shape": 7
614
+ },
615
+ {
616
+ "name": "vae",
617
+ "type": "VAE",
618
+ "link": null,
619
+ "shape": 7
620
+ }
621
+ ],
622
+ "outputs": [
623
+ {
624
+ "name": "Filenames",
625
+ "type": "VHS_FILENAMES",
626
+ "links": null,
627
+ "shape": 3
628
+ }
629
+ ],
630
+ "properties": {
631
+ "Node name for S&R": "VHS_VideoCombine"
632
+ },
633
+ "widgets_values": {
634
+ "frame_rate": 8,
635
+ "loop_count": 0,
636
+ "filename_prefix": "CogVideoX-Interpolation",
637
+ "format": "video/h264-mp4",
638
+ "pix_fmt": "yuv420p",
639
+ "crf": 19,
640
+ "save_metadata": true,
641
+ "pingpong": false,
642
+ "save_output": true,
643
+ "videopreview": {
644
+ "hidden": false,
645
+ "paused": false,
646
+ "params": {
647
+ "filename": "CogVideoX-I2V_00003.mp4",
648
+ "subfolder": "",
649
+ "type": "temp",
650
+ "format": "video/h264-mp4",
651
+ "frame_rate": 8
652
+ },
653
+ "muted": false
654
+ }
655
+ }
656
+ },
657
+ {
658
+ "id": 59,
659
+ "type": "DownloadAndLoadCogVideoModel",
660
+ "pos": {
661
+ "0": 622,
662
+ "1": -25
663
+ },
664
+ "size": [
665
+ 347.24594407027485,
666
+ 218
667
+ ],
668
+ "flags": {},
669
+ "order": 3,
670
+ "mode": 0,
671
+ "inputs": [
672
+ {
673
+ "name": "block_edit",
674
+ "type": "TRANSFORMERBLOCKS",
675
+ "link": null,
676
+ "shape": 7
677
+ },
678
+ {
679
+ "name": "lora",
680
+ "type": "COGLORA",
681
+ "link": null,
682
+ "shape": 7
683
+ },
684
+ {
685
+ "name": "compile_args",
686
+ "type": "COMPILEARGS",
687
+ "link": null,
688
+ "shape": 7
689
+ }
690
+ ],
691
+ "outputs": [
692
+ {
693
+ "name": "model",
694
+ "type": "COGVIDEOMODEL",
695
+ "links": [
696
+ 144
697
+ ]
698
+ },
699
+ {
700
+ "name": "vae",
701
+ "type": "VAE",
702
+ "links": [
703
+ 132,
704
+ 141
705
+ ],
706
+ "slot_index": 1
707
+ }
708
+ ],
709
+ "properties": {
710
+ "Node name for S&R": "DownloadAndLoadCogVideoModel"
711
+ },
712
+ "widgets_values": [
713
+ "feizhengcong/CogvideoX-Interpolation",
714
+ "bf16",
715
+ "disabled",
716
+ false,
717
+ "sdpa",
718
+ "main_device"
719
+ ]
720
+ }
721
+ ],
722
+ "links": [
723
+ [
724
+ 54,
725
+ 20,
726
+ 0,
727
+ 30,
728
+ 0,
729
+ "CLIP"
730
+ ],
731
+ [
732
+ 71,
733
+ 36,
734
+ 0,
735
+ 37,
736
+ 0,
737
+ "IMAGE"
738
+ ],
739
+ [
740
+ 132,
741
+ 59,
742
+ 1,
743
+ 60,
744
+ 0,
745
+ "VAE"
746
+ ],
747
+ [
748
+ 134,
749
+ 60,
750
+ 0,
751
+ 44,
752
+ 0,
753
+ "IMAGE"
754
+ ],
755
+ [
756
+ 141,
757
+ 59,
758
+ 1,
759
+ 62,
760
+ 0,
761
+ "VAE"
762
+ ],
763
+ [
764
+ 142,
765
+ 37,
766
+ 0,
767
+ 62,
768
+ 1,
769
+ "IMAGE"
770
+ ],
771
+ [
772
+ 144,
773
+ 59,
774
+ 0,
775
+ 63,
776
+ 0,
777
+ "COGVIDEOMODEL"
778
+ ],
779
+ [
780
+ 145,
781
+ 30,
782
+ 0,
783
+ 63,
784
+ 1,
785
+ "CONDITIONING"
786
+ ],
787
+ [
788
+ 146,
789
+ 31,
790
+ 0,
791
+ 63,
792
+ 2,
793
+ "CONDITIONING"
794
+ ],
795
+ [
796
+ 147,
797
+ 62,
798
+ 0,
799
+ 63,
800
+ 4,
801
+ "LATENT"
802
+ ],
803
+ [
804
+ 148,
805
+ 63,
806
+ 0,
807
+ 60,
808
+ 1,
809
+ "LATENT"
810
+ ],
811
+ [
812
+ 149,
813
+ 30,
814
+ 1,
815
+ 31,
816
+ 0,
817
+ "CLIP"
818
+ ],
819
+ [
820
+ 151,
821
+ 64,
822
+ 0,
823
+ 65,
824
+ 0,
825
+ "IMAGE"
826
+ ],
827
+ [
828
+ 152,
829
+ 37,
830
+ 1,
831
+ 65,
832
+ 4,
833
+ "INT"
834
+ ],
835
+ [
836
+ 153,
837
+ 37,
838
+ 2,
839
+ 65,
840
+ 5,
841
+ "INT"
842
+ ],
843
+ [
844
+ 155,
845
+ 65,
846
+ 0,
847
+ 62,
848
+ 2,
849
+ "IMAGE"
850
+ ]
851
+ ],
852
+ "groups": [],
853
+ "config": {},
854
+ "extra": {
855
+ "ds": {
856
+ "scale": 0.7627768444387061,
857
+ "offset": [
858
+ 630.1733472923837,
859
+ 148.14641794691272
860
+ ]
861
+ }
862
+ },
863
+ "version": 0.4
864
+ }
custom_nodes/ComfyUI-CogVideoXWrapper/example_workflows/cogvideox_1_0_5b_vid2vid_02.json ADDED
@@ -0,0 +1,1061 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "last_node_id": 78,
3
+ "last_link_id": 218,
4
+ "nodes": [
5
+ {
6
+ "id": 20,
7
+ "type": "CLIPLoader",
8
+ "pos": {
9
+ "0": -29,
10
+ "1": 407
11
+ },
12
+ "size": {
13
+ "0": 451.30548095703125,
14
+ "1": 82
15
+ },
16
+ "flags": {},
17
+ "order": 0,
18
+ "mode": 0,
19
+ "inputs": [],
20
+ "outputs": [
21
+ {
22
+ "name": "CLIP",
23
+ "type": "CLIP",
24
+ "links": [
25
+ 54
26
+ ],
27
+ "slot_index": 0,
28
+ "shape": 3
29
+ }
30
+ ],
31
+ "properties": {
32
+ "Node name for S&R": "CLIPLoader"
33
+ },
34
+ "widgets_values": [
35
+ "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
36
+ "sd3"
37
+ ]
38
+ },
39
+ {
40
+ "id": 41,
41
+ "type": "ImageResizeKJ",
42
+ "pos": {
43
+ "0": 206,
44
+ "1": -69
45
+ },
46
+ "size": {
47
+ "0": 315,
48
+ "1": 242
49
+ },
50
+ "flags": {},
51
+ "order": 7,
52
+ "mode": 0,
53
+ "inputs": [
54
+ {
55
+ "name": "image",
56
+ "type": "IMAGE",
57
+ "link": 180
58
+ },
59
+ {
60
+ "name": "get_image_size",
61
+ "type": "IMAGE",
62
+ "link": null,
63
+ "shape": 7
64
+ },
65
+ {
66
+ "name": "width_input",
67
+ "type": "INT",
68
+ "link": null,
69
+ "widget": {
70
+ "name": "width_input"
71
+ }
72
+ },
73
+ {
74
+ "name": "height_input",
75
+ "type": "INT",
76
+ "link": null,
77
+ "widget": {
78
+ "name": "height_input"
79
+ }
80
+ }
81
+ ],
82
+ "outputs": [
83
+ {
84
+ "name": "IMAGE",
85
+ "type": "IMAGE",
86
+ "links": [
87
+ 126
88
+ ],
89
+ "slot_index": 0,
90
+ "shape": 3
91
+ },
92
+ {
93
+ "name": "width",
94
+ "type": "INT",
95
+ "links": null,
96
+ "shape": 3
97
+ },
98
+ {
99
+ "name": "height",
100
+ "type": "INT",
101
+ "links": null,
102
+ "shape": 3
103
+ }
104
+ ],
105
+ "properties": {
106
+ "Node name for S&R": "ImageResizeKJ"
107
+ },
108
+ "widgets_values": [
109
+ 720,
110
+ 480,
111
+ "lanczos",
112
+ false,
113
+ 2,
114
+ 0,
115
+ 0,
116
+ "disabled"
117
+ ]
118
+ },
119
+ {
120
+ "id": 45,
121
+ "type": "VHS_LoadVideo",
122
+ "pos": {
123
+ "0": -93,
124
+ "1": -153
125
+ },
126
+ "size": [
127
+ 247.455078125,
128
+ 365.7275390625
129
+ ],
130
+ "flags": {},
131
+ "order": 4,
132
+ "mode": 0,
133
+ "inputs": [
134
+ {
135
+ "name": "meta_batch",
136
+ "type": "VHS_BatchManager",
137
+ "link": null,
138
+ "shape": 7
139
+ },
140
+ {
141
+ "name": "vae",
142
+ "type": "VAE",
143
+ "link": null,
144
+ "shape": 7
145
+ },
146
+ {
147
+ "name": "frame_load_cap",
148
+ "type": "INT",
149
+ "link": 177,
150
+ "widget": {
151
+ "name": "frame_load_cap"
152
+ }
153
+ }
154
+ ],
155
+ "outputs": [
156
+ {
157
+ "name": "IMAGE",
158
+ "type": "IMAGE",
159
+ "links": [
160
+ 179
161
+ ],
162
+ "slot_index": 0,
163
+ "shape": 3
164
+ },
165
+ {
166
+ "name": "frame_count",
167
+ "type": "INT",
168
+ "links": null,
169
+ "shape": 3
170
+ },
171
+ {
172
+ "name": "audio",
173
+ "type": "AUDIO",
174
+ "links": null,
175
+ "shape": 3
176
+ },
177
+ {
178
+ "name": "video_info",
179
+ "type": "VHS_VIDEOINFO",
180
+ "links": null,
181
+ "shape": 3
182
+ }
183
+ ],
184
+ "properties": {
185
+ "Node name for S&R": "VHS_LoadVideo"
186
+ },
187
+ "widgets_values": {
188
+ "video": "jeep.mp4",
189
+ "force_rate": 0,
190
+ "force_size": "Disabled",
191
+ "custom_width": 512,
192
+ "custom_height": 512,
193
+ "frame_load_cap": 20,
194
+ "skip_first_frames": 0,
195
+ "select_every_nth": 1,
196
+ "choose video to upload": "image",
197
+ "videopreview": {
198
+ "hidden": false,
199
+ "paused": false,
200
+ "params": {
201
+ "frame_load_cap": 20,
202
+ "skip_first_frames": 0,
203
+ "force_rate": 0,
204
+ "filename": "jeep.mp4",
205
+ "type": "input",
206
+ "format": "video/mp4",
207
+ "select_every_nth": 1
208
+ }
209
+ }
210
+ }
211
+ },
212
+ {
213
+ "id": 70,
214
+ "type": "GetImageSizeAndCount",
215
+ "pos": {
216
+ "0": 214,
217
+ "1": -234
218
+ },
219
+ "size": {
220
+ "0": 202.2143096923828,
221
+ "1": 99.23601531982422
222
+ },
223
+ "flags": {},
224
+ "order": 6,
225
+ "mode": 0,
226
+ "inputs": [
227
+ {
228
+ "name": "image",
229
+ "type": "IMAGE",
230
+ "link": 179,
231
+ "slot_index": 0
232
+ }
233
+ ],
234
+ "outputs": [
235
+ {
236
+ "name": "image",
237
+ "type": "IMAGE",
238
+ "links": [
239
+ 180
240
+ ],
241
+ "slot_index": 0,
242
+ "shape": 3
243
+ },
244
+ {
245
+ "name": "512 width",
246
+ "type": "INT",
247
+ "links": [],
248
+ "slot_index": 1,
249
+ "shape": 3
250
+ },
251
+ {
252
+ "name": "256 height",
253
+ "type": "INT",
254
+ "links": [],
255
+ "slot_index": 2,
256
+ "shape": 3
257
+ },
258
+ {
259
+ "name": "33 count",
260
+ "type": "INT",
261
+ "links": [],
262
+ "slot_index": 3,
263
+ "shape": 3
264
+ }
265
+ ],
266
+ "properties": {
267
+ "Node name for S&R": "GetImageSizeAndCount"
268
+ },
269
+ "widgets_values": []
270
+ },
271
+ {
272
+ "id": 69,
273
+ "type": "INTConstant",
274
+ "pos": {
275
+ "0": -90,
276
+ "1": -305
277
+ },
278
+ "size": {
279
+ "0": 210,
280
+ "1": 58
281
+ },
282
+ "flags": {},
283
+ "order": 1,
284
+ "mode": 0,
285
+ "inputs": [],
286
+ "outputs": [
287
+ {
288
+ "name": "value",
289
+ "type": "INT",
290
+ "links": [
291
+ 177
292
+ ],
293
+ "shape": 3
294
+ }
295
+ ],
296
+ "title": "Frames to load",
297
+ "properties": {
298
+ "Node name for S&R": "INTConstant"
299
+ },
300
+ "widgets_values": [
301
+ 33
302
+ ],
303
+ "color": "#1b4669",
304
+ "bgcolor": "#29699c"
305
+ },
306
+ {
307
+ "id": 58,
308
+ "type": "ImageConcanate",
309
+ "pos": {
310
+ "0": 1594,
311
+ "1": 230
312
+ },
313
+ "size": {
314
+ "0": 315,
315
+ "1": 102
316
+ },
317
+ "flags": {},
318
+ "order": 13,
319
+ "mode": 0,
320
+ "inputs": [
321
+ {
322
+ "name": "image1",
323
+ "type": "IMAGE",
324
+ "link": 191
325
+ },
326
+ {
327
+ "name": "image2",
328
+ "type": "IMAGE",
329
+ "link": 170
330
+ }
331
+ ],
332
+ "outputs": [
333
+ {
334
+ "name": "IMAGE",
335
+ "type": "IMAGE",
336
+ "links": [
337
+ 132
338
+ ],
339
+ "slot_index": 0,
340
+ "shape": 3
341
+ }
342
+ ],
343
+ "properties": {
344
+ "Node name for S&R": "ImageConcanate"
345
+ },
346
+ "widgets_values": [
347
+ "right",
348
+ false
349
+ ]
350
+ },
351
+ {
352
+ "id": 55,
353
+ "type": "GetImageSizeAndCount",
354
+ "pos": {
355
+ "0": 1654,
356
+ "1": 77
357
+ },
358
+ "size": {
359
+ "0": 210,
360
+ "1": 86
361
+ },
362
+ "flags": {},
363
+ "order": 12,
364
+ "mode": 0,
365
+ "inputs": [
366
+ {
367
+ "name": "image",
368
+ "type": "IMAGE",
369
+ "link": 208,
370
+ "slot_index": 0
371
+ }
372
+ ],
373
+ "outputs": [
374
+ {
375
+ "name": "image",
376
+ "type": "IMAGE",
377
+ "links": [
378
+ 170
379
+ ],
380
+ "slot_index": 0,
381
+ "shape": 3
382
+ },
383
+ {
384
+ "name": "720 width",
385
+ "type": "INT",
386
+ "links": null,
387
+ "shape": 3
388
+ },
389
+ {
390
+ "name": "480 height",
391
+ "type": "INT",
392
+ "links": null,
393
+ "shape": 3
394
+ },
395
+ {
396
+ "name": "33 count",
397
+ "type": "INT",
398
+ "links": [],
399
+ "slot_index": 3,
400
+ "shape": 3
401
+ }
402
+ ],
403
+ "properties": {
404
+ "Node name for S&R": "GetImageSizeAndCount"
405
+ },
406
+ "widgets_values": []
407
+ },
408
+ {
409
+ "id": 77,
410
+ "type": "CogVideoImageEncode",
411
+ "pos": {
412
+ "0": 952,
413
+ "1": -118
414
+ },
415
+ "size": {
416
+ "0": 315,
417
+ "1": 122
418
+ },
419
+ "flags": {},
420
+ "order": 9,
421
+ "mode": 0,
422
+ "inputs": [
423
+ {
424
+ "name": "vae",
425
+ "type": "VAE",
426
+ "link": 209
427
+ },
428
+ {
429
+ "name": "start_image",
430
+ "type": "IMAGE",
431
+ "link": 210
432
+ },
433
+ {
434
+ "name": "end_image",
435
+ "type": "IMAGE",
436
+ "link": null,
437
+ "shape": 7
438
+ }
439
+ ],
440
+ "outputs": [
441
+ {
442
+ "name": "samples",
443
+ "type": "LATENT",
444
+ "links": [
445
+ 215
446
+ ]
447
+ }
448
+ ],
449
+ "properties": {
450
+ "Node name for S&R": "CogVideoImageEncode"
451
+ },
452
+ "widgets_values": [
453
+ false,
454
+ 0
455
+ ]
456
+ },
457
+ {
458
+ "id": 76,
459
+ "type": "CogVideoDecode",
460
+ "pos": {
461
+ "0": 1335,
462
+ "1": -123
463
+ },
464
+ "size": {
465
+ "0": 315,
466
+ "1": 198
467
+ },
468
+ "flags": {},
469
+ "order": 11,
470
+ "mode": 0,
471
+ "inputs": [
472
+ {
473
+ "name": "vae",
474
+ "type": "VAE",
475
+ "link": 206
476
+ },
477
+ {
478
+ "name": "samples",
479
+ "type": "LATENT",
480
+ "link": 216
481
+ }
482
+ ],
483
+ "outputs": [
484
+ {
485
+ "name": "images",
486
+ "type": "IMAGE",
487
+ "links": [
488
+ 208
489
+ ]
490
+ }
491
+ ],
492
+ "properties": {
493
+ "Node name for S&R": "CogVideoDecode"
494
+ },
495
+ "widgets_values": [
496
+ true,
497
+ 240,
498
+ 360,
499
+ 0.2,
500
+ 0.2,
501
+ true
502
+ ]
503
+ },
504
+ {
505
+ "id": 30,
506
+ "type": "CogVideoTextEncode",
507
+ "pos": {
508
+ "0": 491,
509
+ "1": 372
510
+ },
511
+ "size": [
512
+ 478.6890949595422,
513
+ 215.66308749666905
514
+ ],
515
+ "flags": {},
516
+ "order": 3,
517
+ "mode": 0,
518
+ "inputs": [
519
+ {
520
+ "name": "clip",
521
+ "type": "CLIP",
522
+ "link": 54
523
+ }
524
+ ],
525
+ "outputs": [
526
+ {
527
+ "name": "conditioning",
528
+ "type": "CONDITIONING",
529
+ "links": [
530
+ 213
531
+ ],
532
+ "slot_index": 0,
533
+ "shape": 3
534
+ },
535
+ {
536
+ "name": "clip",
537
+ "type": "CLIP",
538
+ "links": [
539
+ 217
540
+ ],
541
+ "slot_index": 1
542
+ }
543
+ ],
544
+ "properties": {
545
+ "Node name for S&R": "CogVideoTextEncode"
546
+ },
547
+ "widgets_values": [
548
+ "A high-definition nature video showcasing a brown bear as it gracefully runs down a crystal-clear stream, surrounded by the serene ambiance of a dense, verdant forest. The sunlight filters through the canopy of tall trees, casting dappled light on the forest floor, while the gentle sound of flowing water and rustling leaves creates a peaceful atmosphere. The brown bear's fur glistens in the sunlight, highlighting its striking red and white markings as it navigates the stream with agility and playfulness.",
549
+ 1,
550
+ false
551
+ ]
552
+ },
553
+ {
554
+ "id": 31,
555
+ "type": "CogVideoTextEncode",
556
+ "pos": {
557
+ "0": 504,
558
+ "1": 651
559
+ },
560
+ "size": {
561
+ "0": 463.01251220703125,
562
+ "1": 144
563
+ },
564
+ "flags": {},
565
+ "order": 5,
566
+ "mode": 0,
567
+ "inputs": [
568
+ {
569
+ "name": "clip",
570
+ "type": "CLIP",
571
+ "link": 217
572
+ }
573
+ ],
574
+ "outputs": [
575
+ {
576
+ "name": "conditioning",
577
+ "type": "CONDITIONING",
578
+ "links": [
579
+ 214
580
+ ],
581
+ "slot_index": 0,
582
+ "shape": 3
583
+ },
584
+ {
585
+ "name": "clip",
586
+ "type": "CLIP",
587
+ "links": null
588
+ }
589
+ ],
590
+ "properties": {
591
+ "Node name for S&R": "CogVideoTextEncode"
592
+ },
593
+ "widgets_values": [
594
+ "",
595
+ 1,
596
+ true
597
+ ]
598
+ },
599
+ {
600
+ "id": 78,
601
+ "type": "CogVideoSampler",
602
+ "pos": {
603
+ "0": 1083,
604
+ "1": 255
605
+ },
606
+ "size": [
607
+ 330,
608
+ 574
609
+ ],
610
+ "flags": {},
611
+ "order": 10,
612
+ "mode": 0,
613
+ "inputs": [
614
+ {
615
+ "name": "model",
616
+ "type": "COGVIDEOMODEL",
617
+ "link": 212
618
+ },
619
+ {
620
+ "name": "positive",
621
+ "type": "CONDITIONING",
622
+ "link": 213
623
+ },
624
+ {
625
+ "name": "negative",
626
+ "type": "CONDITIONING",
627
+ "link": 214
628
+ },
629
+ {
630
+ "name": "samples",
631
+ "type": "LATENT",
632
+ "link": 215,
633
+ "shape": 7
634
+ },
635
+ {
636
+ "name": "image_cond_latents",
637
+ "type": "LATENT",
638
+ "link": null,
639
+ "shape": 7
640
+ },
641
+ {
642
+ "name": "context_options",
643
+ "type": "COGCONTEXT",
644
+ "link": null,
645
+ "shape": 7
646
+ },
647
+ {
648
+ "name": "controlnet",
649
+ "type": "COGVIDECONTROLNET",
650
+ "link": null,
651
+ "shape": 7
652
+ },
653
+ {
654
+ "name": "tora_trajectory",
655
+ "type": "TORAFEATURES",
656
+ "link": null,
657
+ "shape": 7
658
+ },
659
+ {
660
+ "name": "fastercache",
661
+ "type": "FASTERCACHEARGS",
662
+ "link": null,
663
+ "shape": 7
664
+ },
665
+ {
666
+ "name": "num_frames",
667
+ "type": "INT",
668
+ "link": 218,
669
+ "widget": {
670
+ "name": "num_frames"
671
+ }
672
+ }
673
+ ],
674
+ "outputs": [
675
+ {
676
+ "name": "samples",
677
+ "type": "LATENT",
678
+ "links": [
679
+ 216
680
+ ]
681
+ }
682
+ ],
683
+ "properties": {
684
+ "Node name for S&R": "CogVideoSampler"
685
+ },
686
+ "widgets_values": [
687
+ 49,
688
+ 25,
689
+ 6,
690
+ 0,
691
+ "fixed",
692
+ "CogVideoXDDIM",
693
+ 0.8
694
+ ]
695
+ },
696
+ {
697
+ "id": 57,
698
+ "type": "GetImageSizeAndCount",
699
+ "pos": {
700
+ "0": 595,
701
+ "1": -79
702
+ },
703
+ "size": {
704
+ "0": 202.2143096923828,
705
+ "1": 99.23601531982422
706
+ },
707
+ "flags": {},
708
+ "order": 8,
709
+ "mode": 0,
710
+ "inputs": [
711
+ {
712
+ "name": "image",
713
+ "type": "IMAGE",
714
+ "link": 126,
715
+ "slot_index": 0
716
+ }
717
+ ],
718
+ "outputs": [
719
+ {
720
+ "name": "image",
721
+ "type": "IMAGE",
722
+ "links": [
723
+ 191,
724
+ 210
725
+ ],
726
+ "slot_index": 0,
727
+ "shape": 3
728
+ },
729
+ {
730
+ "name": "720 width",
731
+ "type": "INT",
732
+ "links": [],
733
+ "slot_index": 1,
734
+ "shape": 3
735
+ },
736
+ {
737
+ "name": "480 height",
738
+ "type": "INT",
739
+ "links": [],
740
+ "slot_index": 2,
741
+ "shape": 3
742
+ },
743
+ {
744
+ "name": "33 count",
745
+ "type": "INT",
746
+ "links": [
747
+ 218
748
+ ],
749
+ "slot_index": 3,
750
+ "shape": 3
751
+ }
752
+ ],
753
+ "properties": {
754
+ "Node name for S&R": "GetImageSizeAndCount"
755
+ },
756
+ "widgets_values": []
757
+ },
758
+ {
759
+ "id": 75,
760
+ "type": "DownloadAndLoadCogVideoModel",
761
+ "pos": {
762
+ "0": 606,
763
+ "1": 85
764
+ },
765
+ "size": {
766
+ "0": 315,
767
+ "1": 218
768
+ },
769
+ "flags": {},
770
+ "order": 2,
771
+ "mode": 0,
772
+ "inputs": [
773
+ {
774
+ "name": "block_edit",
775
+ "type": "TRANSFORMERBLOCKS",
776
+ "link": null,
777
+ "shape": 7
778
+ },
779
+ {
780
+ "name": "lora",
781
+ "type": "COGLORA",
782
+ "link": null,
783
+ "shape": 7
784
+ },
785
+ {
786
+ "name": "compile_args",
787
+ "type": "COMPILEARGS",
788
+ "link": null,
789
+ "shape": 7
790
+ }
791
+ ],
792
+ "outputs": [
793
+ {
794
+ "name": "model",
795
+ "type": "COGVIDEOMODEL",
796
+ "links": [
797
+ 212
798
+ ]
799
+ },
800
+ {
801
+ "name": "vae",
802
+ "type": "VAE",
803
+ "links": [
804
+ 206,
805
+ 209
806
+ ]
807
+ }
808
+ ],
809
+ "properties": {
810
+ "Node name for S&R": "DownloadAndLoadCogVideoModel"
811
+ },
812
+ "widgets_values": [
813
+ "THUDM/CogVideoX-5b",
814
+ "bf16",
815
+ "disabled",
816
+ false,
817
+ "sdpa",
818
+ "main_device"
819
+ ]
820
+ },
821
+ {
822
+ "id": 47,
823
+ "type": "VHS_VideoCombine",
824
+ "pos": {
825
+ "0": 1946,
826
+ "1": -172
827
+ },
828
+ "size": [
829
+ 1110,
830
+ 687.3333333333333
831
+ ],
832
+ "flags": {},
833
+ "order": 14,
834
+ "mode": 0,
835
+ "inputs": [
836
+ {
837
+ "name": "images",
838
+ "type": "IMAGE",
839
+ "link": 132
840
+ },
841
+ {
842
+ "name": "audio",
843
+ "type": "AUDIO",
844
+ "link": null,
845
+ "shape": 7
846
+ },
847
+ {
848
+ "name": "meta_batch",
849
+ "type": "VHS_BatchManager",
850
+ "link": null,
851
+ "shape": 7
852
+ },
853
+ {
854
+ "name": "vae",
855
+ "type": "VAE",
856
+ "link": null,
857
+ "shape": 7
858
+ }
859
+ ],
860
+ "outputs": [
861
+ {
862
+ "name": "Filenames",
863
+ "type": "VHS_FILENAMES",
864
+ "links": null,
865
+ "shape": 3
866
+ }
867
+ ],
868
+ "properties": {
869
+ "Node name for S&R": "VHS_VideoCombine"
870
+ },
871
+ "widgets_values": {
872
+ "frame_rate": 8,
873
+ "loop_count": 0,
874
+ "filename_prefix": "CogVideoX_vid2vid",
875
+ "format": "video/h264-mp4",
876
+ "pix_fmt": "yuv420p",
877
+ "crf": 19,
878
+ "save_metadata": true,
879
+ "pingpong": false,
880
+ "save_output": true,
881
+ "videopreview": {
882
+ "hidden": false,
883
+ "paused": false,
884
+ "params": {
885
+ "filename": "CogVideoX_vid2vid_00003.mp4",
886
+ "subfolder": "",
887
+ "type": "temp",
888
+ "format": "video/h264-mp4",
889
+ "frame_rate": 8
890
+ }
891
+ }
892
+ }
893
+ }
894
+ ],
895
+ "links": [
896
+ [
897
+ 54,
898
+ 20,
899
+ 0,
900
+ 30,
901
+ 0,
902
+ "CLIP"
903
+ ],
904
+ [
905
+ 126,
906
+ 41,
907
+ 0,
908
+ 57,
909
+ 0,
910
+ "IMAGE"
911
+ ],
912
+ [
913
+ 132,
914
+ 58,
915
+ 0,
916
+ 47,
917
+ 0,
918
+ "IMAGE"
919
+ ],
920
+ [
921
+ 170,
922
+ 55,
923
+ 0,
924
+ 58,
925
+ 1,
926
+ "IMAGE"
927
+ ],
928
+ [
929
+ 177,
930
+ 69,
931
+ 0,
932
+ 45,
933
+ 2,
934
+ "INT"
935
+ ],
936
+ [
937
+ 179,
938
+ 45,
939
+ 0,
940
+ 70,
941
+ 0,
942
+ "IMAGE"
943
+ ],
944
+ [
945
+ 180,
946
+ 70,
947
+ 0,
948
+ 41,
949
+ 0,
950
+ "IMAGE"
951
+ ],
952
+ [
953
+ 191,
954
+ 57,
955
+ 0,
956
+ 58,
957
+ 0,
958
+ "IMAGE"
959
+ ],
960
+ [
961
+ 206,
962
+ 75,
963
+ 1,
964
+ 76,
965
+ 0,
966
+ "VAE"
967
+ ],
968
+ [
969
+ 208,
970
+ 76,
971
+ 0,
972
+ 55,
973
+ 0,
974
+ "IMAGE"
975
+ ],
976
+ [
977
+ 209,
978
+ 75,
979
+ 1,
980
+ 77,
981
+ 0,
982
+ "VAE"
983
+ ],
984
+ [
985
+ 210,
986
+ 57,
987
+ 0,
988
+ 77,
989
+ 1,
990
+ "IMAGE"
991
+ ],
992
+ [
993
+ 212,
994
+ 75,
995
+ 0,
996
+ 78,
997
+ 0,
998
+ "COGVIDEOMODEL"
999
+ ],
1000
+ [
1001
+ 213,
1002
+ 30,
1003
+ 0,
1004
+ 78,
1005
+ 1,
1006
+ "CONDITIONING"
1007
+ ],
1008
+ [
1009
+ 214,
1010
+ 31,
1011
+ 0,
1012
+ 78,
1013
+ 2,
1014
+ "CONDITIONING"
1015
+ ],
1016
+ [
1017
+ 215,
1018
+ 77,
1019
+ 0,
1020
+ 78,
1021
+ 3,
1022
+ "LATENT"
1023
+ ],
1024
+ [
1025
+ 216,
1026
+ 78,
1027
+ 0,
1028
+ 76,
1029
+ 1,
1030
+ "LATENT"
1031
+ ],
1032
+ [
1033
+ 217,
1034
+ 30,
1035
+ 1,
1036
+ 31,
1037
+ 0,
1038
+ "CLIP"
1039
+ ],
1040
+ [
1041
+ 218,
1042
+ 57,
1043
+ 3,
1044
+ 78,
1045
+ 9,
1046
+ "INT"
1047
+ ]
1048
+ ],
1049
+ "groups": [],
1050
+ "config": {},
1051
+ "extra": {
1052
+ "ds": {
1053
+ "scale": 0.8390545288825798,
1054
+ "offset": [
1055
+ -318.82552550589344,
1056
+ 331.70430573737934
1057
+ ]
1058
+ }
1059
+ },
1060
+ "version": 0.4
1061
+ }
custom_nodes/ComfyUI-CogVideoXWrapper/example_workflows/cogvideox_1_5_5b_I2V_01.json ADDED
@@ -0,0 +1,688 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "last_node_id": 64,
3
+ "last_link_id": 149,
4
+ "nodes": [
5
+ {
6
+ "id": 63,
7
+ "type": "CogVideoSampler",
8
+ "pos": {
9
+ "0": 1142,
10
+ "1": 74
11
+ },
12
+ "size": {
13
+ "0": 330,
14
+ "1": 574
15
+ },
16
+ "flags": {},
17
+ "order": 7,
18
+ "mode": 0,
19
+ "inputs": [
20
+ {
21
+ "name": "model",
22
+ "type": "COGVIDEOMODEL",
23
+ "link": 144
24
+ },
25
+ {
26
+ "name": "positive",
27
+ "type": "CONDITIONING",
28
+ "link": 145
29
+ },
30
+ {
31
+ "name": "negative",
32
+ "type": "CONDITIONING",
33
+ "link": 146
34
+ },
35
+ {
36
+ "name": "samples",
37
+ "type": "LATENT",
38
+ "link": null,
39
+ "shape": 7
40
+ },
41
+ {
42
+ "name": "image_cond_latents",
43
+ "type": "LATENT",
44
+ "link": 147,
45
+ "shape": 7
46
+ },
47
+ {
48
+ "name": "context_options",
49
+ "type": "COGCONTEXT",
50
+ "link": null,
51
+ "shape": 7
52
+ },
53
+ {
54
+ "name": "controlnet",
55
+ "type": "COGVIDECONTROLNET",
56
+ "link": null,
57
+ "shape": 7
58
+ },
59
+ {
60
+ "name": "tora_trajectory",
61
+ "type": "TORAFEATURES",
62
+ "link": null,
63
+ "shape": 7
64
+ },
65
+ {
66
+ "name": "fastercache",
67
+ "type": "FASTERCACHEARGS",
68
+ "link": null,
69
+ "shape": 7
70
+ }
71
+ ],
72
+ "outputs": [
73
+ {
74
+ "name": "samples",
75
+ "type": "LATENT",
76
+ "links": [
77
+ 148
78
+ ]
79
+ }
80
+ ],
81
+ "properties": {
82
+ "Node name for S&R": "CogVideoSampler"
83
+ },
84
+ "widgets_values": [
85
+ 49,
86
+ 25,
87
+ 6,
88
+ 0,
89
+ "fixed",
90
+ "CogVideoXDDIM",
91
+ 1
92
+ ]
93
+ },
94
+ {
95
+ "id": 62,
96
+ "type": "CogVideoImageEncode",
97
+ "pos": {
98
+ "0": 1149,
99
+ "1": 711
100
+ },
101
+ "size": {
102
+ "0": 315,
103
+ "1": 122
104
+ },
105
+ "flags": {},
106
+ "order": 5,
107
+ "mode": 0,
108
+ "inputs": [
109
+ {
110
+ "name": "vae",
111
+ "type": "VAE",
112
+ "link": 141
113
+ },
114
+ {
115
+ "name": "start_image",
116
+ "type": "IMAGE",
117
+ "link": 142
118
+ },
119
+ {
120
+ "name": "end_image",
121
+ "type": "IMAGE",
122
+ "link": null,
123
+ "shape": 7
124
+ }
125
+ ],
126
+ "outputs": [
127
+ {
128
+ "name": "samples",
129
+ "type": "LATENT",
130
+ "links": [
131
+ 147
132
+ ]
133
+ }
134
+ ],
135
+ "properties": {
136
+ "Node name for S&R": "CogVideoImageEncode"
137
+ },
138
+ "widgets_values": [
139
+ false,
140
+ 0
141
+ ]
142
+ },
143
+ {
144
+ "id": 30,
145
+ "type": "CogVideoTextEncode",
146
+ "pos": {
147
+ "0": 493,
148
+ "1": 303
149
+ },
150
+ "size": {
151
+ "0": 471.90142822265625,
152
+ "1": 168.08047485351562
153
+ },
154
+ "flags": {},
155
+ "order": 4,
156
+ "mode": 0,
157
+ "inputs": [
158
+ {
159
+ "name": "clip",
160
+ "type": "CLIP",
161
+ "link": 54
162
+ }
163
+ ],
164
+ "outputs": [
165
+ {
166
+ "name": "conditioning",
167
+ "type": "CONDITIONING",
168
+ "links": [
169
+ 145
170
+ ],
171
+ "slot_index": 0,
172
+ "shape": 3
173
+ },
174
+ {
175
+ "name": "clip",
176
+ "type": "CLIP",
177
+ "links": [
178
+ 149
179
+ ],
180
+ "slot_index": 1
181
+ }
182
+ ],
183
+ "properties": {
184
+ "Node name for S&R": "CogVideoTextEncode"
185
+ },
186
+ "widgets_values": [
187
+ "a majestic stag is grazing in an enhanced forest, basking in the setting sun filtered by the trees",
188
+ 1,
189
+ false
190
+ ]
191
+ },
192
+ {
193
+ "id": 36,
194
+ "type": "LoadImage",
195
+ "pos": {
196
+ "0": 335,
197
+ "1": 731
198
+ },
199
+ "size": {
200
+ "0": 402.06353759765625,
201
+ "1": 396.6225891113281
202
+ },
203
+ "flags": {},
204
+ "order": 0,
205
+ "mode": 0,
206
+ "inputs": [],
207
+ "outputs": [
208
+ {
209
+ "name": "IMAGE",
210
+ "type": "IMAGE",
211
+ "links": [
212
+ 71
213
+ ],
214
+ "slot_index": 0,
215
+ "shape": 3
216
+ },
217
+ {
218
+ "name": "MASK",
219
+ "type": "MASK",
220
+ "links": null,
221
+ "shape": 3
222
+ }
223
+ ],
224
+ "properties": {
225
+ "Node name for S&R": "LoadImage"
226
+ },
227
+ "widgets_values": [
228
+ "sd3stag.png",
229
+ "image"
230
+ ]
231
+ },
232
+ {
233
+ "id": 20,
234
+ "type": "CLIPLoader",
235
+ "pos": {
236
+ "0": -2,
237
+ "1": 304
238
+ },
239
+ "size": {
240
+ "0": 451.30548095703125,
241
+ "1": 82
242
+ },
243
+ "flags": {},
244
+ "order": 1,
245
+ "mode": 0,
246
+ "inputs": [],
247
+ "outputs": [
248
+ {
249
+ "name": "CLIP",
250
+ "type": "CLIP",
251
+ "links": [
252
+ 54
253
+ ],
254
+ "slot_index": 0,
255
+ "shape": 3
256
+ }
257
+ ],
258
+ "properties": {
259
+ "Node name for S&R": "CLIPLoader"
260
+ },
261
+ "widgets_values": [
262
+ "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
263
+ "sd3"
264
+ ]
265
+ },
266
+ {
267
+ "id": 60,
268
+ "type": "CogVideoDecode",
269
+ "pos": {
270
+ "0": 1523,
271
+ "1": -6
272
+ },
273
+ "size": {
274
+ "0": 315,
275
+ "1": 198
276
+ },
277
+ "flags": {},
278
+ "order": 8,
279
+ "mode": 0,
280
+ "inputs": [
281
+ {
282
+ "name": "vae",
283
+ "type": "VAE",
284
+ "link": 132
285
+ },
286
+ {
287
+ "name": "samples",
288
+ "type": "LATENT",
289
+ "link": 148
290
+ }
291
+ ],
292
+ "outputs": [
293
+ {
294
+ "name": "images",
295
+ "type": "IMAGE",
296
+ "links": [
297
+ 134
298
+ ]
299
+ }
300
+ ],
301
+ "properties": {
302
+ "Node name for S&R": "CogVideoDecode"
303
+ },
304
+ "widgets_values": [
305
+ true,
306
+ 240,
307
+ 360,
308
+ 0.2,
309
+ 0.2,
310
+ true
311
+ ]
312
+ },
313
+ {
314
+ "id": 37,
315
+ "type": "ImageResizeKJ",
316
+ "pos": {
317
+ "0": 784,
318
+ "1": 731
319
+ },
320
+ "size": {
321
+ "0": 315,
322
+ "1": 266
323
+ },
324
+ "flags": {},
325
+ "order": 3,
326
+ "mode": 0,
327
+ "inputs": [
328
+ {
329
+ "name": "image",
330
+ "type": "IMAGE",
331
+ "link": 71
332
+ },
333
+ {
334
+ "name": "get_image_size",
335
+ "type": "IMAGE",
336
+ "link": null,
337
+ "shape": 7
338
+ },
339
+ {
340
+ "name": "width_input",
341
+ "type": "INT",
342
+ "link": null,
343
+ "widget": {
344
+ "name": "width_input"
345
+ }
346
+ },
347
+ {
348
+ "name": "height_input",
349
+ "type": "INT",
350
+ "link": null,
351
+ "widget": {
352
+ "name": "height_input"
353
+ }
354
+ }
355
+ ],
356
+ "outputs": [
357
+ {
358
+ "name": "IMAGE",
359
+ "type": "IMAGE",
360
+ "links": [
361
+ 142
362
+ ],
363
+ "slot_index": 0,
364
+ "shape": 3
365
+ },
366
+ {
367
+ "name": "width",
368
+ "type": "INT",
369
+ "links": null,
370
+ "shape": 3
371
+ },
372
+ {
373
+ "name": "height",
374
+ "type": "INT",
375
+ "links": null,
376
+ "shape": 3
377
+ }
378
+ ],
379
+ "properties": {
380
+ "Node name for S&R": "ImageResizeKJ"
381
+ },
382
+ "widgets_values": [
383
+ 1360,
384
+ 768,
385
+ "lanczos",
386
+ false,
387
+ 16,
388
+ 0,
389
+ 0,
390
+ "disabled"
391
+ ]
392
+ },
393
+ {
394
+ "id": 31,
395
+ "type": "CogVideoTextEncode",
396
+ "pos": {
397
+ "0": 497,
398
+ "1": 520
399
+ },
400
+ "size": {
401
+ "0": 463.01251220703125,
402
+ "1": 144
403
+ },
404
+ "flags": {},
405
+ "order": 6,
406
+ "mode": 0,
407
+ "inputs": [
408
+ {
409
+ "name": "clip",
410
+ "type": "CLIP",
411
+ "link": 149
412
+ }
413
+ ],
414
+ "outputs": [
415
+ {
416
+ "name": "conditioning",
417
+ "type": "CONDITIONING",
418
+ "links": [
419
+ 146
420
+ ],
421
+ "slot_index": 0,
422
+ "shape": 3
423
+ },
424
+ {
425
+ "name": "clip",
426
+ "type": "CLIP",
427
+ "links": null
428
+ }
429
+ ],
430
+ "properties": {
431
+ "Node name for S&R": "CogVideoTextEncode"
432
+ },
433
+ "widgets_values": [
434
+ "",
435
+ 1,
436
+ true
437
+ ]
438
+ },
439
+ {
440
+ "id": 59,
441
+ "type": "DownloadAndLoadCogVideoModel",
442
+ "pos": {
443
+ "0": 622,
444
+ "1": -25
445
+ },
446
+ "size": {
447
+ "0": 315,
448
+ "1": 218
449
+ },
450
+ "flags": {},
451
+ "order": 2,
452
+ "mode": 0,
453
+ "inputs": [
454
+ {
455
+ "name": "block_edit",
456
+ "type": "TRANSFORMERBLOCKS",
457
+ "link": null,
458
+ "shape": 7
459
+ },
460
+ {
461
+ "name": "lora",
462
+ "type": "COGLORA",
463
+ "link": null,
464
+ "shape": 7
465
+ },
466
+ {
467
+ "name": "compile_args",
468
+ "type": "COMPILEARGS",
469
+ "link": null,
470
+ "shape": 7
471
+ }
472
+ ],
473
+ "outputs": [
474
+ {
475
+ "name": "model",
476
+ "type": "COGVIDEOMODEL",
477
+ "links": [
478
+ 144
479
+ ]
480
+ },
481
+ {
482
+ "name": "vae",
483
+ "type": "VAE",
484
+ "links": [
485
+ 132,
486
+ 141
487
+ ],
488
+ "slot_index": 1
489
+ }
490
+ ],
491
+ "properties": {
492
+ "Node name for S&R": "DownloadAndLoadCogVideoModel"
493
+ },
494
+ "widgets_values": [
495
+ "kijai/CogVideoX-5b-1.5-I2V",
496
+ "bf16",
497
+ "disabled",
498
+ false,
499
+ "sdpa",
500
+ "main_device"
501
+ ]
502
+ },
503
+ {
504
+ "id": 44,
505
+ "type": "VHS_VideoCombine",
506
+ "pos": {
507
+ "0": 1884,
508
+ "1": -6
509
+ },
510
+ "size": [
511
+ 605.3909912109375,
512
+ 310
513
+ ],
514
+ "flags": {},
515
+ "order": 9,
516
+ "mode": 0,
517
+ "inputs": [
518
+ {
519
+ "name": "images",
520
+ "type": "IMAGE",
521
+ "link": 134
522
+ },
523
+ {
524
+ "name": "audio",
525
+ "type": "AUDIO",
526
+ "link": null,
527
+ "shape": 7
528
+ },
529
+ {
530
+ "name": "meta_batch",
531
+ "type": "VHS_BatchManager",
532
+ "link": null,
533
+ "shape": 7
534
+ },
535
+ {
536
+ "name": "vae",
537
+ "type": "VAE",
538
+ "link": null,
539
+ "shape": 7
540
+ }
541
+ ],
542
+ "outputs": [
543
+ {
544
+ "name": "Filenames",
545
+ "type": "VHS_FILENAMES",
546
+ "links": null,
547
+ "shape": 3
548
+ }
549
+ ],
550
+ "properties": {
551
+ "Node name for S&R": "VHS_VideoCombine"
552
+ },
553
+ "widgets_values": {
554
+ "frame_rate": 16,
555
+ "loop_count": 0,
556
+ "filename_prefix": "CogVideoX_1_5_I2V",
557
+ "format": "video/h264-mp4",
558
+ "pix_fmt": "yuv420p",
559
+ "crf": 19,
560
+ "save_metadata": true,
561
+ "pingpong": false,
562
+ "save_output": true,
563
+ "videopreview": {
564
+ "hidden": false,
565
+ "paused": false,
566
+ "params": {
567
+ "filename": "CogVideoX-I2V_00004.mp4",
568
+ "subfolder": "",
569
+ "type": "temp",
570
+ "format": "video/h264-mp4",
571
+ "frame_rate": 8
572
+ },
573
+ "muted": false
574
+ }
575
+ }
576
+ }
577
+ ],
578
+ "links": [
579
+ [
580
+ 54,
581
+ 20,
582
+ 0,
583
+ 30,
584
+ 0,
585
+ "CLIP"
586
+ ],
587
+ [
588
+ 71,
589
+ 36,
590
+ 0,
591
+ 37,
592
+ 0,
593
+ "IMAGE"
594
+ ],
595
+ [
596
+ 132,
597
+ 59,
598
+ 1,
599
+ 60,
600
+ 0,
601
+ "VAE"
602
+ ],
603
+ [
604
+ 134,
605
+ 60,
606
+ 0,
607
+ 44,
608
+ 0,
609
+ "IMAGE"
610
+ ],
611
+ [
612
+ 141,
613
+ 59,
614
+ 1,
615
+ 62,
616
+ 0,
617
+ "VAE"
618
+ ],
619
+ [
620
+ 142,
621
+ 37,
622
+ 0,
623
+ 62,
624
+ 1,
625
+ "IMAGE"
626
+ ],
627
+ [
628
+ 144,
629
+ 59,
630
+ 0,
631
+ 63,
632
+ 0,
633
+ "COGVIDEOMODEL"
634
+ ],
635
+ [
636
+ 145,
637
+ 30,
638
+ 0,
639
+ 63,
640
+ 1,
641
+ "CONDITIONING"
642
+ ],
643
+ [
644
+ 146,
645
+ 31,
646
+ 0,
647
+ 63,
648
+ 2,
649
+ "CONDITIONING"
650
+ ],
651
+ [
652
+ 147,
653
+ 62,
654
+ 0,
655
+ 63,
656
+ 4,
657
+ "LATENT"
658
+ ],
659
+ [
660
+ 148,
661
+ 63,
662
+ 0,
663
+ 60,
664
+ 1,
665
+ "LATENT"
666
+ ],
667
+ [
668
+ 149,
669
+ 30,
670
+ 1,
671
+ 31,
672
+ 0,
673
+ "CLIP"
674
+ ]
675
+ ],
676
+ "groups": [],
677
+ "config": {},
678
+ "extra": {
679
+ "ds": {
680
+ "scale": 0.7627768444387097,
681
+ "offset": [
682
+ 716.7143770104391,
683
+ 291.75859557289965
684
+ ]
685
+ }
686
+ },
687
+ "version": 0.4
688
+ }