File size: 6,734 Bytes
f949b3f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
# pytorch_lightning==2.0.9
seed_everything: 33
trainer:
  accelerator: auto
  strategy: auto
  devices: '8'
  num_nodes: 1
  precision: 16-mixed
  logger: null
  callbacks:
  - class_path: pytorch_lightning.callbacks.RichModelSummary
    init_args:
      max_depth: 1
  - class_path: pytorch_lightning.callbacks.RichProgressBar
    init_args:
      refresh_rate: 1
      leave: false
      theme:
        description: white
        progress_bar: '#6206E0'
        progress_bar_finished: '#6206E0'
        progress_bar_pulse: '#6206E0'
        batch_progress: white
        time: grey54
        processing_speed: grey70
        metrics: white
      console_kwargs: null
  fast_dev_run: false
  max_epochs: 5000
  min_epochs: null
  max_steps: 2020000
  min_steps: null
  max_time: null
  limit_train_batches: null
  limit_val_batches: 512
  limit_test_batches: null
  limit_predict_batches: null
  overfit_batches: 0.0
  val_check_interval: 8000
  check_val_every_n_epoch: 1
  num_sanity_val_steps: null
  log_every_n_steps: 10
  enable_checkpointing: null
  enable_progress_bar: null
  enable_model_summary: null
  accumulate_grad_batches: 8
  gradient_clip_val: 1
  gradient_clip_algorithm: norm
  deterministic: null
  benchmark: null
  inference_mode: true
  use_distributed_sampler: true
  profiler: null
  detect_anomaly: false
  barebones: false
  plugins: null
  sync_batchnorm: false
  reload_dataloaders_every_n_epochs: 0
  default_root_dir: null
model:
  inference_params:
    class_path: t2v_enhanced.model.pl_module_params_controlnet.InferenceParams
    init_args:
      width: 256
      height: 256
      video_length: 16
      guidance_scale: 7.5
      use_dec_scaling: true
      frame_rate: 8
      num_inference_steps: 50
      eta: 1.0
      n_autoregressive_generations: 1
      mode: long_video
      start_from_real_input: true
      eval_loss_metrics: false
      scheduler_cls: ''
      negative_prompt: ''
      conditioning_from_all_past: false
      validation_samples: 80
      conditioning_type: last_chunk
      result_formats:
      - eval_gif
      - gif
      - mp4
      concat_video: true
  opt_params:
    class_path: t2v_enhanced.model.pl_module_params_controlnet.OptimizerParams
    init_args:
      learning_rate: 5.0e-05
      layers_config:
        class_path: t2v_enhanced.model.requires_grad_setter.LayerConfig
        init_args:
          gradient_setup:
          - - false
            - - vae
          - - false
            - - text_encoder
          - - false
            - - image_encoder
          - - true
            - - resampler
          - - true
            - - unet
          - - true
            - - base_model
          - - false
            - - base_model
              - transformer_in
          - - false
            - - base_model
              - temp_attentions
          - - false
            - - base_model
              - temp_convs
      layers_config_base: null
      use_warmup: false
      warmup_steps: 10000
      warmup_start_factor: 1.0e-05
      learning_rate_spatial: 0.0
      use_8_bit_adam: false
      noise_generator: null
      noise_decomposition: null
      perceptual_loss: false
      noise_offset: 0.0
      split_opt_by_node: false
      reset_prediction_type_to_eps: false
      train_val_sampler_may_differ: true
      measure_similarity: false
      similarity_loss: false
      similarity_loss_weight: 1.0
      loss_conditional_weight: 0.0
      loss_conditional_weight_convex: false
      loss_conditional_change_after_step: 0
      mask_conditional_frames: false
      sample_from_noise: true
      mask_alternating: false
      uncondition_freq: -1
      no_text_condition_control: false
      inject_image_into_input: false
      inject_at_T: false
      resampling_steps: 1
      control_freq_in_resample: 1
      resample_to_T: false
      adaptive_loss_reweight: false
      load_resampler_from_ckpt: ''
      skip_controlnet_branch: false
      use_fps_conditioning: false
      num_frame_embeddings_range: 16
      start_frame_training: 16
      start_frame_ctrl: 16
      load_trained_base_model_and_resampler_from_ckpt: ''
      load_trained_controlnet_from_ckpt: ''
  unet_params:
    class_path: t2v_enhanced.model.pl_module_params_controlnet.UNetParams
    init_args:
      conditioning_embedding_out_channels:
      - 32
      - 96
      - 256
      - 512
      ckpt_spatial_layers: ''
      pipeline_repo: damo-vilab/text-to-video-ms-1.7b
      unet_from_diffusers: true
      spatial_latent_input: false
      num_frame_conditioning: 1
      pipeline_class: t2v_enhanced.model.model.controlnet.pipeline_text_to_video_w_controlnet_synth.TextToVideoSDPipeline
      frame_expansion: none
      downsample_controlnet_cond: true
      num_frames: 16
      pre_transformer_in_cond: false
      num_tranformers: 1
      zero_conv_3d: false
      merging_mode: addition
      compute_only_conditioned_frames: false
      condition_encoder: ''
      zero_conv_mode: Identity
      clean_model: true
      merging_mode_base: attention_cross_attention
      attention_mask_params: null
      attention_mask_params_base: null
      modelscope_input_format: true
      temporal_self_attention_only_on_conditioning: false
      temporal_self_attention_mask_included_itself: false
      use_post_merger_zero_conv: false
      weight_control_sample: 1.0
      use_controlnet_mask: false
      random_mask_shift: false
      random_mask: false
      use_resampler: true
      unet_from_pipe: false
      unet_operates_on_2d: false
      image_encoder: CLIP
      use_standard_attention_processor: false
      num_frames_before_chunk: 0
      resampler_type: single_frame
      resampler_cls: t2v_enhanced.model.diffusers_conditional.models.controlnet.image_embedder.ImgEmbContextResampler
      resampler_merging_layers: 4
      image_encoder_obj:
        class_path: t2v_enhanced.model.diffusers_conditional.models.controlnet.image_embedder.FrozenOpenCLIPImageEmbedder
        init_args:
          arch: ViT-H-14
          version: laion2b_s32b_b79k
          device: cuda
          max_length: 77
          freeze: true
          antialias: true
          ucg_rate: 0.0
          unsqueeze_dim: false
          repeat_to_max_len: false
          num_image_crops: 0
          output_tokens: false
      cfg_text_image: false
      aggregation: last_out
      resampler_random_shift: true
      img_cond_alpha_per_frame: false
      num_control_input_frames: 8
      use_image_encoder_normalization: false
      use_of: false
      ema_param: -1.0
      concat: false
      use_image_tokens_main: true
      use_image_tokens_ctrl: false
result_fol: results
exp_name: my_exp_name
run_name: my_run_name
scale_lr: false
matmul_precision: high