model:
  target: vtdm.vtdm_gen_v01.VideoLDM
  base_learning_rate: 1.0e-05
  params:
    input_key: video
    scale_factor: 0.18215
    log_keys: caption
    num_samples: 25 #frame_rate
    trained_param_keys:
    - diffusion_model.label_emb.0.0.weight
    - .emb_layers.
    - .time_stack.
    en_and_decode_n_samples_a_time: 25 #frame_rate
    disable_first_stage_autocast: true
    denoiser_config:
      target: sgm.modules.diffusionmodules.denoiser.Denoiser
      params:
        scaling_config:
          target: sgm.modules.diffusionmodules.denoiser_scaling.VScalingWithEDMcNoise
    network_config:
      target: sgm.modules.diffusionmodules.video_model.VideoUNet
      params:
        adm_in_channels: 768
        num_classes: sequential
        use_checkpoint: true
        in_channels: 8
        out_channels: 4
        model_channels: 320
        attention_resolutions:
        - 4
        - 2
        - 1
        num_res_blocks: 2
        channel_mult:
        - 1
        - 2
        - 4
        - 4
        num_head_channels: 64
        use_linear_in_transformer: true
        transformer_depth: 1
        context_dim: 1024
        spatial_transformer_attn_type: softmax-xformers
        extra_ff_mix_layer: true
        use_spatial_context: true
        merge_strategy: learned_with_images
        video_kernel_size:
        - 3
        - 1
        - 1
    conditioner_config:
      target: sgm.modules.GeneralConditioner
      params:
        emb_models:
        - is_trainable: false
          input_key: cond_frames_without_noise
          ucg_rate: 0.1
          target: sgm.modules.encoders.modules.FrozenOpenCLIPImagePredictionEmbedder
          params:
            n_cond_frames: 1
            n_copies: 1
            open_clip_embedding_config:
              target: sgm.modules.encoders.modules.FrozenOpenCLIPImageEmbedder
              params:
                version: ckpts/open_clip_pytorch_model.bin
                freeze: true
        - is_trainable: false
          input_key: video
          ucg_rate: 0.0
          target: vtdm.encoders.AesEmbedder
        - is_trainable: false
          input_key: elevation
          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
          params:
            outdim: 256
        - input_key: cond_frames
          is_trainable: false
          ucg_rate: 0.1
          target: sgm.modules.encoders.modules.VideoPredictionEmbedderWithEncoder
          params:
            disable_encoder_autocast: true
            n_cond_frames: 1
            n_copies: 25 #frame_rate
            is_ae: true
            encoder_config:
              target: sgm.models.autoencoder.AutoencoderKLModeOnly
              params:
                embed_dim: 4
                monitor: val/rec_loss
                ddconfig:
                  attn_type: vanilla-xformers
                  double_z: true
                  z_channels: 4
                  resolution: 256
                  in_channels: 3
                  out_ch: 3
                  ch: 128
                  ch_mult:
                  - 1
                  - 2
                  - 4
                  - 4
                  num_res_blocks: 2
                  attn_resolutions: []
                  dropout: 0.0
                lossconfig:
                  target: torch.nn.Identity
        - input_key: cond_aug
          is_trainable: false
          target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
          params:
            outdim: 256
    first_stage_config:
      target: sgm.models.autoencoder.AutoencoderKL
      params:
        embed_dim: 4
        monitor: val/rec_loss
        ddconfig:
          attn_type: vanilla-xformers
          double_z: true
          z_channels: 4
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    loss_fn_config:
      target: sgm.modules.diffusionmodules.loss.StandardDiffusionLoss
      params:
        num_frames: 25 #frame_rate
        batch2model_keys:
        - num_video_frames
        - image_only_indicator
        sigma_sampler_config:
          target: sgm.modules.diffusionmodules.sigma_sampling.EDMSampling
          params:
            p_mean: 1.0
            p_std: 1.6
        loss_weighting_config:
          target: sgm.modules.diffusionmodules.loss_weighting.VWeighting
    sampler_config:
      target: sgm.modules.diffusionmodules.sampling.LinearMultistepSampler
      params:
        num_steps: 50
        verbose: True

        discretization_config:
          target: sgm.modules.diffusionmodules.discretizer.EDMDiscretization
          params:
            sigma_max: 700.0

        guider_config:
          target: sgm.modules.diffusionmodules.guiders.LinearPredictionGuider
          params:
            num_frames: 25 #frame_rate
            max_scale: 2.5
            min_scale: 1.0