Spaces:

PAIR
/

StreamingT2V

Runtime error

File size: 6,734 Bytes

f949b3f

# pytorch_lightning==2.0.9
seed_everything: 33
trainer:
  accelerator: auto
  strategy: auto
  devices: '8'
  num_nodes: 1
  precision: 16-mixed
  logger: null
  callbacks:
  - class_path: pytorch_lightning.callbacks.RichModelSummary
    init_args:
      max_depth: 1
  - class_path: pytorch_lightning.callbacks.RichProgressBar
    init_args:
      refresh_rate: 1
      leave: false
      theme:
        description: white
        progress_bar: '#6206E0'
        progress_bar_finished: '#6206E0'
        progress_bar_pulse: '#6206E0'
        batch_progress: white
        time: grey54
        processing_speed: grey70
        metrics: white
      console_kwargs: null
  fast_dev_run: false
  max_epochs: 5000
  min_epochs: null
  max_steps: 2020000
  min_steps: null
  max_time: null
  limit_train_batches: null
  limit_val_batches: 512
  limit_test_batches: null
  limit_predict_batches: null
  overfit_batches: 0.0
  val_check_interval: 8000
  check_val_every_n_epoch: 1
  num_sanity_val_steps: null
  log_every_n_steps: 10
  enable_checkpointing: null
  enable_progress_bar: null
  enable_model_summary: null
  accumulate_grad_batches: 8
  gradient_clip_val: 1
  gradient_clip_algorithm: norm
  deterministic: null
  benchmark: null
  inference_mode: true
  use_distributed_sampler: true
  profiler: null
  detect_anomaly: false
  barebones: false
  plugins: null
  sync_batchnorm: false
  reload_dataloaders_every_n_epochs: 0
  default_root_dir: null
model:
  inference_params:
    class_path: t2v_enhanced.model.pl_module_params_controlnet.InferenceParams
    init_args:
      width: 256
      height: 256
      video_length: 16
      guidance_scale: 7.5
      use_dec_scaling: true
      frame_rate: 8
      num_inference_steps: 50
      eta: 1.0
      n_autoregressive_generations: 1
      mode: long_video
      start_from_real_input: true
      eval_loss_metrics: false
      scheduler_cls: ''
      negative_prompt: ''
      conditioning_from_all_past: false
      validation_samples: 80
      conditioning_type: last_chunk
      result_formats:
      - eval_gif
      - gif
      - mp4
      concat_video: true
  opt_params:
    class_path: t2v_enhanced.model.pl_module_params_controlnet.OptimizerParams
    init_args:
      learning_rate: 5.0e-05
      layers_config:
        class_path: t2v_enhanced.model.requires_grad_setter.LayerConfig
        init_args:
          gradient_setup:
          - - false
            - - vae
          - - false
            - - text_encoder
          - - false
            - - image_encoder
          - - true
            - - resampler
          - - true
            - - unet
          - - true
            - - base_model
          - - false
            - - base_model
              - transformer_in
          - - false
            - - base_model
              - temp_attentions
          - - false
            - - base_model
              - temp_convs
      layers_config_base: null
      use_warmup: false
      warmup_steps: 10000
      warmup_start_factor: 1.0e-05
      learning_rate_spatial: 0.0
      use_8_bit_adam: false
      noise_generator: null
      noise_decomposition: null
      perceptual_loss: false
      noise_offset: 0.0
      split_opt_by_node: false
      reset_prediction_type_to_eps: false
      train_val_sampler_may_differ: true
      measure_similarity: false
      similarity_loss: false
      similarity_loss_weight: 1.0
      loss_conditional_weight: 0.0
      loss_conditional_weight_convex: false
      loss_conditional_change_after_step: 0
      mask_conditional_frames: false
      sample_from_noise: true
      mask_alternating: false
      uncondition_freq: -1
      no_text_condition_control: false
      inject_image_into_input: false
      inject_at_T: false
      resampling_steps: 1
      control_freq_in_resample: 1
      resample_to_T: false
      adaptive_loss_reweight: false
      load_resampler_from_ckpt: ''
      skip_controlnet_branch: false
      use_fps_conditioning: false
      num_frame_embeddings_range: 16
      start_frame_training: 16
      start_frame_ctrl: 16
      load_trained_base_model_and_resampler_from_ckpt: ''
      load_trained_controlnet_from_ckpt: ''
  unet_params:
    class_path: t2v_enhanced.model.pl_module_params_controlnet.UNetParams
    init_args:
      conditioning_embedding_out_channels:
      - 32
      - 96
      - 256
      - 512
      ckpt_spatial_layers: ''
      pipeline_repo: damo-vilab/text-to-video-ms-1.7b
      unet_from_diffusers: true
      spatial_latent_input: false
      num_frame_conditioning: 1
      pipeline_class: t2v_enhanced.model.model.controlnet.pipeline_text_to_video_w_controlnet_synth.TextToVideoSDPipeline
      frame_expansion: none
      downsample_controlnet_cond: true
      num_frames: 16
      pre_transformer_in_cond: false
      num_tranformers: 1
      zero_conv_3d: false
      merging_mode: addition
      compute_only_conditioned_frames: false
      condition_encoder: ''
      zero_conv_mode: Identity
      clean_model: true
      merging_mode_base: attention_cross_attention
      attention_mask_params: null
      attention_mask_params_base: null
      modelscope_input_format: true
      temporal_self_attention_only_on_conditioning: false
      temporal_self_attention_mask_included_itself: false
      use_post_merger_zero_conv: false
      weight_control_sample: 1.0
      use_controlnet_mask: false
      random_mask_shift: false
      random_mask: false
      use_resampler: true
      unet_from_pipe: false
      unet_operates_on_2d: false
      image_encoder: CLIP
      use_standard_attention_processor: false
      num_frames_before_chunk: 0
      resampler_type: single_frame
      resampler_cls: t2v_enhanced.model.diffusers_conditional.models.controlnet.image_embedder.ImgEmbContextResampler
      resampler_merging_layers: 4
      image_encoder_obj:
        class_path: t2v_enhanced.model.diffusers_conditional.models.controlnet.image_embedder.FrozenOpenCLIPImageEmbedder
        init_args:
          arch: ViT-H-14
          version: laion2b_s32b_b79k
          device: cuda
          max_length: 77
          freeze: true
          antialias: true
          ucg_rate: 0.0
          unsqueeze_dim: false
          repeat_to_max_len: false
          num_image_crops: 0
          output_tokens: false
      cfg_text_image: false
      aggregation: last_out
      resampler_random_shift: true
      img_cond_alpha_per_frame: false
      num_control_input_frames: 8
      use_image_encoder_normalization: false
      use_of: false
      ema_param: -1.0
      concat: false
      use_image_tokens_main: true
      use_image_tokens_ctrl: false
result_fol: results
exp_name: my_exp_name
run_name: my_run_name
scale_lr: false
matmul_precision: high