Yingqing commited on
Commit
244cb9d
1 Parent(s): ba8bda2

upload t2v-version-1-1 models

Browse files
models/base_t2v/model.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8870651704d6bfd2476c37e2bb8296711638fd35812292b2d0e99f98b6427e08
3
+ size 4663057434
models/base_t2v/model_config.yaml ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ load_from_pretrained_img_model: true
3
+ ckpt_path: /apdcephfs_cq2/share_1290939/yingqinghe/dependencies/stable_diffusion/compvis-sd-v1-4-original/sd-v1-4-full-ema.ckpt
4
+ config_path: configs/latent-diffusion/txt2img-1p4B-eval-Clipembedder.yaml
5
+ load_from_checkpoint: /apdcephfs/share_1290939/yingqinghe/results/latent_diffusion/text2video/tv_054_NoFPSEmbd_NoMotionAdapter_FS32_basedon050_2_8nodes_e0_V/checkpoints/trainstep_checkpoints/epoch=000003-step=000020000.ckpt
6
+ base_learning_rate: 5.0e-07
7
+ scale_lr: false
8
+ target: lvdm.models.ddpm3d.LatentDiffusion
9
+ params:
10
+ linear_start: 0.00085
11
+ linear_end: 0.012
12
+ num_timesteps_cond: 1
13
+ log_every_t: 200
14
+ timesteps: 1000
15
+ first_stage_key: video
16
+ cond_stage_key: caption
17
+ image_size:
18
+ - 32
19
+ - 32
20
+ video_length: 16
21
+ channels: 4
22
+ cond_stage_trainable: false
23
+ conditioning_key: crossattn
24
+ monitor: train/loss_simple_step
25
+ scale_by_std: false
26
+ scale_factor: 0.18215
27
+ use_ema: false
28
+ loss_type: l2-consistency
29
+ val_prompt_file: info/prompts/magicvideo_mini.txt
30
+ seed: 23
31
+ val_fvd_interval: 5000
32
+ unet_config:
33
+ target: lvdm.models.modules.openaimodel3d.UNetModel
34
+ params:
35
+ image_size: 32
36
+ in_channels: 4
37
+ out_channels: 4
38
+ model_channels: 320
39
+ attention_resolutions:
40
+ - 4
41
+ - 2
42
+ - 1
43
+ num_res_blocks: 2
44
+ channel_mult:
45
+ - 1
46
+ - 2
47
+ - 4
48
+ - 4
49
+ num_heads: 8
50
+ transformer_depth: 1
51
+ context_dim: 768
52
+ use_checkpoint: true
53
+ legacy: false
54
+ kernel_size_t: 1
55
+ padding_t: 0
56
+ use_temporal_transformer: true
57
+ temporal_length: 16
58
+ use_relative_position: true
59
+ img_video_joint_train: false
60
+ image_length: null
61
+ temporal_crossattn_type: selfattn
62
+ motion_adaptor: false
63
+ n_mix_channels: 1
64
+ first_stage_config:
65
+ target: lvdm.models.autoencoder.AutoencoderKL
66
+ params:
67
+ embed_dim: 4
68
+ monitor: val/rec_loss
69
+ ddconfig:
70
+ double_z: true
71
+ z_channels: 4
72
+ resolution: 256
73
+ in_channels: 3
74
+ out_ch: 3
75
+ ch: 128
76
+ ch_mult:
77
+ - 1
78
+ - 2
79
+ - 4
80
+ - 4
81
+ num_res_blocks: 2
82
+ attn_resolutions: []
83
+ dropout: 0.0
84
+ lossconfig:
85
+ target: torch.nn.Identity
86
+ cond_stage_config:
87
+ target: lvdm.models.modules.condition_modules.FrozenCLIPEmbedder
88
+ logdir: /apdcephfs/share_1290939/yingqinghe/results/latent_diffusion/text2video/tv_056_2_0.01_lr_ConsistencyLoss_resumefrom054_8nodes_e0_V
89
+ ckptdir: /apdcephfs/share_1290939/yingqinghe/results/latent_diffusion/text2video/tv_056_2_0.01_lr_ConsistencyLoss_resumefrom054_8nodes_e0_V/checkpoints
90
+ data:
91
+ auto_cal_bs: true
92
+ target: main.DataModuleFromConfig
93
+ params:
94
+ batch_size: 3
95
+ num_workers: 12
96
+ wrap: false
97
+ train:
98
+ target: lvdm.data.webvid.WebVid
99
+ params:
100
+ data_dir: /apdcephfs/share_1290939/0_public_datasets/WebVid
101
+ meta_path: /apdcephfs/share_1290939/0_public_datasets/WebVid/metadata/results_2M_train.csv
102
+ video_length: 16
103
+ frame_stride: 32
104
+ load_raw_resolution: true
105
+ resolution: 256
106
+ spatial_transform: resize_center_crop
107
+ fps_max: 199
108
+ validation:
109
+ target: lvdm.data.webvid.WebVid
110
+ params:
111
+ data_dir: /apdcephfs/share_1290939/0_public_datasets/WebVid
112
+ meta_path: /apdcephfs_cq2/share_1290939/yingqinghe/datasets/webvid/metadata_2048_val.csv
113
+ video_length: 16
114
+ frame_stride: 32
115
+ load_raw_resolution: true
116
+ resolution: 256
117
+ spatial_transform: resize_center_crop
118
+ fps_max: 199
119
+ --local_rank: 0