|
dataset_names = [ |
|
'all', |
|
'amass_mocap', 'motionx_mocap', 'humanact12_mocap', 'uestc_mocap', 'ntu_mocap', 'aist_mocap', |
|
'beat_mocap', 'tedg_mocap', 'tedex_mocap', 's2g3d_mocap', 'h36m_mocap', 'mpi_mocap', |
|
|
|
'humanml3d_t2m', 'kitml_t2m', 'babel_t2m', 'motionx_t2m', |
|
'humanact12_t2m', 'uestc_t2m', 'ntu_t2m', |
|
|
|
'aist_m2d', |
|
'beat_s2g', 'tedg_s2g', 'tedex_s2g', 's2g3d_s2g', |
|
|
|
'h36m_v2m', 'mpi_v2m' |
|
] |
|
num_datasets = len(dataset_names) |
|
|
|
model = dict( |
|
type='UnifiedMotionDiffusion', |
|
model=dict( |
|
type='LargeMotionModel', |
|
input_feats=669, |
|
max_seq_len=200, |
|
num_parts=10, |
|
latent_part_dim=64, |
|
time_embed_dim=2048, |
|
dataset_names=dataset_names, |
|
num_layers=4, |
|
num_cond_layers=2, |
|
num_datasets=num_datasets, |
|
dropout=0, |
|
ca_block_cfg=dict( |
|
type='ArtAttention', |
|
num_experts=16, |
|
topk=4, |
|
gate_type='cosine_top', |
|
gate_noise=1.0, |
|
num_datasets=num_datasets, |
|
has_text=True, |
|
has_music=True, |
|
has_speech=True, |
|
has_video=True |
|
), |
|
text_input_dim=1024, |
|
music_input_dim=768, |
|
speech_input_dim=768, |
|
video_input_dim=1024, |
|
guidance_cfg=dict( |
|
all=dict(type='linear', scale=5.5), |
|
), |
|
moe_route_loss_weight=10.0, |
|
template_kl_loss_weight=0.0001, |
|
use_pos_embedding=False, |
|
cond_drop_rate=0.1 |
|
), |
|
loss_recon=dict( |
|
type='KinematicLoss', loss_type='mse', loss_weight=[20], reduction='none'), |
|
train_repeat=1, |
|
diffusion_train=dict( |
|
beta_scheduler='linear', |
|
diffusion_steps=1000, |
|
model_mean_type='start_x', |
|
model_var_type='fixed_large', |
|
), |
|
diffusion_test_dict=dict( |
|
base=dict( |
|
beta_scheduler='linear', |
|
diffusion_steps=1000, |
|
model_mean_type='start_x', |
|
model_var_type='fixed_large', |
|
), |
|
all='15,15,8,6,6' |
|
), |
|
inference_type='ddim', |
|
loss_reduction='batch', |
|
loss_weight='data/motionverse/statistics/loss_weight.npy' |
|
) |
|
|