LMM / configs /lmm /lmm.py
mingyuan's picture
initial commit
373af33
dataset_names = [
'all',
'amass_mocap', 'motionx_mocap', 'humanact12_mocap', 'uestc_mocap', 'ntu_mocap', 'aist_mocap',
'beat_mocap', 'tedg_mocap', 'tedex_mocap', 's2g3d_mocap', 'h36m_mocap', 'mpi_mocap',
'humanml3d_t2m', 'kitml_t2m', 'babel_t2m', 'motionx_t2m',
'humanact12_t2m', 'uestc_t2m', 'ntu_t2m',
'aist_m2d',
'beat_s2g', 'tedg_s2g', 'tedex_s2g', 's2g3d_s2g',
'h36m_v2m', 'mpi_v2m'
]
num_datasets = len(dataset_names)
# model settings
model = dict(
type='UnifiedMotionDiffusion',
model=dict(
type='LargeMotionModel',
input_feats=669,
max_seq_len=200,
num_parts=10,
latent_part_dim=64,
time_embed_dim=2048,
dataset_names=dataset_names,
num_layers=4,
num_cond_layers=2,
num_datasets=num_datasets,
dropout=0,
ca_block_cfg=dict(
type='ArtAttention',
num_experts=16,
topk=4,
gate_type='cosine_top',
gate_noise=1.0,
num_datasets=num_datasets,
has_text=True,
has_music=True,
has_speech=True,
has_video=True
),
text_input_dim=1024,
music_input_dim=768,
speech_input_dim=768,
video_input_dim=1024,
guidance_cfg=dict(
all=dict(type='linear', scale=5.5),
),
moe_route_loss_weight=10.0,
template_kl_loss_weight=0.0001,
use_pos_embedding=False,
cond_drop_rate=0.1
),
loss_recon=dict(
type='KinematicLoss', loss_type='mse', loss_weight=[20], reduction='none'),
train_repeat=1,
diffusion_train=dict(
beta_scheduler='linear',
diffusion_steps=1000,
model_mean_type='start_x',
model_var_type='fixed_large',
),
diffusion_test_dict=dict(
base=dict(
beta_scheduler='linear',
diffusion_steps=1000,
model_mean_type='start_x',
model_var_type='fixed_large',
),
all='15,15,8,6,6'
),
inference_type='ddim',
loss_reduction='batch',
loss_weight='data/motionverse/statistics/loss_weight.npy'
)