|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
common: |
|
share_backbone_group: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
0, 0, 0, 0, 0, 0, 0, 0] |
|
share_decoder_group: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
0, 0, 0, 0, 0, 0, 0, 0] |
|
|
|
share_rgb_group: [-1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
|
0, 0, 0, 0, 0, 0, 0, 0] |
|
share_dense_labeling_group: [-1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, |
|
0, 0, 0, 0, 0, 0, 0, 0] |
|
share_text_group: [0, 0, -1, -1, -1, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, |
|
-1, -1, -1, -1, -1, -1, -1, -1] |
|
share_sparse_labeling_group: [ 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|
-1, -1, -1, -1, -1, -1, -1, -1] |
|
share_video_group: [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, |
|
-1, -1, -1, -1, -1, -1, -1, -1] |
|
|
|
share_modality_group: [ 2, 2, 3, 4, 4, 0, 0, 1, 5, 5, 5, 5, 5, 5, 5, 5, |
|
5, 6, 6, 6, 6, 6, 6, 6 ] |
|
|
|
|
|
solver: |
|
type: SolverMAEDev |
|
|
|
model_entry_type: aio_entry_v2mae_shareneck |
|
|
|
|
|
lr_scheduler: |
|
type: 'Cosine' |
|
kwargs: |
|
eta_min: 0. |
|
base_lr: 1.e-5 |
|
warmup_lr: 1.e-3 |
|
warmup_steps: 1500 |
|
|
|
backbone_multiplier: 1. |
|
pos_embed_multiplier: 1. |
|
layer_decay: |
|
num_layers: 12 |
|
layer_decay_rate: 0.75 |
|
lpe_lr: True |
|
|
|
optimizer: |
|
type: Adafactor_dev |
|
kwargs: |
|
beta1: 0.9 |
|
clip_beta2: 0.999 |
|
clip_threshold: 1. |
|
decay_rate: -0.8 |
|
scale_parameter: False |
|
relative_step: False |
|
weight_decay: 0.05 |
|
|
|
auto_denan: False |
|
|
|
workers: 2 |
|
max_iter: 60000 |
|
|
|
deterministic: True |
|
cudnn_deterministic: False |
|
worker_rank: True |
|
random_seed: 233 |
|
|
|
print_freq: 10 |
|
verbose_loss: False |
|
vis_batch: False |
|
save_interval: 10000 |
|
|
|
use_ceph: True |
|
sync: True |
|
collate: det |
|
|
|
|
|
tasks : |
|
|
|
|
|
0: |
|
name: NUTRGBD_skeleton |
|
loss_weight: 4.4 |
|
gres_ratio: 2 |
|
backbone: |
|
type: vit_base_patch16_mask |
|
kwargs: |
|
task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] |
|
pretrained: True |
|
lms_checkpoint_train: fairscale |
|
window: False |
|
test_pos_mode: learnable_interpolate |
|
learnable_pos: False |
|
drop_path_rate: 0.1 |
|
vis_patch_token_ratio: 1 |
|
vis_label_token_ratio: 0. |
|
|
|
dataset: |
|
type: mmSkeletonDataset |
|
kwargs: |
|
ann_file: |
|
- /mnt/path...to...//skaction_public/ntu60_hrnet.pkl |
|
- /mnt/path...to...//skaction_public/ntu120_hrnet.pkl |
|
- /mnt/path...to...//skaction_public/gym_hrnet.pkl |
|
dataset_name: |
|
- 2dntu60 |
|
- 2dntu120 |
|
- gym |
|
kp_dim: 2d |
|
one_hot: True |
|
num_classes: |
|
- 60 |
|
- 120 |
|
- 99 |
|
centernorm: False |
|
scale_range: [ 0.75,1.25 ] |
|
data_pipeline: |
|
- type: PreNormalize2D |
|
kwargs: { } |
|
- type: GenSkeFeat |
|
kwargs: |
|
dataset: coco |
|
feats: [ 'j' ] |
|
- type: UniformSampleGivenFrames |
|
kwargs: |
|
clip_len: 25 |
|
given_len: 7 |
|
- type: PoseDecode |
|
kwargs: { } |
|
- type: FormatGCNInput2D |
|
kwargs: |
|
num_person: 2 |
|
window: False |
|
rotate: True |
|
mode: zero |
|
- type: Collect |
|
kwargs: |
|
keys: [ 'keypoint', 'label' ] |
|
meta_keys: [ ] |
|
- type: ToTensor |
|
kwargs: |
|
keys: [ 'keypoint' ] |
|
flip: True |
|
|
|
|
|
sampler: |
|
batch_size: 120 |
|
shuffle_strategy: 1 |
|
|
|
patch_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: sparse_labeling |
|
|
|
label_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: text |
|
|
|
patch_adapter: |
|
type: sparse_labeling_adapter_skaction |
|
kwargs: |
|
pretrained: True |
|
in_chans: 3 |
|
num_joints: 17 |
|
num_frames: 175 |
|
embed_dim: 768 |
|
patch_size: [ 7, 2 ] |
|
stride_level: [ 1, 1 ] |
|
use_abs_pos_emb: True |
|
learnable_pos: False |
|
test_pos_mode: learnable_interpolate |
|
type_embed: False |
|
joint_with_text_embedding: True |
|
joint_names: coco_body_17joints |
|
proj_norm: 'LN' |
|
stride_text_embedding: True |
|
is_2d_dataset: True |
|
modality_share_list: [ |
|
'merge_kernel', |
|
'proj_kernel', |
|
'proj', ] |
|
task_sp_list: [ 'text_embedding', 'pos_embed', ] |
|
|
|
|
|
patch_proj: |
|
type: sparse_labeling_projector |
|
kwargs: |
|
task: skeleton |
|
loss_cfg: |
|
type: MaskDetFocalDiceLoss |
|
kwargs: |
|
cfg: |
|
deep_supervision: True |
|
focal_alpha: 0.25 |
|
class_weight: 2.0 |
|
bbox_weight: 5.0 |
|
giou_weight: 2. |
|
ign_thr: 0.7 |
|
dec_layers: 6 |
|
num_classes: 1 |
|
predict3d: True |
|
xyxy: True |
|
in_chans: 3 |
|
num_joints: 17 |
|
num_frames: 175 |
|
modality_share_list: [ |
|
'output_proj', |
|
'translate_weight', |
|
'translate_bias', |
|
'post_mul_norm', |
|
'patch_proj', |
|
'class_proj' |
|
] |
|
task_sp_list: [ |
|
'text_vectors', |
|
'text_features', |
|
] |
|
|
|
label_adapter: |
|
type: text_adapter |
|
kwargs: |
|
pretrained: True |
|
|
|
description_dict_name: |
|
- ntu60_name |
|
- ntu120_name |
|
- gym_cls_name |
|
one_way_semantics: False |
|
skeleton_action: True |
|
skeleton_action_one_hot_label: True |
|
task_sp_list: [ 'text_vectors', ] |
|
|
|
label_proj: |
|
type: text_projector |
|
kwargs: |
|
one_way_semantics: False |
|
description_dict_name: |
|
- ntu60_name |
|
- ntu120_name |
|
- gym_cls_name |
|
skeleton_action: True |
|
skeleton_action_one_hot_label: True |
|
pre_proj_type: 'pool' |
|
replace_post_mul_norm: False |
|
post_mul_norm: True |
|
task_sp_list: [ 'text_vectors', |
|
'translate_weight', |
|
'translate_bias', |
|
'post_mul_norm', ] |
|
loss_cfg: |
|
type: CELoss |
|
kwargs: |
|
loss_weight: 1.0 |
|
|
|
decoder: |
|
type: UniHCPv2_Head |
|
kwargs: |
|
predictor: 'hulk' |
|
task: recons |
|
modality_share_list: ['predictor.mask_token', ] |
|
task_sp_list: [ |
|
'predictor.query_embed_patch', |
|
'predictor.query_embed_label', |
|
'predictor.class_embed','predictor.fc_bias', |
|
] |
|
loss_weight: 1.0 |
|
transformer_predictor_cfg: |
|
hidden_dim: 256 |
|
num_queries: 20 |
|
nheads: 8 |
|
dim_feedforward: 2048 |
|
dec_layers: 9 |
|
pre_norm: False |
|
arch: fan_in |
|
enforce_input_project: False |
|
mask_on: False |
|
num_feature_levels: 1 |
|
cross_pos_embed: anchor |
|
self_attn_mask_type: patch_diag_label_row |
|
detach_from_peddet: True |
|
|
|
loss_cfg: |
|
type: CEL_Sigmoid |
|
|
|
1: |
|
name: k400_skeleton |
|
loss_weight: 1 |
|
gres_ratio: 1 |
|
backbone: |
|
type: vit_base_patch16_mask |
|
kwargs: |
|
task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] |
|
pretrained: True |
|
lms_checkpoint_train: fairscale |
|
window: False |
|
test_pos_mode: learnable_interpolate |
|
learnable_pos: False |
|
drop_path_rate: 0.1 |
|
vis_patch_token_ratio: 1 |
|
vis_label_token_ratio: 0. |
|
|
|
dataset: |
|
type: mmSkeletonDataset |
|
kwargs: |
|
ann_file: |
|
- /mnt/path...to.../skaction_public/diving48_hrnet.pkl |
|
- /mnt/path...to.../skaction_public/ucf101_hrnet.pkl |
|
- /mnt/path...to.../skaction_public/k400_hrnet.pkl |
|
dataset_name: |
|
- diving |
|
- ucf |
|
- k400 |
|
kp_dim: 2d |
|
one_hot: True |
|
num_classes: |
|
- 48 |
|
- 101 |
|
- 400 |
|
centernorm: False |
|
scale_range: [ 0.75,1.25 ] |
|
data_pipeline: |
|
- type: PreNormalize2D |
|
kwargs: { } |
|
- type: GenSkeFeat |
|
kwargs: |
|
dataset: coco |
|
feats: [ 'j' ] |
|
- type: UniformSampleGivenFrames |
|
kwargs: |
|
clip_len: 25 |
|
given_len: 7 |
|
- type: PoseDecode |
|
kwargs: { } |
|
- type: FormatGCNInput2D |
|
kwargs: |
|
num_person: 2 |
|
window: False |
|
rotate: True |
|
mode: zero |
|
- type: Collect |
|
kwargs: |
|
keys: [ 'keypoint', 'label' ] |
|
meta_keys: [ ] |
|
- type: ToTensor |
|
kwargs: |
|
keys: [ 'keypoint' ] |
|
flip: True |
|
|
|
|
|
sampler: |
|
batch_size: 90 |
|
shuffle_strategy: 1 |
|
|
|
patch_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: sparse_labeling |
|
|
|
label_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: text |
|
|
|
patch_adapter: |
|
type: sparse_labeling_adapter_skaction |
|
kwargs: |
|
pretrained: True |
|
in_chans: 3 |
|
num_joints: 17 |
|
num_frames: 175 |
|
embed_dim: 768 |
|
patch_size: [ 7, 2 ] |
|
stride_level: [ 1, 1 ] |
|
use_abs_pos_emb: True |
|
learnable_pos: False |
|
test_pos_mode: learnable_interpolate |
|
type_embed: False |
|
joint_with_text_embedding: True |
|
joint_names: coco_body_17joints |
|
proj_norm: 'LN' |
|
stride_text_embedding: True |
|
is_2d_dataset: True |
|
modality_share_list: [ |
|
'merge_kernel', |
|
'proj_kernel', |
|
'proj', ] |
|
task_sp_list: [ 'text_embedding', 'pos_embed', ] |
|
|
|
|
|
patch_proj: |
|
type: sparse_labeling_projector |
|
kwargs: |
|
task: skeleton |
|
loss_cfg: |
|
type: MaskDetFocalDiceLoss |
|
kwargs: |
|
cfg: |
|
deep_supervision: True |
|
focal_alpha: 0.25 |
|
class_weight: 2.0 |
|
bbox_weight: 5.0 |
|
giou_weight: 2. |
|
ign_thr: 0.7 |
|
dec_layers: 6 |
|
num_classes: 1 |
|
predict3d: True |
|
xyxy: True |
|
in_chans: 3 |
|
num_joints: 17 |
|
num_frames: 175 |
|
modality_share_list: [ |
|
'output_proj', |
|
'translate_weight', |
|
'translate_bias', |
|
'post_mul_norm', |
|
'patch_proj', |
|
'class_proj' |
|
] |
|
task_sp_list: [ |
|
'text_vectors', |
|
'text_features', |
|
] |
|
|
|
label_adapter: |
|
type: text_adapter |
|
kwargs: |
|
pretrained: True |
|
description_dict_name: |
|
- diving48_cls_name |
|
- ucf101_cls_name |
|
- k400_cls_name |
|
one_way_semantics: False |
|
skeleton_action: True |
|
skeleton_action_one_hot_label: True |
|
task_sp_list: [ 'text_vectors', ] |
|
|
|
label_proj: |
|
type: text_projector |
|
kwargs: |
|
one_way_semantics: False |
|
description_dict_name: |
|
- diving48_cls_name |
|
- ucf101_cls_name |
|
- k400_cls_name |
|
skeleton_action: True |
|
skeleton_action_one_hot_label: True |
|
pre_proj_type: 'pool' |
|
replace_post_mul_norm: False |
|
post_mul_norm: True |
|
task_sp_list: [ 'text_vectors', |
|
'translate_weight', |
|
'translate_bias', |
|
'post_mul_norm', ] |
|
loss_cfg: |
|
type: CELoss |
|
kwargs: |
|
loss_weight: 1.0 |
|
|
|
decoder: |
|
type: UniHCPv2_Head |
|
kwargs: |
|
predictor: 'hulk' |
|
task: recons |
|
modality_share_list: [ 'predictor.mask_token', ] |
|
task_sp_list: [ |
|
'predictor.query_embed_patch', |
|
'predictor.query_embed_label', |
|
'predictor.class_embed','predictor.fc_bias', |
|
] |
|
loss_weight: 1.0 |
|
transformer_predictor_cfg: |
|
hidden_dim: 256 |
|
num_queries: 20 |
|
nheads: 8 |
|
dim_feedforward: 2048 |
|
dec_layers: 9 |
|
pre_norm: False |
|
arch: fan_in |
|
enforce_input_project: False |
|
mask_on: False |
|
num_feature_levels: 1 |
|
cross_pos_embed: anchor |
|
self_attn_mask_type: patch_diag_label_row |
|
detach_from_peddet: True |
|
|
|
loss_cfg: |
|
type: CEL_Sigmoid |
|
|
|
2: |
|
name: smpl |
|
loss_weight: 0.5 |
|
gres_ratio: 3 |
|
dataset: |
|
type: MeshTSVYamlDataset |
|
kwargs: |
|
is_composite: True |
|
is_train: True |
|
cv2_output: False |
|
augmentation: |
|
scale_factor: 0.25 |
|
noise_factor: 0.4 |
|
rot_factor: 30 |
|
img_res: 224 |
|
cfg: |
|
data_path: |
|
- /mnt/path...to.../Processed_SMPL/3dpw/dataset.pkl |
|
- /mnt/path...to.../Processed_SMPL/human3.6m/dataset.pkl |
|
- /mnt/path...to.../Processed_SMPL/coco_smpl/dataset.pkl |
|
- /mnt/path...to.../Processed_SMPL/muco/dataset.pkl |
|
- /mnt/path...to.../Processed_SMPL/up3d/dataset.pkl |
|
- /mnt/path...to.../Processed_SMPL/mpii/dataset.pkl |
|
- /mnt/path...to.../Processed_SMPL/gta_human/dataset_pkl/v2_dataset_1396913.pkl |
|
- /mnt/path...to.../Processed_SMPL/gta_human/dataset_pkl/v2_dataset_200000.pkl |
|
- /mnt/path...to.../Processed_SMPL/gta_human/dataset_pkl/v2_dataset_400000.pkl |
|
- /mnt/path...to.../Processed_SMPL/gta_human/dataset_pkl/v2_dataset_600000.pkl |
|
- /mnt/path...to.../Processed_SMPL/gta_human/dataset_pkl/v2_dataset_800000.pkl |
|
- /mnt/path...to.../Processed_SMPL/gta_human/dataset_pkl/v2_dataset_1000000.pkl |
|
- /mnt/path...to.../Processed_SMPL/gta_human/dataset_pkl/v2_dataset_1200000.pkl |
|
root_path: |
|
- /mnt/path...to.../Processed_SMPL/3dpw/images |
|
- /mnt/path...to.../Processed_SMPL/human3.6m/images |
|
- /mnt/path...to.../Processed_SMPL/coco_smpl/images |
|
- /mnt/path...to.../Processed_SMPL/muco/images |
|
- /mnt/path...to.../Processed_SMPL/up3d/images |
|
- /mnt/path...to.../Processed_SMPL/mpii/images |
|
- /mnt/path...to.../Processed_SMPL/gta_human_openxlab/gta_human |
|
- /mnt/path...to.../Processed_SMPL/gta_human_openxlab/gta_human |
|
- /mnt/path...to.../Processed_SMPL/gta_human_openxlab/gta_human |
|
- /mnt/path...to.../Processed_SMPL/gta_human_openxlab/gta_human |
|
- /mnt/path...to.../Processed_SMPL/gta_human_openxlab/gta_human |
|
- /mnt/path...to.../Processed_SMPL/gta_human_openxlab/gta_human |
|
- /mnt/path...to.../Processed_SMPL/gta_human_openxlab/gta_human |
|
|
|
sampler: |
|
batch_size: 165 |
|
shuffle_strategy: 1 |
|
|
|
backbone: |
|
type: vit_base_patch16_mask |
|
kwargs: |
|
task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] |
|
pretrained: True |
|
lms_checkpoint_train: fairscale |
|
window: False |
|
test_pos_mode: learnable_interpolate |
|
learnable_pos: True |
|
drop_path_rate: 0.2 |
|
vis_patch_token_ratio: 1 |
|
vis_label_token_ratio: 0. |
|
|
|
patch_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: rgb |
|
|
|
label_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: sparse_labeling |
|
|
|
patch_adapter: |
|
type: rgb_adapter |
|
kwargs: |
|
pretrained: True |
|
stride_level: 1 |
|
in_chans: 3 |
|
use_abs_pos_emb: True |
|
learnable_pos: False |
|
test_pos_mode: False |
|
img_size: [ 224, 224 ] |
|
task_sp_list: [ 'pos_embed' ] |
|
|
|
label_adapter: |
|
type: sparse_labeling_adapter |
|
kwargs: |
|
pretrained: True |
|
in_chans: 3 |
|
num_joints: 446 |
|
num_frames: 1 |
|
embed_dim: 768 |
|
patch_size: [ 1,1 ] |
|
stride_level: [ 1, 1 ] |
|
use_abs_pos_emb: True |
|
learnable_pos: False |
|
test_pos_mode: learnable_interpolate |
|
type_embed: False |
|
proj_norm: 'LN' |
|
task_sp_list: [ 'pos_embed', |
|
'text_embedding', |
|
'proj_kernel', |
|
'proj',] |
|
|
|
patch_proj: |
|
type: rgb_projector |
|
kwargs: |
|
loss_cfg: |
|
type: MaskedMSELoss |
|
kwargs: |
|
stride: 1 |
|
norm_pix_loss: True |
|
pix_loss: True |
|
pix_loss_weight: 1. |
|
norm_pix_loss_weight: 1. |
|
|
|
label_proj: |
|
type: sparse_labeling_projector |
|
kwargs: |
|
task_sp_list: [ 'output_proj', |
|
'text_features', |
|
'loss_fn', |
|
'translate', |
|
'post_mul_norm', |
|
'patch_proj', |
|
'class_proj', |
|
'proj' |
|
] |
|
pre_proj_type: 'fix_text_tokens' |
|
num_classes: 14 |
|
|
|
reference_type: 'smpl' |
|
in_chans: 3 |
|
num_joints: 446 |
|
num_frames: 1 |
|
hidden_dim: 256 |
|
patch_size: [ 1, 1 ] |
|
stride_level: [ 1, 1 ] |
|
replace_post_mul_norm: False |
|
task: smpl |
|
|
|
text_prototype: True |
|
learn_text: True |
|
loss_cfg: |
|
type: SMPL_LOSS_FASTMETRO |
|
kwargs: |
|
|
|
cfg: |
|
use_smpl_param_regressor: True |
|
joints_2d_loss_weight: 100.0 |
|
vertices_3d_loss_weight: 100.0 |
|
edge_normal_loss_weight: 100.0 |
|
joints_3d_loss_weight: 1000.0 |
|
vertices_fine_loss_weight: 0.25 |
|
vertices_intermediate_loss_weight: 0.50 |
|
vertices_coarse_loss_weight: 0.25 |
|
edge_gt_loss_weight: 5.0 |
|
edge_self_loss_weight: 1.e-4 |
|
normal_loss_weight: 0.1 |
|
smpl_param_loss_weight: 1000.0 |
|
except_smpl_param_loss_weight: 1.e-8 |
|
|
|
decoder: |
|
type: UniHCPv2_Head |
|
kwargs: |
|
predictor: 'hulk' |
|
task: recons |
|
modality_share_list: ['predictor.mask_token'] |
|
task_sp_list: [ 'predictor.query_embed_patch', |
|
'predictor.query_embed_label', |
|
'predictor.class_embed', 'predictor.fc_bias', |
|
|
|
] |
|
loss_weight: 1.0 |
|
transformer_predictor_cfg: |
|
hidden_dim: 256 |
|
num_queries: 20 |
|
nheads: 8 |
|
dim_feedforward: 2048 |
|
dec_layers: 9 |
|
pre_norm: False |
|
arch: fan_in |
|
enforce_input_project: False |
|
mask_on: False |
|
num_feature_levels: 1 |
|
cross_pos_embed: anchor |
|
cls_out_dim: 1 |
|
smpl_attention_mask_flag: True |
|
smpl_mae_pe: True |
|
use_adapt_pos2d: True |
|
use_adapt_pos1d: True |
|
self_attn_mask_type: full |
|
adding_per_layer_pe: True |
|
detach_from_peddet: True |
|
use_adapt_position: 'before' |
|
use_smpl_label_attention_mask: True |
|
label_pos_mode: 'smpl_xyz' |
|
loss_cfg: |
|
type: CEL_Sigmoid |
|
|
|
3: |
|
name: Peddet |
|
loss_weight: 15 |
|
gres_ratio: 8 |
|
dataset: |
|
type: PedestrainDetectionDataset_v2 |
|
kwargs: |
|
task_spec: |
|
img_folder: |
|
- /mnt/path...to.../PedDet2d/CrowdHuman/Images |
|
ann_file: |
|
- /mnt/path...to.../PedDet2d/CrowdHuman/annotations/train.json |
|
return_masks: False |
|
augmentation: |
|
max_size: 1120 |
|
vit: True |
|
num_append_fake_boxes: 867 |
|
return_box_xyxy: True |
|
append_z: True |
|
sampler: |
|
batch_size: 4 |
|
shuffle_strategy: 1 |
|
batch_accumulation: 1 |
|
|
|
backbone: |
|
type: vit_base_patch16_mask |
|
kwargs: |
|
task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] |
|
pretrained: True |
|
lms_checkpoint_train: fairscale |
|
window: False |
|
test_pos_mode: learnable_interpolate |
|
learnable_pos: True |
|
drop_path_rate: 0.2 |
|
attn_calcul_method: 'math' |
|
vis_patch_token_ratio: 1 |
|
vis_label_token_ratio: 0. |
|
|
|
patch_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: rgb |
|
|
|
label_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: sparse_labeling |
|
|
|
patch_adapter: |
|
type: rgb_adapter |
|
kwargs: |
|
pretrained: True |
|
stride_level: 1 |
|
in_chans: 3 |
|
learnable_pos: False |
|
use_abs_pos_emb: True |
|
test_pos_mode: interpolate_with_nomask |
|
img_size: 1344 |
|
round_padding: True |
|
pad_attn_mask: True |
|
task_sp_list: [ 'pos_embed' ] |
|
|
|
label_adapter: |
|
type: sparse_labeling_adapter |
|
kwargs: |
|
pretrained: True |
|
in_chans: 3 |
|
num_joints: 867 |
|
num_frames: 2 |
|
embed_dim: 768 |
|
patch_size: [ 2, 1 ] |
|
stride_level: [ 1, 1 ] |
|
use_abs_pos_emb: True |
|
learnable_pos: False |
|
test_pos_mode: learnable_interpolate |
|
type_embed: False |
|
proj_norm: 'LN' |
|
task_sp_list: [ 'pos_embed', |
|
'text_embedding', |
|
'proj_kernel', |
|
'proj', |
|
'merge_kernel', |
|
] |
|
|
|
patch_proj: |
|
type: rgb_projector |
|
kwargs: |
|
loss_cfg: |
|
type: MaskedMSELoss |
|
kwargs: |
|
stride: 1 |
|
norm_pix_loss: True |
|
pix_loss: True |
|
pix_loss_weight: 1. |
|
norm_pix_loss_weight: 1. |
|
|
|
label_proj: |
|
type: sparse_labeling_projector |
|
kwargs: |
|
task_sp_list: [ 'text_vectors', |
|
'text_features', |
|
] |
|
modality_share_list: [ |
|
'text_vectors', |
|
'output_proj', |
|
'translate_weight', |
|
'translate_bias', |
|
'post_mul_norm', |
|
'patch_proj', |
|
'class_proj' |
|
] |
|
in_chans: 3 |
|
num_joints: 867 |
|
num_frames: 2 |
|
pre_proj_type: fix_text_tokens |
|
num_classes: 1 |
|
reference_type: four_points |
|
box_mlp: True |
|
replace_post_mul_norm: True |
|
translate_weight_scale: 4 |
|
text_prototype: True |
|
loss_cfg: |
|
type: MaskDetFocalDiceLoss |
|
kwargs: |
|
cfg: |
|
deep_supervision: True |
|
focal_alpha: 0.25 |
|
class_weight: 2.0 |
|
bbox_weight: 5.0 |
|
giou_weight: 2. |
|
ign_thr: 0.7 |
|
dec_layers: 9 |
|
num_classes: 1 |
|
predict3d: True |
|
xyxy: True |
|
|
|
decoder: |
|
type: UniHCPv2_Head |
|
kwargs: |
|
predictor: 'hulk' |
|
task: recons |
|
modality_share_list: ['predictor.mask_token'] |
|
task_sp_list: [ |
|
'predictor.query_embed_patch', |
|
'predictor.query_embed_label', |
|
'predictor.anchor', |
|
'predictor.class_embed','predictor.fc_bias', |
|
] |
|
loss_weight: 1.0 |
|
transformer_predictor_cfg: |
|
hidden_dim: 256 |
|
num_queries: 20 |
|
nheads: 8 |
|
dim_feedforward: 2048 |
|
dec_layers: 9 |
|
pre_norm: False |
|
arch: fan_in |
|
enforce_input_project: False |
|
mask_on: False |
|
num_feature_levels: 1 |
|
cross_pos_embed: anchor |
|
patch_pos_mode: interpolate_with_nomask |
|
label_pos_mode: simple_interpolate |
|
self_attn_mask_type: patch_diag_label_row_nested |
|
adding_per_layer_pe: True |
|
mask_token_normal_init: True |
|
intermediate_output: True |
|
peddet_cfgs: |
|
share_content_query: 3 |
|
num_queries: 867 |
|
pre_defined_path: '289_points_3d.npy' |
|
query_pe_dim: 3 |
|
xattn: False |
|
anchor_requires_grad: False |
|
|
|
loss_cfg: |
|
type: CEL_Sigmoid |
|
|
|
|
|
|
|
4: |
|
name: Peddet_5set |
|
loss_weight: 42.4 |
|
gres_ratio: 20 |
|
dataset: |
|
type: PedestrainDetectionDataset_v2 |
|
kwargs: |
|
task_spec: |
|
img_folder: |
|
- /mnt/path...to.../peddet_public/CrowdHuman/Images |
|
- /mnt/path...to.../peddet_public/ECP/ |
|
- /mnt/path...to.../peddet_public/CityPersons/ |
|
- /mnt/path...to.../peddet_public/WiderPerson/Images |
|
- /mnt/path...to.../peddet_public/coco/train2017/ |
|
- /mnt/path...to.../peddet_public/WIDER_Pedestrian/Images/ |
|
ann_file: |
|
- /mnt/path...to.../peddet_public/CrowdHuman/annotations/train.json |
|
- /mnt/path...to.../peddet_public/ECP/ECP_remove_no_person_img.json |
|
- /mnt/path...to.../peddet_public/CityPersons/CityPersons_remove_no_person_img.json |
|
- /mnt/path...to.../peddet_public/WiderPerson/WiderPerson_remove_no_person_img.json |
|
- /mnt/path...to.../peddet_public/cocopersons/coco_person_remove_no_person_img.json |
|
- /mnt/path...to.../peddet_public/WIDER_Pedestrian/WIDER_Pedestrian_remove_no_person_img.json |
|
return_masks: False |
|
augmentation: |
|
max_size: 1120 |
|
vit: True |
|
num_append_fake_boxes: 867 |
|
return_box_xyxy: True |
|
append_z: True |
|
sampler: |
|
batch_size: 4 |
|
shuffle_strategy: 1 |
|
batch_accumulation: 1 |
|
|
|
backbone: |
|
type: vit_base_patch16_mask |
|
kwargs: |
|
task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] |
|
pretrained: True |
|
lms_checkpoint_train: fairscale |
|
window: False |
|
test_pos_mode: learnable_interpolate |
|
learnable_pos: True |
|
drop_path_rate: 0.2 |
|
attn_calcul_method: 'math' |
|
vis_patch_token_ratio: 1 |
|
vis_label_token_ratio: 0. |
|
|
|
patch_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: rgb |
|
|
|
label_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: sparse_labeling |
|
|
|
patch_adapter: |
|
type: rgb_adapter |
|
kwargs: |
|
pretrained: True |
|
stride_level: 1 |
|
in_chans: 3 |
|
learnable_pos: False |
|
use_abs_pos_emb: True |
|
test_pos_mode: interpolate_with_nomask |
|
img_size: 1344 |
|
round_padding: True |
|
pad_attn_mask: True |
|
task_sp_list: [ 'pos_embed' ] |
|
|
|
label_adapter: |
|
type: sparse_labeling_adapter |
|
kwargs: |
|
pretrained: True |
|
in_chans: 3 |
|
num_joints: 867 |
|
num_frames: 2 |
|
embed_dim: 768 |
|
patch_size: [ 2, 1 ] |
|
stride_level: [ 1, 1 ] |
|
use_abs_pos_emb: True |
|
learnable_pos: False |
|
test_pos_mode: learnable_interpolate |
|
type_embed: False |
|
proj_norm: 'LN' |
|
task_sp_list: [ 'pos_embed', |
|
'text_embedding', |
|
'proj_kernel', |
|
'proj', |
|
'merge_kernel', |
|
] |
|
|
|
patch_proj: |
|
type: rgb_projector |
|
kwargs: |
|
loss_cfg: |
|
type: MaskedMSELoss |
|
kwargs: |
|
stride: 1 |
|
norm_pix_loss: True |
|
pix_loss: True |
|
pix_loss_weight: 1. |
|
norm_pix_loss_weight: 1. |
|
|
|
label_proj: |
|
type: sparse_labeling_projector |
|
kwargs: |
|
task_sp_list: [ 'text_vectors', |
|
'text_features', |
|
] |
|
modality_share_list: [ |
|
'text_vectors', |
|
'output_proj', |
|
'translate_weight', |
|
'translate_bias', |
|
'post_mul_norm', |
|
'patch_proj', |
|
'class_proj' |
|
] |
|
in_chans: 3 |
|
num_joints: 867 |
|
num_frames: 2 |
|
pre_proj_type: fix_text_tokens |
|
num_classes: 1 |
|
reference_type: four_points |
|
box_mlp: True |
|
replace_post_mul_norm: True |
|
translate_weight_scale: 4 |
|
text_prototype: True |
|
loss_cfg: |
|
type: MaskDetFocalDiceLoss |
|
kwargs: |
|
cfg: |
|
deep_supervision: True |
|
focal_alpha: 0.25 |
|
class_weight: 2.0 |
|
bbox_weight: 5.0 |
|
giou_weight: 2. |
|
ign_thr: 0.7 |
|
dec_layers: 9 |
|
num_classes: 1 |
|
predict3d: True |
|
xyxy: True |
|
|
|
decoder: |
|
type: UniHCPv2_Head |
|
kwargs: |
|
predictor: 'hulk' |
|
task: recons |
|
modality_share_list: [ 'predictor.mask_token' ] |
|
task_sp_list: [ |
|
'predictor.query_embed_patch', |
|
'predictor.query_embed_label', |
|
'predictor.anchor', |
|
'predictor.class_embed','predictor.fc_bias', |
|
] |
|
loss_weight: 1.0 |
|
transformer_predictor_cfg: |
|
|
|
hidden_dim: 256 |
|
num_queries: 20 |
|
nheads: 8 |
|
dim_feedforward: 2048 |
|
dec_layers: 9 |
|
pre_norm: False |
|
arch: fan_in |
|
enforce_input_project: False |
|
mask_on: False |
|
num_feature_levels: 1 |
|
cross_pos_embed: anchor |
|
patch_pos_mode: interpolate_with_nomask |
|
label_pos_mode: simple_interpolate |
|
self_attn_mask_type: patch_diag_label_row_nested |
|
adding_per_layer_pe: True |
|
mask_token_normal_init: True |
|
intermediate_output: True |
|
peddet_cfgs: |
|
share_content_query: 3 |
|
num_queries: 867 |
|
pre_defined_path: '289_points_3d.npy' |
|
query_pe_dim: 3 |
|
xattn: False |
|
anchor_requires_grad: False |
|
|
|
loss_cfg: |
|
type: CEL_Sigmoid |
|
|
|
5: |
|
name: pedattr_multi_rap2_PA_100k_parse27k_market_HARDHC |
|
loss_weight: 5 |
|
gres_ratio: 1 |
|
backbone: |
|
type: vit_base_patch16_mask |
|
kwargs: |
|
task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] |
|
pretrained: True |
|
lms_checkpoint_train: fairscale |
|
window: False |
|
test_pos_mode: False |
|
learnable_pos: True |
|
drop_path_rate: 0.2 |
|
img_size: 1344 |
|
num_encoded_tokens: 192 |
|
vis_patch_token_ratio: 1 |
|
vis_label_token_ratio: 0. |
|
dataset: |
|
type: MultiAttrDataset |
|
kwargs: |
|
text_label_return: True |
|
task_spec: |
|
dataset: |
|
- rap2 |
|
- PA_100k |
|
- parse27k |
|
- market |
|
- HARDHC |
|
data_path: |
|
- /mnt/path...to.../pedattr_public/rap2/dataset.pkl |
|
- /mnt/path...to.../pedattr_public/PA-100k/dataset.pkl |
|
- /mnt/path...to.../pedattr_public/Parse27k/parse27k/parse27k/dataset.pkl |
|
- /mnt/path...to.../pedattr_public/market/dataset.pkl |
|
- /mnt/path...to.../pedattr_public/HARDHC/dataset.pkl |
|
root_path: |
|
- /mnt/path...to.../pedattr_public/rap2/RAP_dataset/ |
|
- /mnt/path...to.../pedattr_public/PA-100k/data/ |
|
- /mnt/path...to.../pedattr_public/Parse27k/parse27k/parse27k/images |
|
- /mnt/path...to.../pedattr_public/market/bounding_box_train |
|
- /mnt/path...to.../pedattr_public/HARDHC/croped_image/ |
|
augmentation: |
|
height: 256 |
|
width: 192 |
|
|
|
sampler: |
|
batch_size: 147 |
|
shuffle_strategy: 1 |
|
|
|
patch_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: rgb |
|
|
|
|
|
patch_adapter: |
|
type: rgb_adapter |
|
kwargs: |
|
pretrained: True |
|
stride_level: 1 |
|
in_chans: 3 |
|
learnable_pos: False |
|
test_pos_mode: False |
|
img_size: [256, 192] |
|
task_sp_list: [ 'pos_embed' ] |
|
|
|
patch_proj: |
|
type: rgb_projector |
|
kwargs: |
|
loss_cfg: |
|
type: MaskedMSELoss |
|
kwargs: |
|
stride: 1 |
|
norm_pix_loss: True |
|
pix_loss: True |
|
pix_loss_weight: 1. |
|
norm_pix_loss_weight: 1. |
|
|
|
label_adapter: |
|
type: text_adapter |
|
kwargs: |
|
pretrained: True |
|
task_sp_list: ['text_vectors'] |
|
one_way_semantics: True |
|
description_dict_name: 'multi_rap2_PA_100k_parse27k_market_HARDHC_attr_name' |
|
label_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: text |
|
label_proj: |
|
type: text_projector |
|
kwargs: |
|
task_sp_list: ['text_vectors', |
|
'translate_weight', |
|
'translate_bias', |
|
'post_mul_norm',] |
|
one_way_semantics: True |
|
post_mul_norm: True |
|
replace_post_mul_norm: False |
|
translate_weight_scale: 5 |
|
description_dict_name: 'multi_rap2_PA_100k_parse27k_market_HARDHC_attr_name' |
|
pre_proj_type: '' |
|
loss_cfg: |
|
type: MaskedOneSideBCELoss |
|
kwargs: |
|
use_focal_weight: True |
|
loss_weight: 1. |
|
dataset_weight: [ 0.5, 0.5, 0.5, 0.5, 0.5, |
|
0.5, 0.5, 0.5, 0.5, 0.5, |
|
0.5, 0.5, 0.5, 0.5, 0.5, |
|
0.5, 0.5, 0.5, 0.5, 0.5, |
|
0.5, 0.5, 0.5, 0.5, 0.5, |
|
0.5, 0.5, 0.5, 0.5, 0.5, |
|
0.5, 0.5, 0.5, 0.5, 0.5, |
|
0.5, 0.5, 0.5, 0.5, 0.5, |
|
0.5, 0.5, 0.5, 0.5, 0.5, |
|
0.5, 0.5, 0.5, 0.5, 0.5, |
|
0.5, 0.5, 0.5, 0.5, 0.25, |
|
0.25, 0.25, 0.25, 0.25, 0.25, |
|
0.25, 0.25, 0.25, 0.25, 0.25, |
|
0.25, 0.25, 0.25, 0.25, 0.25, |
|
0.25, 0.25, 0.25, 0.25, 0.25, |
|
0.25, 0.25, 0.25, 0.25, 0.25, |
|
1.0, 1.0, 1.0, 1.0, 1.0, |
|
1.0, 1.0, 1.0, 1.0, 1.0, |
|
1.0, 1.0, 1.0, 1.0, 1.0, |
|
1.0, 1.0, 1.0, 1.0, 1.0, |
|
1.0, 1.0, 1.0, 1.0, 1.0, |
|
1.0, 1.0, 1.0, 1.0, 1.0, |
|
1.0, 1.0, 1.0, 1.0, 1.0, |
|
1.0, 1.0, 1.0, 1.0, 1.0, |
|
1.0, 1.0, 1.0, 1.0, 1.0, |
|
1.0, 1.0, 1.0, 1.0, 1.0, |
|
1.0, 1.0, 1.0, 1.0, 1.0, |
|
1.0, 1.0, 1.0, 1.0, 1.0, |
|
1.0, 1.0, 1.0, 1.0, 1.0, |
|
1.0, 1.0, 1.0, 1.0, 1.0, |
|
1.0, 1.0, 1.0, 1.0, 1.0, |
|
1.0, 1.0, 1.0, 1.0, 1.0, |
|
1.0, 1.0, 1.0, 1.0, 1.0, |
|
1.0, 1.0, 1.0, ] |
|
sample_weight: [0.00172477, 0.05791431, 0.2792891 , 0.00459644, 0.01987675, |
|
0.06484867, 0.02327336, 0.01420398, 0.06937013, 0.03476447, |
|
0.08533858, 0.0091179 , 0.0125145 , 0.02894172, 0.00816949, |
|
0.17255632, 0.00890175, 0.00613153, 0.00838123, 0.07975844, |
|
0.03529381, 0.07885856, 0.06067129, 0.02532455, 0.00429207, |
|
0.06790121, 0.02532014, 0.00639179, 0.02070164, 0.00790041, |
|
0.01142935, 0.00823125, 0.00310547, 0.00732696, 0.08890281, |
|
0.00265994, 0.12081324, 0.16404275, 0.010578 , 0.09486231, |
|
0.040896 , 0.23313939, 0.02223673, 0.28135352, 0.01603462, |
|
0.01012806, 0.00799305, 0.01450835, 0.00697848, 0.00314958, |
|
0.00536399, 0.00762692, 0.03982408, 0.00306577, |
|
0.01728739, 0.0714522 , 0.23161312, 0.16539257, 0.01964296, |
|
0.0599655 , 0.04277957, 0.01663895, 0.00187475, 0.00670499, |
|
0.0128674 , 0.28255336, 0.06885843, 0.0455939 , 0.00238203, |
|
0.07344605, 0.07651623, 0.06356061, 0.00378038, 0.00534193, |
|
0.36698324, 0.02468052, 0.18279907, 0.14001068, 0.1169667 , |
|
0.14002832, |
|
0.00080283, 0.04727897, 0.05596016, 0.00868119, 0.00850474, |
|
0.00013234, 0.02891966, 0.0113279 , 0.00466261, 0.00932522, |
|
0.04154444, 0.00932522, 0.00466261, 0.0113279 , 0.0128277 , |
|
0.05136371, 0.05703648, 0.00839005, 0.00951049, 0.10332735, |
|
0.04794505, 0.01736679, 0.05591605, 0.04794505, 0.01736679, |
|
0.05591605, 0.04949779, 0.01482155, 0.05690856, 0.04949779, |
|
0.01482155, 0.05690856, 0.00515225, 0.00014998, 0.11592566, |
|
0.02974014, 0.00336131, 0.08812644, 0.00546986, 0.00292902, |
|
0.11282902, 0.03215746, 0.00087341, 0.08819702, |
|
0.01577436, 0.01377169, 0.00681968, 0.02183531, 0.00826654, |
|
0.00613153, 0.0091179 , 0.00096605, 0.00241732, 0.00012792, |
|
0.00481259, 0.00091752, 0.00754752, 0.00346277, 0.00502433, |
|
0.00635209, 0.00219676, 0.00692113, 0.01726093, 0.00282756, |
|
0.04876553, 0.03532027, 0.05422657, 0.01836813, 0.00129247, |
|
0.0237233 , 0.00093958, 0.04455727, 0.01074562, 0.00082048, |
|
0.07086552, 0.02805507, 0.0062771 , 0.02825357, 0.0273978 , |
|
0.05809076, 0.00874295, 0.01927683, 0.01020305, 0.04525424, |
|
0.01257185, 0.00412004, 0.03352934, 0.00677998, |
|
] |
|
|
|
|
|
decoder: |
|
type: UniHCPv2_Head |
|
kwargs: |
|
predictor: 'hulk' |
|
task: recons |
|
modality_share_list: ['predictor.mask_token'] |
|
task_sp_list: [ |
|
'predictor.query_embed_patch', |
|
'predictor.query_embed_label', |
|
'predictor.class_embed','predictor.fc_bias', |
|
] |
|
loss_weight: 1.0 |
|
transformer_predictor_cfg: |
|
hidden_dim: 256 |
|
num_queries: 20 |
|
nheads: 8 |
|
dim_feedforward: 2048 |
|
dec_layers: 9 |
|
pre_norm: False |
|
arch: fan_in |
|
enforce_input_project: False |
|
mask_on: False |
|
num_feature_levels: 1 |
|
cross_pos_embed: anchor |
|
self_attn_mask_type: patch_diag_label_row |
|
cls_out_dim: 1 |
|
detach_from_peddet: True |
|
loss_cfg: |
|
type: CEL_Sigmoid |
|
|
|
6: |
|
name: attr_luperson |
|
loss_weight: 5 |
|
gres_ratio: 1 |
|
backbone: |
|
type: vit_base_patch16_mask |
|
kwargs: |
|
task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] |
|
pretrained: True |
|
lms_checkpoint_train: fairscale |
|
window: False |
|
test_pos_mode: False |
|
learnable_pos: True |
|
drop_path_rate: 0.2 |
|
img_size: 1344 |
|
num_encoded_tokens: 192 |
|
vis_patch_token_ratio: 1 |
|
vis_label_token_ratio: 0. |
|
dataset: |
|
type: MultiAttrDataset |
|
kwargs: |
|
text_label_return: True |
|
task_spec: |
|
dataset: |
|
- lup_0_600w |
|
- lup_600_1200w |
|
data_path: |
|
- /mnt/path...to.../attribute/dataset_0_600w_pjlab.pkl |
|
- /mnt/path...to.../attribute/dataset_600_1200w_pjlab.pkl |
|
root_path: |
|
- /mnt/path...to.../reid/LUPerson-NL/LUPws |
|
- /mnt/path...to.../reid/LUPerson-NL/LUPws |
|
augmentation: |
|
height: 256 |
|
width: 192 |
|
|
|
sampler: |
|
batch_size: 300 |
|
shuffle_strategy: 1 |
|
|
|
patch_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: rgb |
|
|
|
patch_adapter: |
|
type: rgb_adapter |
|
kwargs: |
|
pretrained: True |
|
stride_level: 1 |
|
in_chans: 3 |
|
learnable_pos: False |
|
test_pos_mode: False |
|
img_size: [ 256, 192 ] |
|
task_sp_list: [ 'pos_embed' ] |
|
|
|
patch_proj: |
|
type: rgb_projector |
|
kwargs: |
|
loss_cfg: |
|
type: MaskedMSELoss |
|
kwargs: |
|
stride: 1 |
|
norm_pix_loss: True |
|
pix_loss: True |
|
pix_loss_weight: 1. |
|
norm_pix_loss_weight: 1. |
|
|
|
label_adapter: |
|
type: text_adapter |
|
kwargs: |
|
pretrained: True |
|
task_sp_list: [ 'text_vectors' ] |
|
one_way_semantics: True |
|
description_dict_name: 'lup_lup_attr_base' |
|
label_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: text |
|
label_proj: |
|
type: text_projector |
|
kwargs: |
|
task_sp_list: [ 'text_vectors', |
|
'translate_weight', |
|
'translate_bias', |
|
'post_mul_norm', ] |
|
one_way_semantics: True |
|
post_mul_norm: True |
|
replace_post_mul_norm: False |
|
translate_weight_scale: 5 |
|
description_dict_name: 'lup_lup_attr_base' |
|
pre_proj_type: '' |
|
loss_cfg: |
|
type: MaskedOneSideBCELoss |
|
kwargs: |
|
loss_weight: 1. |
|
use_focal_weight: True |
|
sample_weight: [ 3.705390e-01, 6.184500e-03, 6.679500e-03, 9.445730e-01, |
|
3.924500e-02, 4.686065e-01, 7.492855e-01, 6.642300e-02, |
|
7.882115e-01, 1.606450e-02, 1.043025e-01, 8.040050e-02, |
|
1.102100e-02, 5.510935e-01, 4.074950e-02, 1.142160e-01, |
|
3.731000e-02, 5.566250e-02, 1.852115e-01, 1.524850e-02, |
|
5.085000e-04, 9.421990e-01, 1.484350e-02, 3.347200e-02, |
|
5.750000e-03, 3.735500e-03, 1.509560e-01, 3.741515e-01, |
|
3.318200e-02, 2.215850e-02, 4.213145e-01, 5.177550e-02, |
|
3.974550e-02, 3.878800e-01, 1.321270e-01, 1.337740e-01, |
|
9.478400e-02, 3.324350e-02, 1.095815e-01, 2.231600e-02, |
|
1.592250e-02, 2.386005e-01, 1.999500e-01, 1.321300e-02, |
|
7.382405e-01, 4.859650e-02, 2.932510e-01, 8.297100e-02, |
|
9.567325e-01, 2.430700e-02, 3.554500e-03, 1.751500e-03, |
|
3.705390e-01, 6.184500e-03, 6.679500e-03, 9.445730e-01, |
|
3.924500e-02, 4.686065e-01, 7.492855e-01, 6.642300e-02, |
|
7.882115e-01, 1.606450e-02, 1.043025e-01, 8.040050e-02, |
|
1.102100e-02, 5.510935e-01, 4.074950e-02, 1.142160e-01, |
|
3.731000e-02, 5.566250e-02, 1.852115e-01, 1.524850e-02, |
|
5.085000e-04, 9.421990e-01, 1.484350e-02, 3.347200e-02, |
|
5.750000e-03, 3.735500e-03, 1.509560e-01, 3.741515e-01, |
|
3.318200e-02, 2.215850e-02, 4.213145e-01, 5.177550e-02, |
|
3.974550e-02, 3.878800e-01, 1.321270e-01, 1.337740e-01, |
|
9.478400e-02, 3.324350e-02, 1.095815e-01, 2.231600e-02, |
|
1.592250e-02, 2.386005e-01, 1.999500e-01, 1.321300e-02, |
|
7.382405e-01, 4.859650e-02, 2.932510e-01, 8.297100e-02, |
|
9.567325e-01, 2.430700e-02, 3.554500e-03, 1.751500e-03 |
|
] |
|
|
|
|
|
decoder: |
|
type: UniHCPv2_Head |
|
kwargs: |
|
predictor: 'hulk' |
|
task: recons |
|
modality_share_list: ['predictor.mask_token'] |
|
task_sp_list: [ |
|
'predictor.query_embed_patch', |
|
'predictor.query_embed_label', |
|
|
|
|
|
'predictor.class_embed','predictor.fc_bias', |
|
] |
|
loss_weight: 1.0 |
|
transformer_predictor_cfg: |
|
hidden_dim: 256 |
|
num_queries: 20 |
|
nheads: 8 |
|
dim_feedforward: 2048 |
|
dec_layers: 9 |
|
pre_norm: False |
|
arch: fan_in |
|
enforce_input_project: False |
|
mask_on: False |
|
num_feature_levels: 1 |
|
cross_pos_embed: anchor |
|
self_attn_mask_type: patch_diag_label_row |
|
cls_out_dim: 1 |
|
detach_from_peddet: True |
|
loss_cfg: |
|
type: CEL_Sigmoid |
|
|
|
7: |
|
name: image_caption_joint |
|
loss_weight: 90 |
|
gres_ratio: 3 |
|
dataset: |
|
type: CocoCaption |
|
kwargs: |
|
bert_dir: /mnt/path...to.../Hulk/experiments/release/bert-base-uncased |
|
max_words: 40 |
|
img_size: 384 |
|
prompt: '' |
|
split_type: train |
|
joint_train: True |
|
joint_train_anno_root: /mnt/path...to.../textreid/joint_reid_caption_train.json |
|
synth_peds_root: /mnt/path...to.../textreid/SYNTH-PEDES/ |
|
cuhk_peds_root: /mnt/path...to.../textreid/CUHK-PEDES/imgs/ |
|
mals_root: /mnt/path...to.../textreid/MALS |
|
luperson_root: /mnt/path...to.../textreid/LUPerson-T/imgs/ |
|
|
|
sampler: |
|
batch_size: 100 |
|
shuffle_strategy: 1 |
|
|
|
backbone: |
|
type: vit_base_patch16_mask |
|
kwargs: |
|
task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] |
|
pretrained: True |
|
lms_checkpoint_train: fairscale |
|
window: False |
|
test_pos_mode: learnable_interpolate |
|
learnable_pos: True |
|
drop_path_rate: 0.2 |
|
|
|
vis_patch_token_ratio: 1 |
|
vis_label_token_ratio: 0. |
|
|
|
patch_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: rgb |
|
|
|
label_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: text |
|
|
|
patch_adapter: |
|
type: rgb_adapter |
|
kwargs: |
|
pretrained: True |
|
stride_level: 1 |
|
in_chans: 3 |
|
learnable_pos: False |
|
test_pos_mode: False |
|
img_size: [ 384, 384 ] |
|
task_sp_list: [ 'pos_embed' ] |
|
|
|
label_adapter: |
|
type: text_adapter |
|
kwargs: |
|
image_caption: True |
|
pretrained: True |
|
max_tokens: 40 |
|
task_sp_list: [ ] |
|
|
|
|
|
|
|
patch_proj: |
|
type: rgb_projector |
|
kwargs: |
|
loss_cfg: |
|
type: MaskedMSELoss |
|
kwargs: |
|
stride: 1 |
|
norm_pix_loss: True |
|
pix_loss: True |
|
pix_loss_weight: 1. |
|
norm_pix_loss_weight: 1. |
|
|
|
label_proj: |
|
type: text_projector |
|
kwargs: |
|
description_dict_name: caption_bert |
|
image_caption: True |
|
one_way_semantics: True |
|
post_mul_norm: True |
|
loss_cfg: |
|
type: LabelSmoothingCrossEntropy |
|
kwargs: |
|
epsilon: 0.1 |
|
loss_weight: 1. |
|
task_sp_list: [ 'post_mul_norm', |
|
'text_vectors', |
|
'loss_fn'] |
|
|
|
decoder: |
|
type: UniHCPv2_Head |
|
kwargs: |
|
predictor: 'hulk' |
|
task: recons |
|
modality_share_list: ['predictor.mask_token'] |
|
task_sp_list: [ |
|
'predictor.query_embed_patch', |
|
'predictor.query_embed_label', |
|
'predictor.mask_token_buffer', |
|
'predictor.mask_token_proj', |
|
'predictor.captiontoken_ln', |
|
'predictor.class_embed','predictor.fc_bias', |
|
] |
|
loss_weight: 1.0 |
|
transformer_predictor_cfg: |
|
hidden_dim: 256 |
|
num_queries: 20 |
|
nheads: 8 |
|
dim_feedforward: 2048 |
|
dec_layers: 9 |
|
pre_norm: False |
|
arch: fan_in |
|
enforce_input_project: False |
|
mask_on: False |
|
num_feature_levels: 1 |
|
cross_pos_embed: anchor |
|
cls_out_dim: 1 |
|
self_attn_mask_type: caption_mask |
|
caption_cfgs: { nn.parameter: True, vocal_size: 30522, lndo: True ,bert_feats_for_embedding: True } |
|
mask_token_normal_init: True |
|
detach_from_peddet: True |
|
loss_cfg: |
|
type: CEL_Sigmoid |
|
|
|
8: |
|
name: cocopose_256x192 |
|
loss_weight: 28000 |
|
gres_ratio: 3 |
|
backbone: |
|
type: vit_base_patch16_mask |
|
kwargs: |
|
task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] |
|
pretrained: True |
|
lms_checkpoint_train: fairscale |
|
window: False |
|
test_pos_mode: False |
|
learnable_pos: True |
|
drop_path_rate: 0.2 |
|
img_size: 1344 |
|
num_encoded_tokens: 192 |
|
vis_patch_token_ratio: 1 |
|
vis_label_token_ratio: 0. |
|
|
|
dataset: |
|
type: COCOPosDatasetDev |
|
kwargs: |
|
ann_file: /mnt/path...to.../pose_public/coco/annotations/person_keypoints_train2017.json |
|
img_prefix: /mnt/path...to.../pose_public/coco/train2017/ |
|
use_udp: True |
|
data_use_ratio: 1 |
|
data_cfg: { |
|
'image_size':[192, 256], |
|
'heatmap_size':[48, 64], |
|
'num_output_channels': 17, |
|
'num_joints': 17, |
|
'dataset_channel': [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],], |
|
'inference_channel': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], |
|
'soft_nms': False, |
|
'nms_thr': 1.0, |
|
'oks_thr': 0.9, |
|
'vis_thr': 0.2, |
|
'use_gt_bbox': False, |
|
'det_bqbox_thr': 0.0, |
|
'bbox_file': './COCO_val2017_detections_AP_H_56_person.json' |
|
} |
|
sampler: |
|
batch_size: 176 |
|
shuffle_strategy: 1 |
|
|
|
patch_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: rgb |
|
|
|
label_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: dense_labeling |
|
|
|
patch_adapter: |
|
type: rgb_adapter |
|
kwargs: |
|
pretrained: True |
|
stride_level: 1 |
|
in_chans: 3 |
|
learnable_pos: False |
|
test_pos_mode: False |
|
img_size: [ 256, 192 ] |
|
task_sp_list: [ 'pos_embed' ] |
|
|
|
label_adapter: |
|
type: dense_labeling_adapter |
|
kwargs: |
|
pretrained: True |
|
stride_level: 1 |
|
in_chans: 17 |
|
learnable_pos: False |
|
test_pos_mode: False |
|
img_size: [ 256, 192 ] |
|
dim_class_embed: 64 |
|
emb_padding_idx: 255 |
|
task_sp_list: [ 'pos_embed', |
|
'class_embed',] |
|
|
|
|
|
|
|
patch_proj: |
|
type: rgb_projector |
|
kwargs: |
|
loss_cfg: |
|
type: MaskedMSELoss |
|
kwargs: |
|
stride: 1 |
|
norm_pix_loss: True |
|
pix_loss: True |
|
pix_loss_weight: 1. |
|
norm_pix_loss_weight: 1. |
|
|
|
|
|
label_proj: |
|
type: dense_labeling_projector |
|
kwargs: |
|
task_sp_list: [ 'post_mul_norm', |
|
'loss_fn', |
|
'upsample_network', |
|
'text_features',] |
|
emb_padding_idx: 255 |
|
post_mul_norm: True |
|
replace_post_mul_norm: False |
|
translate_weight_scale: 1 |
|
cls_loss_branch: True |
|
description_dict_name: checked_pose_coco_name |
|
upsample_hidden_dim: 256 |
|
task: pose |
|
loss_cfg: |
|
type: POS_FocalDiceLoss_bce_cls_emb |
|
kwargs: |
|
target_type: GaussianHeatMap |
|
cfg: |
|
num_classes: 17 |
|
deep_supervision: True |
|
ignore_blank: False |
|
class_weight: 0.001 |
|
dice_weight: 0.0 |
|
mask_weight: 1.0 |
|
redundant_queries: 1 |
|
dec_layers: 9 |
|
sample_weight: [ 0.38647058, 0.33606767, 0.33835369, 0.29253424, 0.29636332, |
|
0.4987484 , 0.49978854, 0.39467358, 0.40091822, 0.36039853, |
|
0.36918446, 0.43343303, 0.4345989 , 0.32999829, 0.33092793, |
|
0.27714171, 0.27754939 ] |
|
eos_coef: 0.1 |
|
|
|
decoder: |
|
type: UniHCPv2_Head |
|
kwargs: |
|
predictor: 'hulk' |
|
task: recons |
|
modality_share_list: ['predictor.mask_token'] |
|
task_sp_list: [ |
|
'predictor.query_embed_patch', |
|
'predictor.query_embed_label', |
|
'predictor.class_embed', 'predictor.fc_bias', |
|
] |
|
loss_weight: 1.0 |
|
transformer_predictor_cfg: |
|
hidden_dim: 256 |
|
num_queries: 20 |
|
nheads: 8 |
|
dim_feedforward: 2048 |
|
dec_layers: 9 |
|
pre_norm: False |
|
arch: fan_in |
|
enforce_input_project: False |
|
mask_on: False |
|
intermediate_output: True |
|
num_feature_levels: 1 |
|
cross_pos_embed: anchor |
|
cls_out_dim: 1 |
|
patch_pos_mode: False |
|
|
|
|
|
label_pos_mode: False |
|
self_attn_mask_type: full |
|
|
|
|
|
detach_from_peddet: True |
|
adding_per_layer_pe: True |
|
use_adapt_pos2d: True |
|
loss_cfg: |
|
type: CEL_Sigmoid |
|
|
|
9: |
|
name: aic |
|
loss_weight: 56000 |
|
gres_ratio: 7 |
|
backbone: |
|
type: vit_base_patch16_mask |
|
kwargs: |
|
task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] |
|
pretrained: True |
|
lms_checkpoint_train: fairscale |
|
window: False |
|
test_pos_mode: False |
|
learnable_pos: True |
|
drop_path_rate: 0.2 |
|
vis_patch_token_ratio: 1 |
|
vis_label_token_ratio: 0. |
|
|
|
dataset: |
|
type: MultiPoseDatasetDev |
|
kwargs: |
|
dataset_name: aic |
|
ann_file: /mnt/path...to.../pose_public/ai_challenge/annotations/aic_train.json |
|
img_prefix: /mnt/path...to.../pose_public/ai_challenge/ai_challenger_keypoint_train_20170902/keypoint_train_images_20170902/ |
|
use_udp: True |
|
data_use_ratio: 1 |
|
data_cfg: { |
|
'image_size': [ 192, 256 ], |
|
'heatmap_size': [ 48, 64 ], |
|
'num_output_channels': 14, |
|
'num_joints': 14, |
|
'dataset_channel': [ [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 ], ], |
|
'inference_channel': [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 ], |
|
|
|
'flip_pairs': [ [ 0, 3 ], [ 1, 4 ], [ 2, 5 ], [ 6, 9 ], [ 7, 10 ], [ 8, 11 ], ], |
|
'upper_body_ids': [ 0, 1, 2, 3, 4, 5, 12, 13 ], |
|
'lower_body_ids': [ 6, 7, 8, 9, 10, 11 ], |
|
'use_different_joint_weights': False, |
|
'joint_weights': [ 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1.2, 1.5, 1., 1. ], |
|
|
|
'soft_nms': False, |
|
'nms_thr': 1.0, |
|
'oks_thr': 0.9, |
|
'vis_thr': 0.2, |
|
'use_gt_bbox': False, |
|
'det_bqbox_thr': 0.0, |
|
'bbox_file': './COCO_val2017_detections_AP_H_56_person.json' |
|
} |
|
sampler: |
|
batch_size: 189 |
|
shuffle_strategy: 1 |
|
|
|
patch_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: rgb |
|
|
|
label_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: dense_labeling |
|
|
|
patch_adapter: |
|
type: rgb_adapter |
|
kwargs: |
|
pretrained: True |
|
stride_level: 1 |
|
in_chans: 3 |
|
learnable_pos: False |
|
test_pos_mode: False |
|
img_size: [ 256, 192 ] |
|
task_sp_list: [ 'pos_embed' ] |
|
|
|
|
|
label_adapter: |
|
type: dense_labeling_adapter |
|
kwargs: |
|
pretrained: True |
|
stride_level: 1 |
|
in_chans: 14 |
|
learnable_pos: False |
|
test_pos_mode: False |
|
img_size: [ 256, 192 ] |
|
dim_class_embed: 64 |
|
emb_padding_idx: 255 |
|
task_sp_list: [ 'pos_embed', |
|
'class_embed', ] |
|
|
|
|
|
|
|
patch_proj: |
|
type: rgb_projector |
|
kwargs: |
|
loss_cfg: |
|
type: MaskedMSELoss |
|
kwargs: |
|
stride: 1 |
|
norm_pix_loss: True |
|
pix_loss: True |
|
pix_loss_weight: 1. |
|
norm_pix_loss_weight: 1. |
|
|
|
|
|
label_proj: |
|
type: dense_labeling_projector |
|
kwargs: |
|
task_sp_list: [ 'post_mul_norm', |
|
'upsample_network', |
|
'loss_fn', |
|
'text_features', ] |
|
emb_padding_idx: 255 |
|
post_mul_norm: True |
|
replace_post_mul_norm: False |
|
translate_weight_scale: 1 |
|
cls_loss_branch: True |
|
description_dict_name: checked_pose_aic_name |
|
task: pose |
|
upsample_hidden_dim: 256 |
|
|
|
loss_cfg: |
|
type: POS_FocalDiceLoss_bce_cls_emb |
|
kwargs: |
|
target_type: GaussianHeatMap |
|
cfg: |
|
num_classes: 14 |
|
deep_supervision: True |
|
ignore_blank: False |
|
class_weight: 0.001 |
|
dice_weight: 0.0 |
|
mask_weight: 1.0 |
|
redundant_queries: 1 |
|
dec_layers: 9 |
|
sample_weight: [ 0.98064613, 0.977893565, 0.97715356, 0.98064613, 0.977893565, |
|
0.97715356, 0.9594528200000001, 0.85703431, 0.7504981850000001, |
|
0.9594528200000001, 0.85703431, 0.7504981850000001, 0.97149646, 0.98605877 ] |
|
|
|
eos_coef: 0.1 |
|
|
|
decoder: |
|
type: UniHCPv2_Head |
|
kwargs: |
|
predictor: 'hulk' |
|
task: recons |
|
modality_share_list: ['predictor.mask_token'] |
|
task_sp_list: [ |
|
'predictor.query_embed_patch', |
|
'predictor.query_embed_label', |
|
'predictor.class_embed','predictor.fc_bias', |
|
] |
|
loss_weight: 1.0 |
|
transformer_predictor_cfg: |
|
hidden_dim: 256 |
|
num_queries: 20 |
|
nheads: 8 |
|
dim_feedforward: 2048 |
|
dec_layers: 9 |
|
pre_norm: False |
|
arch: fan_in |
|
enforce_input_project: False |
|
mask_on: False |
|
intermediate_output: True |
|
num_feature_levels: 1 |
|
cross_pos_embed: anchor |
|
cls_out_dim: 1 |
|
patch_pos_mode: False |
|
label_pos_mode: False |
|
self_attn_mask_type: full |
|
detach_from_peddet: True |
|
adding_per_layer_pe: True |
|
use_adapt_pos2d: True |
|
|
|
loss_cfg: |
|
type: CEL_Sigmoid |
|
|
|
10: |
|
name: h36m_pose_256x256 |
|
loss_weight: 3192 |
|
gres_ratio: 2 |
|
backbone: |
|
type: vit_base_patch16_mask |
|
kwargs: |
|
task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] |
|
pretrained: True |
|
lms_checkpoint_train: fairscale |
|
window: False |
|
test_pos_mode: False |
|
learnable_pos: True |
|
drop_path_rate: 0.2 |
|
img_size: 1344 |
|
num_encoded_tokens: 192 |
|
vis_patch_token_ratio: 1 |
|
vis_label_token_ratio: 0. |
|
|
|
dataset: |
|
type: COCOPosDatasetDev |
|
kwargs: |
|
ann_file: /mnt/path...to.../pose_public/h36m/processed/annotation_body2d/h36m_coco_train.json |
|
img_prefix: /mnt/path...to.../pose_public/h36m/processed/images/ |
|
use_udp: True |
|
data_use_ratio: 1 |
|
data_cfg: { |
|
'image_size': [ 256, 256 ], |
|
'heatmap_size': [ 64, 64 ], |
|
'num_output_channels': 17, |
|
'num_joints': 17, |
|
'dataset_channel': [ [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 ], ], |
|
'inference_channel': [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 ], |
|
'soft_nms': False, |
|
'nms_thr': 1.0, |
|
'oks_thr': 0.9, |
|
'vis_thr': 0.2, |
|
'use_gt_bbox': True, |
|
'det_bqbox_thr': 0.0, |
|
'bbox_file': './COCO_val2017_detections_AP_H_56_person.json' |
|
} |
|
sampler: |
|
batch_size: 132 |
|
shuffle_strategy: 1 |
|
|
|
patch_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: rgb |
|
|
|
label_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: dense_labeling |
|
|
|
patch_adapter: |
|
type: rgb_adapter |
|
kwargs: |
|
pretrained: True |
|
stride_level: 1 |
|
in_chans: 3 |
|
learnable_pos: False |
|
test_pos_mode: False |
|
img_size: [ 256, 256 ] |
|
task_sp_list: [ 'pos_embed' ] |
|
|
|
label_adapter: |
|
type: dense_labeling_adapter |
|
kwargs: |
|
pretrained: True |
|
stride_level: 1 |
|
in_chans: 17 |
|
learnable_pos: False |
|
test_pos_mode: False |
|
img_size: [ 256, 256 ] |
|
dim_class_embed: 64 |
|
emb_padding_idx: 255 |
|
task_sp_list: [ 'pos_embed', |
|
'class_embed', ] |
|
|
|
patch_proj: |
|
type: rgb_projector |
|
kwargs: |
|
loss_cfg: |
|
type: MaskedMSELoss |
|
kwargs: |
|
stride: 1 |
|
norm_pix_loss: True |
|
pix_loss: True |
|
pix_loss_weight: 1. |
|
norm_pix_loss_weight: 1. |
|
|
|
|
|
label_proj: |
|
type: dense_labeling_projector |
|
kwargs: |
|
task_sp_list: [ 'post_mul_norm', |
|
'post_mul_norm_cls', |
|
'loss_fn', |
|
'upsample_network', |
|
'text_features', ] |
|
emb_padding_idx: 255 |
|
post_mul_norm: True |
|
replace_post_mul_norm: False |
|
translate_weight_scale: 1 |
|
cls_loss_branch: True |
|
description_dict_name: checked_pose_h3m6_name |
|
upsample_hidden_dim: 256 |
|
task: pose |
|
|
|
loss_cfg: |
|
type: POS_FocalDiceLoss_bce_cls_emb |
|
kwargs: |
|
target_type: GaussianHeatMap |
|
cfg: |
|
num_classes: 17 |
|
deep_supervision: True |
|
ignore_blank: False |
|
class_weight: 0.001 |
|
dice_weight: 0.0 |
|
mask_weight: 1.0 |
|
redundant_queries: 1 |
|
dec_layers: 9 |
|
sample_weight: [ 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1. ] |
|
eos_coef: 0.1 |
|
|
|
decoder: |
|
type: UniHCPv2_Head |
|
kwargs: |
|
predictor: 'hulk' |
|
task: recons |
|
modality_share_list: [ 'predictor.mask_token' ] |
|
task_sp_list: [ |
|
'predictor.query_embed_patch', |
|
'predictor.query_embed_label', |
|
'predictor.class_embed', 'predictor.fc_bias', |
|
] |
|
loss_weight: 1.0 |
|
transformer_predictor_cfg: |
|
hidden_dim: 256 |
|
num_queries: 20 |
|
nheads: 8 |
|
dim_feedforward: 2048 |
|
dec_layers: 9 |
|
pre_norm: False |
|
arch: fan_in |
|
enforce_input_project: False |
|
mask_on: False |
|
intermediate_output: True |
|
num_feature_levels: 1 |
|
cross_pos_embed: anchor |
|
cls_out_dim: 1 |
|
patch_pos_mode: False |
|
label_pos_mode: False |
|
self_attn_mask_type: full |
|
detach_from_peddet: True |
|
adding_per_layer_pe: True |
|
use_adapt_pos2d: True |
|
loss_cfg: |
|
type: CEL_Sigmoid |
|
|
|
11: |
|
name: posetrack_256x192 |
|
loss_weight: 12335 |
|
gres_ratio: 2 |
|
backbone: |
|
type: vit_base_patch16_mask |
|
kwargs: |
|
task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] |
|
pretrained: True |
|
lms_checkpoint_train: fairscale |
|
window: False |
|
test_pos_mode: False |
|
learnable_pos: True |
|
drop_path_rate: 0.2 |
|
img_size: 1344 |
|
num_encoded_tokens: 192 |
|
vis_patch_token_ratio: 1 |
|
vis_label_token_ratio: 0. |
|
|
|
dataset: |
|
type: MultiPoseDatasetDev |
|
kwargs: |
|
ann_file: /mnt/path...to.../pose_public/PoseChallenge2018/annotations/posetrack18_train.json |
|
img_prefix: /mnt/path...to.../pose_public/PoseChallenge2018/ |
|
use_udp: True |
|
dataset_name: 'posetrack' |
|
data_cfg: { |
|
'image_size':[192, 256], |
|
'heatmap_size':[48, 64], |
|
'num_output_channels': 15, |
|
'num_joints': 15, |
|
'dataset_channel': [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],], |
|
'inference_channel': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], |
|
|
|
'flip_pairs': [[3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14], ], |
|
'upper_body_ids': [0, 1, 2, 3, 4, 5, 6, 7, 8,], |
|
'lower_body_ids': [9, 10, 11, 12, 13, 14], |
|
'use_different_joint_weights': False, |
|
'joint_weights': [1., 1., 1., 1., 1., 1.2, 1.2, 1.5, 1.5, 1., 1., 1.2, 1.2, 1.5, 1.5], |
|
|
|
'soft_nms': False, |
|
'nms_thr': 1.0, |
|
'oks_thr': 0.9, |
|
'vis_thr': 0.2, |
|
'use_gt_bbox': True, |
|
'det_bbox_thr': 0.0, |
|
'bbox_file': 'COCO_val2017_detections_AP_H_56_person.json' |
|
} |
|
sampler: |
|
batch_size: 170 |
|
shuffle_strategy: 1 |
|
|
|
patch_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: rgb |
|
|
|
label_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: dense_labeling |
|
|
|
patch_adapter: |
|
type: rgb_adapter |
|
kwargs: |
|
pretrained: True |
|
stride_level: 1 |
|
in_chans: 3 |
|
learnable_pos: False |
|
test_pos_mode: False |
|
img_size: [ 256, 192 ] |
|
task_sp_list: [ 'pos_embed' ] |
|
|
|
label_adapter: |
|
type: dense_labeling_adapter |
|
kwargs: |
|
pretrained: True |
|
stride_level: 1 |
|
in_chans: 15 |
|
learnable_pos: False |
|
test_pos_mode: False |
|
img_size: [ 256, 192 ] |
|
dim_class_embed: 64 |
|
emb_padding_idx: 255 |
|
task_sp_list: [ 'pos_embed', |
|
'class_embed',] |
|
|
|
patch_proj: |
|
type: rgb_projector |
|
kwargs: |
|
loss_cfg: |
|
type: MaskedMSELoss |
|
kwargs: |
|
stride: 1 |
|
norm_pix_loss: True |
|
pix_loss: True |
|
pix_loss_weight: 1. |
|
norm_pix_loss_weight: 1. |
|
|
|
|
|
label_proj: |
|
type: dense_labeling_projector |
|
kwargs: |
|
task_sp_list: [ 'post_mul_norm', |
|
'post_mul_norm_cls', |
|
'loss_fn', |
|
'upsample_network', |
|
'text_features',] |
|
emb_padding_idx: 255 |
|
post_mul_norm: True |
|
replace_post_mul_norm: False |
|
translate_weight_scale: 1 |
|
cls_loss_branch: True |
|
description_dict_name: checked_pose_posetrack_name |
|
upsample_hidden_dim: 256 |
|
task: pose |
|
|
|
loss_cfg: |
|
type: POS_FocalDiceLoss_bce_cls_emb |
|
kwargs: |
|
target_type: GaussianHeatMap |
|
cfg: |
|
num_classes: 15 |
|
deep_supervision: True |
|
ignore_blank: False |
|
class_weight: 0.001 |
|
dice_weight: 0.0 |
|
mask_weight: 1.0 |
|
redundant_queries: 1 |
|
dec_layers: 9 |
|
sample_weight: [ 0.81831569, 0.75692071, 0.74175951, |
|
0.789882655, 0.789882655, 0.659771425, 0.659771425, 0.625614735, |
|
0.625614735, 0.737772405, 0.737772405, 0.665022735, 0.665022735, |
|
0.59563039, 0.5956303 |
|
] |
|
eos_coef: 0.1 |
|
|
|
decoder: |
|
type: UniHCPv2_Head |
|
kwargs: |
|
predictor: 'hulk' |
|
task: recons |
|
modality_share_list: ['predictor.mask_token'] |
|
task_sp_list: [ |
|
'predictor.query_embed_patch', |
|
'predictor.query_embed_label', |
|
'predictor.class_embed', 'predictor.fc_bias', |
|
] |
|
loss_weight: 1.0 |
|
transformer_predictor_cfg: |
|
hidden_dim: 256 |
|
num_queries: 20 |
|
nheads: 8 |
|
dim_feedforward: 2048 |
|
dec_layers: 9 |
|
pre_norm: False |
|
arch: fan_in |
|
enforce_input_project: False |
|
mask_on: False |
|
intermediate_output: True |
|
num_feature_levels: 1 |
|
cross_pos_embed: anchor |
|
cls_out_dim: 1 |
|
patch_pos_mode: False |
|
|
|
|
|
label_pos_mode: False |
|
self_attn_mask_type: full |
|
|
|
|
|
detach_from_peddet: True |
|
adding_per_layer_pe: True |
|
use_adapt_pos2d: True |
|
loss_cfg: |
|
type: CEL_Sigmoid |
|
|
|
12: |
|
name: jrdb_256x192 |
|
loss_weight: 8223 |
|
gres_ratio: 2 |
|
backbone: |
|
type: vit_base_patch16_mask |
|
kwargs: |
|
task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] |
|
pretrained: True |
|
lms_checkpoint_train: fairscale |
|
window: False |
|
test_pos_mode: False |
|
learnable_pos: True |
|
drop_path_rate: 0.2 |
|
img_size: 1344 |
|
num_encoded_tokens: 192 |
|
vis_patch_token_ratio: 1 |
|
vis_label_token_ratio: 0. |
|
|
|
dataset: |
|
type: MultiPoseDatasetDev |
|
kwargs: |
|
ann_file: /mnt/path...to.../pose_public/JRDB2019/train.json |
|
img_prefix: /mnt/path...to.../pose_public/JRDB2022/images/ |
|
use_udp: True |
|
dataset_name: 'JRDB2022' |
|
data_cfg: { |
|
'image_size':[192, 256], |
|
'heatmap_size':[48, 64], |
|
'num_output_channels': 17, |
|
'num_joints': 17, |
|
'dataset_channel': [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],], |
|
'inference_channel': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,], |
|
'flip_pairs': [[2, 5], [3, 6], [4, 7], [8, 11], [9, 12], [10, 13], ], |
|
'upper_body_ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 11, 14, 15, 16,], |
|
'lower_body_ids': [9, 10, 12, 13], |
|
'use_different_joint_weights': False, |
|
'joint_weights': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], |
|
|
|
'soft_nms': False, |
|
'nms_thr': 1.0, |
|
'oks_thr': 0.9, |
|
'vis_thr': 0.2, |
|
'use_gt_bbox': True, |
|
'det_bbox_thr': 0.0, |
|
'bbox_file': 'COCO_val2017_detections_AP_H_56_person.json' |
|
|
|
} |
|
sampler: |
|
batch_size: 170 |
|
shuffle_strategy: 1 |
|
|
|
patch_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: rgb |
|
|
|
label_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: dense_labeling |
|
|
|
patch_adapter: |
|
type: rgb_adapter |
|
kwargs: |
|
pretrained: True |
|
stride_level: 1 |
|
in_chans: 3 |
|
learnable_pos: False |
|
test_pos_mode: False |
|
img_size: [ 256, 192 ] |
|
task_sp_list: [ 'pos_embed' ] |
|
|
|
label_adapter: |
|
type: dense_labeling_adapter |
|
kwargs: |
|
pretrained: True |
|
stride_level: 1 |
|
in_chans: 17 |
|
learnable_pos: False |
|
test_pos_mode: False |
|
img_size: [ 256, 192 ] |
|
dim_class_embed: 64 |
|
emb_padding_idx: 255 |
|
task_sp_list: [ 'pos_embed', |
|
'class_embed', ] |
|
|
|
patch_proj: |
|
type: rgb_projector |
|
kwargs: |
|
loss_cfg: |
|
type: MaskedMSELoss |
|
kwargs: |
|
stride: 1 |
|
norm_pix_loss: True |
|
pix_loss: True |
|
pix_loss_weight: 1. |
|
norm_pix_loss_weight: 1. |
|
|
|
|
|
label_proj: |
|
type: dense_labeling_projector |
|
kwargs: |
|
task_sp_list: [ 'post_mul_norm', |
|
'post_mul_norm_cls', |
|
'loss_fn', |
|
'upsample_network', |
|
'text_features', ] |
|
emb_padding_idx: 255 |
|
post_mul_norm: True |
|
replace_post_mul_norm: False |
|
translate_weight_scale: 1 |
|
cls_loss_branch: True |
|
description_dict_name: checked_pose_jrdb_name |
|
upsample_hidden_dim: 256 |
|
task: pose |
|
|
|
loss_cfg: |
|
type: POS_FocalDiceLoss_bce_cls_emb |
|
kwargs: |
|
target_type: GaussianHeatMap |
|
cfg: |
|
num_classes: 17 |
|
deep_supervision: True |
|
ignore_blank: False |
|
class_weight: 0.001 |
|
dice_weight: 0.0 |
|
mask_weight: 1.0 |
|
redundant_queries: 1 |
|
dec_layers: 9 |
|
sample_weight: [ |
|
0.90384634, 0.82524231, 0.89927266, 0.90945538, 0.92796942, 0.89927266, |
|
0.90945538, 0.92796942, 0.9912784, 0.84353379, 0.97898463, 0.9912784, |
|
0.84353379, 0.97898463, 0.97418356, 0.94284516, 0.93372039, |
|
] |
|
eos_coef: 0.1 |
|
|
|
decoder: |
|
type: UniHCPv2_Head |
|
kwargs: |
|
predictor: 'hulk' |
|
task: recons |
|
modality_share_list: [ 'predictor.mask_token' ] |
|
task_sp_list: [ |
|
'predictor.query_embed_patch', |
|
'predictor.query_embed_label', |
|
'predictor.class_embed', 'predictor.fc_bias', |
|
] |
|
loss_weight: 1.0 |
|
transformer_predictor_cfg: |
|
hidden_dim: 256 |
|
num_queries: 20 |
|
nheads: 8 |
|
dim_feedforward: 2048 |
|
dec_layers: 9 |
|
pre_norm: False |
|
arch: fan_in |
|
enforce_input_project: False |
|
mask_on: False |
|
intermediate_output: True |
|
num_feature_levels: 1 |
|
cross_pos_embed: anchor |
|
cls_out_dim: 1 |
|
patch_pos_mode: False |
|
label_pos_mode: False |
|
self_attn_mask_type: full |
|
detach_from_peddet: True |
|
adding_per_layer_pe: True |
|
use_adapt_pos2d: True |
|
loss_cfg: |
|
type: CEL_Sigmoid |
|
|
|
13: |
|
name: MHP_256x192 |
|
loss_weight: 3192 |
|
gres_ratio: 1 |
|
backbone: |
|
type: vit_base_patch16_mask |
|
kwargs: |
|
task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] |
|
pretrained: True |
|
lms_checkpoint_train: fairscale |
|
window: False |
|
test_pos_mode: False |
|
learnable_pos: True |
|
drop_path_rate: 0.2 |
|
img_size: 1344 |
|
num_encoded_tokens: 192 |
|
vis_patch_token_ratio: 1 |
|
vis_label_token_ratio: 0. |
|
|
|
dataset: |
|
type: MultiPoseDatasetDev |
|
kwargs: |
|
ann_file: /mnt/path...to.../pose_public/pose_MHPv2/train.json |
|
img_prefix: /mnt/path...to.../pose_public/pose_MHPv2/train/images |
|
use_udp: True |
|
dataset_name: 'mhp' |
|
data_cfg: { |
|
'image_size':[192, 256], |
|
'heatmap_size':[48, 64], |
|
'num_output_channels': 16, |
|
'num_joints': 16, |
|
'dataset_channel': [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,],], |
|
'inference_channel': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,], |
|
|
|
'flip_pairs': [[0, 5], [1, 4], [2, 3], [10, 15], [11, 14], [12, 13], ], |
|
'upper_body_ids': [7, 8, 9, 10, 11, 12, 13, 14, 15], |
|
'lower_body_ids': [0, 1, 2, 3, 4, 5, 6], |
|
'use_different_joint_weights': False, |
|
'joint_weights': [1.5, 1.2, 1., 1., 1.2, 1.5, 1., 1., 1., 1., 1.5, 1.2, 1., 1., 1.2, 1.5], |
|
|
|
'soft_nms': False, |
|
'nms_thr': 1.0, |
|
'oks_thr': 0.9, |
|
'vis_thr': 0.2, |
|
'use_gt_bbox': True, |
|
'det_bbox_thr': 0.0, |
|
'bbox_file': 'COCO_val2017_detections_AP_H_56_person.json' |
|
|
|
} |
|
sampler: |
|
batch_size: 132 |
|
shuffle_strategy: 1 |
|
|
|
patch_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: rgb |
|
|
|
|
|
label_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: dense_labeling |
|
|
|
patch_adapter: |
|
type: rgb_adapter |
|
kwargs: |
|
pretrained: True |
|
stride_level: 1 |
|
in_chans: 3 |
|
learnable_pos: False |
|
test_pos_mode: False |
|
img_size: [ 256, 192 ] |
|
task_sp_list: [ 'pos_embed' ] |
|
|
|
label_adapter: |
|
type: dense_labeling_adapter |
|
kwargs: |
|
pretrained: True |
|
stride_level: 1 |
|
in_chans: 16 |
|
learnable_pos: False |
|
test_pos_mode: False |
|
img_size: [ 256, 192 ] |
|
dim_class_embed: 64 |
|
emb_padding_idx: 255 |
|
task_sp_list: [ 'pos_embed', |
|
'class_embed',] |
|
|
|
patch_proj: |
|
type: rgb_projector |
|
kwargs: |
|
loss_cfg: |
|
type: MaskedMSELoss |
|
kwargs: |
|
stride: 1 |
|
norm_pix_loss: True |
|
pix_loss: True |
|
pix_loss_weight: 1. |
|
norm_pix_loss_weight: 1. |
|
|
|
|
|
label_proj: |
|
type: dense_labeling_projector |
|
kwargs: |
|
task_sp_list: [ 'post_mul_norm', |
|
'post_mul_norm_cls', |
|
'loss_fn', |
|
'upsample_network', |
|
'text_features',] |
|
emb_padding_idx: 255 |
|
post_mul_norm: True |
|
replace_post_mul_norm: False |
|
translate_weight_scale: 1 |
|
cls_loss_branch: True |
|
description_dict_name: checked_pose_mhp_name |
|
upsample_hidden_dim: 256 |
|
|
|
task: pose |
|
|
|
loss_cfg: |
|
type: POS_FocalDiceLoss_bce_cls_emb |
|
kwargs: |
|
target_type: GaussianHeatMap |
|
cfg: |
|
num_classes: 16 |
|
deep_supervision: True |
|
ignore_blank: False |
|
class_weight: 0.001 |
|
dice_weight: 0.0 |
|
mask_weight: 1.0 |
|
redundant_queries: 1 |
|
dec_layers: 9 |
|
sample_weight: [ 0.463188095, 0.6055728499999999, 0.732992125, 0.732992125, 0.6055728499999999, |
|
0.463188095, 0.74209784, 0.92598716, 0.9642093, 0.98767263, |
|
0.67156195, 0.6861140800000001, 0.85427203, 0.85427203, 0.6861140800000001, |
|
0.67156195 |
|
] |
|
eos_coef: 0.1 |
|
|
|
decoder: |
|
type: UniHCPv2_Head |
|
kwargs: |
|
predictor: 'hulk' |
|
task: recons |
|
modality_share_list: ['predictor.mask_token'] |
|
task_sp_list: [ |
|
'predictor.query_embed_patch', |
|
'predictor.query_embed_label', |
|
'predictor.class_embed', 'predictor.fc_bias', |
|
] |
|
loss_weight: 1.0 |
|
transformer_predictor_cfg: |
|
hidden_dim: 256 |
|
num_queries: 20 |
|
nheads: 8 |
|
dim_feedforward: 2048 |
|
dec_layers: 9 |
|
pre_norm: False |
|
arch: fan_in |
|
enforce_input_project: False |
|
mask_on: False |
|
intermediate_output: True |
|
num_feature_levels: 1 |
|
cross_pos_embed: anchor |
|
cls_out_dim: 1 |
|
patch_pos_mode: False |
|
|
|
|
|
label_pos_mode: False |
|
self_attn_mask_type: full |
|
|
|
|
|
detach_from_peddet: True |
|
adding_per_layer_pe: True |
|
use_adapt_pos2d: True |
|
loss_cfg: |
|
type: CEL_Sigmoid |
|
|
|
|
|
14: |
|
name: mpi_inf_3dhp_256x192 |
|
loss_weight: 8223 |
|
gres_ratio: 2 |
|
backbone: |
|
type: vit_base_patch16_mask |
|
kwargs: |
|
task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] |
|
pretrained: True |
|
lms_checkpoint_train: fairscale |
|
window: False |
|
test_pos_mode: False |
|
learnable_pos: True |
|
drop_path_rate: 0.2 |
|
img_size: 1344 |
|
num_encoded_tokens: 192 |
|
vis_patch_token_ratio: 1 |
|
vis_label_token_ratio: 0. |
|
|
|
dataset: |
|
type: MultiPoseDatasetDev |
|
kwargs: |
|
ann_file: /mnt/path...to.../pose_public/mpi_inf_3dhp/train.json |
|
img_prefix: /mnt/path...to.../pose_public/mpi_inf_3dhp/processed/images/ |
|
use_udp: True |
|
dataset_name: '3DHP' |
|
data_cfg: { |
|
'image_size':[192, 256], |
|
'heatmap_size':[48, 64], |
|
'num_output_channels': 136, |
|
'num_joints': 17, |
|
'dataset_channel': [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],], |
|
'inference_channel': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,], |
|
'flip_pairs': [[2, 5], [3, 6], [4, 7], [8, 11], [9, 12], [10, 13], ], |
|
'upper_body_ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 11, 14, 15, 16,], |
|
'lower_body_ids': [9, 10, 12, 13], |
|
'use_different_joint_weights': False, |
|
'joint_weights': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], |
|
|
|
'soft_nms': False, |
|
'nms_thr': 1.0, |
|
'oks_thr': 0.9, |
|
'vis_thr': 0.2, |
|
'use_gt_bbox': True, |
|
'det_bbox_thr': 0.0, |
|
'bbox_file': 'COCO_val2017_detections_AP_H_56_person.json' |
|
|
|
} |
|
sampler: |
|
batch_size: 170 |
|
shuffle_strategy: 1 |
|
|
|
patch_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: rgb |
|
|
|
|
|
label_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: dense_labeling |
|
|
|
patch_adapter: |
|
type: rgb_adapter |
|
kwargs: |
|
pretrained: True |
|
stride_level: 1 |
|
in_chans: 3 |
|
learnable_pos: False |
|
test_pos_mode: False |
|
img_size: [ 256, 192 ] |
|
task_sp_list: [ 'pos_embed' ] |
|
|
|
label_adapter: |
|
type: dense_labeling_adapter |
|
kwargs: |
|
pretrained: True |
|
stride_level: 1 |
|
in_chans: 17 |
|
learnable_pos: False |
|
test_pos_mode: False |
|
img_size: [ 256, 192 ] |
|
dim_class_embed: 64 |
|
emb_padding_idx: 255 |
|
task_sp_list: [ 'pos_embed', |
|
'class_embed',] |
|
|
|
patch_proj: |
|
type: rgb_projector |
|
kwargs: |
|
loss_cfg: |
|
type: MaskedMSELoss |
|
kwargs: |
|
stride: 1 |
|
norm_pix_loss: True |
|
pix_loss: True |
|
pix_loss_weight: 1. |
|
norm_pix_loss_weight: 1. |
|
|
|
|
|
label_proj: |
|
type: dense_labeling_projector |
|
kwargs: |
|
task_sp_list: [ 'post_mul_norm', |
|
'post_mul_norm_cls', |
|
'loss_fn', |
|
'upsample_network', |
|
'text_features',] |
|
emb_padding_idx: 255 |
|
post_mul_norm: True |
|
|
|
replace_post_mul_norm: False |
|
translate_weight_scale: 1 |
|
cls_loss_branch: True |
|
description_dict_name: checked_pose_mpi_inf_3dhp_name |
|
upsample_hidden_dim: 256 |
|
|
|
task: pose |
|
|
|
loss_cfg: |
|
type: POS_FocalDiceLoss_bce_cls_emb |
|
kwargs: |
|
target_type: GaussianHeatMap |
|
cfg: |
|
num_classes: 17 |
|
deep_supervision: True |
|
ignore_blank: False |
|
class_weight: 0.001 |
|
dice_weight: 0.0 |
|
mask_weight: 1.0 |
|
redundant_queries: 1 |
|
dec_layers: 9 |
|
sample_weight: [ |
|
0.97905498, 0.98151887, 0.98018951, 0.97778281, 0.97704955, |
|
0.98018951, 0.97778281, 0.97704955, 0.98309006, 0.98060388, |
|
0.97209657, 0.98309006, 0.98060388, 0.97209657, 0.98405158, |
|
0.98242514, 0.98066688 |
|
] |
|
eos_coef: 0.1 |
|
|
|
decoder: |
|
type: UniHCPv2_Head |
|
kwargs: |
|
predictor: 'hulk' |
|
task: recons |
|
modality_share_list: ['predictor.mask_token'] |
|
task_sp_list: [ |
|
'predictor.query_embed_patch', |
|
'predictor.query_embed_label', |
|
'predictor.class_embed', 'predictor.fc_bias', |
|
] |
|
loss_weight: 1.0 |
|
transformer_predictor_cfg: |
|
hidden_dim: 256 |
|
num_queries: 20 |
|
nheads: 8 |
|
dim_feedforward: 2048 |
|
dec_layers: 9 |
|
pre_norm: False |
|
arch: fan_in |
|
enforce_input_project: False |
|
mask_on: False |
|
intermediate_output: True |
|
num_feature_levels: 1 |
|
cross_pos_embed: anchor |
|
cls_out_dim: 1 |
|
patch_pos_mode: False |
|
|
|
|
|
label_pos_mode: False |
|
self_attn_mask_type: full |
|
|
|
|
|
detach_from_peddet: True |
|
adding_per_layer_pe: True |
|
use_adapt_pos2d: True |
|
|
|
loss_cfg: |
|
type: CEL_Sigmoid |
|
|
|
|
|
15: |
|
name: 3dpw_256x192 |
|
loss_weight: 2055 |
|
gres_ratio: 1 |
|
backbone: |
|
type: vit_base_patch16_mask |
|
kwargs: |
|
task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] |
|
pretrained: True |
|
lms_checkpoint_train: fairscale |
|
window: False |
|
test_pos_mode: False |
|
learnable_pos: True |
|
drop_path_rate: 0.2 |
|
img_size: 1344 |
|
num_encoded_tokens: 192 |
|
vis_patch_token_ratio: 1 |
|
vis_label_token_ratio: 0. |
|
|
|
dataset: |
|
type: MultiPoseDatasetDev |
|
kwargs: |
|
ann_file: /mnt/path...to.../pose_public/3DPW/dataset_merged.json |
|
img_prefix: /mnt/path...to.../pose_public/3DPW/imageFiles |
|
use_udp: True |
|
dataset_name: '3DPW' |
|
data_cfg: { |
|
'image_size':[192, 256], |
|
'heatmap_size':[48, 64], |
|
'num_output_channels': 18, |
|
'num_joints': 18, |
|
'dataset_channel': [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17],], |
|
'inference_channel': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], |
|
|
|
'flip_pairs': [[2, 5], [3, 6], [4, 7], [8, 11], [9, 12], [10, 13], [14, 15], [16, 17]], |
|
'upper_body_ids': [0, 1, 2 ,3, 4, 5, 6, 714, 15, 16, 17], |
|
'lower_body_ids': [8, 9, 10, 11, 12, 13], |
|
'use_different_joint_weights': False, |
|
'joint_weights': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ], |
|
|
|
'soft_nms': False, |
|
'nms_thr': 1.0, |
|
'oks_thr': 0.9, |
|
'vis_thr': 0.2, |
|
'use_gt_bbox': True, |
|
'det_bbox_thr': 0.0, |
|
'bbox_file': 'COCO_val2017_detections_AP_H_56_person.json' |
|
} |
|
sampler: |
|
batch_size: 170 |
|
shuffle_strategy: 1 |
|
|
|
patch_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: rgb |
|
|
|
label_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: dense_labeling |
|
|
|
patch_adapter: |
|
type: rgb_adapter |
|
kwargs: |
|
pretrained: True |
|
stride_level: 1 |
|
in_chans: 3 |
|
learnable_pos: False |
|
test_pos_mode: False |
|
img_size: [ 256, 192 ] |
|
task_sp_list: [ 'pos_embed' ] |
|
|
|
label_adapter: |
|
type: dense_labeling_adapter |
|
kwargs: |
|
pretrained: True |
|
stride_level: 1 |
|
in_chans: 18 |
|
learnable_pos: False |
|
test_pos_mode: False |
|
img_size: [ 256, 192 ] |
|
dim_class_embed: 64 |
|
emb_padding_idx: 255 |
|
task_sp_list: [ 'pos_embed', |
|
'class_embed',] |
|
|
|
patch_proj: |
|
type: rgb_projector |
|
kwargs: |
|
loss_cfg: |
|
type: MaskedMSELoss |
|
kwargs: |
|
stride: 1 |
|
norm_pix_loss: True |
|
pix_loss: True |
|
pix_loss_weight: 1. |
|
norm_pix_loss_weight: 1. |
|
|
|
|
|
label_proj: |
|
type: dense_labeling_projector |
|
kwargs: |
|
task_sp_list: [ 'post_mul_norm', |
|
'post_mul_norm_cls', |
|
'loss_fn', |
|
'upsample_network', |
|
'text_features',] |
|
emb_padding_idx: 255 |
|
post_mul_norm: True |
|
replace_post_mul_norm: False |
|
translate_weight_scale: 1 |
|
cls_loss_branch: True |
|
description_dict_name: checked_pose_3dpw_name |
|
|
|
upsample_hidden_dim: 256 |
|
|
|
task: pose |
|
|
|
loss_cfg: |
|
type: POS_FocalDiceLoss_bce_cls_emb |
|
kwargs: |
|
target_type: GaussianHeatMap |
|
cfg: |
|
num_classes: 18 |
|
deep_supervision: True |
|
ignore_blank: False |
|
class_weight: 0.001 |
|
dice_weight: 0.0 |
|
mask_weight: 1.0 |
|
redundant_queries: 1 |
|
dec_layers: 9 |
|
sample_weight: [ 0.81362905, 0.92006165, 0.90966899, 0.83948673, 0.78390512, |
|
0.90966899, 0.83948673, 0.78390512, 0.916771645, 0.895912625, |
|
0.86267757, 0.916771645, 0.895912625, 0.86267757, 0.683630395, |
|
0.683630395, 0.6390913949999999, 0.6390913949999999 |
|
] |
|
eos_coef: 0.1 |
|
|
|
decoder: |
|
type: UniHCPv2_Head |
|
kwargs: |
|
predictor: 'hulk' |
|
task: recons |
|
modality_share_list: ['predictor.mask_token'] |
|
task_sp_list: [ |
|
'predictor.query_embed_patch', |
|
'predictor.query_embed_label', |
|
'predictor.class_embed', 'predictor.fc_bias', |
|
] |
|
loss_weight: 1.0 |
|
transformer_predictor_cfg: |
|
hidden_dim: 256 |
|
num_queries: 20 |
|
nheads: 8 |
|
dim_feedforward: 2048 |
|
dec_layers: 9 |
|
pre_norm: False |
|
arch: fan_in |
|
enforce_input_project: False |
|
mask_on: False |
|
intermediate_output: True |
|
num_feature_levels: 1 |
|
cross_pos_embed: anchor |
|
cls_out_dim: 1 |
|
patch_pos_mode: False |
|
|
|
|
|
label_pos_mode: False |
|
self_attn_mask_type: full |
|
|
|
|
|
detach_from_peddet: True |
|
adding_per_layer_pe: True |
|
use_adapt_pos2d: True |
|
|
|
loss_cfg: |
|
type: CEL_Sigmoid |
|
|
|
16: |
|
name: aist++_256x192 |
|
loss_weight: 2055 |
|
gres_ratio: 1 |
|
backbone: |
|
type: vit_base_patch16_mask |
|
kwargs: |
|
task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] |
|
pretrained: True |
|
lms_checkpoint_train: fairscale |
|
window: False |
|
test_pos_mode: False |
|
learnable_pos: True |
|
drop_path_rate: 0.2 |
|
img_size: 1344 |
|
num_encoded_tokens: 192 |
|
vis_patch_token_ratio: 1 |
|
vis_label_token_ratio: 0. |
|
|
|
dataset: |
|
type: MultiPoseDatasetDev |
|
kwargs: |
|
ann_file: /mnt/path...to.../pose_public/aistplusplus/merged_train_1m_filter.json |
|
img_prefix: /mnt/path...to.../pose_public/aistplusplus/images/ |
|
use_udp: True |
|
dataset_name: 'AIST' |
|
data_cfg: { |
|
'image_size': [ 192, 256 ], |
|
'heatmap_size': [ 48, 64 ], |
|
'num_output_channels': 136, |
|
'num_joints': 17, |
|
'dataset_channel': [ [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 ], ], |
|
'inference_channel': [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 ], |
|
'flip_pairs': [ [ 1, 2 ], [ 3, 4 ], [ 5, 6 ], [ 7, 8 ], [ 9, 10 ], [ 11, 12 ], [ 13, 14 ], [ 15, 16 ] ], |
|
'upper_body_ids': [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 ], |
|
'lower_body_ids': [ 13, 14, 15, 16 ], |
|
'use_different_joint_weights': False, |
|
'joint_weights': [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ], |
|
|
|
'soft_nms': False, |
|
'nms_thr': 1.0, |
|
'oks_thr': 0.9, |
|
'vis_thr': 0.2, |
|
'use_gt_bbox': True, |
|
'det_bbox_thr': 0.0, |
|
'bbox_file': 'COCO_val2017_detections_AP_H_56_person.json' |
|
|
|
} |
|
sampler: |
|
batch_size: 170 |
|
shuffle_strategy: 1 |
|
|
|
patch_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: rgb |
|
|
|
label_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: dense_labeling |
|
|
|
patch_adapter: |
|
type: rgb_adapter |
|
kwargs: |
|
pretrained: True |
|
stride_level: 1 |
|
in_chans: 3 |
|
learnable_pos: False |
|
test_pos_mode: False |
|
img_size: [ 256, 192 ] |
|
task_sp_list: [ 'pos_embed' ] |
|
|
|
label_adapter: |
|
type: dense_labeling_adapter |
|
kwargs: |
|
pretrained: True |
|
stride_level: 1 |
|
in_chans: 17 |
|
learnable_pos: False |
|
test_pos_mode: False |
|
img_size: [ 256, 192 ] |
|
dim_class_embed: 64 |
|
emb_padding_idx: 255 |
|
task_sp_list: [ 'pos_embed', |
|
'class_embed', ] |
|
|
|
patch_proj: |
|
type: rgb_projector |
|
kwargs: |
|
loss_cfg: |
|
type: MaskedMSELoss |
|
kwargs: |
|
stride: 1 |
|
norm_pix_loss: True |
|
pix_loss: True |
|
pix_loss_weight: 1. |
|
norm_pix_loss_weight: 1. |
|
|
|
|
|
label_proj: |
|
type: dense_labeling_projector |
|
kwargs: |
|
task_sp_list: [ 'post_mul_norm', |
|
'post_mul_norm_cls', |
|
'loss_fn', |
|
'upsample_network', |
|
'text_features', ] |
|
emb_padding_idx: 255 |
|
post_mul_norm: True |
|
replace_post_mul_norm: False |
|
translate_weight_scale: 1 |
|
cls_loss_branch: True |
|
description_dict_name: checked_pose_aist_name |
|
|
|
upsample_hidden_dim: 256 |
|
|
|
task: pose |
|
|
|
loss_cfg: |
|
type: POS_FocalDiceLoss_bce_cls_emb |
|
kwargs: |
|
target_type: GaussianHeatMap |
|
cfg: |
|
num_classes: 17 |
|
deep_supervision: True |
|
ignore_blank: False |
|
class_weight: 0.001 |
|
dice_weight: 0.0 |
|
mask_weight: 1.0 |
|
redundant_queries: 1 |
|
dec_layers: 9 |
|
sample_weight: [ |
|
0.97905498, 0.98151887, 0.98018951, 0.97778281, 0.97704955, |
|
0.98018951, 0.97778281, 0.97704955, 0.98309006, 0.98060388, |
|
0.97209657, 0.98309006, 0.98060388, 0.97209657, 0.98405158, |
|
0.98242514, 0.98066688 |
|
] |
|
eos_coef: 0.1 |
|
|
|
decoder: |
|
type: UniHCPv2_Head |
|
kwargs: |
|
predictor: 'hulk' |
|
task: recons |
|
modality_share_list: [ 'predictor.mask_token' ] |
|
task_sp_list: [ |
|
'predictor.query_embed_patch', |
|
'predictor.query_embed_label', |
|
'predictor.class_embed', 'predictor.fc_bias', |
|
] |
|
loss_weight: 1.0 |
|
transformer_predictor_cfg: |
|
hidden_dim: 256 |
|
num_queries: 20 |
|
nheads: 8 |
|
dim_feedforward: 2048 |
|
dec_layers: 9 |
|
pre_norm: False |
|
arch: fan_in |
|
enforce_input_project: False |
|
mask_on: False |
|
intermediate_output: True |
|
num_feature_levels: 1 |
|
cross_pos_embed: anchor |
|
cls_out_dim: 1 |
|
patch_pos_mode: False |
|
label_pos_mode: False |
|
self_attn_mask_type: full |
|
|
|
|
|
detach_from_peddet: True |
|
adding_per_layer_pe: True |
|
use_adapt_pos2d: True |
|
|
|
loss_cfg: |
|
type: CEL_Sigmoid |
|
|
|
17: |
|
name: LIP_parsing |
|
loss_weight: 1.8 |
|
gres_ratio: 4 |
|
dataset: |
|
type: LIPParsingDataset |
|
kwargs: |
|
data_path: /mnt/path...to.../parsing_public/LIP |
|
cfg: |
|
stride_level: 1 |
|
is_flip: True |
|
crop_size: [ 480, 480 ] |
|
is_multi_scale: True |
|
scale_factor: 11 |
|
center_crop_test: False |
|
base_size: 480 |
|
eval_crop_size: [ 480, 480 ] |
|
ignore2endclass: True |
|
|
|
is_photometricdistortion: True |
|
brightness: 32 |
|
contrast_range: [ 0.5, 1.5 ] |
|
saturation_range: [ 0.5, 1.5 ] |
|
hue_delta: 18 |
|
is_rotate: True |
|
|
|
ignore_value: 255 |
|
num_classes: 20 |
|
label_list: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 ] |
|
|
|
|
|
sampler: |
|
batch_size: 27 |
|
shuffle_strategy: 1 |
|
|
|
backbone: |
|
type: vit_base_patch16_mask |
|
kwargs: |
|
task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] |
|
pretrained: True |
|
lms_checkpoint_train: fairscale |
|
window: False |
|
test_pos_mode: learnable_interpolate |
|
learnable_pos: True |
|
drop_path_rate: 0.2 |
|
vis_patch_token_ratio: 1 |
|
vis_label_token_ratio: 0. |
|
|
|
patch_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: rgb |
|
|
|
|
|
label_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: dense_labeling |
|
|
|
patch_adapter: |
|
type: rgb_adapter |
|
kwargs: |
|
pretrained: True |
|
stride_level: 1 |
|
in_chans: 3 |
|
learnable_pos: False |
|
test_pos_mode: False |
|
img_size: 480 |
|
task_sp_list: [ 'pos_embed' ] |
|
|
|
label_adapter: |
|
type: dense_labeling_adapter |
|
kwargs: |
|
pretrained: True |
|
stride_level: 1 |
|
in_chans: 20 |
|
learnable_pos: False |
|
test_pos_mode: False |
|
img_size: 480 |
|
dim_class_embed: 64 |
|
emb_padding_idx: 255 |
|
task_sp_list: [ 'pos_embed', 'class_embed', ] |
|
|
|
patch_proj: |
|
type: rgb_projector |
|
kwargs: |
|
loss_cfg: |
|
type: MaskedMSELoss |
|
kwargs: |
|
stride: 1 |
|
norm_pix_loss: True |
|
pix_loss: True |
|
pix_loss_weight: 1. |
|
norm_pix_loss_weight: 1. |
|
|
|
label_proj: |
|
type: dense_labeling_projector |
|
kwargs: |
|
task_sp_list: [ 'post_mul_norm', |
|
'loss_fn', 'text_features' ] |
|
modality_share_list: ['upsample_network',] |
|
emb_padding_idx: 255 |
|
post_mul_norm: True |
|
replace_post_mul_norm: False |
|
translate_weight_scale: 1 |
|
description_dict_name: checked_par_lip_name |
|
cls_loss_branch: True |
|
upsample_before_product: True |
|
upsample_hidden_dim: 256 |
|
task: parsing |
|
loss_cfg: |
|
type: FocalDiceLoss_bce_cls_emb_sample_weight |
|
kwargs: |
|
cfg: |
|
ignore_index: 20 |
|
loss_weight: 1. |
|
loss_per_class: True |
|
dice_weight: 50.0 |
|
mask_weight: 50.0 |
|
class_weight: 0.3 |
|
deep_supervision: True |
|
dec_layers: 9 |
|
cls_weight_sample: True |
|
sample_weight: [ 1.0, 0.25279349, 0.97595474, 0.06368458, 0.08419378, |
|
0.91287129, 0.18341584, 0.50346535, 0.12729844, 0.6937058, |
|
0.96898868, 0.07022631, 0.07464639, 0.99359972, 0.88490099, |
|
0.88490099, 0.27644979000000003, 0.27644979000000003, 0.33016266, 0.33016266 ] |
|
decoder: |
|
type: UniHCPv2_Head |
|
kwargs: |
|
predictor: 'hulk' |
|
task: recons |
|
modality_share_list: ['predictor.mask_token'] |
|
task_sp_list: [ |
|
'predictor.query_embed_patch', |
|
'predictor.query_embed_label', |
|
'predictor.class_embed','predictor.fc_bias', |
|
] |
|
loss_weight: 1.0 |
|
transformer_predictor_cfg: |
|
hidden_dim: 256 |
|
num_queries: 20 |
|
nheads: 8 |
|
dim_feedforward: 2048 |
|
dec_layers: 9 |
|
pre_norm: False |
|
arch: fan_in |
|
enforce_input_project: False |
|
mask_on: False |
|
intermediate_output: True |
|
num_feature_levels: 1 |
|
cross_pos_embed: anchor |
|
cls_out_dim: 1 |
|
patch_pos_mode: False |
|
label_pos_mode: False |
|
self_attn_mask_type: patch_diag_label_row_textlabelfull |
|
|
|
detach_from_peddet: True |
|
adding_per_layer_pe: True |
|
use_adapt_pos2d: True |
|
|
|
loss_cfg: |
|
type: FocalDiceLoss_bce_cls_emb_sample_weight |
|
kwargs: |
|
cfg: |
|
deep_supervision: True |
|
no_object_weight: 0.1 |
|
|
|
class_weight: 0.25 |
|
dice_weight: 5.0 |
|
mask_weight: 5.0 |
|
redundant_queries: 1 |
|
num_points: 12544 |
|
|
|
dec_layers: 6 |
|
|
|
oversample_ratio: 3.0 |
|
importance_sample_ratio: 0.75 |
|
sample_weight: [ 1.0, 0.25279349, 0.97595474, 0.06368458, 0.08419378, |
|
0.91287129, 0.18341584, 0.50346535, 0.12729844, 0.6937058, |
|
0.96898868, 0.07022631, 0.07464639, 0.99359972, 0.88490099, |
|
0.88490099, 0.27644979000000003, 0.27644979000000003, 0.33016266, 0.33016266 ] |
|
|
|
18: |
|
name: CIHP_parsing |
|
loss_weight: 3.6 |
|
gres_ratio: 4 |
|
dataset: |
|
type: CIHPParsingDataset |
|
kwargs: |
|
data_path: /mnt/path...to.../parsing_public/CIHP |
|
cfg: |
|
stride_level: 1 |
|
is_flip: True |
|
crop_size: [ 480, 480 ] |
|
is_multi_scale: True |
|
scale_factor: 11 |
|
center_crop_test: False |
|
base_size: 480 |
|
eval_crop_size: [ 480, 480 ] |
|
ignore2endclass: True |
|
|
|
is_photometricdistortion: True |
|
brightness: 32 |
|
contrast_range: [ 0.5, 1.5 ] |
|
saturation_range: [ 0.5, 1.5 ] |
|
hue_delta: 18 |
|
is_rotate: True |
|
|
|
ignore_value: 255 |
|
num_classes: 20 |
|
label_list: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 ] |
|
|
|
sampler: |
|
batch_size: 26 |
|
shuffle_strategy: 1 |
|
|
|
backbone: |
|
type: vit_base_patch16_mask |
|
kwargs: |
|
task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] |
|
pretrained: True |
|
lms_checkpoint_train: fairscale |
|
window: False |
|
test_pos_mode: learnable_interpolate |
|
learnable_pos: True |
|
drop_path_rate: 0.2 |
|
vis_patch_token_ratio: 1 |
|
vis_label_token_ratio: 0. |
|
|
|
patch_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: rgb |
|
|
|
|
|
label_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: dense_labeling |
|
|
|
patch_adapter: |
|
type: rgb_adapter |
|
kwargs: |
|
pretrained: True |
|
stride_level: 1 |
|
in_chans: 3 |
|
learnable_pos: False |
|
test_pos_mode: False |
|
img_size: 480 |
|
task_sp_list: [ 'pos_embed' ] |
|
|
|
label_adapter: |
|
type: dense_labeling_adapter |
|
kwargs: |
|
pretrained: True |
|
stride_level: 1 |
|
in_chans: 20 |
|
learnable_pos: False |
|
test_pos_mode: False |
|
img_size: 480 |
|
dim_class_embed: 64 |
|
emb_padding_idx: 255 |
|
task_sp_list: [ 'pos_embed', 'class_embed', ] |
|
|
|
patch_proj: |
|
type: rgb_projector |
|
kwargs: |
|
loss_cfg: |
|
type: MaskedMSELoss |
|
kwargs: |
|
stride: 1 |
|
norm_pix_loss: True |
|
pix_loss: True |
|
pix_loss_weight: 1. |
|
norm_pix_loss_weight: 1. |
|
|
|
label_proj: |
|
type: dense_labeling_projector |
|
kwargs: |
|
task_sp_list: [ 'post_mul_norm', |
|
'loss_fn', 'text_features' ] |
|
modality_share_list: ['upsample_network',] |
|
emb_padding_idx: 255 |
|
post_mul_norm: True |
|
replace_post_mul_norm: False |
|
translate_weight_scale: 1 |
|
description_dict_name: checked_par_cihp_name |
|
cls_loss_branch: True |
|
task: parsing |
|
upsample_before_product: True |
|
upsample_hidden_dim: 256 |
|
loss_cfg: |
|
type: FocalDiceLoss_bce_cls_emb_sample_weight |
|
kwargs: |
|
cfg: |
|
ignore_index: 20 |
|
loss_weight: 1. |
|
loss_per_class: True |
|
dice_weight: 50.0 |
|
mask_weight: 50.0 |
|
class_weight: 0.1 |
|
deep_supervision: True |
|
dec_layers: 9 |
|
cls_weight_sample: True |
|
sample_weight: [ 1.0, 0.25279349, 0.97595474, 0.06368458, 0.08419378, |
|
0.91287129, 0.18341584, 0.50346535, 0.12729844, 0.6937058, |
|
0.96898868, 0.07022631, 0.07464639, 0.99359972, 0.88490099, |
|
0.88490099, 0.27644979000000003, 0.27644979000000003, 0.33016266, 0.33016266 ] |
|
decoder: |
|
type: UniHCPv2_Head |
|
kwargs: |
|
predictor: 'hulk' |
|
task: recons |
|
modality_share_list: ['predictor.mask_token'] |
|
task_sp_list: [ |
|
'predictor.query_embed_patch', |
|
'predictor.query_embed_label', |
|
'predictor.class_embed','predictor.fc_bias', |
|
] |
|
loss_weight: 1.0 |
|
transformer_predictor_cfg: |
|
hidden_dim: 256 |
|
num_queries: 20 |
|
nheads: 8 |
|
dim_feedforward: 2048 |
|
dec_layers: 9 |
|
pre_norm: False |
|
arch: fan_in |
|
enforce_input_project: False |
|
mask_on: False |
|
intermediate_output: True |
|
num_feature_levels: 1 |
|
cross_pos_embed: anchor |
|
cls_out_dim: 1 |
|
patch_pos_mode: False |
|
label_pos_mode: False |
|
self_attn_mask_type: patch_diag_label_row_textlabelfull |
|
|
|
detach_from_peddet: True |
|
adding_per_layer_pe: True |
|
use_adapt_pos2d: True |
|
|
|
loss_cfg: |
|
type: FocalDiceLoss_bce_cls_emb_sample_weight |
|
kwargs: |
|
cfg: |
|
deep_supervision: True |
|
no_object_weight: 0.1 |
|
|
|
class_weight: 0.25 |
|
dice_weight: 5.0 |
|
mask_weight: 5.0 |
|
redundant_queries: 1 |
|
num_points: 12544 |
|
|
|
dec_layers: 6 |
|
|
|
oversample_ratio: 3.0 |
|
importance_sample_ratio: 0.75 |
|
sample_weight: [ 1.0, 0.25279349, 0.97595474, 0.06368458, 0.08419378, |
|
0.91287129, 0.18341584, 0.50346535, 0.12729844, 0.6937058, |
|
0.96898868, 0.07022631, 0.07464639, 0.99359972, 0.88490099, |
|
0.88490099, 0.27644979000000003, 0.27644979000000003, 0.33016266, 0.33016266 ] |
|
|
|
19: |
|
name: human3.6m_parsing |
|
loss_weight: 2.25 |
|
gres_ratio: 7 |
|
dataset: |
|
type: Human3M6ParsingDataset |
|
kwargs: |
|
data_path: /mnt/path...to.../parsing_public/human3.6 |
|
cfg: |
|
stride_level: 1 |
|
is_flip: True |
|
crop_size: [ 480, 480 ] |
|
is_multi_scale: True |
|
scale_factor: 11 |
|
center_crop_test: False |
|
base_size: 480 |
|
eval_crop_size: [ 480, 480 ] |
|
ignore2endclass: True |
|
|
|
is_photometricdistortion: True |
|
brightness: 32 |
|
contrast_range: [ 0.5, 1.5 ] |
|
saturation_range: [ 0.5, 1.5 ] |
|
hue_delta: 18 |
|
is_rotate: True |
|
|
|
ignore_value: 255 |
|
num_classes: 25 |
|
label_list: [0, 1, 2, 3, 6, 7, 8, 17, 18, 19, 25, 26, 27, 32, 33, 34, 38, 39, 43, 44, |
|
46, 49, 50, 56, 58] |
|
|
|
sampler: |
|
batch_size: 31 |
|
shuffle_strategy: 1 |
|
|
|
backbone: |
|
type: vit_base_patch16_mask |
|
kwargs: |
|
task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] |
|
pretrained: True |
|
lms_checkpoint_train: fairscale |
|
window: False |
|
test_pos_mode: learnable_interpolate |
|
learnable_pos: True |
|
drop_path_rate: 0.2 |
|
vis_patch_token_ratio: 1 |
|
vis_label_token_ratio: 0. |
|
|
|
patch_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: rgb |
|
|
|
label_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: dense_labeling |
|
|
|
patch_adapter: |
|
type: rgb_adapter |
|
kwargs: |
|
pretrained: True |
|
stride_level: 1 |
|
in_chans: 3 |
|
learnable_pos: False |
|
test_pos_mode: False |
|
img_size: 480 |
|
task_sp_list: [ 'pos_embed' ] |
|
|
|
label_adapter: |
|
type: dense_labeling_adapter |
|
kwargs: |
|
pretrained: True |
|
stride_level: 1 |
|
in_chans: 25 |
|
learnable_pos: False |
|
test_pos_mode: False |
|
img_size: 480 |
|
dim_class_embed: 64 |
|
emb_padding_idx: 255 |
|
task_sp_list: [ 'pos_embed', 'class_embed', ] |
|
|
|
patch_proj: |
|
type: rgb_projector |
|
kwargs: |
|
loss_cfg: |
|
type: MaskedMSELoss |
|
kwargs: |
|
stride: 1 |
|
norm_pix_loss: True |
|
pix_loss: True |
|
pix_loss_weight: 1. |
|
norm_pix_loss_weight: 1. |
|
|
|
label_proj: |
|
type: dense_labeling_projector |
|
kwargs: |
|
task_sp_list: [ 'post_mul_norm', |
|
'loss_fn', 'text_features' ] |
|
modality_share_list: ['upsample_network',] |
|
emb_padding_idx: 255 |
|
post_mul_norm: True |
|
replace_post_mul_norm: False |
|
translate_weight_scale: 1 |
|
description_dict_name: checked_par_human_name |
|
cls_loss_branch: True |
|
task: parsing |
|
upsample_before_product: True |
|
upsample_hidden_dim: 256 |
|
loss_cfg: |
|
type: FocalDiceLoss_bce_cls_emb_sample_weight |
|
kwargs: |
|
cfg: |
|
ignore_index: 25 |
|
loss_weight: 1. |
|
loss_per_class: True |
|
dice_weight: 50.0 |
|
mask_weight: 50.0 |
|
class_weight: 0.1 |
|
deep_supervision: True |
|
dec_layers: 9 |
|
cls_weight_sample: True |
|
sample_weight: [1.0, 0.97325, 0.96685, 0.9903500000000001, 0.97325, 0.96685, 0.9903500000000001, 0.9929, 0.9459, |
|
0.89645, 0.9929, 0.9459, 0.89645, 0.981, 0.9997, 0.99265, 0.9997, 0.99265, |
|
0.9995, 0.9999, 0.9999, 0.9758, 0.9256500000000001, 0.9758, 0.9256500000000001] |
|
decoder: |
|
type: UniHCPv2_Head |
|
kwargs: |
|
predictor: 'hulk' |
|
task: recons |
|
modality_share_list: ['predictor.mask_token'] |
|
task_sp_list: [ |
|
'predictor.query_embed_patch', |
|
'predictor.query_embed_label', |
|
'predictor.class_embed','predictor.fc_bias', |
|
] |
|
loss_weight: 1.0 |
|
transformer_predictor_cfg: |
|
hidden_dim: 256 |
|
num_queries: 25 |
|
nheads: 8 |
|
dim_feedforward: 2048 |
|
dec_layers: 9 |
|
pre_norm: False |
|
arch: fan_in |
|
enforce_input_project: False |
|
mask_on: False |
|
intermediate_output: True |
|
num_feature_levels: 1 |
|
cross_pos_embed: anchor |
|
cls_out_dim: 1 |
|
patch_pos_mode: False |
|
|
|
|
|
label_pos_mode: False |
|
self_attn_mask_type: patch_diag_label_row_textlabelfull |
|
|
|
detach_from_peddet: True |
|
adding_per_layer_pe: True |
|
use_adapt_pos2d: True |
|
|
|
loss_cfg: |
|
type: FocalDiceLoss_bce_cls_emb_sample_weight |
|
kwargs: |
|
cfg: |
|
deep_supervision: True |
|
no_object_weight: 0.1 |
|
|
|
class_weight: 0.25 |
|
dice_weight: 5.0 |
|
mask_weight: 5.0 |
|
redundant_queries: 1 |
|
num_points: 12544 |
|
|
|
dec_layers: 6 |
|
|
|
oversample_ratio: 3.0 |
|
importance_sample_ratio: 0.75 |
|
sample_weight: [ 1.0, 0.25279349, 0.97595474, 0.06368458, 0.08419378, |
|
0.91287129, 0.18341584, 0.50346535, 0.12729844, 0.6937058, |
|
0.96898868, 0.07022631, 0.07464639, 0.99359972, 0.88490099, |
|
0.88490099, 0.27644979000000003, 0.27644979000000003, 0.33016266, 0.33016266 ] |
|
|
|
20: |
|
name: modanet_parsing |
|
loss_weight: 0.021 |
|
gres_ratio: 1 |
|
dataset: |
|
type: ModaNetParsingDataset |
|
kwargs: |
|
data_path: /mnt/path...to.../parsing_public/ModaNet/ |
|
cfg: |
|
stride_level: 1 |
|
is_flip: True |
|
crop_size: [ 480, 480 ] |
|
is_multi_scale: True |
|
scale_factor: 11 |
|
center_crop_test: False |
|
base_size: 480 |
|
eval_crop_size: [ 480, 480 ] |
|
ignore2endclass: True |
|
|
|
is_photometricdistortion: True |
|
brightness: 32 |
|
contrast_range: [ 0.5, 1.5 ] |
|
saturation_range: [ 0.5, 1.5 ] |
|
hue_delta: 18 |
|
is_rotate: True |
|
|
|
ignore_value: 255 |
|
num_classes: 14 |
|
label_list: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, ] |
|
|
|
sampler: |
|
batch_size: 27 |
|
shuffle_strategy: 1 |
|
|
|
backbone: |
|
type: vit_base_patch16_mask |
|
kwargs: |
|
task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] |
|
pretrained: True |
|
lms_checkpoint_train: fairscale |
|
window: False |
|
test_pos_mode: learnable_interpolate |
|
learnable_pos: True |
|
drop_path_rate: 0.2 |
|
vis_patch_token_ratio: 1 |
|
vis_label_token_ratio: 0. |
|
|
|
patch_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: rgb |
|
|
|
label_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: dense_labeling |
|
|
|
patch_adapter: |
|
type: rgb_adapter |
|
kwargs: |
|
pretrained: True |
|
stride_level: 1 |
|
in_chans: 3 |
|
learnable_pos: False |
|
test_pos_mode: False |
|
img_size: 480 |
|
task_sp_list: [ 'pos_embed' ] |
|
|
|
label_adapter: |
|
type: dense_labeling_adapter |
|
kwargs: |
|
pretrained: True |
|
stride_level: 1 |
|
in_chans: 14 |
|
learnable_pos: False |
|
test_pos_mode: False |
|
img_size: 480 |
|
dim_class_embed: 64 |
|
emb_padding_idx: 255 |
|
task_sp_list: [ 'pos_embed', 'class_embed', ] |
|
|
|
patch_proj: |
|
type: rgb_projector |
|
kwargs: |
|
loss_cfg: |
|
type: MaskedMSELoss |
|
kwargs: |
|
stride: 1 |
|
norm_pix_loss: True |
|
pix_loss: True |
|
pix_loss_weight: 1. |
|
norm_pix_loss_weight: 1. |
|
|
|
label_proj: |
|
type: dense_labeling_projector |
|
kwargs: |
|
task_sp_list: [ 'post_mul_norm', |
|
'post_mul_norm_cls', |
|
|
|
'loss_fn', 'text_features' ] |
|
modality_share_list: ['upsample_network',] |
|
emb_padding_idx: 255 |
|
post_mul_norm: True |
|
|
|
replace_post_mul_norm: False |
|
translate_weight_scale: 1 |
|
description_dict_name: checked_par_modanet_name |
|
cls_loss_branch: True |
|
upsample_before_product: True |
|
upsample_hidden_dim: 256 |
|
task: parsing |
|
loss_cfg: |
|
type: FocalDiceLoss_bce_cls_emb_sample_weight |
|
kwargs: |
|
cfg: |
|
ignore_index: 14 |
|
loss_weight: 1. |
|
loss_per_class: True |
|
dice_weight: 50.0 |
|
mask_weight: 50.0 |
|
class_weight: 0.1 |
|
deep_supervision: True |
|
dec_layers: 9 |
|
cls_weight_sample: True |
|
sample_weight: [ 1.0, 0.3933582160972342, 0.2633553450090918, 0.13557278208440998, 0.7506555651258494, 0.45334481768590296, 0.2760455545985262, 0.16753756340319648, 0.4404249210450761, 0.6636233132357163, 0.13457747152837593, 0.25979519571250836, 0.10422049956933678, 0.0956263757297349 ] |
|
decoder: |
|
type: UniHCPv2_Head |
|
kwargs: |
|
predictor: 'hulk' |
|
task: recons |
|
modality_share_list: ['predictor.mask_token'] |
|
task_sp_list: [ |
|
'predictor.query_embed_patch', |
|
'predictor.query_embed_label', |
|
|
|
|
|
'predictor.class_embed','predictor.fc_bias', |
|
] |
|
loss_weight: 1.0 |
|
transformer_predictor_cfg: |
|
hidden_dim: 256 |
|
num_queries: 20 |
|
nheads: 8 |
|
dim_feedforward: 2048 |
|
dec_layers: 9 |
|
pre_norm: False |
|
arch: fan_in |
|
enforce_input_project: False |
|
mask_on: False |
|
intermediate_output: True |
|
num_feature_levels: 1 |
|
cross_pos_embed: anchor |
|
cls_out_dim: 1 |
|
patch_pos_mode: False |
|
|
|
|
|
label_pos_mode: False |
|
self_attn_mask_type: patch_diag_label_row_textlabelfull |
|
|
|
detach_from_peddet: True |
|
adding_per_layer_pe: True |
|
use_adapt_pos2d: True |
|
|
|
loss_cfg: |
|
type: FocalDiceLoss_bce_cls_emb_sample_weight |
|
kwargs: |
|
cfg: |
|
deep_supervision: True |
|
no_object_weight: 0.1 |
|
|
|
class_weight: 0.25 |
|
dice_weight: 5.0 |
|
mask_weight: 5.0 |
|
redundant_queries: 1 |
|
num_points: 12544 |
|
|
|
dec_layers: 6 |
|
|
|
oversample_ratio: 3.0 |
|
importance_sample_ratio: 0.75 |
|
sample_weight: [ 1.0, 0.25279349, 0.97595474, 0.06368458, 0.08419378, |
|
0.91287129, 0.18341584, 0.50346535, 0.12729844, 0.6937058, |
|
0.96898868, 0.07022631, 0.07464639, 0.99359972, 0.88490099, |
|
0.88490099, 0.27644979000000003, 0.27644979000000003, 0.33016266, 0.33016266 ] |
|
|
|
21: |
|
name: VIP_parsing |
|
loss_weight: 0.021 |
|
gres_ratio: 1 |
|
dataset: |
|
type: VIPParsingDataset |
|
kwargs: |
|
data_path: /mnt/path...to.../parsing_public/VIP |
|
cfg: |
|
stride_level: 1 |
|
is_flip: True |
|
crop_size: [ 480, 480 ] |
|
is_multi_scale: True |
|
scale_factor: 11 |
|
center_crop_test: False |
|
base_size: 480 |
|
eval_crop_size: [ 480, 480 ] |
|
ignore2endclass: True |
|
|
|
is_photometricdistortion: True |
|
brightness: 32 |
|
contrast_range: [ 0.5, 1.5 ] |
|
saturation_range: [ 0.5, 1.5 ] |
|
hue_delta: 18 |
|
is_rotate: True |
|
|
|
ignore_value: 255 |
|
num_classes: 20 |
|
label_list: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 ] |
|
|
|
|
|
sampler: |
|
batch_size: 27 |
|
shuffle_strategy: 1 |
|
|
|
backbone: |
|
type: vit_base_patch16_mask |
|
kwargs: |
|
task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] |
|
pretrained: True |
|
lms_checkpoint_train: fairscale |
|
window: False |
|
test_pos_mode: learnable_interpolate |
|
learnable_pos: True |
|
drop_path_rate: 0.2 |
|
vis_patch_token_ratio: 1 |
|
vis_label_token_ratio: 0. |
|
|
|
patch_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: rgb |
|
|
|
label_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: dense_labeling |
|
|
|
patch_adapter: |
|
type: rgb_adapter |
|
kwargs: |
|
pretrained: True |
|
stride_level: 1 |
|
in_chans: 3 |
|
learnable_pos: False |
|
test_pos_mode: False |
|
img_size: 480 |
|
task_sp_list: [ 'pos_embed' ] |
|
|
|
label_adapter: |
|
type: dense_labeling_adapter |
|
kwargs: |
|
pretrained: True |
|
stride_level: 1 |
|
in_chans: 20 |
|
learnable_pos: False |
|
test_pos_mode: False |
|
img_size: 480 |
|
dim_class_embed: 64 |
|
emb_padding_idx: 255 |
|
task_sp_list: [ 'pos_embed', 'class_embed', ] |
|
|
|
patch_proj: |
|
type: rgb_projector |
|
kwargs: |
|
loss_cfg: |
|
type: MaskedMSELoss |
|
kwargs: |
|
stride: 1 |
|
norm_pix_loss: True |
|
pix_loss: True |
|
pix_loss_weight: 1. |
|
norm_pix_loss_weight: 1. |
|
|
|
label_proj: |
|
type: dense_labeling_projector |
|
kwargs: |
|
task_sp_list: [ 'post_mul_norm', |
|
'post_mul_norm_cls', |
|
|
|
'loss_fn', 'text_features' ] |
|
modality_share_list: ['upsample_network',] |
|
emb_padding_idx: 255 |
|
post_mul_norm: True |
|
replace_post_mul_norm: False |
|
translate_weight_scale: 1 |
|
description_dict_name: checked_par_vip_name |
|
cls_loss_branch: True |
|
upsample_before_product: True |
|
upsample_hidden_dim: 256 |
|
task: parsing |
|
loss_cfg: |
|
type: FocalDiceLoss_bce_cls_emb_sample_weight |
|
kwargs: |
|
cfg: |
|
ignore_index: 20 |
|
loss_weight: 1. |
|
loss_per_class: True |
|
dice_weight: 50.0 |
|
mask_weight: 50.0 |
|
class_weight: 0.1 |
|
deep_supervision: True |
|
dec_layers: 9 |
|
cls_weight_sample: True |
|
sample_weight: [1.0, 0.3266013319616655, 0.9908495316476258, 0.029184038117927337, 0.052466294872489036, 0.991336834695977, 0.10801884238453625, 0.30001624343494504, 0.3465807569440684, 0.9136932156586712, 0.9863555146461639, 0.015810276679841896, 0.11895608858086523, 0.9925821647084303, 0.9789106069630192, 0.9789106069630192, 0.4952081866912123, 0.4952081866912123, 0.7048026422654177, 0.7048026422654177, ] |
|
decoder: |
|
type: UniHCPv2_Head |
|
kwargs: |
|
predictor: 'hulk' |
|
task: recons |
|
modality_share_list: ['predictor.mask_token'] |
|
task_sp_list: [ |
|
'predictor.query_embed_patch', |
|
'predictor.query_embed_label', |
|
'predictor.class_embed','predictor.fc_bias', |
|
] |
|
loss_weight: 1.0 |
|
transformer_predictor_cfg: |
|
hidden_dim: 256 |
|
num_queries: 20 |
|
nheads: 8 |
|
dim_feedforward: 2048 |
|
dec_layers: 9 |
|
pre_norm: False |
|
arch: fan_in |
|
enforce_input_project: False |
|
mask_on: False |
|
intermediate_output: True |
|
num_feature_levels: 1 |
|
cross_pos_embed: anchor |
|
cls_out_dim: 1 |
|
patch_pos_mode: False |
|
|
|
|
|
label_pos_mode: False |
|
self_attn_mask_type: patch_diag_label_row_textlabelfull |
|
|
|
detach_from_peddet: True |
|
adding_per_layer_pe: True |
|
use_adapt_pos2d: True |
|
|
|
loss_cfg: |
|
type: FocalDiceLoss_bce_cls_emb_sample_weight |
|
kwargs: |
|
cfg: |
|
deep_supervision: True |
|
no_object_weight: 0.1 |
|
|
|
class_weight: 0.25 |
|
dice_weight: 5.0 |
|
mask_weight: 5.0 |
|
redundant_queries: 1 |
|
num_points: 12544 |
|
|
|
dec_layers: 6 |
|
|
|
oversample_ratio: 3.0 |
|
importance_sample_ratio: 0.75 |
|
sample_weight: [ 1.0, 0.25279349, 0.97595474, 0.06368458, 0.08419378, |
|
0.91287129, 0.18341584, 0.50346535, 0.12729844, 0.6937058, |
|
0.96898868, 0.07022631, 0.07464639, 0.99359972, 0.88490099, |
|
0.88490099, 0.27644979000000003, 0.27644979000000003, 0.33016266, 0.33016266 ] |
|
|
|
22: |
|
name: deepfashion_parsing |
|
loss_weight: 0.042 |
|
gres_ratio: 2 |
|
dataset: |
|
type: DeepFashionParsingDataset |
|
kwargs: |
|
data_path: /mnt/path...to.../parsing_public/deepfashion2/ |
|
cfg: |
|
stride_level: 1 |
|
is_flip: True |
|
crop_size: [ 480, 480 ] |
|
is_multi_scale: True |
|
scale_factor: 11 |
|
center_crop_test: False |
|
base_size: 480 |
|
eval_crop_size: [ 480, 480 ] |
|
ignore2endclass: True |
|
|
|
is_photometricdistortion: True |
|
brightness: 32 |
|
contrast_range: [ 0.5, 1.5 ] |
|
saturation_range: [ 0.5, 1.5 ] |
|
hue_delta: 18 |
|
is_rotate: True |
|
|
|
ignore_value: 255 |
|
num_classes: 14 |
|
label_list: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, ] |
|
|
|
sampler: |
|
batch_size: 27 |
|
shuffle_strategy: 1 |
|
|
|
backbone: |
|
type: vit_base_patch16_mask |
|
kwargs: |
|
task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] |
|
pretrained: True |
|
lms_checkpoint_train: fairscale |
|
window: False |
|
test_pos_mode: learnable_interpolate |
|
learnable_pos: True |
|
drop_path_rate: 0.2 |
|
vis_patch_token_ratio: 1 |
|
vis_label_token_ratio: 0. |
|
|
|
patch_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: rgb |
|
|
|
label_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: dense_labeling |
|
|
|
patch_adapter: |
|
type: rgb_adapter |
|
kwargs: |
|
pretrained: True |
|
stride_level: 1 |
|
in_chans: 3 |
|
learnable_pos: False |
|
test_pos_mode: False |
|
img_size: 480 |
|
task_sp_list: [ 'pos_embed' ] |
|
|
|
label_adapter: |
|
type: dense_labeling_adapter |
|
kwargs: |
|
pretrained: True |
|
stride_level: 1 |
|
in_chans: 14 |
|
learnable_pos: False |
|
test_pos_mode: False |
|
img_size: 480 |
|
dim_class_embed: 64 |
|
emb_padding_idx: 255 |
|
task_sp_list: [ 'pos_embed', 'class_embed', ] |
|
|
|
patch_proj: |
|
type: rgb_projector |
|
kwargs: |
|
loss_cfg: |
|
type: MaskedMSELoss |
|
kwargs: |
|
stride: 1 |
|
norm_pix_loss: True |
|
pix_loss: True |
|
pix_loss_weight: 1. |
|
norm_pix_loss_weight: 1. |
|
|
|
label_proj: |
|
type: dense_labeling_projector |
|
kwargs: |
|
task_sp_list: [ 'post_mul_norm', |
|
'post_mul_norm_cls', |
|
|
|
'loss_fn', 'text_features' ] |
|
modality_share_list: ['upsample_network',] |
|
emb_padding_idx: 255 |
|
post_mul_norm: True |
|
replace_post_mul_norm: False |
|
translate_weight_scale: 1 |
|
description_dict_name: checked_par_deepfashion_name |
|
cls_loss_branch: True |
|
upsample_before_product: True |
|
upsample_hidden_dim: 256 |
|
task: parsing |
|
loss_cfg: |
|
type: FocalDiceLoss_bce_cls_emb_sample_weight |
|
kwargs: |
|
|
|
cfg: |
|
ignore_index: 14 |
|
loss_weight: 1. |
|
loss_per_class: True |
|
dice_weight: 50.0 |
|
mask_weight: 50.0 |
|
class_weight: 0.1 |
|
deep_supervision: True |
|
dec_layers: 9 |
|
cls_weight_sample: True |
|
sample_weight: [ 1.0, 0.367704898390819, 0.18624095519402378, 0.002807862013638187, 0.06970686754080256, 0.08321481967691353, 0.010231244888284599, 0.18925719286730117, 0.28635504086767627, 0.15953761441126063, 0.0887055183084064, 0.04064888180411646, 0.09255004922874958, 0.03362141268278453, ] |
|
decoder: |
|
type: UniHCPv2_Head |
|
kwargs: |
|
predictor: 'hulk' |
|
task: recons |
|
modality_share_list: ['predictor.mask_token'] |
|
task_sp_list: [ |
|
'predictor.query_embed_patch', |
|
'predictor.query_embed_label', |
|
'predictor.class_embed','predictor.fc_bias', |
|
] |
|
loss_weight: 1.0 |
|
transformer_predictor_cfg: |
|
hidden_dim: 256 |
|
num_queries: 20 |
|
nheads: 8 |
|
dim_feedforward: 2048 |
|
dec_layers: 9 |
|
pre_norm: False |
|
arch: fan_in |
|
enforce_input_project: False |
|
mask_on: False |
|
intermediate_output: True |
|
num_feature_levels: 1 |
|
cross_pos_embed: anchor |
|
cls_out_dim: 1 |
|
patch_pos_mode: False |
|
|
|
|
|
label_pos_mode: False |
|
self_attn_mask_type: patch_diag_label_row_textlabelfull |
|
|
|
detach_from_peddet: True |
|
adding_per_layer_pe: True |
|
use_adapt_pos2d: True |
|
|
|
loss_cfg: |
|
type: FocalDiceLoss_bce_cls_emb_sample_weight |
|
kwargs: |
|
cfg: |
|
deep_supervision: True |
|
no_object_weight: 0.1 |
|
|
|
class_weight: 0.25 |
|
dice_weight: 5.0 |
|
mask_weight: 5.0 |
|
redundant_queries: 1 |
|
num_points: 12544 |
|
|
|
dec_layers: 6 |
|
|
|
oversample_ratio: 3.0 |
|
importance_sample_ratio: 0.75 |
|
sample_weight: [ 1.0, 0.25279349, 0.97595474, 0.06368458, 0.08419378, |
|
0.91287129, 0.18341584, 0.50346535, 0.12729844, 0.6937058, |
|
0.96898868, 0.07022631, 0.07464639, 0.99359972, 0.88490099, |
|
0.88490099, 0.27644979000000003, 0.27644979000000003, 0.33016266, 0.33016266 ] |
|
|
|
23: |
|
name: PaperDoll_parsing |
|
loss_weight: 0.021 |
|
gres_ratio: 1 |
|
dataset: |
|
type: PaperDollParsingDataset |
|
kwargs: |
|
data_path: /mnt/path...to.../parsing_public/PaperDoll |
|
cfg: |
|
stride_level: 1 |
|
is_flip: True |
|
crop_size: [ 480, 480 ] |
|
is_multi_scale: True |
|
scale_factor: 11 |
|
center_crop_test: False |
|
base_size: 480 |
|
eval_crop_size: [ 480, 480 ] |
|
ignore2endclass: True |
|
|
|
is_photometricdistortion: True |
|
brightness: 32 |
|
contrast_range: [ 0.5, 1.5 ] |
|
saturation_range: [ 0.5, 1.5 ] |
|
hue_delta: 18 |
|
is_rotate: True |
|
|
|
ignore_value: 255 |
|
num_classes: 20 |
|
label_list: [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19 ] |
|
|
|
sampler: |
|
batch_size: 27 |
|
shuffle_strategy: 1 |
|
|
|
backbone: |
|
type: vit_base_patch16_mask |
|
kwargs: |
|
task_sp_list: [ 'rel_pos_h', 'rel_pos_w' ] |
|
pretrained: True |
|
lms_checkpoint_train: fairscale |
|
window: False |
|
test_pos_mode: learnable_interpolate |
|
learnable_pos: True |
|
drop_path_rate: 0.2 |
|
vis_patch_token_ratio: 1 |
|
vis_label_token_ratio: 0. |
|
|
|
patch_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: rgb |
|
|
|
label_neck: |
|
type: MAEdecoder_proj_neck |
|
kwargs: |
|
mask_dim: 256 |
|
modality: dense_labeling |
|
|
|
patch_adapter: |
|
type: rgb_adapter |
|
kwargs: |
|
pretrained: True |
|
stride_level: 1 |
|
in_chans: 3 |
|
learnable_pos: False |
|
test_pos_mode: False |
|
img_size: 480 |
|
task_sp_list: [ 'pos_embed' ] |
|
|
|
label_adapter: |
|
type: dense_labeling_adapter |
|
kwargs: |
|
pretrained: True |
|
stride_level: 1 |
|
in_chans: 20 |
|
learnable_pos: False |
|
test_pos_mode: False |
|
img_size: 480 |
|
dim_class_embed: 64 |
|
emb_padding_idx: 255 |
|
task_sp_list: [ 'pos_embed', 'class_embed', ] |
|
|
|
patch_proj: |
|
type: rgb_projector |
|
kwargs: |
|
loss_cfg: |
|
type: MaskedMSELoss |
|
kwargs: |
|
stride: 1 |
|
norm_pix_loss: True |
|
pix_loss: True |
|
pix_loss_weight: 1. |
|
norm_pix_loss_weight: 1. |
|
|
|
label_proj: |
|
type: dense_labeling_projector |
|
kwargs: |
|
task_sp_list: [ 'post_mul_norm', |
|
'post_mul_norm_cls', |
|
'loss_fn', 'text_features' ] |
|
modality_share_list: ['upsample_network',] |
|
emb_padding_idx: 255 |
|
post_mul_norm: True |
|
replace_post_mul_norm: False |
|
translate_weight_scale: 1 |
|
description_dict_name: checked_par_paperdoll_name |
|
cls_loss_branch: True |
|
upsample_before_product: True |
|
upsample_hidden_dim: 256 |
|
|
|
task: parsing |
|
loss_cfg: |
|
type: FocalDiceLoss_bce_cls_emb_sample_weight |
|
kwargs: |
|
cfg: |
|
ignore_index: 20 |
|
loss_weight: 1. |
|
loss_per_class: True |
|
dice_weight: 50.0 |
|
mask_weight: 50.0 |
|
class_weight: 0.1 |
|
deep_supervision: True |
|
dec_layers: 9 |
|
cls_weight_sample: True |
|
sample_weight: [ 1.0, 0.12651171233101552, 0.9445288709780197, 0.022596273603759997, 0.1542096228225839, 0.7740073338443981, 0.3171279444960444, 0.38393872629003634, 0.19776277195374156, 0.5762416654276241, 0.932492136102867, 0.0684559727964192, 0.2131960924782717, 0.9246929266441772, 0.9079233711740138, 0.9079233711740138, 0.5743937220129259, 0.5743937220129259, 0.7146935638660443, 0.7146935638660443, ] |
|
decoder: |
|
type: UniHCPv2_Head |
|
kwargs: |
|
predictor: 'hulk' |
|
task: recons |
|
modality_share_list: ['predictor.mask_token'] |
|
task_sp_list: [ |
|
'predictor.query_embed_patch', |
|
'predictor.query_embed_label', |
|
|
|
|
|
'predictor.class_embed','predictor.fc_bias', |
|
] |
|
loss_weight: 1.0 |
|
transformer_predictor_cfg: |
|
hidden_dim: 256 |
|
num_queries: 20 |
|
nheads: 8 |
|
dim_feedforward: 2048 |
|
dec_layers: 9 |
|
pre_norm: False |
|
arch: fan_in |
|
enforce_input_project: False |
|
mask_on: False |
|
intermediate_output: True |
|
num_feature_levels: 1 |
|
cross_pos_embed: anchor |
|
cls_out_dim: 1 |
|
patch_pos_mode: False |
|
|
|
|
|
label_pos_mode: False |
|
self_attn_mask_type: patch_diag_label_row_textlabelfull |
|
|
|
detach_from_peddet: True |
|
adding_per_layer_pe: True |
|
use_adapt_pos2d: True |
|
|
|
loss_cfg: |
|
type: FocalDiceLoss_bce_cls_emb_sample_weight |
|
kwargs: |
|
cfg: |
|
deep_supervision: True |
|
no_object_weight: 0.1 |
|
|
|
class_weight: 0.25 |
|
dice_weight: 5.0 |
|
mask_weight: 5.0 |
|
redundant_queries: 1 |
|
num_points: 12544 |
|
|
|
dec_layers: 6 |
|
|
|
oversample_ratio: 3.0 |
|
importance_sample_ratio: 0.75 |
|
sample_weight: [ 1.0, 0.25279349, 0.97595474, 0.06368458, 0.08419378, |
|
0.91287129, 0.18341584, 0.50346535, 0.12729844, 0.6937058, |
|
0.96898868, 0.07022631, 0.07464639, 0.99359972, 0.88490099, |
|
0.88490099, 0.27644979000000003, 0.27644979000000003, 0.33016266, 0.33016266 ] |