|
{ |
|
"activation_dropout": 0.1, |
|
"activation_fn": "gelu", |
|
"apply_mask": false, |
|
"arch": "avsp_llm", |
|
"architectures": [ |
|
"AVSPLLMModel" |
|
], |
|
"attention_dropout": 0.1, |
|
"audio_dropout": 0.0, |
|
"audio_feat_dim": 104, |
|
"auto_map": { |
|
"AutoConfig": "configuration.AVSPLLMConfig", |
|
"AutoModel": "modelling.AVSPLLMModel" |
|
}, |
|
"conv_bias": false, |
|
"conv_feature_layers": "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2", |
|
"conv_pos": 128, |
|
"conv_pos_groups": 16, |
|
"data": null, |
|
"decoder_activation_dropout": 0.0, |
|
"decoder_attention_dropout": 0.1, |
|
"decoder_attention_heads": 4, |
|
"decoder_dropout": 0.1, |
|
"decoder_embed_dim": 2560, |
|
"decoder_ffn_embed_dim": 3072, |
|
"decoder_layerdrop": 0.0, |
|
"decoder_layers": 6, |
|
"decoder_learned_pos": false, |
|
"decoder_normalize_before": false, |
|
"dropout": 0.1, |
|
"dropout_features": 0.0, |
|
"dropout_input": 0.0, |
|
"encoder_attention_heads": 16, |
|
"encoder_embed_dim": 1024, |
|
"encoder_ffn_embed_dim": 4096, |
|
"encoder_layerdrop": 0.0, |
|
"encoder_layers": 24, |
|
"extractor_mode": "default", |
|
"feature_ds_rate": 1, |
|
"feature_grad_mult": 1.0, |
|
"final_dim": 256, |
|
"final_dropout": 0.1, |
|
"freeze_finetune_updates": 0, |
|
"ignored_weights": [], |
|
"input_modality": "video", |
|
"label_rate": 25, |
|
"latent_temp": [ |
|
2.0, |
|
0.5, |
|
0.999995 |
|
], |
|
"layer_norm_first": false, |
|
"layerdrop": 0.0, |
|
"llm_ckpt_path": "vilm/vinallama-2.7b", |
|
"logit_temp": 0.1, |
|
"mask_channel_length": 64, |
|
"mask_channel_min_space": 1, |
|
"mask_channel_other": 0.0, |
|
"mask_channel_prob": 0.5, |
|
"mask_channel_selection": "static", |
|
"mask_length": 10, |
|
"mask_length_audio": 10, |
|
"mask_length_image": 10, |
|
"mask_min_space": 1, |
|
"mask_other": 0.0, |
|
"mask_prob": 0.5, |
|
"mask_prob_audio": 0.65, |
|
"mask_prob_image": 0.65, |
|
"mask_selection": "static", |
|
"masking_type": "input", |
|
"masking_updates": 0, |
|
"max_target_positions": 2048, |
|
"modality_dropout": 0.0, |
|
"modality_fuse": "concat", |
|
"model_type": "avsp_llm", |
|
"no_mask_channel_overlap": false, |
|
"no_mask_overlap": false, |
|
"no_pretrained_weights": false, |
|
"no_scale_embedding": true, |
|
"no_token_positional_embeddings": false, |
|
"normalize": false, |
|
"num_classes": 2004, |
|
"num_frames": 16, |
|
"num_frozen_layers": 0, |
|
"pretrained": "tanthinhdt/ViAVSP-LLM_v1.0", |
|
"resnet_relu_type": "prelu", |
|
"resnet_weights": null, |
|
"sample_rate": 25, |
|
"selection_type": "same_other_seq", |
|
"share_decoder_input_output_embed": false, |
|
"sim_type": "cosine", |
|
"skip_masked": false, |
|
"skip_nomask": false, |
|
"sub_encoder_layers": 0, |
|
"target_glu": false, |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.41.2", |
|
"untie_final_proj": false, |
|
"w2v_args": null |
|
} |
|
|