ViAVSP-LLM_v1.0 / config.json
tanthinhdt's picture
Upload model
15e149b verified
raw
history blame
2.68 kB
{
"activation_dropout": 0.1,
"activation_fn": "gelu",
"apply_mask": false,
"arch": "avsp_llm",
"architectures": [
"AVSPLLMModel"
],
"attention_dropout": 0.1,
"audio_dropout": 0.0,
"audio_feat_dim": 104,
"auto_map": {
"AutoConfig": "configuration.AVSPLLMConfig",
"AutoModel": "modelling.AVSPLLMModel"
},
"conv_bias": false,
"conv_feature_layers": "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2",
"conv_pos": 128,
"conv_pos_groups": 16,
"data": null,
"decoder_activation_dropout": 0.0,
"decoder_attention_dropout": 0.1,
"decoder_attention_heads": 4,
"decoder_dropout": 0.1,
"decoder_embed_dim": 2560,
"decoder_ffn_embed_dim": 3072,
"decoder_layerdrop": 0.0,
"decoder_layers": 6,
"decoder_learned_pos": false,
"decoder_normalize_before": false,
"dropout": 0.1,
"dropout_features": 0.0,
"dropout_input": 0.0,
"encoder_attention_heads": 16,
"encoder_embed_dim": 1024,
"encoder_ffn_embed_dim": 4096,
"encoder_layerdrop": 0.0,
"encoder_layers": 24,
"extractor_mode": "default",
"feature_ds_rate": 1,
"feature_grad_mult": 1.0,
"final_dim": 256,
"final_dropout": 0.1,
"freeze_finetune_updates": 0,
"ignored_weights": [],
"input_modality": "video",
"label_rate": 25,
"latent_temp": [
2.0,
0.5,
0.999995
],
"layer_norm_first": false,
"layerdrop": 0.0,
"llm_ckpt_path": "vilm/vinallama-2.7b",
"logit_temp": 0.1,
"mask_channel_length": 64,
"mask_channel_min_space": 1,
"mask_channel_other": 0.0,
"mask_channel_prob": 0.5,
"mask_channel_selection": "static",
"mask_length": 10,
"mask_length_audio": 10,
"mask_length_image": 10,
"mask_min_space": 1,
"mask_other": 0.0,
"mask_prob": 0.5,
"mask_prob_audio": 0.65,
"mask_prob_image": 0.65,
"mask_selection": "static",
"masking_type": "input",
"masking_updates": 0,
"max_target_positions": 2048,
"modality_dropout": 0.0,
"modality_fuse": "concat",
"model_type": "avsp_llm",
"no_mask_channel_overlap": false,
"no_mask_overlap": false,
"no_pretrained_weights": false,
"no_scale_embedding": true,
"no_token_positional_embeddings": false,
"normalize": false,
"num_classes": 2004,
"num_frames": 16,
"num_frozen_layers": 0,
"pretrained": "tanthinhdt/ViAVSP-LLM_v1.0",
"resnet_relu_type": "prelu",
"resnet_weights": null,
"sample_rate": 25,
"selection_type": "same_other_seq",
"share_decoder_input_output_embed": false,
"sim_type": "cosine",
"skip_masked": false,
"skip_nomask": false,
"sub_encoder_layers": 0,
"target_glu": false,
"torch_dtype": "float32",
"transformers_version": "4.41.2",
"untie_final_proj": false,
"w2v_args": null
}