GSU24AI03-SU24AI21
/

ViAVSP-LLM_v1.0

Feature Extraction

8-bit precision

Model card Files Files and versions Metrics Training metrics Community

ViAVSP-LLM_v1.0 / config.json

tanthinhdt's picture

Upload model

15e149b verified 5 months ago

2.68 kB

	{
	"activation_dropout": 0.1,
	"activation_fn": "gelu",
	"apply_mask": false,
	"arch": "avsp_llm",
	"architectures": [
	"AVSPLLMModel"
	],
	"attention_dropout": 0.1,
	"audio_dropout": 0.0,
	"audio_feat_dim": 104,
	"auto_map": {
	"AutoConfig": "configuration.AVSPLLMConfig",
	"AutoModel": "modelling.AVSPLLMModel"
	},
	"conv_bias": false,
	"conv_feature_layers": "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2",
	"conv_pos": 128,
	"conv_pos_groups": 16,
	"data": null,
	"decoder_activation_dropout": 0.0,
	"decoder_attention_dropout": 0.1,
	"decoder_attention_heads": 4,
	"decoder_dropout": 0.1,
	"decoder_embed_dim": 2560,
	"decoder_ffn_embed_dim": 3072,
	"decoder_layerdrop": 0.0,
	"decoder_layers": 6,
	"decoder_learned_pos": false,
	"decoder_normalize_before": false,
	"dropout": 0.1,
	"dropout_features": 0.0,
	"dropout_input": 0.0,
	"encoder_attention_heads": 16,
	"encoder_embed_dim": 1024,
	"encoder_ffn_embed_dim": 4096,
	"encoder_layerdrop": 0.0,
	"encoder_layers": 24,
	"extractor_mode": "default",
	"feature_ds_rate": 1,
	"feature_grad_mult": 1.0,
	"final_dim": 256,
	"final_dropout": 0.1,
	"freeze_finetune_updates": 0,
	"ignored_weights": [],
	"input_modality": "video",
	"label_rate": 25,
	"latent_temp": [
	2.0,
	0.5,
	0.999995
	],
	"layer_norm_first": false,
	"layerdrop": 0.0,
	"llm_ckpt_path": "vilm/vinallama-2.7b",
	"logit_temp": 0.1,
	"mask_channel_length": 64,
	"mask_channel_min_space": 1,
	"mask_channel_other": 0.0,
	"mask_channel_prob": 0.5,
	"mask_channel_selection": "static",
	"mask_length": 10,
	"mask_length_audio": 10,
	"mask_length_image": 10,
	"mask_min_space": 1,
	"mask_other": 0.0,
	"mask_prob": 0.5,
	"mask_prob_audio": 0.65,
	"mask_prob_image": 0.65,
	"mask_selection": "static",
	"masking_type": "input",
	"masking_updates": 0,
	"max_target_positions": 2048,
	"modality_dropout": 0.0,
	"modality_fuse": "concat",
	"model_type": "avsp_llm",
	"no_mask_channel_overlap": false,
	"no_mask_overlap": false,
	"no_pretrained_weights": false,
	"no_scale_embedding": true,
	"no_token_positional_embeddings": false,
	"normalize": false,
	"num_classes": 2004,
	"num_frames": 16,
	"num_frozen_layers": 0,
	"pretrained": "tanthinhdt/ViAVSP-LLM_v1.0",
	"resnet_relu_type": "prelu",
	"resnet_weights": null,
	"sample_rate": 25,
	"selection_type": "same_other_seq",
	"share_decoder_input_output_embed": false,
	"sim_type": "cosine",
	"skip_masked": false,
	"skip_nomask": false,
	"sub_encoder_layers": 0,
	"target_glu": false,
	"torch_dtype": "float32",
	"transformers_version": "4.41.2",
	"untie_final_proj": false,
	"w2v_args": null
	}