{ "activation_dropout": 0.1, "activation_fn": "gelu", "apply_mask": false, "arch": "avsp_llm", "architectures": [ "AVSPLLMModel" ], "attention_dropout": 0.1, "audio_dropout": 0.0, "audio_feat_dim": 104, "auto_map": { "AutoConfig": "configuration.AVSPLLMConfig", "AutoModel": "modelling.AVSPLLMModel" }, "conv_bias": false, "conv_feature_layers": "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2", "conv_pos": 128, "conv_pos_groups": 16, "data": null, "decoder_activation_dropout": 0.0, "decoder_attention_dropout": 0.1, "decoder_attention_heads": 4, "decoder_dropout": 0.1, "decoder_embed_dim": 2560, "decoder_ffn_embed_dim": 3072, "decoder_layerdrop": 0.0, "decoder_layers": 6, "decoder_learned_pos": false, "decoder_normalize_before": false, "dropout": 0.1, "dropout_features": 0.0, "dropout_input": 0.0, "encoder_attention_heads": 16, "encoder_embed_dim": 1024, "encoder_ffn_embed_dim": 4096, "encoder_layerdrop": 0.0, "encoder_layers": 24, "extractor_mode": "default", "feature_ds_rate": 1, "feature_grad_mult": 1.0, "final_dim": 256, "final_dropout": 0.1, "freeze_finetune_updates": 0, "ignored_weights": [], "input_modality": "video", "label_rate": 25, "latent_temp": [ 2.0, 0.5, 0.999995 ], "layer_norm_first": false, "layerdrop": 0.0, "llm_ckpt_path": "vilm/vinallama-2.7b", "logit_temp": 0.1, "mask_channel_length": 64, "mask_channel_min_space": 1, "mask_channel_other": 0.0, "mask_channel_prob": 0.5, "mask_channel_selection": "static", "mask_length": 10, "mask_length_audio": 10, "mask_length_image": 10, "mask_min_space": 1, "mask_other": 0.0, "mask_prob": 0.5, "mask_prob_audio": 0.65, "mask_prob_image": 0.65, "mask_selection": "static", "masking_type": "input", "masking_updates": 0, "max_target_positions": 2048, "modality_dropout": 0.0, "modality_fuse": "concat", "model_type": "avsp_llm", "no_mask_channel_overlap": false, "no_mask_overlap": false, "no_pretrained_weights": false, "no_scale_embedding": true, "no_token_positional_embeddings": false, "normalize": false, "num_classes": 2004, "num_frames": 16, "num_frozen_layers": 0, "pretrained": "tanthinhdt/ViAVSP-LLM_v1.0", "resnet_relu_type": "prelu", "resnet_weights": null, "sample_rate": 25, "selection_type": "same_other_seq", "share_decoder_input_output_embed": false, "sim_type": "cosine", "skip_masked": false, "skip_nomask": false, "sub_encoder_layers": 0, "target_glu": false, "torch_dtype": "float32", "transformers_version": "4.41.2", "untie_final_proj": false, "w2v_args": null }