from typing import Tuple from transformers import PretrainedConfig class AVHubertConfig(PretrainedConfig): model_type = "av_hubert" def __init__( self, label_rate: int = 25, sample_rate: int = 25, input_modality: str = "video", extractor_mode: str = "default", encoder_layers: int = 24, encoder_embed_dim: int = 1024, encoder_ffn_embed_dim: int = 4096, encoder_attention_heads: int = 16, activation_fn: str = "gelu", dropout: float = 0.1, attention_dropout: float = 0.1, activation_dropout: float = 0.1, encoder_layerdrop: float = 0.0, dropout_input: float = 0.0, dropout_features: float = 0.0, final_dim: int = 256, untie_final_proj: bool = False, layer_norm_first: bool = False, conv_feature_layers: str = "[(512,10,5)] + [(512,3,2)] * 4 + [(512,2,2)] * 2", conv_bias: bool = False, logit_temp: float = 0.1, target_glu: bool = False, feature_grad_mult: float = 1.0, mask_length_audio: int = 10, mask_prob_audio: float = 0.65, mask_length_image: int = 10, mask_prob_image: float = 0.65, mask_selection: str = "static", mask_other: float = 0.0, no_mask_overlap: bool = False, mask_min_space: int = 1, mask_channel_length: int = 64, mask_channel_prob: float = 0.5, mask_channel_selection: str = "static", mask_channel_other: float = 0.0, no_mask_channel_overlap: bool = False, mask_channel_min_space: int = 1, conv_pos: int = 128, conv_pos_groups: int = 16, latent_temp: Tuple[float, float, float] = (2.0, 0.5, 0.999995), skip_masked: bool = False, skip_nomask: bool = False, resnet_relu_type: str = "prelu", resnet_weights: str = None, sim_type: str = "cosine", sub_encoder_layers: int = 0, audio_feat_dim: int = 104, modality_dropout: float = 0.0, audio_dropout: float = 0.0, modality_fuse: str = "concat", selection_type: str = "same_other_seq", masking_type: str = "input", decoder_embed_dim: int = 2560, decoder_ffn_embed_dim: int = 3072, decoder_layers: int = 6, decoder_layerdrop: float = 0.0, decoder_attention_heads: int = 4, decoder_learned_pos: bool = False, decoder_normalize_before: bool = False, no_token_positional_embeddings: bool = False, decoder_dropout: float = 0.1, decoder_attention_dropout: float = 0.1, decoder_activation_dropout: float = 0.0, max_target_positions: int = 2048, share_decoder_input_output_embed: bool = False, no_scale_embedding: bool = True, num_classes: int = 2004, **kwargs, ) -> None: super().__init__(**kwargs) self.label_rate = label_rate self.sample_rate = sample_rate self.input_modality = input_modality self.extractor_mode = extractor_mode self.encoder_layers = encoder_layers self.encoder_embed_dim = encoder_embed_dim self.encoder_ffn_embed_dim = encoder_ffn_embed_dim self.encoder_attention_heads = encoder_attention_heads self.activation_fn = activation_fn self.dropout = dropout self.attention_dropout = attention_dropout self.activation_dropout = activation_dropout self.encoder_layerdrop = encoder_layerdrop self.dropout_input = dropout_input self.dropout_features = dropout_features self.final_dim = final_dim self.untie_final_proj = untie_final_proj self.layer_norm_first = layer_norm_first self.conv_feature_layers = conv_feature_layers self.conv_bias = conv_bias self.logit_temp = logit_temp self.target_glu = target_glu self.feature_grad_mult = feature_grad_mult self.mask_length_audio = mask_length_audio self.mask_prob_audio = mask_prob_audio self.mask_length_image = mask_length_image self.mask_prob_image = mask_prob_image self.mask_selection = mask_selection self.mask_other = mask_other self.no_mask_overlap = no_mask_overlap self.mask_min_space = mask_min_space self.mask_channel_length = mask_channel_length self.mask_channel_prob = mask_channel_prob self.mask_channel_selection = mask_channel_selection self.mask_channel_other = mask_channel_other self.no_mask_channel_overlap = no_mask_channel_overlap self.mask_channel_min_space = mask_channel_min_space self.conv_pos = conv_pos self.conv_pos_groups = conv_pos_groups self.latent_temp = latent_temp self.skip_masked = skip_masked self.skip_nomask = skip_nomask self.resnet_relu_type = resnet_relu_type self.resnet_weights = resnet_weights self.sim_type = sim_type self.sub_encoder_layers = sub_encoder_layers self.audio_feat_dim = audio_feat_dim self.modality_dropout = modality_dropout self.audio_dropout = audio_dropout self.modality_fuse = modality_fuse self.selection_type = selection_type self.masking_type = masking_type self.decoder_embed_dim = decoder_embed_dim self.decoder_ffn_embed_dim = decoder_ffn_embed_dim self.decoder_layers = decoder_layers self.decoder_layerdrop = decoder_layerdrop self.decoder_attention_heads = decoder_attention_heads self.decoder_learned_pos = decoder_learned_pos self.decoder_normalize_before = decoder_normalize_before self.no_token_positional_embeddings = no_token_positional_embeddings self.decoder_dropout = decoder_dropout self.decoder_attention_dropout = decoder_attention_dropout self.decoder_activation_dropout = decoder_activation_dropout self.max_target_positions = max_target_positions self.share_decoder_input_output_embed = share_decoder_input_output_embed self.no_scale_embedding = no_scale_embedding self.num_classes = num_classes self.feature_ds_rate = 1 class AVSPLLMConfig(AVHubertConfig): model_type = "avsp_llm" def __init__( self, llm_ckpt_path: str = "vilm/vinallama-2.7b", no_pretrained_weights: bool = False, final_dropout: float = 0.1, apply_mask: bool = False, mask_length: int = 10, mask_prob: float = 0.5, masking_updates: int = 0, layerdrop: float = 0.0, normalize: bool = False, data: str = None, w2v_args: dict = None, freeze_finetune_updates: int = 0, **kwargs, ) -> None: super().__init__(**kwargs) self.llm_ckpt_path = llm_ckpt_path self.no_pretrained_weights = no_pretrained_weights self.final_dropout = final_dropout self.apply_mask = apply_mask self.mask_length = mask_length self.mask_prob = mask_prob self.masking_updates = masking_updates self.layerdrop = layerdrop self.normalize = normalize self.data = data self.w2v_args = w2v_args self.freeze_finetune_updates = freeze_finetune_updates