llama3.1-typhoon2-audio-8b-instruct / configuration_typhoon2audio.py
Pittawat Taveekitworachai
chore: remove unused configurations
847baab
from transformers import PretrainedConfig, WhisperConfig
class BEATsConfig(PretrainedConfig):
def __init__(self, cfg=None):
# update the default values to BEATs_iter3_plus_AS2M_finetuned_on_AS2M_cpt2.pt
self.input_patch_size: int = 16 # path size of patch embedding
self.embed_dim: int = 512 # patch embedding dimension
self.conv_bias: bool = False # include bias in conv encoder
self.encoder_layers: int = 12 # num encoder layers in the transformer
self.encoder_embed_dim: int = 768 # encoder embedding dimension
self.encoder_ffn_embed_dim: int = 3072 # encoder embedding dimension for FFN
self.encoder_attention_heads: int = 12 # num encoder attention heads
self.activation_fn: str = "gelu" # activation function to use
self.layer_wise_gradient_decay_ratio: float = 0.6 # ratio for layer-wise gradient decay
self.layer_norm_first: bool = False # apply layernorm first in the transformer
self.deep_norm: bool = True # apply deep_norm first in the transformer
# dropouts
self.dropout: float = 0.0 # dropout probability for the transformer
self.attention_dropout: float = 0.0 # dropout probability for attention weights
self.activation_dropout: float = 0.0 # dropout probability after activation in FFN
self.encoder_layerdrop: float = 0.05 # probability of dropping a tarnsformer layer
self.dropout_input: float = 0.0 # dropout to apply to the input (after feat extr)
# positional embeddings
self.conv_pos: int = 128 # number of filters for convolutional positional embeddings
self.conv_pos_groups: int = 16 # number of groups for convolutional positional embedding
# relative position embedding
self.relative_position_embedding: bool = True # apply relative position embedding
self.num_buckets: int = 320 # number of buckets for relative position embedding
self.max_distance: int = 800 # maximum distance for relative position embedding
self.gru_rel_pos: bool = True # apply gated relative position embedding
# label predictor
self.finetuned_model: bool = True # whether the model is a fine-tuned model.
self.predictor_dropout: float = 0.0 # dropout probability for the predictor
self.predictor_class: int = 527 # target class number for the predictor
if cfg is not None:
self.update(cfg)
def update(self, cfg: dict):
self.__dict__.update(cfg)
class Typhoon2AudioConfig(PretrainedConfig):
model_type = "typhoon2audio"
def __init__(self, **kwargs):
# LLM -- Llama3
self.llama_base_model = "scb10x/llama3.1-typhoon2-8b-instruct"
# Whisper
self.whisper_extractor_feature_size=128
self.whisper = WhisperConfig(
activation_dropout=0.0,
activation_function="gelu",
apply_spec_augment=True,
attention_dropout=0.0,
begin_suppress_tokens=[220, 50257],
bos_token_id=50257,
d_model=1280,
decoder_attention_heads=20,
decoder_ffn_dim=5120,
decoder_layerdrop=0.0,
decoder_layers=32,
decoder_start_token_id=50258,
dropout=0.0,
encoder_attention_heads=20,
encoder_ffn_dim=5120,
encoder_layerdrop=0.0,
encoder_layers=32,
eos_token_id=50257,
init_std=0.02,
mask_feature_length=64,
mask_feature_min_masks=0,
mask_feature_prob=0.1,
mask_time_length=10,
mask_time_min_masks=2,
mask_time_prob=0.1,
max_length=448,
max_source_positions=1500,
max_target_positions=448,
median_filter_width=7,
num_hidden_layers=32,
num_mel_bins=128,
pad_token_id=50256,
scale_embedding=False,
use_weighted_layer_sum=False,
vocab_size=51866,
)
# BEATs
self.beats = BEATsConfig()
# Speech QFormer
self.speech_qformer_token_num=1
self.speech_qformer_layer=2
self.second_per_frame=0.333333
self.second_stride=0.333333
# SpeechDecoder CTC
self.pretraining_tp = 1
self.ctc_decoder_config='(4,4096,32,11008)'
self.ctc_upsample_factor=25
self.ctc_loss_weight=1.0
self.unit_vocab_size=1000
self.speech_decoder_ignore_index=-100
self.attention_bias=False
self.attention_dropout=0.0
self.bos_token_id=128000
self.eos_token_id=128009
self.head_dim=128
self.hidden_act="silu"
self.hidden_size=4096
self.intermediate_size=14336
self.max_position_embeddings=131072
self.mlp_bias=False
self.num_attention_heads=32
self.num_hidden_layers=32
self.num_key_value_heads=8
self.rms_norm_eps=1e-05
self.rope_scaling={
"factor": 8.0,
"high_freq_factor": 4.0,
"low_freq_factor": 1.0,
"original_max_position_embeddings": 8192,
"rope_type": "llama3"
}
self.rope_theta=500000.0
self.vocab_size=128256
# Unit Vocoder (HiFiGAN)
self.vocoder_config = {
'resblock': 1,
'upsample_rates': [5, 4, 4, 2, 2],
'upsample_kernel_sizes': [11, 8, 8, 4, 4],
'upsample_initial_channel': 512,
'resblock_kernel_sizes': [3, 7, 11],
'resblock_dilation_sizes': [[1, 3, 5], [1, 3, 5], [1, 3, 5]],
'num_embeddings': 1000,
'embedding_dim': 512,
'model_in_dim': 512,
'segment_size': 8960,
'code_hop_size': 320,
'num_mels': 80,
'num_freq': 1025,
'n_fft': 1024,
'hop_size': 256,
'win_size': 1024,
'sampling_rate': 16000,
'dur_prediction_weight': 1.0,
'dur_predictor_params': {
'encoder_embed_dim': 512,
'var_pred_hidden_dim': 512,
'var_pred_kernel_size': 3,
'var_pred_dropout': 0.5
}
}
super().__init__(**kwargs)