Pittawat Taveekitworachai
chore: remove unused configurations
847baab
{
"architectures": [
"Typhoon2Audio2AudioForConditionalGeneration"
],
"attention_bias": false,
"attention_dropout": 0.0,
"auto_map": {
"AutoConfig": "configuration_typhoon2audio.Typhoon2AudioConfig",
"AutoModel": "modeling_typhoon2audio.Typhoon2Audio2AudioForConditionalGeneration"
},
"beats": {
"model_type": ""
},
"ctc_decoder_config": "(4,4096,32,11008)",
"ctc_loss_weight": 1.0,
"ctc_upsample_factor": 25,
"head_dim": 128,
"hidden_act": "silu",
"hidden_size": 4096,
"intermediate_size": 14336,
"llama_base_model": "scb10x/llama3.1-typhoon2-8b-instruct",
"max_position_embeddings": 131072,
"mlp_bias": false,
"model_type": "typhoon2audio",
"num_attention_heads": 32,
"num_hidden_layers": 32,
"num_key_value_heads": 8,
"pretraining_tp": 1,
"rms_norm_eps": 1e-05,
"rope_scaling": {
"factor": 8.0,
"high_freq_factor": 4.0,
"low_freq_factor": 1.0,
"original_max_position_embeddings": 8192,
"rope_type": "llama3"
},
"rope_theta": 500000.0,
"second_per_frame": 0.333333,
"second_stride": 0.333333,
"speech_decoder_ignore_index": -100,
"speech_qformer_layer": 2,
"speech_qformer_token_num": 1,
"torch_dtype": "float16",
"transformers_version": "4.45.0",
"unit_vocab_size": 1000,
"vocab_size": 128256,
"vocoder_config": {
"code_hop_size": 320,
"dur_prediction_weight": 1.0,
"dur_predictor_params": {
"encoder_embed_dim": 512,
"var_pred_dropout": 0.5,
"var_pred_hidden_dim": 512,
"var_pred_kernel_size": 3
},
"embedding_dim": 512,
"hop_size": 256,
"model_in_dim": 512,
"n_fft": 1024,
"num_embeddings": 1000,
"num_freq": 1025,
"num_mels": 80,
"resblock": 1,
"resblock_dilation_sizes": [
[
1,
3,
5
],
[
1,
3,
5
],
[
1,
3,
5
]
],
"resblock_kernel_sizes": [
3,
7,
11
],
"sampling_rate": 16000,
"segment_size": 8960,
"upsample_initial_channel": 512,
"upsample_kernel_sizes": [
11,
8,
8,
4,
4
],
"upsample_rates": [
5,
4,
4,
2,
2
],
"win_size": 1024
},
"whisper": {
"apply_spec_augment": true,
"begin_suppress_tokens": [
220,
50257
],
"bos_token_id": 50257,
"d_model": 1280,
"decoder_attention_heads": 20,
"decoder_ffn_dim": 5120,
"decoder_layers": 32,
"decoder_start_token_id": 50258,
"encoder_attention_heads": 20,
"encoder_ffn_dim": 5120,
"encoder_layers": 32,
"eos_token_id": 50257,
"mask_feature_length": 64,
"mask_feature_prob": 0.1,
"mask_time_prob": 0.1,
"max_length": 448,
"model_type": "whisper",
"num_hidden_layers": 32,
"num_mel_bins": 128,
"vocab_size": 51866
},
"whisper_extractor_feature_size": 128
}