scb10x
/

llama3.1-typhoon2-audio-8b-instruct

Text Generation

feature-extraction

Model card Files Files and versions Community

llama3.1-typhoon2-audio-8b-instruct / config.json

Pittawat Taveekitworachai

chore: remove unused configurations

847baab about 1 month ago

history blame contribute delete

2.92 kB

	{
	"architectures": [
	"Typhoon2Audio2AudioForConditionalGeneration"
	],
	"attention_bias": false,
	"attention_dropout": 0.0,
	"auto_map": {
	"AutoConfig": "configuration_typhoon2audio.Typhoon2AudioConfig",
	"AutoModel": "modeling_typhoon2audio.Typhoon2Audio2AudioForConditionalGeneration"
	},
	"beats": {
	"model_type": ""
	},
	"ctc_decoder_config": "(4,4096,32,11008)",
	"ctc_loss_weight": 1.0,
	"ctc_upsample_factor": 25,
	"head_dim": 128,
	"hidden_act": "silu",
	"hidden_size": 4096,
	"intermediate_size": 14336,
	"llama_base_model": "scb10x/llama3.1-typhoon2-8b-instruct",
	"max_position_embeddings": 131072,
	"mlp_bias": false,
	"model_type": "typhoon2audio",
	"num_attention_heads": 32,
	"num_hidden_layers": 32,
	"num_key_value_heads": 8,
	"pretraining_tp": 1,
	"rms_norm_eps": 1e-05,
	"rope_scaling": {
	"factor": 8.0,
	"high_freq_factor": 4.0,
	"low_freq_factor": 1.0,
	"original_max_position_embeddings": 8192,
	"rope_type": "llama3"
	},
	"rope_theta": 500000.0,
	"second_per_frame": 0.333333,
	"second_stride": 0.333333,
	"speech_decoder_ignore_index": -100,
	"speech_qformer_layer": 2,
	"speech_qformer_token_num": 1,
	"torch_dtype": "float16",
	"transformers_version": "4.45.0",
	"unit_vocab_size": 1000,
	"vocab_size": 128256,
	"vocoder_config": {
	"code_hop_size": 320,
	"dur_prediction_weight": 1.0,
	"dur_predictor_params": {
	"encoder_embed_dim": 512,
	"var_pred_dropout": 0.5,
	"var_pred_hidden_dim": 512,
	"var_pred_kernel_size": 3
	},
	"embedding_dim": 512,
	"hop_size": 256,
	"model_in_dim": 512,
	"n_fft": 1024,
	"num_embeddings": 1000,
	"num_freq": 1025,
	"num_mels": 80,
	"resblock": 1,
	"resblock_dilation_sizes": [
	[
	1,
	3,
	5
	],
	[
	1,
	3,
	5
	],
	[
	1,
	3,
	5
	]
	],
	"resblock_kernel_sizes": [
	3,
	7,
	11
	],
	"sampling_rate": 16000,
	"segment_size": 8960,
	"upsample_initial_channel": 512,
	"upsample_kernel_sizes": [
	11,
	8,
	8,
	4,
	4
	],
	"upsample_rates": [
	5,
	4,
	4,
	2,
	2
	],
	"win_size": 1024
	},
	"whisper": {
	"apply_spec_augment": true,
	"begin_suppress_tokens": [
	220,
	50257
	],
	"bos_token_id": 50257,
	"d_model": 1280,
	"decoder_attention_heads": 20,
	"decoder_ffn_dim": 5120,
	"decoder_layers": 32,
	"decoder_start_token_id": 50258,
	"encoder_attention_heads": 20,
	"encoder_ffn_dim": 5120,
	"encoder_layers": 32,
	"eos_token_id": 50257,
	"mask_feature_length": 64,
	"mask_feature_prob": 0.1,
	"mask_time_prob": 0.1,
	"max_length": 448,
	"model_type": "whisper",
	"num_hidden_layers": 32,
	"num_mel_bins": 128,
	"vocab_size": 51866
	},
	"whisper_extractor_feature_size": 128
	}