nanotron
/

minicpm-nanotron

thomwolf HF staff

update

54ba632 10 months ago

1.18 kB

	checkpoints: null
	data: null
	general:
	benchmark_csv_path: null
	consumed_train_samples: null
	ignore_sanity_checks: false
	project: openbmb
	run: MiniCPM-2B-dpo-bf16
	seed: 42
	step: 0
	lighteval: null
	logging: null
	model:
	ddp_bucket_cap_mb: 25
	dtype: bfloat16
	init_method:
	std: 0.025
	make_vocab_size_divisible_by: 1
	model_config:
	attn_pdrop: 0.0
	bos_token_id: 1
	dim_model_base: 256
	eos_token_id: 2
	hidden_act: silu
	hidden_size: 2304
	initializer_range: 0.1
	intermediate_size: 5760
	max_position_embeddings: 2048
	num_attention_heads: 36
	num_hidden_layers: 40
	num_key_value_heads: 36
	pad_token_id: null
	pretraining_tp: 1
	rms_norm_eps: 1.0e-05
	rope_theta: 10000.0
	scale_depth: 1.4
	scale_emb: 12
	tie_word_embeddings: true
	use_cache: true
	vocab_size: 122753
	optimizer: null
	parallelism:
	dp: 1
	pp: 1
	pp_engine: 1f1b
	recompute_granularity: SELECTIVE
	tp: 1
	tp_linear_async_communication: true
	tp_mode: REDUCE_SCATTER
	profiler: null
	tokenizer:
	tokenizer_max_length: null
	tokenizer_name_or_path: openbmb/MiniCPM-2B-dpo-bf16
	tokenizer_revision: null
	tokens: null