nvidia
/

Nemotron-4-340B-Instruct

NeMo

Model card Files Files and versions Community

okuchaiev commited on Jun 13

Commit

365b81f

•

1 Parent(s): e3d6f77

Add files using large-upload tool

Browse files

Files changed (1) hide show

model_config.yaml +261 -0

model_config.yaml ADDED Viewed

	@@ -0,0 +1,261 @@

+mcore_gpt: true
+micro_batch_size: 1
+global_batch_size: 256
+tensor_model_parallel_size: 8
+pipeline_model_parallel_size: 4
+virtual_pipeline_model_parallel_size: null
+encoder_seq_length: 4096
+max_position_embeddings: 4096
+num_layers: 96
+hidden_size: 18432
+ffn_hidden_size: 73728
+num_attention_heads: 96
+init_method_std: 0.0063
+use_scaled_init_method: true
+hidden_dropout: 0.0
+attention_dropout: 0.0
+ffn_dropout: 0.0
+kv_channels: null
+apply_query_key_layer_scaling: true
+normalization: layernorm1p
+layernorm_epsilon: 1.0e-05
+do_layer_norm_weight_decay: false
+make_vocab_size_divisible_by: 128
+pre_process: true
+post_process: true
+persist_layer_norm: true
+bias: false
+activation: squared-relu
+headscale: false
+transformer_block_type: pre_ln
+openai_gelu: false
+normalize_attention_scores: true
+position_embedding_type: rope
+rotary_percentage: 0.5
+attention_type: multihead
+share_embeddings_and_output_weights: false
+num_query_groups: 8
+tokenizer:
+  library: sentencepiece
+  type: null
+  model: nemo:8223bf8eaa194eb8920af568bb52e2d0_megatron_2.model
+  vocab_file: null
+  merge_file: null
+  tokenizer_model: nemo:eb5528fdec5c4083affa2c97958eeef7_megatron_2.model
+  sentencepiece_legacy: false
+native_amp_init_scale: 4294967296
+native_amp_growth_interval: 1000
+hysteresis: 2
+fp32_residual_connection: false
+fp16_lm_cross_entropy: false
+megatron_amp_O2: true
+grad_allreduce_chunk_size_mb: 125
+grad_div_ar_fusion: true
+gradient_accumulation_fusion: false
+bias_activation_fusion: false
+bias_dropout_add_fusion: false
+masked_softmax_fusion: true
+seed: 1234
+resume_from_checkpoint: null
+use_cpu_initialization: false
+onnx_safe: false
+apex_transformer_log_level: 30
+gradient_as_bucket_view: false
+sync_batch_comm: false
+activations_checkpoint_granularity: null
+activations_checkpoint_method: null
+activations_checkpoint_num_layers: 1
+num_micro_batches_with_partial_activation_checkpoints: null
+activations_checkpoint_layers_per_pipeline: null
+sequence_parallel: false
+transformer_engine: false
+fp8: false
+fp8_e4m3: false
+fp8_hybrid: false
+fp8_margin: 0
+fp8_interval: 1
+fp8_amax_history_len: 1
+fp8_amax_compute_algo: most_recent
+reduce_amax: true
+use_emha: false
+optim:
+  name: distributed_fused_adam
+  lr: 3.002e-07
+  weight_decay: 0.1
+  betas:
+  - 0.9
+  - 0.98
+  sched:
+    name: CosineAnnealing
+    warmup_steps: 10
+    constant_steps: 400
+    min_lr: 3.0e-07
+  bucket_cap_mb: 200
+  overlap_grad_sync: false
+  contiguous_grad_buffer: true
+precision: bf16-mixed
+data:
+  chat: true
+  chat_prompt_tokens:
+    system_turn_start: <extra_id_0>
+    turn_start: <extra_id_1>
+    label_start: <extra_id_2>
+    end_of_turn: '
+      '
+    end_of_name: '
+      '
+  sample: true
+  num_workers: 2
+  dataloader_type: single
+  train_ds:
+    file_path: /dataset/daring-anteater_commercial.shuf.removelong.jsonl
+    global_batch_size: 128
+    micro_batch_size: 1
+    shuffle: true
+    memmap_workers: null
+    max_seq_length: 4096
+    min_seq_length: 1
+    drop_last: true
+    concat_sampling_probabilities: null
+    label_key: output
+    add_eos: false
+    add_sep: false
+    add_bos: false
+    truncation_field: input
+    index_mapping_dir: /indexmap_dir
+    prompt_template: '<extra_id_0>System
+      {system message}
+      <extra_id_1>User
+      {turn 1 user message}
+      <extra_id_1>Assistant
+      <extra_id_2>{turn 1 assistant label}
+      {turn 1 assistant message}
+      <extra_id_1>User
+      {turn 2 user message}
+      <extra_id_1>Assistant
+      <extra_id_2>{turn 2 assistant label}
+      {turn 2 assistant message}
+      <extra_id_1>'
+    hf_dataset: true
+    truncation_method: right
+  validation_ds:
+    file_path: /dataset/daring-anteater_commercial.shuf.removelong.jsonl
+    names: null
+    global_batch_size: 128
+    micro_batch_size: 1
+    shuffle: false
+    memmap_workers: null
+    max_seq_length: 4096
+    min_seq_length: 1
+    drop_last: false
+    label_key: output
+    add_eos: false
+    add_sep: false
+    add_bos: false
+    write_predictions_to_file: false
+    output_file_path_prefix: null
+    truncation_field: input
+    index_mapping_dir: /indexmap_dir
+    prompt_template: '<extra_id_0>System
+      {system message}
+      <extra_id_1>User
+      {turn 1 user message}
+      <extra_id_1>Assistant
+      <extra_id_2>{turn 1 assistant label}
+      {turn 1 assistant message}
+      <extra_id_1>User
+      {turn 2 user message}
+      <extra_id_1>Assistant
+      <extra_id_2>{turn 2 assistant label}
+      {turn 2 assistant message}
+      <extra_id_1>'
+    tokens_to_generate: 32
+    hf_dataset: true
+    truncation_method: right
+    metric:
+      name: loss
+      average: null
+      num_classes: null
+  test_ds:
+    prompt_template: '<extra_id_0>System
+      {system message}
+      <extra_id_1>User
+      {turn 1 user message}
+      <extra_id_1>Assistant
+      <extra_id_2>{turn 1 assistant label}
+      {turn 1 assistant message}
+      <extra_id_1>User
+      {turn 2 user message}
+      <extra_id_1>Assistant
+      <extra_id_2>{turn 2 assistant label}
+      {turn 2 assistant message}
+      <extra_id_1>'
+  data_impl: jsonl
+  splits_string: null
+  seq_length: 4096
+  skip_warmup: true
+  reset_position_ids: false
+  reset_attention_mask: false
+  eod_mask_loss: false
+  index_mapping_dir: /indexmap_dir
+  data_prefix:
+    train:
+    - /datasets/v30_benign-walrus_clip153600.jsonl
+    validation:
+    - /datasets/v30_benign-walrus_clip153600.jsonl
+    test:
+    - /datasets/v30_benign-walrus_clip153600.jsonl
+answer_only_loss: true
+restore_from_path: /models/340B_100p_CT_100B
+save_nemo_on_validation_end: true
+use_flash_attention: null
+pipeline_model_parallel_split_rank: 0
+dpo:
+  log_prob_forward_micro_batch_size: 2
+  ref_policy_kl_penalty: 0.3
+  average_log_probs: false
+  sft_loss_coeff: 1.0e-05
+  optimize_ref_policy_kl_penalty: false
+  preference_loss: reward_rev_dpo
+  gt_reward_scale: 1.0
+apply_rope_fusion: false
+target: nemo.collections.nlp.models.language_modeling.megatron_gpt_model.MegatronGPTModel
+nemo_version: 1.22.0