File size: 3,412 Bytes
bfc8406 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
name: gemma-7b-sql-nemo
trainer:
num_nodes: 1
devices: 8
accelerator: gpu
precision: bf16
logger: false
enable_checkpointing: false
use_distributed_sampler: false
max_time: null
max_epochs: 1
max_steps: -1
sft:
max_epochs: 1
max_steps: -1
val_check_interval: 1000
save_interval: 1000
limit_val_batches: 40
gradient_clip_val: 1.0
exp_manager:
explicit_log_dir: models/gemma-7b-sql-nemo
exp_dir: null
name: gemma-7b-sql-nemo
create_wandb_logger: false
wandb_logger_kwargs:
project: null
name: null
resume_if_exists: true
resume_ignore_no_checkpoint: true
create_checkpoint_callback: true
checkpoint_callback_params:
monitor: validation_loss
save_top_k: 5
mode: min
save_nemo_on_train_end: true
filename: megatron_gpt_sft--{validation_loss:.3f}-{step}-{consumed_samples}-{epoch}
model_parallel_size: 4
save_best_model: false
model:
seed: 1234
tensor_model_parallel_size: 4
pipeline_model_parallel_size: 1
restore_from_path: /workspace/models/pytorch-7b-pt.nemo
resume_from_checkpoint: null
save_nemo_on_validation_end: true
sync_batch_comm: false
megatron_amp_O2: true
encoder_seq_length: 8192
sequence_parallel: false
activations_checkpoint_granularity: null
activations_checkpoint_method: null
activations_checkpoint_num_layers: null
activations_checkpoint_layers_per_pipeline: null
answer_only_loss: true
gradient_as_bucket_view: false
seq_len_interpolation_factor: null
use_flash_attention: null
hidden_dropout: 0.0
attention_dropout: 0.0
ffn_dropout: 0.0
peft:
peft_scheme: none
restore_from_path: null
lora_tuning:
target_modules:
- attention_qkv
adapter_dim: 32
adapter_dropout: 0.0
column_init_method: xavier
row_init_method: zero
layer_selection: null
weight_tying: false
position_embedding_strategy: null
data:
chat: false
chat_prompt_tokens:
system_turn_start: "\0"
turn_start: "\x11"
label_start: "\x12"
end_of_turn: '
'
end_of_name: '
'
sample: false
num_workers: 0
dataloader_type: single
train_ds:
file_path: nsql.jsonl
global_batch_size: 128
micro_batch_size: 1
shuffle: true
memmap_workers: null
max_seq_length: 8192
min_seq_length: 1
drop_last: true
label_key: output
add_eos: true
add_sep: false
add_bos: false
truncation_field: input
index_mapping_dir: null
prompt_template: '{input} {output}'
hf_dataset: false
truncation_method: right
validation_ds:
file_path: nsql.jsonl
global_batch_size: 128
micro_batch_size: 1
shuffle: false
memmap_workers: null
max_seq_length: 8192
min_seq_length: 1
drop_last: true
label_key: output
add_eos: true
add_sep: false
add_bos: false
truncation_field: input
index_mapping_dir: null
prompt_template: '{input} {output}'
hf_dataset: false
truncation_method: right
output_original_text: true
optim:
name: distributed_fused_adam
lr: 5.0e-06
weight_decay: 0.01
betas:
- 0.9
- 0.98
sched:
name: CosineAnnealing
warmup_steps: 10
constant_steps: 1000
min_lr: 9.0e-07
bias_activation_fusion: true
precision: bf16
|