|
name: gemma-7b-sql-nemo |
|
trainer: |
|
num_nodes: 1 |
|
devices: 8 |
|
accelerator: gpu |
|
precision: bf16 |
|
logger: false |
|
enable_checkpointing: false |
|
use_distributed_sampler: false |
|
max_time: null |
|
max_epochs: 1 |
|
max_steps: -1 |
|
sft: |
|
max_epochs: 1 |
|
max_steps: -1 |
|
val_check_interval: 1000 |
|
save_interval: 1000 |
|
limit_val_batches: 40 |
|
gradient_clip_val: 1.0 |
|
exp_manager: |
|
explicit_log_dir: models/gemma-7b-sql-nemo |
|
exp_dir: null |
|
name: gemma-7b-sql-nemo |
|
create_wandb_logger: false |
|
wandb_logger_kwargs: |
|
project: null |
|
name: null |
|
resume_if_exists: true |
|
resume_ignore_no_checkpoint: true |
|
create_checkpoint_callback: true |
|
checkpoint_callback_params: |
|
monitor: validation_loss |
|
save_top_k: 5 |
|
mode: min |
|
save_nemo_on_train_end: true |
|
filename: megatron_gpt_sft--{validation_loss:.3f}-{step}-{consumed_samples}-{epoch} |
|
model_parallel_size: 4 |
|
save_best_model: false |
|
model: |
|
seed: 1234 |
|
tensor_model_parallel_size: 4 |
|
pipeline_model_parallel_size: 1 |
|
restore_from_path: /workspace/models/pytorch-7b-pt.nemo |
|
resume_from_checkpoint: null |
|
save_nemo_on_validation_end: true |
|
sync_batch_comm: false |
|
megatron_amp_O2: true |
|
encoder_seq_length: 8192 |
|
sequence_parallel: false |
|
activations_checkpoint_granularity: null |
|
activations_checkpoint_method: null |
|
activations_checkpoint_num_layers: null |
|
activations_checkpoint_layers_per_pipeline: null |
|
answer_only_loss: true |
|
gradient_as_bucket_view: false |
|
seq_len_interpolation_factor: null |
|
use_flash_attention: null |
|
hidden_dropout: 0.0 |
|
attention_dropout: 0.0 |
|
ffn_dropout: 0.0 |
|
peft: |
|
peft_scheme: none |
|
restore_from_path: null |
|
lora_tuning: |
|
target_modules: |
|
- attention_qkv |
|
adapter_dim: 32 |
|
adapter_dropout: 0.0 |
|
column_init_method: xavier |
|
row_init_method: zero |
|
layer_selection: null |
|
weight_tying: false |
|
position_embedding_strategy: null |
|
data: |
|
chat: false |
|
chat_prompt_tokens: |
|
system_turn_start: "\0" |
|
turn_start: "\x11" |
|
label_start: "\x12" |
|
end_of_turn: ' |
|
|
|
' |
|
end_of_name: ' |
|
|
|
' |
|
sample: false |
|
num_workers: 0 |
|
dataloader_type: single |
|
train_ds: |
|
file_path: nsql.jsonl |
|
global_batch_size: 128 |
|
micro_batch_size: 1 |
|
shuffle: true |
|
memmap_workers: null |
|
max_seq_length: 8192 |
|
min_seq_length: 1 |
|
drop_last: true |
|
label_key: output |
|
add_eos: true |
|
add_sep: false |
|
add_bos: false |
|
truncation_field: input |
|
index_mapping_dir: null |
|
prompt_template: '{input} {output}' |
|
hf_dataset: false |
|
truncation_method: right |
|
validation_ds: |
|
file_path: nsql.jsonl |
|
global_batch_size: 128 |
|
micro_batch_size: 1 |
|
shuffle: false |
|
memmap_workers: null |
|
max_seq_length: 8192 |
|
min_seq_length: 1 |
|
drop_last: true |
|
label_key: output |
|
add_eos: true |
|
add_sep: false |
|
add_bos: false |
|
truncation_field: input |
|
index_mapping_dir: null |
|
prompt_template: '{input} {output}' |
|
hf_dataset: false |
|
truncation_method: right |
|
output_original_text: true |
|
optim: |
|
name: distributed_fused_adam |
|
lr: 5.0e-06 |
|
weight_decay: 0.01 |
|
betas: |
|
- 0.9 |
|
- 0.98 |
|
sched: |
|
name: CosineAnnealing |
|
warmup_steps: 10 |
|
constant_steps: 1000 |
|
min_lr: 9.0e-07 |
|
bias_activation_fusion: true |
|
precision: bf16 |
|
|