gemma-7b-sql / nemo /hparams.yaml
samos123's picture
Upload folder using huggingface_hub
bfc8406 verified
name: gemma-7b-sql-nemo
trainer:
num_nodes: 1
devices: 8
accelerator: gpu
precision: bf16
logger: false
enable_checkpointing: false
use_distributed_sampler: false
max_time: null
max_epochs: 1
max_steps: -1
sft:
max_epochs: 1
max_steps: -1
val_check_interval: 1000
save_interval: 1000
limit_val_batches: 40
gradient_clip_val: 1.0
exp_manager:
explicit_log_dir: models/gemma-7b-sql-nemo
exp_dir: null
name: gemma-7b-sql-nemo
create_wandb_logger: false
wandb_logger_kwargs:
project: null
name: null
resume_if_exists: true
resume_ignore_no_checkpoint: true
create_checkpoint_callback: true
checkpoint_callback_params:
monitor: validation_loss
save_top_k: 5
mode: min
save_nemo_on_train_end: true
filename: megatron_gpt_sft--{validation_loss:.3f}-{step}-{consumed_samples}-{epoch}
model_parallel_size: 4
save_best_model: false
model:
seed: 1234
tensor_model_parallel_size: 4
pipeline_model_parallel_size: 1
restore_from_path: /workspace/models/pytorch-7b-pt.nemo
resume_from_checkpoint: null
save_nemo_on_validation_end: true
sync_batch_comm: false
megatron_amp_O2: true
encoder_seq_length: 8192
sequence_parallel: false
activations_checkpoint_granularity: null
activations_checkpoint_method: null
activations_checkpoint_num_layers: null
activations_checkpoint_layers_per_pipeline: null
answer_only_loss: true
gradient_as_bucket_view: false
seq_len_interpolation_factor: null
use_flash_attention: null
hidden_dropout: 0.0
attention_dropout: 0.0
ffn_dropout: 0.0
peft:
peft_scheme: none
restore_from_path: null
lora_tuning:
target_modules:
- attention_qkv
adapter_dim: 32
adapter_dropout: 0.0
column_init_method: xavier
row_init_method: zero
layer_selection: null
weight_tying: false
position_embedding_strategy: null
data:
chat: false
chat_prompt_tokens:
system_turn_start: "\0"
turn_start: "\x11"
label_start: "\x12"
end_of_turn: '
'
end_of_name: '
'
sample: false
num_workers: 0
dataloader_type: single
train_ds:
file_path: nsql.jsonl
global_batch_size: 128
micro_batch_size: 1
shuffle: true
memmap_workers: null
max_seq_length: 8192
min_seq_length: 1
drop_last: true
label_key: output
add_eos: true
add_sep: false
add_bos: false
truncation_field: input
index_mapping_dir: null
prompt_template: '{input} {output}'
hf_dataset: false
truncation_method: right
validation_ds:
file_path: nsql.jsonl
global_batch_size: 128
micro_batch_size: 1
shuffle: false
memmap_workers: null
max_seq_length: 8192
min_seq_length: 1
drop_last: true
label_key: output
add_eos: true
add_sep: false
add_bos: false
truncation_field: input
index_mapping_dir: null
prompt_template: '{input} {output}'
hf_dataset: false
truncation_method: right
output_original_text: true
optim:
name: distributed_fused_adam
lr: 5.0e-06
weight_decay: 0.01
betas:
- 0.9
- 0.98
sched:
name: CosineAnnealing
warmup_steps: 10
constant_steps: 1000
min_lr: 9.0e-07
bias_activation_fusion: true
precision: bf16