tosi-n7's picture
Upload folder using huggingface_hub
d8ffdc4
# Used for multi-gpu
local_rank = -1
per_device_train_batch_size = 4
per_device_eval_batch_size = 4
gradient_accumulation_steps = 1
learning_rate = 2e-4
max_grad_norm = 0.3
weight_decay = 0.001
lora_alpha = 16
lora_dropout = 0.1
lora_r = 64
max_seq_length = None
# The model that you want to train from the Hugging Face hub
model_name = "guardrail/llama-2-7b-guanaco-instruct-sharded"
# Fine-tuned model name
new_model = "llama-2-7b-custom-accountant"
# The instruction dataset to use
# dataset_name = "databricks/databricks-dolly-15k"
# Activate 4-bit precision base model loading
use_4bit = True
# Activate nested quantization for 4-bit base models
use_nested_quant = False
# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"
# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"
# Number of training epochs
num_train_epochs = 2
# Enable fp16 training, (bf16 to True with an A100)
fp16 = False
# Enable bf16 training
bf16 = False
# Use packing dataset creating
packing = False
# Enable gradient checkpointing
gradient_checkpointing = True
# Optimizer to use, original is paged_adamw_32bit
optim = "paged_adamw_32bit"
# Learning rate schedule (constant a bit better than cosine, and has advantage for analysis)
lr_scheduler_type = "cosine"
# Number of optimizer update steps, 10K original, 20 for demo purposes
max_steps = -1
# Fraction of steps to do a warmup for
warmup_ratio = 0.03
# Group sequences into batches with same length (saves memory and speeds up training considerably)
group_by_length = True
# Save checkpoint every X updates steps
save_steps = 10
# Log every X updates steps
logging_steps = 1
# The output directory where the model predictions and checkpoints will be written
output_dir = "../model_files/"
# Load the entire model on the GPU 0
device_map = {"": 0}
# Visualize training
report_to = "tensorboard"
# Tensorboard logs
tb_log_dir = "../logs/"