File size: 2,826 Bytes
48c7a89 7c9f7d5 48c7a89 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
from datasets import load_dataset
from transformers import TrainingArguments
from trl import SFTTrainer
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig
# Load jsonl data from disk
dataset = load_dataset("philschmid/dolly-15k-oai-style", split="train")
# Hugging Face model id
model_id = "google/gemma-7b"
tokenizer_id = "philschmid/gemma-tokenizer-chatml"
# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
attn_implementation="flash_attention_2",
torch_dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
tokenizer.padding_side = 'right' # to prevent warnings
# LoRA config based on QLoRA paper & Sebastian Raschka experiment
peft_config = LoraConfig(
lora_alpha=8,
lora_dropout=0.05,
r=16,
bias="none",
target_modules="all-linear",
task_type="CAUSAL_LM",
)
args = TrainingArguments(
output_dir="gemma-7b-dolly-chatml", # directory to save and repository id
num_train_epochs=3, # number of training epochs
per_device_train_batch_size=8, # batch size per device during training
gradient_checkpointing=True, # use gradient checkpointing to save memory
optim="adamw_torch_fused", # use fused adamw optimizer
logging_steps=10, # log every 10 steps
save_strategy="epoch", # save checkpoint every epoch
bf16=True, # use bfloat16 precision
tf32=True, # use tf32 precision
### peft specific arguments ###
learning_rate=2e-4, # learning rate, based on QLoRA paper
max_grad_norm=0.3, # max gradient norm based on QLoRA paper
warmup_ratio=0.03, # warmup ratio based on QLoRA paper
lr_scheduler_type="constant", # use constant learning rate scheduler
report_to="tensorboard", # report metrics to tensorboard
push_to_hub=True, # push model to hub
)
max_seq_length = 1512 # max sequence length for model and packing of the dataset
trainer = SFTTrainer(
model=model,
args=args,
train_dataset=dataset,
### peft specific arguments ###
peft_config=peft_config,
max_seq_length=max_seq_length,
tokenizer=tokenizer,
packing=True,
dataset_kwargs={
"add_special_tokens": False, # <bos> and <eos> should be part of the dataset.
"append_concat_token": False, # make sure to not add additional tokens when packing
}
)
# start training, the model will be automatically saved to the hub and the output directory
trainer.train()
# save model
trainer.save_model() |