GPT2-finetune / finetune.py
Chappieut's picture
Upload 30 files
14b27c6 verified
import os
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForCausalLM
# Load dataset from text file
data_path = "./data.txt"
dataset = load_dataset("text", data_files={"train": data_path})
# Load tokenizer
model_name = "gpt2" # Use the base GPT-2 model
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Set the pad_token
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# Tokenize dataset with labels for CLM
def tokenize_function(examples):
tokens = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
tokens["labels"] = tokens["input_ids"].copy() # Use input_ids as labels for CLM
return tokens
tokenized_dataset = dataset["train"].map(tokenize_function, batched=True, remove_columns=["text"])
tokenized_dataset.set_format("torch")
# Training arguments
training_args = TrainingArguments(
output_dir="./gpt2_finetuned", # Directory to save model checkpoints
overwrite_output_dir=True, # Overwrite if the directory exists
num_train_epochs=3, # Number of training epochs
per_device_train_batch_size=2, # Batch size per GPU
save_steps=500, # Save checkpoint every 500 steps
save_total_limit=2, # Limit saved checkpoints
logging_dir="./logs", # Directory for logs
logging_steps=100, # Log every 100 steps
eval_strategy="no", # No evaluation (optional for fine-tuning)
learning_rate=5e-5, # Learning rate
warmup_steps=500, # Warmup steps
weight_decay=0.01, # Weight decay
fp16=True if torch.cuda.is_available() else False, # Use mixed precision if GPU available
)
# Load pre-trained GPT-2 model
model = AutoModelForCausalLM.from_pretrained(model_name)
# Create Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset
)
# Fine-tune the model
trainer.train()
# Save the fine-tuned model and tokenizer
model.save_pretrained("./gpt2_finetuned")
tokenizer.save_pretrained("./gpt2_finetuned")