|
import os
|
|
import torch
|
|
from datasets import load_dataset
|
|
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForCausalLM
|
|
|
|
|
|
data_path = "./data.txt"
|
|
dataset = load_dataset("text", data_files={"train": data_path})
|
|
|
|
|
|
model_name = "gpt2"
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
|
|
|
|
if tokenizer.pad_token is None:
|
|
tokenizer.pad_token = tokenizer.eos_token
|
|
|
|
|
|
def tokenize_function(examples):
|
|
tokens = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=512)
|
|
tokens["labels"] = tokens["input_ids"].copy()
|
|
return tokens
|
|
|
|
tokenized_dataset = dataset["train"].map(tokenize_function, batched=True, remove_columns=["text"])
|
|
tokenized_dataset.set_format("torch")
|
|
|
|
|
|
training_args = TrainingArguments(
|
|
output_dir="./gpt2_finetuned",
|
|
overwrite_output_dir=True,
|
|
num_train_epochs=3,
|
|
per_device_train_batch_size=2,
|
|
save_steps=500,
|
|
save_total_limit=2,
|
|
logging_dir="./logs",
|
|
logging_steps=100,
|
|
eval_strategy="no",
|
|
learning_rate=5e-5,
|
|
warmup_steps=500,
|
|
weight_decay=0.01,
|
|
fp16=True if torch.cuda.is_available() else False,
|
|
)
|
|
|
|
|
|
model = AutoModelForCausalLM.from_pretrained(model_name)
|
|
|
|
|
|
trainer = Trainer(
|
|
model=model,
|
|
args=training_args,
|
|
train_dataset=tokenized_dataset
|
|
)
|
|
|
|
|
|
trainer.train()
|
|
|
|
|
|
model.save_pretrained("./gpt2_finetuned")
|
|
tokenizer.save_pretrained("./gpt2_finetuned")
|
|
|