|
import torch |
|
from transformers import pipeline |
|
from transformers import GPT2LMHeadModel, GPT2Tokenizer |
|
from google.colab import files |
|
|
|
|
|
device = torch.device("cpu") |
|
|
|
model = GPT2LMHeadModel.from_pretrained("gpt2") |
|
tokenizer = GPT2Tokenizer.from_pretrained("gpt2") |
|
|
|
tokenizer.add_special_tokens({'pad_token': '[PAD]'}) |
|
|
|
|
|
model = model.to(device) |
|
|
|
uploaded_files = files.upload() |
|
training_data = [] |
|
for file_name, content in uploaded_files.items(): |
|
try: |
|
document = content.decode("utf-8") |
|
except UnicodeDecodeError: |
|
document = content.decode("latin-1") |
|
training_data.append(document) |
|
|
|
tokenized_data = tokenizer('\n\n'.join(training_data), truncation=True, padding=True, max_length=256, return_tensors="pt") |
|
model.resize_token_embeddings(len(tokenizer)) |
|
|
|
|
|
loss_function = torch.nn.CrossEntropyLoss() |
|
|
|
|
|
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5) |
|
|
|
|
|
model.train() |
|
accumulation_steps = 4 |
|
batch_size = 4 |
|
|
|
for epoch in range(3): |
|
for i in range(0, len(tokenized_data['input_ids']), batch_size): |
|
input_ids_batch = tokenized_data['input_ids'][i:i + batch_size].to(device, non_blocking=True) |
|
outputs = model(input_ids_batch) |
|
logits = outputs.logits |
|
loss = loss_function(logits.view(-1, model.config.vocab_size), input_ids_batch.view(-1)) |
|
loss.backward() |
|
optimizer.step() |
|
optimizer.zero_grad() |
|
print(f"Epoch: {epoch+1}, Batch: {i+1}/{len(tokenized_data['input_ids'])//batch_size}, Loss: {loss.item()}") |
|
|
|
if (i + 1) % accumulation_steps == 0: |
|
optimizer.step() |
|
optimizer.zero_grad() |
|
|
|
while True: |
|
user_input = input("User: ") |
|
input_ids = tokenizer.encode(user_input, return_tensors='pt').to(device) |
|
generate = pipeline('text-generation', model='gpt2') |
|
output = model.generate(input_ids=input_ids, max_length=100, num_return_sequences=1) |
|
response = tokenizer.decode(output[0]) |
|
print("ChatBot:", response) |
|
break |