# As I mentioned below on my code, after many many days of trying to use the whole data set and having my code crashing after long hours of waiting, I decided to use a sample. # I'll start by installing (make sure to see the requirements file) and importing all I need import torch import pandas as pd from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments from google.colab import drive drive.mount('/content/drive') #Reading my data set enron_data = pd.read_csv('/content/drive/MyDrive/Mestrado/emails.csv') # I tried to take the whole dataset several times, but due to memory problems, I decided to go for a sample of 10k sample_size = 10000 sample_enron_data = enron_data.sample(sample_size) sample_enron_data.to_csv("sample_enron_dataset.csv", index=False) # now that I have a sample of my data set running locally, I'll call it to make sure it's all good sample_enron_data.head() len(sample_enron_data) # Now I'll concatenate all email messages into a single string text = "\n".join(sample_enron_data['message']) tokenizer = GPT2Tokenizer.from_pretrained('gpt2') tokenizer.add_special_tokens({'pad_token': '[PAD]'}) input_ids = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding=True)['input_ids'] from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW, get_linear_schedule_with_warmup from torch.utils.data import Dataset, DataLoader from tqdm import tqdm # Now I'll try to define a custom dataset class EmailDataset(Dataset): def __init__(self, input_ids): self.input_ids = input_ids def __len__(self): return len(self.input_ids) def __getitem__(self, idx): return self.input_ids[idx] dataset = EmailDataset(input_ids) # I'll define the GPT-2 model model = GPT2LMHeadModel.from_pretrained('gpt2') #Since I tried many times and it crashed, following some tutorials I saw that I could try to define this optimizer optimizer = AdamW(model.parameters(), lr=5e-5) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataset)) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model.to(device) # Now I'll train it model.train() train_dataloader = DataLoader(dataset, batch_size=8, shuffle=True) num_epochs = 3 for epoch in range(num_epochs): epoch_loss = 0 steps = 0 for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}"): batch = batch.to(device) outputs = model(input_ids=batch, labels=batch) loss = outputs.loss optimizer.zero_grad() loss.backward() optimizer.step() scheduler.step() epoch_loss += loss.item() steps += 1 print(f"Epoch {epoch + 1} - Average Loss: {epoch_loss / steps}") # and then I'll save the fine-tuned model model.save_pretrained("./fine_tuned_model") ### PART 3: Create a Gradio Interface that answers questions related to the case Now, having fine tuned the model, I proceed to creating the gradio interface # In order to make the gradio interface, first I need to install it and then import !pip install gradio import gradio as gr # First I'll load the fine tuned model model_fine_tuned = GPT2LMHeadModel.from_pretrained("./fine_tuned_model") tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # Then I'll create the function to generate the response def generate_response(question): input_ids = tokenizer.encode(question, return_tensors="pt") output = model_fine_tuned.generate(input_ids, max_length=200, num_return_sequences=1, temperature=0.7) response = tokenizer.decode(output[0], skip_special_tokens=True) return response # Finally I'll create Gradio interface gr.Interface(generate_response, "textbox", "textbox", title="Ask Enron Dataset", description="Enter a question about the case").launch() # I experimented with the chatbot and it starts answering well, but then repeats the same sentence over and over in many situations # I couldn't fix the situation, probably solved by text preprocessing