File size: 5,076 Bytes
fffb0cd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, AlbertConfig
from datasets import Dataset as HFDataset
import pandas as pd
import os
# Ensure the /model/ directory exists
model_dir = 'model'
os.makedirs(model_dir, exist_ok=True)
# Load datasets from the Arrow files
train_dataset = HFDataset.from_file('train/data-00000-of-00001.arrow')
val_dataset = HFDataset.from_file('validation/data-00000-of-00001.arrow')
test_dataset = HFDataset.from_file('test/data-00000-of-00001.arrow')
# Convert datasets to pandas DataFrame
train_df = train_dataset.to_pandas()
val_df = val_dataset.to_pandas()
test_df = test_dataset.to_pandas()
# Remove question marks at the end of each query
train_df['content'] = train_df['content'].str.rstrip('?')
val_df['content'] = val_df['content'].str.rstrip('?')
test_df['content'] = test_df['content'].str.rstrip('?')
# Convert labels to integers (0 or 1)
train_df['rating'] = train_df['rating'].apply(lambda x: int(x >= 0.5))
val_df['rating'] = val_df['rating'].apply(lambda x: int(x >= 0.5))
test_df['rating'] = test_df['rating'].apply(lambda x: int(x >= 0.5))
# Initialize ALBERT tokenizer
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
# Custom Dataset class for PyTorch
class QueryDataset(Dataset):
def __init__(self, texts, labels, tokenizer, max_length=32):
self.texts = texts
self.labels = labels
self.tokenizer = tokenizer
self.max_length = max_length
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
text = str(self.texts[idx])
label = int(self.labels[idx]) # Ensure label is an integer
encoding = self.tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=self.max_length,
padding='max_length', # Ensure consistent length
truncation=True, # Truncate longer sequences
return_attention_mask=True,
return_tensors='pt'
)
return {
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'label': torch.tensor(label, dtype=torch.long)
}
# Prepare datasets
train_dataset = QueryDataset(train_df['content'].values, train_df['rating'].values, tokenizer)
val_dataset = QueryDataset(val_df['content'].values, val_df['rating'].values, tokenizer)
test_dataset = QueryDataset(test_df['content'].values, test_df['rating'].values, tokenizer)
# DataLoaders
batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)
# Load ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
# Optimizer and loss function
optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()
# Training loop
epochs = 4
for epoch in range(epochs):
model.train()
total_loss = 0
for batch in train_loader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['label'].to(device)
optimizer.zero_grad()
outputs = model(input_ids, attention_mask=attention_mask)
loss = criterion(outputs.logits, labels)
loss.backward()
optimizer.step()
total_loss += loss.item()
avg_loss = total_loss / len(train_loader)
print(f'Epoch {epoch + 1}, Loss: {avg_loss:.4f}')
# Validation step at the end of each epoch
model.eval()
correct_predictions = 0
total_predictions = 0
with torch.no_grad():
for batch in val_loader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['label'].to(device)
outputs = model(input_ids, attention_mask=attention_mask)
preds = torch.argmax(outputs.logits, dim=1)
correct_predictions += (preds == labels).sum().item()
total_predictions += labels.size(0)
accuracy = correct_predictions / total_predictions
print(f'Validation Accuracy after Epoch {epoch + 1}: {accuracy:.4f}')
# Save the model, tokenizer, and config to /model/ directory
model.save_pretrained(model_dir, safe_serialization=True) # Save model weights in safetensors format
tokenizer.save_pretrained(model_dir)
# Update config with correct classifier details
config = AlbertConfig.from_pretrained('albert-base-v2')
config.num_labels = 2 # Set the number of labels for classification
config.save_pretrained(model_dir)
print(f"Model and all required files saved to {model_dir}")
|