import torch import torch.nn as nn from torch.utils.data import DataLoader, Dataset from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, AlbertConfig from datasets import Dataset as HFDataset import pandas as pd import os # Ensure the /model/ directory exists model_dir = 'model' os.makedirs(model_dir, exist_ok=True) # Load datasets from the Arrow files train_dataset = HFDataset.from_file('train/data-00000-of-00001.arrow') val_dataset = HFDataset.from_file('validation/data-00000-of-00001.arrow') test_dataset = HFDataset.from_file('test/data-00000-of-00001.arrow') # Convert datasets to pandas DataFrame train_df = train_dataset.to_pandas() val_df = val_dataset.to_pandas() test_df = test_dataset.to_pandas() # Remove question marks at the end of each query train_df['content'] = train_df['content'].str.rstrip('?') val_df['content'] = val_df['content'].str.rstrip('?') test_df['content'] = test_df['content'].str.rstrip('?') # Convert labels to integers (0 or 1) train_df['rating'] = train_df['rating'].apply(lambda x: int(x >= 0.5)) val_df['rating'] = val_df['rating'].apply(lambda x: int(x >= 0.5)) test_df['rating'] = test_df['rating'].apply(lambda x: int(x >= 0.5)) # Initialize ALBERT tokenizer tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') # Custom Dataset class for PyTorch class QueryDataset(Dataset): def __init__(self, texts, labels, tokenizer, max_length=32): self.texts = texts self.labels = labels self.tokenizer = tokenizer self.max_length = max_length def __len__(self): return len(self.texts) def __getitem__(self, idx): text = str(self.texts[idx]) label = int(self.labels[idx]) # Ensure label is an integer encoding = self.tokenizer.encode_plus( text, add_special_tokens=True, max_length=self.max_length, padding='max_length', # Ensure consistent length truncation=True, # Truncate longer sequences return_attention_mask=True, return_tensors='pt' ) return { 'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label, dtype=torch.long) } # Prepare datasets train_dataset = QueryDataset(train_df['content'].values, train_df['rating'].values, tokenizer) val_dataset = QueryDataset(val_df['content'].values, val_df['rating'].values, tokenizer) test_dataset = QueryDataset(test_df['content'].values, test_df['rating'].values, tokenizer) # DataLoaders batch_size = 128 train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=batch_size) test_loader = DataLoader(test_dataset, batch_size=batch_size) # Load ALBERT model model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=2) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model.to(device) # Optimizer and loss function optimizer = AdamW(model.parameters(), lr=1e-5) criterion = nn.CrossEntropyLoss() # Training loop epochs = 4 for epoch in range(epochs): model.train() total_loss = 0 for batch in train_loader: input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) labels = batch['label'].to(device) optimizer.zero_grad() outputs = model(input_ids, attention_mask=attention_mask) loss = criterion(outputs.logits, labels) loss.backward() optimizer.step() total_loss += loss.item() avg_loss = total_loss / len(train_loader) print(f'Epoch {epoch + 1}, Loss: {avg_loss:.4f}') # Validation step at the end of each epoch model.eval() correct_predictions = 0 total_predictions = 0 with torch.no_grad(): for batch in val_loader: input_ids = batch['input_ids'].to(device) attention_mask = batch['attention_mask'].to(device) labels = batch['label'].to(device) outputs = model(input_ids, attention_mask=attention_mask) preds = torch.argmax(outputs.logits, dim=1) correct_predictions += (preds == labels).sum().item() total_predictions += labels.size(0) accuracy = correct_predictions / total_predictions print(f'Validation Accuracy after Epoch {epoch + 1}: {accuracy:.4f}') # Save the model, tokenizer, and config to /model/ directory model.save_pretrained(model_dir, safe_serialization=True) # Save model weights in safetensors format tokenizer.save_pretrained(model_dir) # Update config with correct classifier details config = AlbertConfig.from_pretrained('albert-base-v2') config.num_labels = 2 # Set the number of labels for classification config.save_pretrained(model_dir) print(f"Model and all required files saved to {model_dir}")