File size: 5,076 Bytes
fffb0cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW, AlbertConfig
from datasets import Dataset as HFDataset
import pandas as pd
import os

# Ensure the /model/ directory exists
model_dir = 'model'
os.makedirs(model_dir, exist_ok=True)

# Load datasets from the Arrow files
train_dataset = HFDataset.from_file('train/data-00000-of-00001.arrow')
val_dataset = HFDataset.from_file('validation/data-00000-of-00001.arrow')
test_dataset = HFDataset.from_file('test/data-00000-of-00001.arrow')

# Convert datasets to pandas DataFrame
train_df = train_dataset.to_pandas()
val_df = val_dataset.to_pandas()
test_df = test_dataset.to_pandas()

# Remove question marks at the end of each query
train_df['content'] = train_df['content'].str.rstrip('?')
val_df['content'] = val_df['content'].str.rstrip('?')
test_df['content'] = test_df['content'].str.rstrip('?')

# Convert labels to integers (0 or 1)
train_df['rating'] = train_df['rating'].apply(lambda x: int(x >= 0.5))
val_df['rating'] = val_df['rating'].apply(lambda x: int(x >= 0.5))
test_df['rating'] = test_df['rating'].apply(lambda x: int(x >= 0.5))

# Initialize ALBERT tokenizer
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')

# Custom Dataset class for PyTorch
class QueryDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=32):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])  # Ensure label is an integer
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',  # Ensure consistent length
            truncation=True,       # Truncate longer sequences
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Prepare datasets
train_dataset = QueryDataset(train_df['content'].values, train_df['rating'].values, tokenizer)
val_dataset = QueryDataset(val_df['content'].values, val_df['rating'].values, tokenizer)
test_dataset = QueryDataset(test_df['content'].values, test_df['rating'].values, tokenizer)

# DataLoaders
batch_size = 128
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Load ALBERT model
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Optimizer and loss function
optimizer = AdamW(model.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()

# Training loop
epochs = 4
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}, Loss: {avg_loss:.4f}')

    # Validation step at the end of each epoch
    model.eval()
    correct_predictions = 0
    total_predictions = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            preds = torch.argmax(outputs.logits, dim=1)
            correct_predictions += (preds == labels).sum().item()
            total_predictions += labels.size(0)

    accuracy = correct_predictions / total_predictions
    print(f'Validation Accuracy after Epoch {epoch + 1}: {accuracy:.4f}')

# Save the model, tokenizer, and config to /model/ directory
model.save_pretrained(model_dir, safe_serialization=True)  # Save model weights in safetensors format
tokenizer.save_pretrained(model_dir)

# Update config with correct classifier details
config = AlbertConfig.from_pretrained('albert-base-v2')
config.num_labels = 2  # Set the number of labels for classification
config.save_pretrained(model_dir)

print(f"Model and all required files saved to {model_dir}")