Help needed to learn training BERT classification model
#1
by
somvanova
- opened
Hello Vansh,
Could you please help me understand why this training strategy fails to produce any acceptable results? I am learning BERT based classification methods, and it would be really helpful if you could provide insights on model improvement strategies. Thanks a lot!
import time
import copy
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, classification_report, f1_score
from transformers import AutoTokenizer, AutoModel
import torch.nn as nn
import torch.optim as optim
class LinearWarmupScheduler:
"""
This class implements a usual learning rate scheduling with warmups steps
followed by linear decay of the learning rate after each `scheduler.step()` call.
"""
def __init__(self, optimizer, warmup_steps, training_steps):
self.optimizer = optimizer
self.warmup_steps = warmup_steps
self.training_steps = training_steps
def __call__(self, current_step: int):
return self.warmup(current_step)
def warmup(self, current_step: int):
# Phase 1: linear warmup
if current_step < self.warmup_steps:
return float(current_step) / float(self.warmup_steps)
# Phase 2: linear decay
else:
return max(
0.0,
float(self.training_steps - current_step)
/ float(max(1, self.training_steps - self.warmup_steps))
)
class HeadClassification(nn.Module):
def __init__(self, input_dim, output_dim):
super(HeadClassification, self).__init__()
self.linear = nn.Linear(input_dim, output_dim)
def forward(self, x):
return self.linear(x)
class RobertaForClassification(nn.Module):
def __init__(self, roberta_model, input_dim, output_dim):
super(RobertaForClassification, self).__init__()
self.roberta = roberta_model
self.classifier = HeadClassification(input_dim, output_dim)
def forward(self, input_ids, attention_mask, **kwargs):
# roberta_model returns: (last_hidden_state, pooler_output, ...)
outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
pooled_output = outputs.pooler_output # shape [batch_size, hidden_size]
logits = self.classifier(pooled_output)
return logits
class MyDataset(Dataset):
def __init__(self, texts, labels, tokenizer):
self.texts = texts
self.labels = labels
self.tokenizer = tokenizer
def __len__(self):
return len(self.texts)
def __getitem__(self, idx):
return self.texts[idx], self.labels[idx]
class DataCollator:
def __init__(self, tokenizer):
self.tokenizer = tokenizer
def __call__(self, inputs):
texts = [elem[0] for elem in inputs]
labels = [elem[1] for elem in inputs]
encodings = self.tokenizer.batch_encode_plus(
texts,
return_tensors="pt",
padding="longest",
truncation=True
)
return encodings, torch.tensor(labels)
# Example label mapping
LABEL_MAPPING = {
"0_not_relevant": 0,
"1_not_happening": 1,
"2_not_human": 2,
"3_not_bad": 3,
"4_solutions_harmful_unnecessary": 4,
"5_science_unreliable": 5,
"6_proponents_biased": 6,
"7_fossil_fuels_needed": 7
}
# Read CSV (replace "frugal_train.csv" with your actual data file)
df = pd.read_csv("frugal_train.csv", encoding="latin1")
# Example split: 70% train, 15% validation, 15% test
df_shuffled = df.sample(frac=1.0, random_state=42).reset_index(drop=True)
train_end = int(0.7 * len(df_shuffled))
val_end = int(0.85 * len(df_shuffled))
data_train = df_shuffled.iloc[:train_end].copy()
data_val = df_shuffled.iloc[train_end:val_end].copy()
data_test = df_shuffled.iloc[val_end:].copy()
# Map labels from strings to numeric
data_train["label"] = data_train["label"].map(LABEL_MAPPING)
data_val["label"] = data_val["label"].map(LABEL_MAPPING)
data_test["label"] = data_test["label"].map(LABEL_MAPPING)
train_texts, train_labels = data_train["quote"].tolist(), data_train["label"].tolist()
val_texts, val_labels = data_val["quote"].tolist(), data_val["label"].tolist()
test_texts, test_labels = data_test["quote"].tolist(), data_test["label"].tolist()
tokenizer = AutoTokenizer.from_pretrained("roberta-base")
roberta_model = AutoModel.from_pretrained("roberta-base")
classification_model = RobertaForClassification(
roberta_model,
input_dim=768,
output_dim=8
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
classification_model.to(device)
# -----------------------------------------------------
train_dataset = MyDataset(train_texts, train_labels, tokenizer)
val_dataset = MyDataset(val_texts, val_labels, tokenizer)
test_dataset = MyDataset(test_texts, test_labels, tokenizer)
collator = DataCollator(tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collator)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=collator)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=collator)
num_epochs = 40
learning_rate = 5e-5
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(classification_model.parameters(), lr=learning_rate)
# Warmup + linear decay scheduler
training_steps = len(train_loader) * num_epochs
linearwarmup_func = LinearWarmupScheduler(
optimizer,
warmup_steps=int(0.1 * training_steps),
training_steps=training_steps
)
scheduler = torch.optim.lr_scheduler.LambdaLR(
optimizer,
lr_lambda=linearwarmup_func
)
best_val_loss = float('inf')
epochs_without_improvement = 0
patience = 8 # early stopping patience
best_model = None
for epoch in range(num_epochs):
# --- Training phase ---
classification_model.train()
train_loss = 0.0
for batch_idx, (inputs, labels) in enumerate(train_loader):
# Move to device
inputs = {k: v.to(device) for k, v in inputs.items()}
labels = labels.to(device)
# Forward pass
outputs = classification_model(**inputs)
loss = criterion(outputs, labels)
# Backprop + optimization
optimizer.zero_grad()
loss.backward()
optimizer.step()
scheduler.step() # update learning rate
train_loss += loss.item()
train_loss /= len(train_loader)
current_lr = optimizer.param_groups[0]["lr"]
print(f"Epoch {epoch+1}/{num_epochs}, LR: {current_lr:.6f}, Train Loss: {train_loss:.4f}")
# --- Validation phase ---
classification_model.eval()
val_loss = 0.0
y_pred, y_true = [], []
with torch.no_grad():
for inputs, labels in val_loader:
inputs = {k: v.to(device) for k, v in inputs.items()}
labels = labels.to(device)
outputs = classification_model(**inputs)
loss = criterion(outputs, labels)
val_loss += loss.item()
preds = torch.argmax(outputs, dim=1)
y_pred.extend(preds.cpu().numpy())
y_true.extend(labels.cpu().numpy())
val_loss /= len(val_loader)
f1_macro = f1_score(y_true, y_pred, average="macro")
print(f"Epoch {epoch+1}/{num_epochs}, Val Loss: {val_loss:.4f}, F1 (Macro): {f1_macro:.4f}")
# Check if current model is the best
if val_loss < best_val_loss:
best_val_loss = val_loss
best_model = copy.deepcopy(classification_model)
epochs_without_improvement = 0
print(" [*] Best model updated.")
else:
epochs_without_improvement += 1
# Early stopping
if epochs_without_improvement >= patience:
print(f"Early stopping at epoch {epoch+1}. No improvement for {patience} epochs.")
break
classification_model = best_model # load best model
classification_model.eval()
test_loss = 0.0
y_pred_test, y_true_test = [], []
with torch.no_grad():
for inputs, labels in test_loader:
inputs = {k: v.to(device) for k, v in inputs.items()}
labels = labels.to(device)
outputs = classification_model(**inputs)
loss = criterion(outputs, labels)
test_loss += loss.item()
preds = torch.argmax(outputs, dim=1)
y_pred_test.extend(preds.cpu().numpy())
y_true_test.extend(labels.cpu().numpy())
test_loss /= len(test_loader)
f1_macro_test = f1_score(y_true_test, y_pred_test, average="macro")
acc_test = accuracy_score(y_true_test, y_pred_test)
print(f"\nFinal Test Loss: {test_loss:.4f}")
print(f"Test F1 (Macro): {f1_macro_test:.4f}")
print(f"Test Accuracy: {acc_test:.4f}")
print("\nClassification Report on Test Set:")
print(classification_report(y_true_test, y_pred_test))