File size: 4,960 Bytes
de55574 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 |
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, TrainerCallback
from transformers import DataCollatorWithPadding
from datasets import load_metric, Dataset
import torch
import wandb
# Set tweakable parameters
model_name = 'albert-base-v2'
num_labels = 7 # Number of sentiment labels
output_dir = './albert_sentiment_model'
data_file = 'data.csv'
wandb_entity = 'dejan'
batch_size = 8
num_train_epochs = 30
learning_rate = 5e-5
# Initialize wandb
wandb.init(entity=wandb_entity, project="sentiment_classification")
# Load and preprocess the dataset
df = pd.read_csv(data_file, header=None, names=['text', 'label'])
# Remove leading instructions and prompts (assuming we know the prompt structure)
df['text'] = df['text'].apply(lambda x: x.split('Write nothing but the article text. Do not include the sentiment in the text of the article.')[-1].strip())
# Display the cleaned data
print(df.head())
train_texts, val_texts, train_labels, val_labels = train_test_split(
df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
train_dataset = Dataset.from_dict({
'input_ids': train_encodings['input_ids'],
'attention_mask': train_encodings['attention_mask'],
'labels': train_labels
})
val_dataset = Dataset.from_dict({
'input_ids': val_encodings['input_ids'],
'attention_mask': val_encodings['attention_mask'],
'labels': val_labels
})
# Define data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# Define metrics
accuracy_metric = load_metric("accuracy")
precision_metric = load_metric("precision")
recall_metric = load_metric("recall")
f1_metric = load_metric("f1")
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = torch.argmax(torch.tensor(logits), dim=-1)
accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
precision = precision_metric.compute(predictions=predictions, references=labels, average='weighted')
recall = recall_metric.compute(predictions=predictions, references=labels, average='weighted')
f1 = f1_metric.compute(predictions=predictions, references=labels, average='weighted')
wandb.log({
"eval_accuracy": accuracy["accuracy"],
"eval_precision": precision["precision"],
"eval_recall": recall["recall"],
"eval_f1": f1["f1"],
})
return {
"accuracy": accuracy["accuracy"],
"precision": precision["precision"],
"recall": recall["recall"],
"f1": f1["f1"],
}
# Training arguments
training_args = TrainingArguments(
output_dir=output_dir,
num_train_epochs=num_train_epochs,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
warmup_steps=500,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=10,
evaluation_strategy="steps",
eval_steps=500,
save_strategy="steps",
save_steps=500,
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
learning_rate=learning_rate,
report_to="wandb",
lr_scheduler_type="linear",
logging_strategy="steps",
)
# Early stopping callback
class EarlyStoppingCallback(TrainerCallback):
def __init__(self, patience=2):
self.patience = patience
self.best_metric = None
self.best_model_checkpoint = None
self.epochs_no_improve = 0
def on_evaluate(self, args, state, control, **kwargs):
eval_metric = kwargs['metrics'][training_args.metric_for_best_model]
if self.best_metric is None or eval_metric < self.best_metric:
self.best_metric = eval_metric
self.best_model_checkpoint = state.global_step
self.epochs_no_improve = 0
else:
self.epochs_no_improve += 1
if self.epochs_no_improve >= self.patience:
print(f"Stopping early after {self.epochs_no_improve} evaluations with no improvement.")
control.should_training_stop = True
# Trainer
trainer = Trainer(
model=AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels),
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
data_collator=data_collator,
compute_metrics=compute_metrics,
callbacks=[EarlyStoppingCallback(patience=2)]
)
# Train and save the final model
trainer.train()
trainer.save_model(output_dir)
# Finalize wandb
wandb.finish()
print(f"Training completed. Model saved to {output_dir}")
|