|
from datasets import load_dataset, Dataset |
|
import random |
|
import numpy as np |
|
from transformers import ( |
|
AutoTokenizer, |
|
DataCollatorWithPadding, |
|
AutoModelForSequenceClassification, |
|
TrainingArguments, |
|
Trainer, |
|
PreTrainedTokenizer, |
|
ElectraForSequenceClassification, |
|
EarlyStoppingCallback |
|
) |
|
from dataclasses import dataclass |
|
from sklearn.metrics import accuracy_score, precision_recall_fscore_support |
|
|
|
|
|
def process(batch: dict, tokenizer: PreTrainedTokenizer) -> dict: |
|
|
|
|
|
new_labels = [] |
|
for label in batch["Polarity"]: |
|
if label in ["SP", "WP"]: |
|
new_labels.append(1) |
|
elif label in ["WN", "SN"]: |
|
new_labels.append(0) |
|
elif label == "NU": |
|
new_labels.append(random.choice([1, 0])) |
|
else: |
|
new_labels.append(label) |
|
inputs = tokenizer(batch["Text"], truncation=True) |
|
batch["input_ids"] = inputs["input_ids"] |
|
batch["attention_mask"] = inputs["attention_mask"] |
|
batch["labels"] = new_labels |
|
return batch |
|
|
|
|
|
def compute_metrics(eval_pred): |
|
logits, labels = eval_pred |
|
predictions = logits.argmax(-1) |
|
accuracy = accuracy_score(labels, predictions) |
|
precision, recall, f1, _ = precision_recall_fscore_support( |
|
labels, predictions, average='binary' |
|
) |
|
return { |
|
"accuracy": accuracy, |
|
"precision": precision, |
|
"recall": recall, |
|
"f1": f1, |
|
} |
|
|
|
|
|
def pipeline(args): |
|
model = AutoModelForSequenceClassification.from_pretrained(args.model_name, num_labels=2) |
|
tokenizer = AutoTokenizer.from_pretrained(args.model_name) |
|
dataset = load_dataset(args.dataset_name) |
|
dataset = dataset.map(process, batched=True, fn_kwargs={'tokenizer': tokenizer}) |
|
dataset = dataset["train"].train_test_split(args.split_ratio) |
|
train_dataset = dataset["train"] |
|
test_dataset = dataset["test"] |
|
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) |
|
|
|
trainer = Trainer( |
|
model=model, |
|
args=TrainingArguments( |
|
output_dir="./results", |
|
learning_rate=args.learning_rate, |
|
per_device_train_batch_size=args.batch_size, |
|
per_device_eval_batch_size=args.batch_size, |
|
num_train_epochs=args.epochs, |
|
weight_decay=0.01, |
|
eval_strategy="steps", |
|
save_strategy="steps", |
|
load_best_model_at_end=True, |
|
report_to="none", |
|
save_steps=500, |
|
eval_steps=500, |
|
save_total_limit=1, |
|
logging_steps=500, |
|
fp16=args.fp16, |
|
greater_is_better=True, |
|
metric_for_best_model="f1", |
|
), |
|
train_dataset=train_dataset, |
|
eval_dataset=test_dataset, |
|
processing_class=tokenizer, |
|
data_collator=data_collator, |
|
compute_metrics=compute_metrics, |
|
callbacks=[EarlyStoppingCallback(early_stopping_patience=5)] |
|
) |
|
|
|
trainer.train() |
|
trainer.evaluate() |
|
trainer.predict(test_dataset) |
|
|
|
|
|
trainer.push_to_hub(args.hub_location) |
|
tokenizer.push_to_hub(args.hub_location) |
|
|
|
@dataclass |
|
class Arguments: |
|
model_name: str = "csebuetnlp/banglabert" |
|
dataset_name: str = "SayedShaun/sentigold" |
|
split_ratio: float = 0.1 |
|
batch_size: int = 128 |
|
epochs: int = 40 |
|
learning_rate: float = 1e-5 |
|
fp16: bool = True |
|
hub_location: str = "SayedShaun/bangla-classifier-binary" |
|
|
|
if __name__=="__main__": |
|
args = Arguments() |
|
pipeline(args) |