from datasets import load_dataset, Dataset import random import numpy as np from transformers import ( AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer, PreTrainedTokenizer, ElectraForSequenceClassification, EarlyStoppingCallback ) from dataclasses import dataclass from sklearn.metrics import accuracy_score, precision_recall_fscore_support def process(batch: dict, tokenizer: PreTrainedTokenizer) -> dict: # SP and WP = Positive | WN and SN = Negative # NU should randomly be Positive or Negative new_labels = [] for label in batch["Polarity"]: if label in ["SP", "WP"]: new_labels.append(1) elif label in ["WN", "SN"]: new_labels.append(0) elif label == "NU": new_labels.append(random.choice([1, 0])) else: new_labels.append(label) inputs = tokenizer(batch["Text"], truncation=True) batch["input_ids"] = inputs["input_ids"] batch["attention_mask"] = inputs["attention_mask"] batch["labels"] = new_labels return batch def compute_metrics(eval_pred): logits, labels = eval_pred predictions = logits.argmax(-1) accuracy = accuracy_score(labels, predictions) precision, recall, f1, _ = precision_recall_fscore_support( labels, predictions, average='binary' ) return { "accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1, } def pipeline(args): model = AutoModelForSequenceClassification.from_pretrained(args.model_name, num_labels=2) tokenizer = AutoTokenizer.from_pretrained(args.model_name) dataset = load_dataset(args.dataset_name) dataset = dataset.map(process, batched=True, fn_kwargs={'tokenizer': tokenizer}) dataset = dataset["train"].train_test_split(args.split_ratio) train_dataset = dataset["train"] test_dataset = dataset["test"] data_collator = DataCollatorWithPadding(tokenizer=tokenizer) trainer = Trainer( model=model, args=TrainingArguments( output_dir="./results", learning_rate=args.learning_rate, per_device_train_batch_size=args.batch_size, per_device_eval_batch_size=args.batch_size, num_train_epochs=args.epochs, weight_decay=0.01, eval_strategy="steps", save_strategy="steps", load_best_model_at_end=True, report_to="none", save_steps=500, eval_steps=500, save_total_limit=1, logging_steps=500, fp16=args.fp16, greater_is_better=True, metric_for_best_model="f1", ), train_dataset=train_dataset, eval_dataset=test_dataset, processing_class=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics, callbacks=[EarlyStoppingCallback(early_stopping_patience=5)] ) trainer.train() trainer.evaluate() trainer.predict(test_dataset) # Push to Hub trainer.push_to_hub(args.hub_location) tokenizer.push_to_hub(args.hub_location) @dataclass class Arguments: model_name: str = "csebuetnlp/banglabert" dataset_name: str = "SayedShaun/sentigold" split_ratio: float = 0.1 batch_size: int = 128 epochs: int = 40 learning_rate: float = 1e-5 fp16: bool = True hub_location: str = "SayedShaun/bangla-classifier-binary" if __name__=="__main__": args = Arguments() pipeline(args)