In [5]:
import warnings
warnings.filterwarnings("ignore")

In [16]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
import torch

# Load dataset
data = pd.read_csv('C:/Users/Administrator/Downloads/ds_2300_Sheet1.csv')

# Remove 'id' column
data = data.drop(columns=['id'])

# Adding a dummy label column (ensure it's an integer type)
data['label'] = 0

# Convert label column to integer type
data['label'] = data['label'].astype(float)

# Convert to Hugging Face dataset
dataset = Dataset.from_pandas(data)

# Loading pre-trained uncased multilingual BERT model and tokenizer
model_name = 'bert-base-multilingual-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=1)  # Adjust num_labels if needed

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=512)  # Adjust max_length if needed

# Tokenize the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Split the dataset
split_datasets = tokenized_datasets.train_test_split(test_size=0.1)
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']

# Convert train and eval datasets to PyTorch tensors and ensure labels are Long tensors
def format_dataset(dataset):
    return dataset.with_format('torch', columns=['input_ids', 'attention_mask', 'label'])

train_dataset = format_dataset(train_dataset)
eval_dataset = format_dataset(eval_dataset)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/2312 [00:00<?, ? examples/s]

In [None]:
# Train the model
trainer.train()


Epoch,Training Loss,Validation Loss


In [None]:
# Evaluate the model
eval_results = trainer.evaluate()
print("Evaluation Results:", eval_results)

# Save the model
model.save_pretrained('./fine-tuned-bert-urdu')
tokenizer.save_pretrained('./fine-tuned-bert-urdu')