In [None]:
# Importing necessary libraries
from datasets import load_dataset, ClassLabel
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch

# Load dataset
dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Appliances", trust_remote_code=True, split="full")
dataset = dataset.remove_columns(['title', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'])
dataset = dataset.rename_column('rating', 'label')
dataset = dataset.cast_column('label', ClassLabel(num_classes=6))

# Load pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

# Define tokenization function
def tokenize_function(examples):
 return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.shuffle()
print(tokenized_datasets)

# Load pre-trained BERT model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=6)

# Define training arguments
training_args = TrainingArguments(
 output_dir='./results',
 num_train_epochs=10,
 per_device_train_batch_size=16,
 per_device_eval_batch_size=16,
 evaluation_strategy='epoch',
 logging_dir='./logs',
)

# Create trainer instance
trainer = Trainer(
 model=model,
 args=training_args,
 train_dataset=tokenized_datasets.select(range(1000)),
 eval_dataset=tokenized_datasets.select(range(1001, 2001)),
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
trainer.train()

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Define function to compute metrics
def compute_metrics(pred):
 labels = pred.label_ids
 preds = pred.predictions.argmax(-1)
 precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
 acc = accuracy_score(labels, preds)
 return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

# Update trainer to include custom metrics
trainer.compute_metrics = compute_metrics

# Evaluate the model
eval_result = trainer.evaluate()
print(eval_result)

In [None]:
# Save the fine-tuned model and tokenizer
trainer.save_model('roberta-rating')
tokenizer.save_pretrained('roberta-rating')