{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Importing necessary libraries\n", "from datasets import load_dataset, ClassLabel\n", "from transformers import AutoTokenizer\n", "from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments\n", "import torch\n", "\n", "# Load dataset\n", "dataset = load_dataset(\"McAuley-Lab/Amazon-Reviews-2023\", \"raw_review_Appliances\", trust_remote_code=True, split=\"full\")\n", "dataset = dataset.remove_columns(['title', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'])\n", "dataset = dataset.rename_column('rating', 'label')\n", "dataset = dataset.cast_column('label', ClassLabel(num_classes=6))\n", "\n", "# Load pre-trained tokenizer\n", "tokenizer = AutoTokenizer.from_pretrained('roberta-base')\n", "\n", "# Define tokenization function\n", "def tokenize_function(examples):\n", " return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)\n", "\n", "# Apply tokenization\n", "tokenized_datasets = dataset.map(tokenize_function, batched=True)\n", "tokenized_datasets = tokenized_datasets.shuffle()\n", "print(tokenized_datasets)\n", "\n", "# Load pre-trained BERT model for sequence classification\n", "model = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=6)\n", "\n", "# Define training arguments\n", "training_args = TrainingArguments(\n", " output_dir='./results',\n", " num_train_epochs=10,\n", " per_device_train_batch_size=16,\n", " per_device_eval_batch_size=16,\n", " evaluation_strategy='epoch',\n", " logging_dir='./logs',\n", ")\n", "\n", "# Create trainer instance\n", "trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " train_dataset=tokenized_datasets.select(range(1000)),\n", " eval_dataset=tokenized_datasets.select(range(1001, 2001)),\n", ")\n", "\n", "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", "model.to(device)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "trainer.train()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.metrics import accuracy_score, precision_recall_fscore_support\n", "\n", "# Define function to compute metrics\n", "def compute_metrics(pred):\n", " labels = pred.label_ids\n", " preds = pred.predictions.argmax(-1)\n", " precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')\n", " acc = accuracy_score(labels, preds)\n", " return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}\n", "\n", "# Update trainer to include custom metrics\n", "trainer.compute_metrics = compute_metrics\n", "\n", "# Evaluate the model\n", "eval_result = trainer.evaluate()\n", "print(eval_result)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Save the fine-tuned model and tokenizer\n", "trainer.save_model('roberta-rating')\n", "tokenizer.save_pretrained('roberta-rating')" ] } ], "metadata": { "kernelspec": { "display_name": "SolutionsInPR", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 2 }