RemotelyBest_Development_of_AI_Applications

Runtime error

App Files Files Community

Jan90 commited on Oct 23, 2024

Commit

a9cce51

verified ·

1 Parent(s): 147e673

Upload 5 files

Browse files

Files changed (5) hide show

Other Codes/Final revision code.ipynb +250 -0
Other Codes/First revision code.ipynb +99 -0
Other Codes/roberta-rating.ipynb +54 -0
Other Codes/roberta-summarization.ipynb +191 -0
Other Codes/trainer.ipynb +116 -0

Other Codes/Final revision code.ipynb ADDED Viewed

	@@ -0,0 +1,250 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "* Running on local URL:  http://127.0.0.1:7870\n",
+      "* Running on public URL: https://a94e18f722148a0463.gradio.live\n",
+      "\n",
+      "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div><iframe src=\"https://a94e18f722148a0463.gradio.live\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": []
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, TextClassificationPipeline\n",
+    "import torch\n",
+    "import gradio as gr\n",
+    "from openpyxl import load_workbook\n",
+    "from numpy import mean\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "theme = gr.themes.Soft(\n",
+    "    primary_hue=\"amber\",\n",
+    "    secondary_hue=\"amber\",\n",
+    "    neutral_hue=\"stone\",\n",
+    ")\n",
+    "\n",
+    "# Load tokenizers and models\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"suriya7/bart-finetuned-text-summarization\")\n",
+    "model = AutoModelForSeq2SeqLM.from_pretrained(\"suriya7/bart-finetuned-text-summarization\")\n",
+    "\n",
+    "tokenizer_keywords = AutoTokenizer.from_pretrained(\"transformer3/H2-keywordextractor\")\n",
+    "model_keywords = AutoModelForSeq2SeqLM.from_pretrained(\"transformer3/H2-keywordextractor\")\n",
+    "\n",
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "new_model = AutoModelForSequenceClassification.from_pretrained('roberta-rating')\n",
+    "new_tokenizer = AutoTokenizer.from_pretrained('roberta-rating')\n",
+    "\n",
+    "classifier = TextClassificationPipeline(model=new_model, tokenizer=new_tokenizer, device=device)\n",
+    "\n",
+    "label_mapping = {1: '1/5', 2: '2/5', 3: '3/5', 4: '4/5', 5: '5/5'}\n",
+    "\n",
+    "# Function to display and filter the Excel workbook\n",
+    "def filter_xl(file, keywords):\n",
+    "    # Load the workbook and convert it to a DataFrame\n",
+    "    workbook = load_workbook(filename=file)\n",
+    "    sheet = workbook.active\n",
+    "    data = sheet.values\n",
+    "    columns = next(data)[0:]\n",
+    "    df = pd.DataFrame(data, columns=columns)\n",
+    "    \n",
+    "    if keywords:\n",
+    "        keyword_list = keywords.split(',')\n",
+    "        for keyword in keyword_list:\n",
+    "            df = df[df.apply(lambda row: row.astype(str).str.contains(keyword.strip(), case=False).any(), axis=1)]\n",
+    "    \n",
+    "    return df\n",
+    "\n",
+    "# Function to calculate overall rating from filtered data\n",
+    "def calculate_rating(filtered_df):\n",
+    "    reviews = filtered_df.to_numpy().flatten()\n",
+    "    ratings = []\n",
+    "    for review in reviews:\n",
+    "        if pd.notna(review):\n",
+    "            rating = int(classifier(review)[0]['label'].split('_')[1])\n",
+    "            ratings.append(rating)\n",
+    "    \n",
+    "    return round(mean(ratings), 2), ratings\n",
+    "\n",
+    "# Function to calculate results including summary, keywords, and sentiment\n",
+    "def calculate_results(file, keywords):\n",
+    "    filtered_df = filter_xl(file, keywords)\n",
+    "    overall_rating, ratings = calculate_rating(filtered_df)\n",
+    "    \n",
+    "    # Summarize and extract keywords from the filtered reviews\n",
+    "    text = \" \".join(filtered_df.to_numpy().flatten())\n",
+    "    inputs = tokenizer([text], max_length=1024, truncation=True, return_tensors=\"pt\")\n",
+    "    summary_ids = model.generate(inputs[\"input_ids\"], num_beams=2, min_length=10, max_length=50)\n",
+    "    summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]\n",
+    "    summary = summary.replace(\"I\", \"They\").replace(\"my\", \"their\").replace(\"me\", \"them\")\n",
+    "\n",
+    "    inputs_keywords = tokenizer_keywords([text], max_length=1024, truncation=True, return_tensors=\"pt\")\n",
+    "    summary_ids_keywords = model_keywords.generate(inputs_keywords[\"input_ids\"], num_beams=2, min_length=0, max_length=100)\n",
+    "    keywords = tokenizer_keywords.batch_decode(summary_ids_keywords, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]\n",
+    "\n",
+    "    # Determine overall sentiment\n",
+    "    sentiments = []\n",
+    "    for review in filtered_df.to_numpy().flatten():\n",
+    "        if pd.notna(review):\n",
+    "            sentiment = classifier(review)[0]['label']\n",
+    "            sentiment_label = \"Positive\" if sentiment == \"LABEL_4\" or sentiment == \"LABEL_5\" else \"Negative\" if sentiment == \"LABEL_1\" or sentiment == \"LABEL_2\" else \"Neutral\"\n",
+    "            sentiments.append(sentiment_label)\n",
+    "    \n",
+    "    overall_sentiment = \"Positive\" if sentiments.count(\"Positive\") > sentiments.count(\"Negative\") else \"Negative\" if sentiments.count(\"Negative\") > sentiments.count(\"Positive\") else \"Neutral\"\n",
+    "\n",
+    "    return overall_rating, summary, keywords, overall_sentiment, ratings, sentiments\n",
+    "\n",
+    "# Function to analyze a single review\n",
+    "def analyze_review(review):\n",
+    "    if not review.strip():\n",
+    "        return \"Error: No text provided\", \"Error: No text provided\", \"Error: No text provided\", \"Error: No text provided\"\n",
+    "    \n",
+    "    # Calculate rating\n",
+    "    rating = int(classifier(review)[0]['label'].split('_')[1])\n",
+    "    \n",
+    "    # Summarize review\n",
+    "    inputs = tokenizer([review], max_length=1024, truncation=True, return_tensors=\"pt\")\n",
+    "    summary_ids = model.generate(inputs[\"input_ids\"], num_beams=2, min_length=10, max_length=50)\n",
+    "    summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]\n",
+    "    summary = summary.replace(\"I\", \"he/she\").replace(\"my\", \"his/her\").replace(\"me\", \"him/her\")\n",
+    "\n",
+    "    # Extract keywords\n",
+    "    inputs_keywords = tokenizer_keywords([review], max_length=1024, truncation=True, return_tensors=\"pt\")\n",
+    "    summary_ids_keywords = model_keywords.generate(inputs_keywords[\"input_ids\"], num_beams=2, min_length=0, max_length=100)\n",
+    "    keywords = tokenizer_keywords.batch_decode(summary_ids_keywords, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]\n",
+    "\n",
+    "    # Determine sentiment\n",
+    "    sentiment = classifier(review)[0]['label']\n",
+    "    sentiment_label = \"Positive\" if sentiment == \"LABEL_4\" or sentiment == \"LABEL_5\" else \"Negative\" if sentiment == \"LABEL_1\" or sentiment == \"LABEL_2\" else \"Neutral\"\n",
+    "\n",
+    "    return rating, summary, keywords, sentiment_label\n",
+    "\n",
+    "# Function to count rows in the filtered DataFrame\n",
+    "def count_rows(filtered_df):\n",
+    "    return len(filtered_df)\n",
+    "\n",
+    "# Function to plot ratings\n",
+    "def plot_ratings(ratings):\n",
+    "    plt.figure(figsize=(10, 5))\n",
+    "    plt.hist(ratings, bins=range(1, 7), edgecolor='black', align='left')\n",
+    "    plt.xlabel('Rating')\n",
+    "    plt.ylabel('Frequency')\n",
+    "    plt.title('Distribution of Ratings')\n",
+    "    plt.xticks(range(1, 6))\n",
+    "    plt.grid(True)\n",
+    "    plt.savefig('ratings_distribution.png')\n",
+    "    return 'ratings_distribution.png'\n",
+    "\n",
+    "# Function to plot sentiments\n",
+    "def plot_sentiments(sentiments):\n",
+    "    sentiment_counts = pd.Series(sentiments).value_counts()\n",
+    "    plt.figure(figsize=(10, 5))\n",
+    "    sentiment_counts.plot(kind='bar', color=['green', 'red', 'blue'])\n",
+    "    plt.xlabel('Sentiment')\n",
+    "    plt.ylabel('Frequency')\n",
+    "    plt.title('Distribution of Sentiments')\n",
+    "    plt.grid(True)\n",
+    "    plt.savefig('sentiments_distribution.png')\n",
+    "    return 'sentiments_distribution.png'\n",
+    "\n",
+    "# Gradio interface\n",
+    "with gr.Blocks(theme=theme) as demo:\n",
+    "    gr.Markdown(\"<h1 style='text-align: center;'>Feedback and Auditing Survey AI Analyzer</h1><br>\")\n",
+    "    with gr.Tabs():\n",
+    "        with gr.TabItem(\"Upload and Filter\"):\n",
+    "            with gr.Row():\n",
+    "                with gr.Column(scale=1):\n",
+    "                    excel_file = gr.File(label=\"Upload Excel File\")\n",
+    "                    #excel_file = gr.File(label=\"Upload Excel File\", file_types=[\".xlsx\", \".xlsm\", \".xltx\", \".xltm\"])\n",
+    "                    keywords_input = gr.Textbox(label=\"Filter by Keywords (comma-separated)\")\n",
+    "                    display_button = gr.Button(\"Display and Filter Excel Data\")\n",
+    "                    clear_button_upload = gr.Button(\"Clear\")\n",
+    "                    row_count = gr.Textbox(label=\"Number of Rows\", interactive=False)\n",
+    "                with gr.Column(scale=3):\n",
+    "                    filtered_data = gr.Dataframe(label=\"Filtered Excel Contents\")\n",
+    "        \n",
+    "        with gr.TabItem(\"Calculate Results\"):\n",
+    "            with gr.Row():\n",
+    "                with gr.Column():\n",
+    "                    overall_rating = gr.Textbox(label=\"Overall Rating\")\n",
+    "                    summary = gr.Textbox(label=\"Summary\")\n",
+    "                    keywords_output = gr.Textbox(label=\"Keywords\")\n",
+    "                    overall_sentiment = gr.Textbox(label=\"Overall Sentiment\")\n",
+    "                    calculate_button = gr.Button(\"Calculate Results\")\n",
+    "                with gr.Column():\n",
+    "                    ratings_graph = gr.Image(label=\"Ratings Distribution\")\n",
+    "                    sentiments_graph = gr.Image(label=\"Sentiments Distribution\")\n",
+    "                    calculate_graph_button = gr.Button(\"Calculate Graph Results\")\n",
+    "        \n",
+    "        with gr.TabItem(\"Testing Area / Write a Review\"):\n",
+    "            with gr.Row():\n",
+    "                with gr.Column(scale=2):\n",
+    "                    review_input = gr.Textbox(label=\"Write your review here\")\n",
+    "                    analyze_button = gr.Button(\"Analyze Review\")\n",
+    "                    clear_button_review = gr.Button(\"Clear\")\n",
+    "                with gr.Column(scale=2):\n",
+    "                    review_rating = gr.Textbox(label=\"Rating\")\n",
+    "                    review_summary = gr.Textbox(label=\"Summary\")\n",
+    "                    review_keywords = gr.Textbox(label=\"Keywords\")\n",
+    "                    review_sentiment = gr.Textbox(label=\"Sentiment\")\n",
+    "\n",
+    "    display_button.click(lambda file, keywords: (filter_xl(file, keywords), count_rows(filter_xl(file, keywords))), inputs=[excel_file, keywords_input], outputs=[filtered_data, row_count])\n",
+    "    calculate_graph_button.click(lambda file, keywords: (*calculate_results(file, keywords)[:4], plot_ratings(calculate_results(file, keywords)[4]), plot_sentiments(calculate_results(file, keywords)[5])), inputs=[excel_file, keywords_input], outputs=[overall_rating, summary, keywords_output, overall_sentiment, ratings_graph, sentiments_graph])\n",
+    "    calculate_button.click(lambda file, keywords: (*calculate_results(file, keywords)[:4], plot_ratings(calculate_results(file, keywords)[4])), inputs=[excel_file, keywords_input], outputs=[overall_rating, summary, keywords_output, overall_sentiment])\n",
+    "    analyze_button.click(analyze_review, inputs=review_input, outputs=[review_rating, review_summary, review_keywords, review_sentiment])\n",
+    "    clear_button_upload.click(lambda: (\"\"), outputs=[keywords_input])\n",
+    "    clear_button_review.click(lambda: (\"\", \"\", \"\", \"\", \"\"), outputs=[review_input, review_rating, review_summary, review_keywords, review_sentiment])\n",
+    "\n",
+    "demo.launch(share=True)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "SolutionsInPR",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

Other Codes/First revision code.ipynb ADDED Viewed

	@@ -0,0 +1,99 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Load model directly\n",
+    "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, TextClassificationPipeline\n",
+    "import torch\n",
+    "import gradio as gr\n",
+    "from openpyxl import load_workbook\n",
+    "from numpy import mean\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"suriya7/bart-finetuned-text-summarization\")\n",
+    "model = AutoModelForSeq2SeqLM.from_pretrained(\"suriya7/bart-finetuned-text-summarization\")\n",
+    "\n",
+    "tokenizer_keywords = AutoTokenizer.from_pretrained(\"transformer3/H2-keywordextractor\")\n",
+    "model_keywords = AutoModelForSeq2SeqLM.from_pretrained(\"transformer3/H2-keywordextractor\")\n",
+    "\n",
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "# Load the fine-tuned model and tokenizer\n",
+    "new_model = AutoModelForSequenceClassification.from_pretrained('roberta-rating')\n",
+    "new_tokenizer = AutoTokenizer.from_pretrained('roberta-rating')\n",
+    "\n",
+    "\n",
+    "# Create a classification pipeline\n",
+    "classifier = TextClassificationPipeline(model=new_model, tokenizer=new_tokenizer, device=device)\n",
+    "\n",
+    "# Add label mapping for sentiment analysis\n",
+    "label_mapping = {1: '1/5', 2: '2/5', 3: '3/5', 4: '4/5', 5: '5/5'}\n",
+    "\n",
+    "def parse_xl(file_path):\n",
+    "    cells = []\n",
+    "\n",
+    "    workbook = load_workbook(filename=file_path)\n",
+    "    for sheet in workbook.worksheets:\n",
+    "        for row in sheet.iter_rows():\n",
+    "            for cell in row:\n",
+    "                if cell.value != None:\n",
+    "                    cells.append(cell.value)\n",
+    "\n",
+    "    return cells\n",
+    "\n",
+    "def evaluate(file):\n",
+    "    reviews = parse_xl(file)\n",
+    "    ratings = []\n",
+    "    text = \"\"\n",
+    "\n",
+    "    for review in reviews:\n",
+    "        ratings.append(int(classifier(review)[0]['label'].split('_')[1]))\n",
+    "        text += review\n",
+    "        text += \" \"\n",
+    "            \n",
+    "    inputs = tokenizer([text], max_length=1024, truncation=True, return_tensors=\"pt\")\n",
+    "    summary_ids = model.generate(inputs[\"input_ids\"], num_beams=2, min_length=50, max_length=1000)\n",
+    "    summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]\n",
+    "\n",
+    "    inputs_keywords = tokenizer_keywords([text], max_length=1024, truncation=True, return_tensors=\"pt\")\n",
+    "    summary_ids_keywords = model_keywords.generate(inputs_keywords[\"input_ids\"], num_beams=2, min_length=0, max_length=100)\n",
+    "    keywords = tokenizer_keywords.batch_decode(summary_ids_keywords, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]   \n",
+    "\n",
+    "    return round(mean(ratings), 2), summary, keywords\n",
+    "\n",
+    "iface = gr.Interface(\n",
+    "    fn=evaluate,\n",
+    "    inputs=gr.File(label=\"Reviews\", file_types=[\".xlsx\", \".xlsm\", \".xltx\", \".xltm\"]),\n",
+    "    outputs=[gr.Textbox(label=\"Rating\"), gr.Textbox(label=\"Summary\"), gr.Textbox(label=\"Keywords\")],\n",
+    "    title='Summarize Reviews',\n",
+    "    description=\"Evaluate and summarize collection of reviews. Reviews are submitted as an Excel file, where each reviews is in its own cell.\"\n",
+    ")\n",
+    "\n",
+    "iface.launch(share=True)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "SolutionsInPR",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

Other Codes/roberta-rating.ipynb ADDED Viewed

	@@ -0,0 +1,54 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoModelForSequenceClassification, AutoTokenizer, TextClassificationPipeline\n",
+    "import torch\n",
+    "import gradio as gr\n",
+    "\n",
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "# Load the fine-tuned model and tokenizer\n",
+    "new_model = AutoModelForSequenceClassification.from_pretrained('roberta-rating')\n",
+    "new_tokenizer = AutoTokenizer.from_pretrained('roberta-rating')\n",
+    "\n",
+    "\n",
+    "# Create a classification pipeline\n",
+    "classifier = TextClassificationPipeline(model=new_model, tokenizer=new_tokenizer, device=device)\n",
+    "\n",
+    "# Add label mapping for sentiment analysis (assuming LABEL_0 = 'negative' and LABEL_1 = 'positive')\n",
+    "label_mapping = {1: '1/5', 2: '2/5', 3: '3/5', 4: '4/5', 5: '5/5'}\n",
+    "\n",
+    "def evaluate(text):\n",
+    "    result = classifier(text)\n",
+    "    return label_mapping[int(result[0]['label'].split('_')[1])] + \".\", result[0]['score']\n",
+    "\n",
+    "iface = gr.Interface(\n",
+    "    fn=evaluate,\n",
+    "    inputs=gr.Textbox(label=\"Review\"),\n",
+    "    outputs=[gr.Textbox(label=\"Evaluation\"), gr.Textbox(label=\"Score\")],\n",
+    "    title='Write a review',\n",
+    "    description=\"Write a product review, and the model will evaluate its numerical rating\"\n",
+    ")\n",
+    "\n",
+    "iface.launch(share=True)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "SolutionsInPR",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

Other Codes/roberta-summarization.ipynb ADDED Viewed

	@@ -0,0 +1,191 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\panuk\\anaconda3\\envs\\SolutionsInPR\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:1617: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be deprecated in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Load model directly\n",
+    "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(\"facebook/bart-large-cnn\")\n",
+    "model = AutoModelForSeq2SeqLM.from_pretrained(\"facebook/bart-large-cnn\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "BartForConditionalGeneration(\n",
+       "  (model): BartModel(\n",
+       "    (shared): BartScaledWordEmbedding(50264, 1024, padding_idx=1)\n",
+       "    (encoder): BartEncoder(\n",
+       "      (embed_tokens): BartScaledWordEmbedding(50264, 1024, padding_idx=1)\n",
+       "      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)\n",
+       "      (layers): ModuleList(\n",
+       "        (0-11): 12 x BartEncoderLayer(\n",
+       "          (self_attn): BartSdpaAttention(\n",
+       "            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+       "            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+       "            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+       "            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+       "          )\n",
+       "          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
+       "          (activation_fn): GELUActivation()\n",
+       "          (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+       "          (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+       "          (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
+       "        )\n",
+       "      )\n",
+       "      (layernorm_embedding): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
+       "    )\n",
+       "    (decoder): BartDecoder(\n",
+       "      (embed_tokens): BartScaledWordEmbedding(50264, 1024, padding_idx=1)\n",
+       "      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)\n",
+       "      (layers): ModuleList(\n",
+       "        (0-11): 12 x BartDecoderLayer(\n",
+       "          (self_attn): BartSdpaAttention(\n",
+       "            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+       "            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+       "            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+       "            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+       "          )\n",
+       "          (activation_fn): GELUActivation()\n",
+       "          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
+       "          (encoder_attn): BartSdpaAttention(\n",
+       "            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+       "            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+       "            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+       "            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+       "          )\n",
+       "          (encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
+       "          (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+       "          (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+       "          (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
+       "        )\n",
+       "      )\n",
+       "      (layernorm_embedding): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
+       "    )\n",
+       "  )\n",
+       "  (lm_head): Linear(in_features=1024, out_features=50264, bias=False)\n",
+       ")"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "model.to(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running on local URL:  http://127.0.0.1:7861\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\panuk\\anaconda3\\envs\\SolutionsInPR\\Lib\\site-packages\\gradio\\analytics.py:106: UserWarning: IMPORTANT: You are using gradio version 4.44.1, however version 5.0.1 is available, please upgrade. \n",
+      "--------\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Running on public URL: https://1fe44b84e4bdd88e83.gradio.live\n",
+      "\n",
+      "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div><iframe src=\"https://1fe44b84e4bdd88e83.gradio.live\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": []
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\n",
+    "def summarize(text):\n",
+    "    inputs = tokenizer([text], max_length=1024, return_tensors=\"pt\")\n",
+    "    summary_ids = model.generate(inputs[\"input_ids\"], num_beams=2, min_length=0, max_length=100)\n",
+    "    return tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]\n",
+    "\n",
+    "import gradio as gr\n",
+    "\n",
+    "iface = gr.Interface(\n",
+    "    fn=summarize,\n",
+    "    inputs=gr.Textbox(label=\"Text to summarize\"),\n",
+    "    outputs=[gr.Textbox(label=\"Summary\")],\n",
+    "    title='Summarize text'\n",
+    ")\n",
+    "\n",
+    "iface.launch(share=True)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "SolutionsInPR",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

Other Codes/trainer.ipynb ADDED Viewed

	@@ -0,0 +1,116 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Importing necessary libraries\n",
+    "from datasets import load_dataset, ClassLabel\n",
+    "from transformers import AutoTokenizer\n",
+    "from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments\n",
+    "import torch\n",
+    "\n",
+    "# Load dataset\n",
+    "dataset = load_dataset(\"McAuley-Lab/Amazon-Reviews-2023\", \"raw_review_Appliances\", trust_remote_code=True, split=\"full\")\n",
+    "dataset = dataset.remove_columns(['title', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'])\n",
+    "dataset = dataset.rename_column('rating', 'label')\n",
+    "dataset = dataset.cast_column('label', ClassLabel(num_classes=6))\n",
+    "\n",
+    "# Load pre-trained tokenizer\n",
+    "tokenizer = AutoTokenizer.from_pretrained('roberta-base')\n",
+    "\n",
+    "# Define tokenization function\n",
+    "def tokenize_function(examples):\n",
+    "    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)\n",
+    "\n",
+    "# Apply tokenization\n",
+    "tokenized_datasets = dataset.map(tokenize_function, batched=True)\n",
+    "tokenized_datasets = tokenized_datasets.shuffle()\n",
+    "print(tokenized_datasets)\n",
+    "\n",
+    "# Load pre-trained BERT model for sequence classification\n",
+    "model = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=6)\n",
+    "\n",
+    "# Define training arguments\n",
+    "training_args = TrainingArguments(\n",
+    "    output_dir='./results',\n",
+    "    num_train_epochs=10,\n",
+    "    per_device_train_batch_size=16,\n",
+    "    per_device_eval_batch_size=16,\n",
+    "    evaluation_strategy='epoch',\n",
+    "    logging_dir='./logs',\n",
+    ")\n",
+    "\n",
+    "# Create trainer instance\n",
+    "trainer = Trainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    train_dataset=tokenized_datasets.select(range(1000)),\n",
+    "    eval_dataset=tokenized_datasets.select(range(1001, 2001)),\n",
+    ")\n",
+    "\n",
+    "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
+    "model.to(device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.metrics import accuracy_score, precision_recall_fscore_support\n",
+    "\n",
+    "# Define function to compute metrics\n",
+    "def compute_metrics(pred):\n",
+    "    labels = pred.label_ids\n",
+    "    preds = pred.predictions.argmax(-1)\n",
+    "    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')\n",
+    "    acc = accuracy_score(labels, preds)\n",
+    "    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}\n",
+    "\n",
+    "# Update trainer to include custom metrics\n",
+    "trainer.compute_metrics = compute_metrics\n",
+    "\n",
+    "# Evaluate the model\n",
+    "eval_result = trainer.evaluate()\n",
+    "print(eval_result)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Save the fine-tuned model and tokenizer\n",
+    "trainer.save_model('roberta-rating')\n",
+    "tokenizer.save_pretrained('roberta-rating')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "SolutionsInPR",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}