|
{ |
|
"cells": [ |
|
{ |
|
"cell_type": "code", |
|
"execution_count": null, |
|
"metadata": {}, |
|
"outputs": [], |
|
"source": [ |
|
"# Load model directly\n", |
|
"from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, TextClassificationPipeline\n", |
|
"import torch\n", |
|
"import gradio as gr\n", |
|
"from openpyxl import load_workbook\n", |
|
"from numpy import mean\n", |
|
"\n", |
|
"tokenizer = AutoTokenizer.from_pretrained(\"suriya7/bart-finetuned-text-summarization\")\n", |
|
"model = AutoModelForSeq2SeqLM.from_pretrained(\"suriya7/bart-finetuned-text-summarization\")\n", |
|
"\n", |
|
"tokenizer_keywords = AutoTokenizer.from_pretrained(\"transformer3/H2-keywordextractor\")\n", |
|
"model_keywords = AutoModelForSeq2SeqLM.from_pretrained(\"transformer3/H2-keywordextractor\")\n", |
|
"\n", |
|
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", |
|
"# Load the fine-tuned model and tokenizer\n", |
|
"new_model = AutoModelForSequenceClassification.from_pretrained('roberta-rating')\n", |
|
"new_tokenizer = AutoTokenizer.from_pretrained('roberta-rating')\n", |
|
"\n", |
|
"\n", |
|
"# Create a classification pipeline\n", |
|
"classifier = TextClassificationPipeline(model=new_model, tokenizer=new_tokenizer, device=device)\n", |
|
"\n", |
|
"# Add label mapping for sentiment analysis\n", |
|
"label_mapping = {1: '1/5', 2: '2/5', 3: '3/5', 4: '4/5', 5: '5/5'}\n", |
|
"\n", |
|
"def parse_xl(file_path):\n", |
|
" cells = []\n", |
|
"\n", |
|
" workbook = load_workbook(filename=file_path)\n", |
|
" for sheet in workbook.worksheets:\n", |
|
" for row in sheet.iter_rows():\n", |
|
" for cell in row:\n", |
|
" if cell.value != None:\n", |
|
" cells.append(cell.value)\n", |
|
"\n", |
|
" return cells\n", |
|
"\n", |
|
"def evaluate(file):\n", |
|
" reviews = parse_xl(file)\n", |
|
" ratings = []\n", |
|
" text = \"\"\n", |
|
"\n", |
|
" for review in reviews:\n", |
|
" ratings.append(int(classifier(review)[0]['label'].split('_')[1]))\n", |
|
" text += review\n", |
|
" text += \" \"\n", |
|
" \n", |
|
" inputs = tokenizer([text], max_length=1024, truncation=True, return_tensors=\"pt\")\n", |
|
" summary_ids = model.generate(inputs[\"input_ids\"], num_beams=2, min_length=50, max_length=1000)\n", |
|
" summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]\n", |
|
"\n", |
|
" inputs_keywords = tokenizer_keywords([text], max_length=1024, truncation=True, return_tensors=\"pt\")\n", |
|
" summary_ids_keywords = model_keywords.generate(inputs_keywords[\"input_ids\"], num_beams=2, min_length=0, max_length=100)\n", |
|
" keywords = tokenizer_keywords.batch_decode(summary_ids_keywords, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] \n", |
|
"\n", |
|
" return round(mean(ratings), 2), summary, keywords\n", |
|
"\n", |
|
"iface = gr.Interface(\n", |
|
" fn=evaluate,\n", |
|
" inputs=gr.File(label=\"Reviews\", file_types=[\".xlsx\", \".xlsm\", \".xltx\", \".xltm\"]),\n", |
|
" outputs=[gr.Textbox(label=\"Rating\"), gr.Textbox(label=\"Summary\"), gr.Textbox(label=\"Keywords\")],\n", |
|
" title='Summarize Reviews',\n", |
|
" description=\"Evaluate and summarize collection of reviews. Reviews are submitted as an Excel file, where each reviews is in its own cell.\"\n", |
|
")\n", |
|
"\n", |
|
"iface.launch(share=True)" |
|
] |
|
} |
|
], |
|
"metadata": { |
|
"kernelspec": { |
|
"display_name": "SolutionsInPR", |
|
"language": "python", |
|
"name": "python3" |
|
}, |
|
"language_info": { |
|
"codemirror_mode": { |
|
"name": "ipython", |
|
"version": 3 |
|
}, |
|
"file_extension": ".py", |
|
"mimetype": "text/x-python", |
|
"name": "python", |
|
"nbconvert_exporter": "python", |
|
"pygments_lexer": "ipython3", |
|
"version": "3.12.3" |
|
} |
|
}, |
|
"nbformat": 4, |
|
"nbformat_minor": 2 |
|
} |
|
|