Upload 5 files
Browse files
Other Codes/Final revision code.ipynb
ADDED
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 14,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [
|
8 |
+
{
|
9 |
+
"name": "stdout",
|
10 |
+
"output_type": "stream",
|
11 |
+
"text": [
|
12 |
+
"* Running on local URL: http://127.0.0.1:7870\n",
|
13 |
+
"* Running on public URL: https://a94e18f722148a0463.gradio.live\n",
|
14 |
+
"\n",
|
15 |
+
"This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)\n"
|
16 |
+
]
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"data": {
|
20 |
+
"text/html": [
|
21 |
+
"<div><iframe src=\"https://a94e18f722148a0463.gradio.live\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
|
22 |
+
],
|
23 |
+
"text/plain": [
|
24 |
+
"<IPython.core.display.HTML object>"
|
25 |
+
]
|
26 |
+
},
|
27 |
+
"metadata": {},
|
28 |
+
"output_type": "display_data"
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"data": {
|
32 |
+
"text/plain": []
|
33 |
+
},
|
34 |
+
"execution_count": 14,
|
35 |
+
"metadata": {},
|
36 |
+
"output_type": "execute_result"
|
37 |
+
}
|
38 |
+
],
|
39 |
+
"source": [
|
40 |
+
"from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, TextClassificationPipeline\n",
|
41 |
+
"import torch\n",
|
42 |
+
"import gradio as gr\n",
|
43 |
+
"from openpyxl import load_workbook\n",
|
44 |
+
"from numpy import mean\n",
|
45 |
+
"import pandas as pd\n",
|
46 |
+
"import matplotlib.pyplot as plt\n",
|
47 |
+
"\n",
|
48 |
+
"theme = gr.themes.Soft(\n",
|
49 |
+
" primary_hue=\"amber\",\n",
|
50 |
+
" secondary_hue=\"amber\",\n",
|
51 |
+
" neutral_hue=\"stone\",\n",
|
52 |
+
")\n",
|
53 |
+
"\n",
|
54 |
+
"# Load tokenizers and models\n",
|
55 |
+
"tokenizer = AutoTokenizer.from_pretrained(\"suriya7/bart-finetuned-text-summarization\")\n",
|
56 |
+
"model = AutoModelForSeq2SeqLM.from_pretrained(\"suriya7/bart-finetuned-text-summarization\")\n",
|
57 |
+
"\n",
|
58 |
+
"tokenizer_keywords = AutoTokenizer.from_pretrained(\"transformer3/H2-keywordextractor\")\n",
|
59 |
+
"model_keywords = AutoModelForSeq2SeqLM.from_pretrained(\"transformer3/H2-keywordextractor\")\n",
|
60 |
+
"\n",
|
61 |
+
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
62 |
+
"new_model = AutoModelForSequenceClassification.from_pretrained('roberta-rating')\n",
|
63 |
+
"new_tokenizer = AutoTokenizer.from_pretrained('roberta-rating')\n",
|
64 |
+
"\n",
|
65 |
+
"classifier = TextClassificationPipeline(model=new_model, tokenizer=new_tokenizer, device=device)\n",
|
66 |
+
"\n",
|
67 |
+
"label_mapping = {1: '1/5', 2: '2/5', 3: '3/5', 4: '4/5', 5: '5/5'}\n",
|
68 |
+
"\n",
|
69 |
+
"# Function to display and filter the Excel workbook\n",
|
70 |
+
"def filter_xl(file, keywords):\n",
|
71 |
+
" # Load the workbook and convert it to a DataFrame\n",
|
72 |
+
" workbook = load_workbook(filename=file)\n",
|
73 |
+
" sheet = workbook.active\n",
|
74 |
+
" data = sheet.values\n",
|
75 |
+
" columns = next(data)[0:]\n",
|
76 |
+
" df = pd.DataFrame(data, columns=columns)\n",
|
77 |
+
" \n",
|
78 |
+
" if keywords:\n",
|
79 |
+
" keyword_list = keywords.split(',')\n",
|
80 |
+
" for keyword in keyword_list:\n",
|
81 |
+
" df = df[df.apply(lambda row: row.astype(str).str.contains(keyword.strip(), case=False).any(), axis=1)]\n",
|
82 |
+
" \n",
|
83 |
+
" return df\n",
|
84 |
+
"\n",
|
85 |
+
"# Function to calculate overall rating from filtered data\n",
|
86 |
+
"def calculate_rating(filtered_df):\n",
|
87 |
+
" reviews = filtered_df.to_numpy().flatten()\n",
|
88 |
+
" ratings = []\n",
|
89 |
+
" for review in reviews:\n",
|
90 |
+
" if pd.notna(review):\n",
|
91 |
+
" rating = int(classifier(review)[0]['label'].split('_')[1])\n",
|
92 |
+
" ratings.append(rating)\n",
|
93 |
+
" \n",
|
94 |
+
" return round(mean(ratings), 2), ratings\n",
|
95 |
+
"\n",
|
96 |
+
"# Function to calculate results including summary, keywords, and sentiment\n",
|
97 |
+
"def calculate_results(file, keywords):\n",
|
98 |
+
" filtered_df = filter_xl(file, keywords)\n",
|
99 |
+
" overall_rating, ratings = calculate_rating(filtered_df)\n",
|
100 |
+
" \n",
|
101 |
+
" # Summarize and extract keywords from the filtered reviews\n",
|
102 |
+
" text = \" \".join(filtered_df.to_numpy().flatten())\n",
|
103 |
+
" inputs = tokenizer([text], max_length=1024, truncation=True, return_tensors=\"pt\")\n",
|
104 |
+
" summary_ids = model.generate(inputs[\"input_ids\"], num_beams=2, min_length=10, max_length=50)\n",
|
105 |
+
" summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]\n",
|
106 |
+
" summary = summary.replace(\"I\", \"They\").replace(\"my\", \"their\").replace(\"me\", \"them\")\n",
|
107 |
+
"\n",
|
108 |
+
" inputs_keywords = tokenizer_keywords([text], max_length=1024, truncation=True, return_tensors=\"pt\")\n",
|
109 |
+
" summary_ids_keywords = model_keywords.generate(inputs_keywords[\"input_ids\"], num_beams=2, min_length=0, max_length=100)\n",
|
110 |
+
" keywords = tokenizer_keywords.batch_decode(summary_ids_keywords, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]\n",
|
111 |
+
"\n",
|
112 |
+
" # Determine overall sentiment\n",
|
113 |
+
" sentiments = []\n",
|
114 |
+
" for review in filtered_df.to_numpy().flatten():\n",
|
115 |
+
" if pd.notna(review):\n",
|
116 |
+
" sentiment = classifier(review)[0]['label']\n",
|
117 |
+
" sentiment_label = \"Positive\" if sentiment == \"LABEL_4\" or sentiment == \"LABEL_5\" else \"Negative\" if sentiment == \"LABEL_1\" or sentiment == \"LABEL_2\" else \"Neutral\"\n",
|
118 |
+
" sentiments.append(sentiment_label)\n",
|
119 |
+
" \n",
|
120 |
+
" overall_sentiment = \"Positive\" if sentiments.count(\"Positive\") > sentiments.count(\"Negative\") else \"Negative\" if sentiments.count(\"Negative\") > sentiments.count(\"Positive\") else \"Neutral\"\n",
|
121 |
+
"\n",
|
122 |
+
" return overall_rating, summary, keywords, overall_sentiment, ratings, sentiments\n",
|
123 |
+
"\n",
|
124 |
+
"# Function to analyze a single review\n",
|
125 |
+
"def analyze_review(review):\n",
|
126 |
+
" if not review.strip():\n",
|
127 |
+
" return \"Error: No text provided\", \"Error: No text provided\", \"Error: No text provided\", \"Error: No text provided\"\n",
|
128 |
+
" \n",
|
129 |
+
" # Calculate rating\n",
|
130 |
+
" rating = int(classifier(review)[0]['label'].split('_')[1])\n",
|
131 |
+
" \n",
|
132 |
+
" # Summarize review\n",
|
133 |
+
" inputs = tokenizer([review], max_length=1024, truncation=True, return_tensors=\"pt\")\n",
|
134 |
+
" summary_ids = model.generate(inputs[\"input_ids\"], num_beams=2, min_length=10, max_length=50)\n",
|
135 |
+
" summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]\n",
|
136 |
+
" summary = summary.replace(\"I\", \"he/she\").replace(\"my\", \"his/her\").replace(\"me\", \"him/her\")\n",
|
137 |
+
"\n",
|
138 |
+
" # Extract keywords\n",
|
139 |
+
" inputs_keywords = tokenizer_keywords([review], max_length=1024, truncation=True, return_tensors=\"pt\")\n",
|
140 |
+
" summary_ids_keywords = model_keywords.generate(inputs_keywords[\"input_ids\"], num_beams=2, min_length=0, max_length=100)\n",
|
141 |
+
" keywords = tokenizer_keywords.batch_decode(summary_ids_keywords, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]\n",
|
142 |
+
"\n",
|
143 |
+
" # Determine sentiment\n",
|
144 |
+
" sentiment = classifier(review)[0]['label']\n",
|
145 |
+
" sentiment_label = \"Positive\" if sentiment == \"LABEL_4\" or sentiment == \"LABEL_5\" else \"Negative\" if sentiment == \"LABEL_1\" or sentiment == \"LABEL_2\" else \"Neutral\"\n",
|
146 |
+
"\n",
|
147 |
+
" return rating, summary, keywords, sentiment_label\n",
|
148 |
+
"\n",
|
149 |
+
"# Function to count rows in the filtered DataFrame\n",
|
150 |
+
"def count_rows(filtered_df):\n",
|
151 |
+
" return len(filtered_df)\n",
|
152 |
+
"\n",
|
153 |
+
"# Function to plot ratings\n",
|
154 |
+
"def plot_ratings(ratings):\n",
|
155 |
+
" plt.figure(figsize=(10, 5))\n",
|
156 |
+
" plt.hist(ratings, bins=range(1, 7), edgecolor='black', align='left')\n",
|
157 |
+
" plt.xlabel('Rating')\n",
|
158 |
+
" plt.ylabel('Frequency')\n",
|
159 |
+
" plt.title('Distribution of Ratings')\n",
|
160 |
+
" plt.xticks(range(1, 6))\n",
|
161 |
+
" plt.grid(True)\n",
|
162 |
+
" plt.savefig('ratings_distribution.png')\n",
|
163 |
+
" return 'ratings_distribution.png'\n",
|
164 |
+
"\n",
|
165 |
+
"# Function to plot sentiments\n",
|
166 |
+
"def plot_sentiments(sentiments):\n",
|
167 |
+
" sentiment_counts = pd.Series(sentiments).value_counts()\n",
|
168 |
+
" plt.figure(figsize=(10, 5))\n",
|
169 |
+
" sentiment_counts.plot(kind='bar', color=['green', 'red', 'blue'])\n",
|
170 |
+
" plt.xlabel('Sentiment')\n",
|
171 |
+
" plt.ylabel('Frequency')\n",
|
172 |
+
" plt.title('Distribution of Sentiments')\n",
|
173 |
+
" plt.grid(True)\n",
|
174 |
+
" plt.savefig('sentiments_distribution.png')\n",
|
175 |
+
" return 'sentiments_distribution.png'\n",
|
176 |
+
"\n",
|
177 |
+
"# Gradio interface\n",
|
178 |
+
"with gr.Blocks(theme=theme) as demo:\n",
|
179 |
+
" gr.Markdown(\"<h1 style='text-align: center;'>Feedback and Auditing Survey AI Analyzer</h1><br>\")\n",
|
180 |
+
" with gr.Tabs():\n",
|
181 |
+
" with gr.TabItem(\"Upload and Filter\"):\n",
|
182 |
+
" with gr.Row():\n",
|
183 |
+
" with gr.Column(scale=1):\n",
|
184 |
+
" excel_file = gr.File(label=\"Upload Excel File\")\n",
|
185 |
+
" #excel_file = gr.File(label=\"Upload Excel File\", file_types=[\".xlsx\", \".xlsm\", \".xltx\", \".xltm\"])\n",
|
186 |
+
" keywords_input = gr.Textbox(label=\"Filter by Keywords (comma-separated)\")\n",
|
187 |
+
" display_button = gr.Button(\"Display and Filter Excel Data\")\n",
|
188 |
+
" clear_button_upload = gr.Button(\"Clear\")\n",
|
189 |
+
" row_count = gr.Textbox(label=\"Number of Rows\", interactive=False)\n",
|
190 |
+
" with gr.Column(scale=3):\n",
|
191 |
+
" filtered_data = gr.Dataframe(label=\"Filtered Excel Contents\")\n",
|
192 |
+
" \n",
|
193 |
+
" with gr.TabItem(\"Calculate Results\"):\n",
|
194 |
+
" with gr.Row():\n",
|
195 |
+
" with gr.Column():\n",
|
196 |
+
" overall_rating = gr.Textbox(label=\"Overall Rating\")\n",
|
197 |
+
" summary = gr.Textbox(label=\"Summary\")\n",
|
198 |
+
" keywords_output = gr.Textbox(label=\"Keywords\")\n",
|
199 |
+
" overall_sentiment = gr.Textbox(label=\"Overall Sentiment\")\n",
|
200 |
+
" calculate_button = gr.Button(\"Calculate Results\")\n",
|
201 |
+
" with gr.Column():\n",
|
202 |
+
" ratings_graph = gr.Image(label=\"Ratings Distribution\")\n",
|
203 |
+
" sentiments_graph = gr.Image(label=\"Sentiments Distribution\")\n",
|
204 |
+
" calculate_graph_button = gr.Button(\"Calculate Graph Results\")\n",
|
205 |
+
" \n",
|
206 |
+
" with gr.TabItem(\"Testing Area / Write a Review\"):\n",
|
207 |
+
" with gr.Row():\n",
|
208 |
+
" with gr.Column(scale=2):\n",
|
209 |
+
" review_input = gr.Textbox(label=\"Write your review here\")\n",
|
210 |
+
" analyze_button = gr.Button(\"Analyze Review\")\n",
|
211 |
+
" clear_button_review = gr.Button(\"Clear\")\n",
|
212 |
+
" with gr.Column(scale=2):\n",
|
213 |
+
" review_rating = gr.Textbox(label=\"Rating\")\n",
|
214 |
+
" review_summary = gr.Textbox(label=\"Summary\")\n",
|
215 |
+
" review_keywords = gr.Textbox(label=\"Keywords\")\n",
|
216 |
+
" review_sentiment = gr.Textbox(label=\"Sentiment\")\n",
|
217 |
+
"\n",
|
218 |
+
" display_button.click(lambda file, keywords: (filter_xl(file, keywords), count_rows(filter_xl(file, keywords))), inputs=[excel_file, keywords_input], outputs=[filtered_data, row_count])\n",
|
219 |
+
" calculate_graph_button.click(lambda file, keywords: (*calculate_results(file, keywords)[:4], plot_ratings(calculate_results(file, keywords)[4]), plot_sentiments(calculate_results(file, keywords)[5])), inputs=[excel_file, keywords_input], outputs=[overall_rating, summary, keywords_output, overall_sentiment, ratings_graph, sentiments_graph])\n",
|
220 |
+
" calculate_button.click(lambda file, keywords: (*calculate_results(file, keywords)[:4], plot_ratings(calculate_results(file, keywords)[4])), inputs=[excel_file, keywords_input], outputs=[overall_rating, summary, keywords_output, overall_sentiment])\n",
|
221 |
+
" analyze_button.click(analyze_review, inputs=review_input, outputs=[review_rating, review_summary, review_keywords, review_sentiment])\n",
|
222 |
+
" clear_button_upload.click(lambda: (\"\"), outputs=[keywords_input])\n",
|
223 |
+
" clear_button_review.click(lambda: (\"\", \"\", \"\", \"\", \"\"), outputs=[review_input, review_rating, review_summary, review_keywords, review_sentiment])\n",
|
224 |
+
"\n",
|
225 |
+
"demo.launch(share=True)"
|
226 |
+
]
|
227 |
+
}
|
228 |
+
],
|
229 |
+
"metadata": {
|
230 |
+
"kernelspec": {
|
231 |
+
"display_name": "SolutionsInPR",
|
232 |
+
"language": "python",
|
233 |
+
"name": "python3"
|
234 |
+
},
|
235 |
+
"language_info": {
|
236 |
+
"codemirror_mode": {
|
237 |
+
"name": "ipython",
|
238 |
+
"version": 3
|
239 |
+
},
|
240 |
+
"file_extension": ".py",
|
241 |
+
"mimetype": "text/x-python",
|
242 |
+
"name": "python",
|
243 |
+
"nbconvert_exporter": "python",
|
244 |
+
"pygments_lexer": "ipython3",
|
245 |
+
"version": "3.12.4"
|
246 |
+
}
|
247 |
+
},
|
248 |
+
"nbformat": 4,
|
249 |
+
"nbformat_minor": 2
|
250 |
+
}
|
Other Codes/First revision code.ipynb
ADDED
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"# Load model directly\n",
|
10 |
+
"from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, TextClassificationPipeline\n",
|
11 |
+
"import torch\n",
|
12 |
+
"import gradio as gr\n",
|
13 |
+
"from openpyxl import load_workbook\n",
|
14 |
+
"from numpy import mean\n",
|
15 |
+
"\n",
|
16 |
+
"tokenizer = AutoTokenizer.from_pretrained(\"suriya7/bart-finetuned-text-summarization\")\n",
|
17 |
+
"model = AutoModelForSeq2SeqLM.from_pretrained(\"suriya7/bart-finetuned-text-summarization\")\n",
|
18 |
+
"\n",
|
19 |
+
"tokenizer_keywords = AutoTokenizer.from_pretrained(\"transformer3/H2-keywordextractor\")\n",
|
20 |
+
"model_keywords = AutoModelForSeq2SeqLM.from_pretrained(\"transformer3/H2-keywordextractor\")\n",
|
21 |
+
"\n",
|
22 |
+
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
23 |
+
"# Load the fine-tuned model and tokenizer\n",
|
24 |
+
"new_model = AutoModelForSequenceClassification.from_pretrained('roberta-rating')\n",
|
25 |
+
"new_tokenizer = AutoTokenizer.from_pretrained('roberta-rating')\n",
|
26 |
+
"\n",
|
27 |
+
"\n",
|
28 |
+
"# Create a classification pipeline\n",
|
29 |
+
"classifier = TextClassificationPipeline(model=new_model, tokenizer=new_tokenizer, device=device)\n",
|
30 |
+
"\n",
|
31 |
+
"# Add label mapping for sentiment analysis\n",
|
32 |
+
"label_mapping = {1: '1/5', 2: '2/5', 3: '3/5', 4: '4/5', 5: '5/5'}\n",
|
33 |
+
"\n",
|
34 |
+
"def parse_xl(file_path):\n",
|
35 |
+
" cells = []\n",
|
36 |
+
"\n",
|
37 |
+
" workbook = load_workbook(filename=file_path)\n",
|
38 |
+
" for sheet in workbook.worksheets:\n",
|
39 |
+
" for row in sheet.iter_rows():\n",
|
40 |
+
" for cell in row:\n",
|
41 |
+
" if cell.value != None:\n",
|
42 |
+
" cells.append(cell.value)\n",
|
43 |
+
"\n",
|
44 |
+
" return cells\n",
|
45 |
+
"\n",
|
46 |
+
"def evaluate(file):\n",
|
47 |
+
" reviews = parse_xl(file)\n",
|
48 |
+
" ratings = []\n",
|
49 |
+
" text = \"\"\n",
|
50 |
+
"\n",
|
51 |
+
" for review in reviews:\n",
|
52 |
+
" ratings.append(int(classifier(review)[0]['label'].split('_')[1]))\n",
|
53 |
+
" text += review\n",
|
54 |
+
" text += \" \"\n",
|
55 |
+
" \n",
|
56 |
+
" inputs = tokenizer([text], max_length=1024, truncation=True, return_tensors=\"pt\")\n",
|
57 |
+
" summary_ids = model.generate(inputs[\"input_ids\"], num_beams=2, min_length=50, max_length=1000)\n",
|
58 |
+
" summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]\n",
|
59 |
+
"\n",
|
60 |
+
" inputs_keywords = tokenizer_keywords([text], max_length=1024, truncation=True, return_tensors=\"pt\")\n",
|
61 |
+
" summary_ids_keywords = model_keywords.generate(inputs_keywords[\"input_ids\"], num_beams=2, min_length=0, max_length=100)\n",
|
62 |
+
" keywords = tokenizer_keywords.batch_decode(summary_ids_keywords, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] \n",
|
63 |
+
"\n",
|
64 |
+
" return round(mean(ratings), 2), summary, keywords\n",
|
65 |
+
"\n",
|
66 |
+
"iface = gr.Interface(\n",
|
67 |
+
" fn=evaluate,\n",
|
68 |
+
" inputs=gr.File(label=\"Reviews\", file_types=[\".xlsx\", \".xlsm\", \".xltx\", \".xltm\"]),\n",
|
69 |
+
" outputs=[gr.Textbox(label=\"Rating\"), gr.Textbox(label=\"Summary\"), gr.Textbox(label=\"Keywords\")],\n",
|
70 |
+
" title='Summarize Reviews',\n",
|
71 |
+
" description=\"Evaluate and summarize collection of reviews. Reviews are submitted as an Excel file, where each reviews is in its own cell.\"\n",
|
72 |
+
")\n",
|
73 |
+
"\n",
|
74 |
+
"iface.launch(share=True)"
|
75 |
+
]
|
76 |
+
}
|
77 |
+
],
|
78 |
+
"metadata": {
|
79 |
+
"kernelspec": {
|
80 |
+
"display_name": "SolutionsInPR",
|
81 |
+
"language": "python",
|
82 |
+
"name": "python3"
|
83 |
+
},
|
84 |
+
"language_info": {
|
85 |
+
"codemirror_mode": {
|
86 |
+
"name": "ipython",
|
87 |
+
"version": 3
|
88 |
+
},
|
89 |
+
"file_extension": ".py",
|
90 |
+
"mimetype": "text/x-python",
|
91 |
+
"name": "python",
|
92 |
+
"nbconvert_exporter": "python",
|
93 |
+
"pygments_lexer": "ipython3",
|
94 |
+
"version": "3.12.3"
|
95 |
+
}
|
96 |
+
},
|
97 |
+
"nbformat": 4,
|
98 |
+
"nbformat_minor": 2
|
99 |
+
}
|
Other Codes/roberta-rating.ipynb
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"from transformers import AutoModelForSequenceClassification, AutoTokenizer, TextClassificationPipeline\n",
|
10 |
+
"import torch\n",
|
11 |
+
"import gradio as gr\n",
|
12 |
+
"\n",
|
13 |
+
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
14 |
+
"# Load the fine-tuned model and tokenizer\n",
|
15 |
+
"new_model = AutoModelForSequenceClassification.from_pretrained('roberta-rating')\n",
|
16 |
+
"new_tokenizer = AutoTokenizer.from_pretrained('roberta-rating')\n",
|
17 |
+
"\n",
|
18 |
+
"\n",
|
19 |
+
"# Create a classification pipeline\n",
|
20 |
+
"classifier = TextClassificationPipeline(model=new_model, tokenizer=new_tokenizer, device=device)\n",
|
21 |
+
"\n",
|
22 |
+
"# Add label mapping for sentiment analysis (assuming LABEL_0 = 'negative' and LABEL_1 = 'positive')\n",
|
23 |
+
"label_mapping = {1: '1/5', 2: '2/5', 3: '3/5', 4: '4/5', 5: '5/5'}\n",
|
24 |
+
"\n",
|
25 |
+
"def evaluate(text):\n",
|
26 |
+
" result = classifier(text)\n",
|
27 |
+
" return label_mapping[int(result[0]['label'].split('_')[1])] + \".\", result[0]['score']\n",
|
28 |
+
"\n",
|
29 |
+
"iface = gr.Interface(\n",
|
30 |
+
" fn=evaluate,\n",
|
31 |
+
" inputs=gr.Textbox(label=\"Review\"),\n",
|
32 |
+
" outputs=[gr.Textbox(label=\"Evaluation\"), gr.Textbox(label=\"Score\")],\n",
|
33 |
+
" title='Write a review',\n",
|
34 |
+
" description=\"Write a product review, and the model will evaluate its numerical rating\"\n",
|
35 |
+
")\n",
|
36 |
+
"\n",
|
37 |
+
"iface.launch(share=True)"
|
38 |
+
]
|
39 |
+
}
|
40 |
+
],
|
41 |
+
"metadata": {
|
42 |
+
"kernelspec": {
|
43 |
+
"display_name": "SolutionsInPR",
|
44 |
+
"language": "python",
|
45 |
+
"name": "python3"
|
46 |
+
},
|
47 |
+
"language_info": {
|
48 |
+
"name": "python",
|
49 |
+
"version": "3.12.3"
|
50 |
+
}
|
51 |
+
},
|
52 |
+
"nbformat": 4,
|
53 |
+
"nbformat_minor": 2
|
54 |
+
}
|
Other Codes/roberta-summarization.ipynb
ADDED
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [
|
8 |
+
{
|
9 |
+
"name": "stderr",
|
10 |
+
"output_type": "stream",
|
11 |
+
"text": [
|
12 |
+
"c:\\Users\\panuk\\anaconda3\\envs\\SolutionsInPR\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:1617: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be deprecated in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
|
13 |
+
" warnings.warn(\n"
|
14 |
+
]
|
15 |
+
}
|
16 |
+
],
|
17 |
+
"source": [
|
18 |
+
"# Load model directly\n",
|
19 |
+
"from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n",
|
20 |
+
"\n",
|
21 |
+
"tokenizer = AutoTokenizer.from_pretrained(\"facebook/bart-large-cnn\")\n",
|
22 |
+
"model = AutoModelForSeq2SeqLM.from_pretrained(\"facebook/bart-large-cnn\")"
|
23 |
+
]
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"cell_type": "code",
|
27 |
+
"execution_count": 2,
|
28 |
+
"metadata": {},
|
29 |
+
"outputs": [
|
30 |
+
{
|
31 |
+
"data": {
|
32 |
+
"text/plain": [
|
33 |
+
"BartForConditionalGeneration(\n",
|
34 |
+
" (model): BartModel(\n",
|
35 |
+
" (shared): BartScaledWordEmbedding(50264, 1024, padding_idx=1)\n",
|
36 |
+
" (encoder): BartEncoder(\n",
|
37 |
+
" (embed_tokens): BartScaledWordEmbedding(50264, 1024, padding_idx=1)\n",
|
38 |
+
" (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)\n",
|
39 |
+
" (layers): ModuleList(\n",
|
40 |
+
" (0-11): 12 x BartEncoderLayer(\n",
|
41 |
+
" (self_attn): BartSdpaAttention(\n",
|
42 |
+
" (k_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
|
43 |
+
" (v_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
|
44 |
+
" (q_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
|
45 |
+
" (out_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
|
46 |
+
" )\n",
|
47 |
+
" (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
|
48 |
+
" (activation_fn): GELUActivation()\n",
|
49 |
+
" (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
|
50 |
+
" (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
|
51 |
+
" (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
|
52 |
+
" )\n",
|
53 |
+
" )\n",
|
54 |
+
" (layernorm_embedding): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
|
55 |
+
" )\n",
|
56 |
+
" (decoder): BartDecoder(\n",
|
57 |
+
" (embed_tokens): BartScaledWordEmbedding(50264, 1024, padding_idx=1)\n",
|
58 |
+
" (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)\n",
|
59 |
+
" (layers): ModuleList(\n",
|
60 |
+
" (0-11): 12 x BartDecoderLayer(\n",
|
61 |
+
" (self_attn): BartSdpaAttention(\n",
|
62 |
+
" (k_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
|
63 |
+
" (v_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
|
64 |
+
" (q_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
|
65 |
+
" (out_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
|
66 |
+
" )\n",
|
67 |
+
" (activation_fn): GELUActivation()\n",
|
68 |
+
" (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
|
69 |
+
" (encoder_attn): BartSdpaAttention(\n",
|
70 |
+
" (k_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
|
71 |
+
" (v_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
|
72 |
+
" (q_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
|
73 |
+
" (out_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
|
74 |
+
" )\n",
|
75 |
+
" (encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
|
76 |
+
" (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
|
77 |
+
" (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
|
78 |
+
" (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
|
79 |
+
" )\n",
|
80 |
+
" )\n",
|
81 |
+
" (layernorm_embedding): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
|
82 |
+
" )\n",
|
83 |
+
" )\n",
|
84 |
+
" (lm_head): Linear(in_features=1024, out_features=50264, bias=False)\n",
|
85 |
+
")"
|
86 |
+
]
|
87 |
+
},
|
88 |
+
"execution_count": 2,
|
89 |
+
"metadata": {},
|
90 |
+
"output_type": "execute_result"
|
91 |
+
}
|
92 |
+
],
|
93 |
+
"source": [
|
94 |
+
"import torch\n",
|
95 |
+
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
96 |
+
"model.to(device)"
|
97 |
+
]
|
98 |
+
},
|
99 |
+
{
|
100 |
+
"cell_type": "code",
|
101 |
+
"execution_count": 3,
|
102 |
+
"metadata": {},
|
103 |
+
"outputs": [
|
104 |
+
{
|
105 |
+
"name": "stdout",
|
106 |
+
"output_type": "stream",
|
107 |
+
"text": [
|
108 |
+
"Running on local URL: http://127.0.0.1:7861\n"
|
109 |
+
]
|
110 |
+
},
|
111 |
+
{
|
112 |
+
"name": "stderr",
|
113 |
+
"output_type": "stream",
|
114 |
+
"text": [
|
115 |
+
"c:\\Users\\panuk\\anaconda3\\envs\\SolutionsInPR\\Lib\\site-packages\\gradio\\analytics.py:106: UserWarning: IMPORTANT: You are using gradio version 4.44.1, however version 5.0.1 is available, please upgrade. \n",
|
116 |
+
"--------\n",
|
117 |
+
" warnings.warn(\n"
|
118 |
+
]
|
119 |
+
},
|
120 |
+
{
|
121 |
+
"name": "stdout",
|
122 |
+
"output_type": "stream",
|
123 |
+
"text": [
|
124 |
+
"Running on public URL: https://1fe44b84e4bdd88e83.gradio.live\n",
|
125 |
+
"\n",
|
126 |
+
"This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n"
|
127 |
+
]
|
128 |
+
},
|
129 |
+
{
|
130 |
+
"data": {
|
131 |
+
"text/html": [
|
132 |
+
"<div><iframe src=\"https://1fe44b84e4bdd88e83.gradio.live\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
|
133 |
+
],
|
134 |
+
"text/plain": [
|
135 |
+
"<IPython.core.display.HTML object>"
|
136 |
+
]
|
137 |
+
},
|
138 |
+
"metadata": {},
|
139 |
+
"output_type": "display_data"
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"data": {
|
143 |
+
"text/plain": []
|
144 |
+
},
|
145 |
+
"execution_count": 3,
|
146 |
+
"metadata": {},
|
147 |
+
"output_type": "execute_result"
|
148 |
+
}
|
149 |
+
],
|
150 |
+
"source": [
|
151 |
+
"\n",
|
152 |
+
"def summarize(text):\n",
|
153 |
+
" inputs = tokenizer([text], max_length=1024, return_tensors=\"pt\")\n",
|
154 |
+
" summary_ids = model.generate(inputs[\"input_ids\"], num_beams=2, min_length=0, max_length=100)\n",
|
155 |
+
" return tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]\n",
|
156 |
+
"\n",
|
157 |
+
"import gradio as gr\n",
|
158 |
+
"\n",
|
159 |
+
"iface = gr.Interface(\n",
|
160 |
+
" fn=summarize,\n",
|
161 |
+
" inputs=gr.Textbox(label=\"Text to summarize\"),\n",
|
162 |
+
" outputs=[gr.Textbox(label=\"Summary\")],\n",
|
163 |
+
" title='Summarize text'\n",
|
164 |
+
")\n",
|
165 |
+
"\n",
|
166 |
+
"iface.launch(share=True)"
|
167 |
+
]
|
168 |
+
}
|
169 |
+
],
|
170 |
+
"metadata": {
|
171 |
+
"kernelspec": {
|
172 |
+
"display_name": "SolutionsInPR",
|
173 |
+
"language": "python",
|
174 |
+
"name": "python3"
|
175 |
+
},
|
176 |
+
"language_info": {
|
177 |
+
"codemirror_mode": {
|
178 |
+
"name": "ipython",
|
179 |
+
"version": 3
|
180 |
+
},
|
181 |
+
"file_extension": ".py",
|
182 |
+
"mimetype": "text/x-python",
|
183 |
+
"name": "python",
|
184 |
+
"nbconvert_exporter": "python",
|
185 |
+
"pygments_lexer": "ipython3",
|
186 |
+
"version": "3.12.3"
|
187 |
+
}
|
188 |
+
},
|
189 |
+
"nbformat": 4,
|
190 |
+
"nbformat_minor": 2
|
191 |
+
}
|
Other Codes/trainer.ipynb
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"# Importing necessary libraries\n",
|
10 |
+
"from datasets import load_dataset, ClassLabel\n",
|
11 |
+
"from transformers import AutoTokenizer\n",
|
12 |
+
"from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments\n",
|
13 |
+
"import torch\n",
|
14 |
+
"\n",
|
15 |
+
"# Load dataset\n",
|
16 |
+
"dataset = load_dataset(\"McAuley-Lab/Amazon-Reviews-2023\", \"raw_review_Appliances\", trust_remote_code=True, split=\"full\")\n",
|
17 |
+
"dataset = dataset.remove_columns(['title', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'])\n",
|
18 |
+
"dataset = dataset.rename_column('rating', 'label')\n",
|
19 |
+
"dataset = dataset.cast_column('label', ClassLabel(num_classes=6))\n",
|
20 |
+
"\n",
|
21 |
+
"# Load pre-trained tokenizer\n",
|
22 |
+
"tokenizer = AutoTokenizer.from_pretrained('roberta-base')\n",
|
23 |
+
"\n",
|
24 |
+
"# Define tokenization function\n",
|
25 |
+
"def tokenize_function(examples):\n",
|
26 |
+
" return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)\n",
|
27 |
+
"\n",
|
28 |
+
"# Apply tokenization\n",
|
29 |
+
"tokenized_datasets = dataset.map(tokenize_function, batched=True)\n",
|
30 |
+
"tokenized_datasets = tokenized_datasets.shuffle()\n",
|
31 |
+
"print(tokenized_datasets)\n",
|
32 |
+
"\n",
|
33 |
+
"# Load pre-trained BERT model for sequence classification\n",
|
34 |
+
"model = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=6)\n",
|
35 |
+
"\n",
|
36 |
+
"# Define training arguments\n",
|
37 |
+
"training_args = TrainingArguments(\n",
|
38 |
+
" output_dir='./results',\n",
|
39 |
+
" num_train_epochs=10,\n",
|
40 |
+
" per_device_train_batch_size=16,\n",
|
41 |
+
" per_device_eval_batch_size=16,\n",
|
42 |
+
" evaluation_strategy='epoch',\n",
|
43 |
+
" logging_dir='./logs',\n",
|
44 |
+
")\n",
|
45 |
+
"\n",
|
46 |
+
"# Create trainer instance\n",
|
47 |
+
"trainer = Trainer(\n",
|
48 |
+
" model=model,\n",
|
49 |
+
" args=training_args,\n",
|
50 |
+
" train_dataset=tokenized_datasets.select(range(1000)),\n",
|
51 |
+
" eval_dataset=tokenized_datasets.select(range(1001, 2001)),\n",
|
52 |
+
")\n",
|
53 |
+
"\n",
|
54 |
+
"device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
|
55 |
+
"model.to(device)"
|
56 |
+
]
|
57 |
+
},
|
58 |
+
{
|
59 |
+
"cell_type": "code",
|
60 |
+
"execution_count": null,
|
61 |
+
"metadata": {},
|
62 |
+
"outputs": [],
|
63 |
+
"source": [
|
64 |
+
"trainer.train()"
|
65 |
+
]
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"cell_type": "code",
|
69 |
+
"execution_count": null,
|
70 |
+
"metadata": {},
|
71 |
+
"outputs": [],
|
72 |
+
"source": [
|
73 |
+
"from sklearn.metrics import accuracy_score, precision_recall_fscore_support\n",
|
74 |
+
"\n",
|
75 |
+
"# Define function to compute metrics\n",
|
76 |
+
"def compute_metrics(pred):\n",
|
77 |
+
" labels = pred.label_ids\n",
|
78 |
+
" preds = pred.predictions.argmax(-1)\n",
|
79 |
+
" precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')\n",
|
80 |
+
" acc = accuracy_score(labels, preds)\n",
|
81 |
+
" return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}\n",
|
82 |
+
"\n",
|
83 |
+
"# Update trainer to include custom metrics\n",
|
84 |
+
"trainer.compute_metrics = compute_metrics\n",
|
85 |
+
"\n",
|
86 |
+
"# Evaluate the model\n",
|
87 |
+
"eval_result = trainer.evaluate()\n",
|
88 |
+
"print(eval_result)"
|
89 |
+
]
|
90 |
+
},
|
91 |
+
{
|
92 |
+
"cell_type": "code",
|
93 |
+
"execution_count": null,
|
94 |
+
"metadata": {},
|
95 |
+
"outputs": [],
|
96 |
+
"source": [
|
97 |
+
"# Save the fine-tuned model and tokenizer\n",
|
98 |
+
"trainer.save_model('roberta-rating')\n",
|
99 |
+
"tokenizer.save_pretrained('roberta-rating')"
|
100 |
+
]
|
101 |
+
}
|
102 |
+
],
|
103 |
+
"metadata": {
|
104 |
+
"kernelspec": {
|
105 |
+
"display_name": "SolutionsInPR",
|
106 |
+
"language": "python",
|
107 |
+
"name": "python3"
|
108 |
+
},
|
109 |
+
"language_info": {
|
110 |
+
"name": "python",
|
111 |
+
"version": "3.12.3"
|
112 |
+
}
|
113 |
+
},
|
114 |
+
"nbformat": 4,
|
115 |
+
"nbformat_minor": 2
|
116 |
+
}
|