Jan90 commited on
Commit
a9cce51
·
verified ·
1 Parent(s): 147e673

Upload 5 files

Browse files
Other Codes/Final revision code.ipynb ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 14,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stdout",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "* Running on local URL: http://127.0.0.1:7870\n",
13
+ "* Running on public URL: https://a94e18f722148a0463.gradio.live\n",
14
+ "\n",
15
+ "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)\n"
16
+ ]
17
+ },
18
+ {
19
+ "data": {
20
+ "text/html": [
21
+ "<div><iframe src=\"https://a94e18f722148a0463.gradio.live\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
22
+ ],
23
+ "text/plain": [
24
+ "<IPython.core.display.HTML object>"
25
+ ]
26
+ },
27
+ "metadata": {},
28
+ "output_type": "display_data"
29
+ },
30
+ {
31
+ "data": {
32
+ "text/plain": []
33
+ },
34
+ "execution_count": 14,
35
+ "metadata": {},
36
+ "output_type": "execute_result"
37
+ }
38
+ ],
39
+ "source": [
40
+ "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, TextClassificationPipeline\n",
41
+ "import torch\n",
42
+ "import gradio as gr\n",
43
+ "from openpyxl import load_workbook\n",
44
+ "from numpy import mean\n",
45
+ "import pandas as pd\n",
46
+ "import matplotlib.pyplot as plt\n",
47
+ "\n",
48
+ "theme = gr.themes.Soft(\n",
49
+ " primary_hue=\"amber\",\n",
50
+ " secondary_hue=\"amber\",\n",
51
+ " neutral_hue=\"stone\",\n",
52
+ ")\n",
53
+ "\n",
54
+ "# Load tokenizers and models\n",
55
+ "tokenizer = AutoTokenizer.from_pretrained(\"suriya7/bart-finetuned-text-summarization\")\n",
56
+ "model = AutoModelForSeq2SeqLM.from_pretrained(\"suriya7/bart-finetuned-text-summarization\")\n",
57
+ "\n",
58
+ "tokenizer_keywords = AutoTokenizer.from_pretrained(\"transformer3/H2-keywordextractor\")\n",
59
+ "model_keywords = AutoModelForSeq2SeqLM.from_pretrained(\"transformer3/H2-keywordextractor\")\n",
60
+ "\n",
61
+ "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
62
+ "new_model = AutoModelForSequenceClassification.from_pretrained('roberta-rating')\n",
63
+ "new_tokenizer = AutoTokenizer.from_pretrained('roberta-rating')\n",
64
+ "\n",
65
+ "classifier = TextClassificationPipeline(model=new_model, tokenizer=new_tokenizer, device=device)\n",
66
+ "\n",
67
+ "label_mapping = {1: '1/5', 2: '2/5', 3: '3/5', 4: '4/5', 5: '5/5'}\n",
68
+ "\n",
69
+ "# Function to display and filter the Excel workbook\n",
70
+ "def filter_xl(file, keywords):\n",
71
+ " # Load the workbook and convert it to a DataFrame\n",
72
+ " workbook = load_workbook(filename=file)\n",
73
+ " sheet = workbook.active\n",
74
+ " data = sheet.values\n",
75
+ " columns = next(data)[0:]\n",
76
+ " df = pd.DataFrame(data, columns=columns)\n",
77
+ " \n",
78
+ " if keywords:\n",
79
+ " keyword_list = keywords.split(',')\n",
80
+ " for keyword in keyword_list:\n",
81
+ " df = df[df.apply(lambda row: row.astype(str).str.contains(keyword.strip(), case=False).any(), axis=1)]\n",
82
+ " \n",
83
+ " return df\n",
84
+ "\n",
85
+ "# Function to calculate overall rating from filtered data\n",
86
+ "def calculate_rating(filtered_df):\n",
87
+ " reviews = filtered_df.to_numpy().flatten()\n",
88
+ " ratings = []\n",
89
+ " for review in reviews:\n",
90
+ " if pd.notna(review):\n",
91
+ " rating = int(classifier(review)[0]['label'].split('_')[1])\n",
92
+ " ratings.append(rating)\n",
93
+ " \n",
94
+ " return round(mean(ratings), 2), ratings\n",
95
+ "\n",
96
+ "# Function to calculate results including summary, keywords, and sentiment\n",
97
+ "def calculate_results(file, keywords):\n",
98
+ " filtered_df = filter_xl(file, keywords)\n",
99
+ " overall_rating, ratings = calculate_rating(filtered_df)\n",
100
+ " \n",
101
+ " # Summarize and extract keywords from the filtered reviews\n",
102
+ " text = \" \".join(filtered_df.to_numpy().flatten())\n",
103
+ " inputs = tokenizer([text], max_length=1024, truncation=True, return_tensors=\"pt\")\n",
104
+ " summary_ids = model.generate(inputs[\"input_ids\"], num_beams=2, min_length=10, max_length=50)\n",
105
+ " summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]\n",
106
+ " summary = summary.replace(\"I\", \"They\").replace(\"my\", \"their\").replace(\"me\", \"them\")\n",
107
+ "\n",
108
+ " inputs_keywords = tokenizer_keywords([text], max_length=1024, truncation=True, return_tensors=\"pt\")\n",
109
+ " summary_ids_keywords = model_keywords.generate(inputs_keywords[\"input_ids\"], num_beams=2, min_length=0, max_length=100)\n",
110
+ " keywords = tokenizer_keywords.batch_decode(summary_ids_keywords, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]\n",
111
+ "\n",
112
+ " # Determine overall sentiment\n",
113
+ " sentiments = []\n",
114
+ " for review in filtered_df.to_numpy().flatten():\n",
115
+ " if pd.notna(review):\n",
116
+ " sentiment = classifier(review)[0]['label']\n",
117
+ " sentiment_label = \"Positive\" if sentiment == \"LABEL_4\" or sentiment == \"LABEL_5\" else \"Negative\" if sentiment == \"LABEL_1\" or sentiment == \"LABEL_2\" else \"Neutral\"\n",
118
+ " sentiments.append(sentiment_label)\n",
119
+ " \n",
120
+ " overall_sentiment = \"Positive\" if sentiments.count(\"Positive\") > sentiments.count(\"Negative\") else \"Negative\" if sentiments.count(\"Negative\") > sentiments.count(\"Positive\") else \"Neutral\"\n",
121
+ "\n",
122
+ " return overall_rating, summary, keywords, overall_sentiment, ratings, sentiments\n",
123
+ "\n",
124
+ "# Function to analyze a single review\n",
125
+ "def analyze_review(review):\n",
126
+ " if not review.strip():\n",
127
+ " return \"Error: No text provided\", \"Error: No text provided\", \"Error: No text provided\", \"Error: No text provided\"\n",
128
+ " \n",
129
+ " # Calculate rating\n",
130
+ " rating = int(classifier(review)[0]['label'].split('_')[1])\n",
131
+ " \n",
132
+ " # Summarize review\n",
133
+ " inputs = tokenizer([review], max_length=1024, truncation=True, return_tensors=\"pt\")\n",
134
+ " summary_ids = model.generate(inputs[\"input_ids\"], num_beams=2, min_length=10, max_length=50)\n",
135
+ " summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]\n",
136
+ " summary = summary.replace(\"I\", \"he/she\").replace(\"my\", \"his/her\").replace(\"me\", \"him/her\")\n",
137
+ "\n",
138
+ " # Extract keywords\n",
139
+ " inputs_keywords = tokenizer_keywords([review], max_length=1024, truncation=True, return_tensors=\"pt\")\n",
140
+ " summary_ids_keywords = model_keywords.generate(inputs_keywords[\"input_ids\"], num_beams=2, min_length=0, max_length=100)\n",
141
+ " keywords = tokenizer_keywords.batch_decode(summary_ids_keywords, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]\n",
142
+ "\n",
143
+ " # Determine sentiment\n",
144
+ " sentiment = classifier(review)[0]['label']\n",
145
+ " sentiment_label = \"Positive\" if sentiment == \"LABEL_4\" or sentiment == \"LABEL_5\" else \"Negative\" if sentiment == \"LABEL_1\" or sentiment == \"LABEL_2\" else \"Neutral\"\n",
146
+ "\n",
147
+ " return rating, summary, keywords, sentiment_label\n",
148
+ "\n",
149
+ "# Function to count rows in the filtered DataFrame\n",
150
+ "def count_rows(filtered_df):\n",
151
+ " return len(filtered_df)\n",
152
+ "\n",
153
+ "# Function to plot ratings\n",
154
+ "def plot_ratings(ratings):\n",
155
+ " plt.figure(figsize=(10, 5))\n",
156
+ " plt.hist(ratings, bins=range(1, 7), edgecolor='black', align='left')\n",
157
+ " plt.xlabel('Rating')\n",
158
+ " plt.ylabel('Frequency')\n",
159
+ " plt.title('Distribution of Ratings')\n",
160
+ " plt.xticks(range(1, 6))\n",
161
+ " plt.grid(True)\n",
162
+ " plt.savefig('ratings_distribution.png')\n",
163
+ " return 'ratings_distribution.png'\n",
164
+ "\n",
165
+ "# Function to plot sentiments\n",
166
+ "def plot_sentiments(sentiments):\n",
167
+ " sentiment_counts = pd.Series(sentiments).value_counts()\n",
168
+ " plt.figure(figsize=(10, 5))\n",
169
+ " sentiment_counts.plot(kind='bar', color=['green', 'red', 'blue'])\n",
170
+ " plt.xlabel('Sentiment')\n",
171
+ " plt.ylabel('Frequency')\n",
172
+ " plt.title('Distribution of Sentiments')\n",
173
+ " plt.grid(True)\n",
174
+ " plt.savefig('sentiments_distribution.png')\n",
175
+ " return 'sentiments_distribution.png'\n",
176
+ "\n",
177
+ "# Gradio interface\n",
178
+ "with gr.Blocks(theme=theme) as demo:\n",
179
+ " gr.Markdown(\"<h1 style='text-align: center;'>Feedback and Auditing Survey AI Analyzer</h1><br>\")\n",
180
+ " with gr.Tabs():\n",
181
+ " with gr.TabItem(\"Upload and Filter\"):\n",
182
+ " with gr.Row():\n",
183
+ " with gr.Column(scale=1):\n",
184
+ " excel_file = gr.File(label=\"Upload Excel File\")\n",
185
+ " #excel_file = gr.File(label=\"Upload Excel File\", file_types=[\".xlsx\", \".xlsm\", \".xltx\", \".xltm\"])\n",
186
+ " keywords_input = gr.Textbox(label=\"Filter by Keywords (comma-separated)\")\n",
187
+ " display_button = gr.Button(\"Display and Filter Excel Data\")\n",
188
+ " clear_button_upload = gr.Button(\"Clear\")\n",
189
+ " row_count = gr.Textbox(label=\"Number of Rows\", interactive=False)\n",
190
+ " with gr.Column(scale=3):\n",
191
+ " filtered_data = gr.Dataframe(label=\"Filtered Excel Contents\")\n",
192
+ " \n",
193
+ " with gr.TabItem(\"Calculate Results\"):\n",
194
+ " with gr.Row():\n",
195
+ " with gr.Column():\n",
196
+ " overall_rating = gr.Textbox(label=\"Overall Rating\")\n",
197
+ " summary = gr.Textbox(label=\"Summary\")\n",
198
+ " keywords_output = gr.Textbox(label=\"Keywords\")\n",
199
+ " overall_sentiment = gr.Textbox(label=\"Overall Sentiment\")\n",
200
+ " calculate_button = gr.Button(\"Calculate Results\")\n",
201
+ " with gr.Column():\n",
202
+ " ratings_graph = gr.Image(label=\"Ratings Distribution\")\n",
203
+ " sentiments_graph = gr.Image(label=\"Sentiments Distribution\")\n",
204
+ " calculate_graph_button = gr.Button(\"Calculate Graph Results\")\n",
205
+ " \n",
206
+ " with gr.TabItem(\"Testing Area / Write a Review\"):\n",
207
+ " with gr.Row():\n",
208
+ " with gr.Column(scale=2):\n",
209
+ " review_input = gr.Textbox(label=\"Write your review here\")\n",
210
+ " analyze_button = gr.Button(\"Analyze Review\")\n",
211
+ " clear_button_review = gr.Button(\"Clear\")\n",
212
+ " with gr.Column(scale=2):\n",
213
+ " review_rating = gr.Textbox(label=\"Rating\")\n",
214
+ " review_summary = gr.Textbox(label=\"Summary\")\n",
215
+ " review_keywords = gr.Textbox(label=\"Keywords\")\n",
216
+ " review_sentiment = gr.Textbox(label=\"Sentiment\")\n",
217
+ "\n",
218
+ " display_button.click(lambda file, keywords: (filter_xl(file, keywords), count_rows(filter_xl(file, keywords))), inputs=[excel_file, keywords_input], outputs=[filtered_data, row_count])\n",
219
+ " calculate_graph_button.click(lambda file, keywords: (*calculate_results(file, keywords)[:4], plot_ratings(calculate_results(file, keywords)[4]), plot_sentiments(calculate_results(file, keywords)[5])), inputs=[excel_file, keywords_input], outputs=[overall_rating, summary, keywords_output, overall_sentiment, ratings_graph, sentiments_graph])\n",
220
+ " calculate_button.click(lambda file, keywords: (*calculate_results(file, keywords)[:4], plot_ratings(calculate_results(file, keywords)[4])), inputs=[excel_file, keywords_input], outputs=[overall_rating, summary, keywords_output, overall_sentiment])\n",
221
+ " analyze_button.click(analyze_review, inputs=review_input, outputs=[review_rating, review_summary, review_keywords, review_sentiment])\n",
222
+ " clear_button_upload.click(lambda: (\"\"), outputs=[keywords_input])\n",
223
+ " clear_button_review.click(lambda: (\"\", \"\", \"\", \"\", \"\"), outputs=[review_input, review_rating, review_summary, review_keywords, review_sentiment])\n",
224
+ "\n",
225
+ "demo.launch(share=True)"
226
+ ]
227
+ }
228
+ ],
229
+ "metadata": {
230
+ "kernelspec": {
231
+ "display_name": "SolutionsInPR",
232
+ "language": "python",
233
+ "name": "python3"
234
+ },
235
+ "language_info": {
236
+ "codemirror_mode": {
237
+ "name": "ipython",
238
+ "version": 3
239
+ },
240
+ "file_extension": ".py",
241
+ "mimetype": "text/x-python",
242
+ "name": "python",
243
+ "nbconvert_exporter": "python",
244
+ "pygments_lexer": "ipython3",
245
+ "version": "3.12.4"
246
+ }
247
+ },
248
+ "nbformat": 4,
249
+ "nbformat_minor": 2
250
+ }
Other Codes/First revision code.ipynb ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "# Load model directly\n",
10
+ "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, TextClassificationPipeline\n",
11
+ "import torch\n",
12
+ "import gradio as gr\n",
13
+ "from openpyxl import load_workbook\n",
14
+ "from numpy import mean\n",
15
+ "\n",
16
+ "tokenizer = AutoTokenizer.from_pretrained(\"suriya7/bart-finetuned-text-summarization\")\n",
17
+ "model = AutoModelForSeq2SeqLM.from_pretrained(\"suriya7/bart-finetuned-text-summarization\")\n",
18
+ "\n",
19
+ "tokenizer_keywords = AutoTokenizer.from_pretrained(\"transformer3/H2-keywordextractor\")\n",
20
+ "model_keywords = AutoModelForSeq2SeqLM.from_pretrained(\"transformer3/H2-keywordextractor\")\n",
21
+ "\n",
22
+ "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
23
+ "# Load the fine-tuned model and tokenizer\n",
24
+ "new_model = AutoModelForSequenceClassification.from_pretrained('roberta-rating')\n",
25
+ "new_tokenizer = AutoTokenizer.from_pretrained('roberta-rating')\n",
26
+ "\n",
27
+ "\n",
28
+ "# Create a classification pipeline\n",
29
+ "classifier = TextClassificationPipeline(model=new_model, tokenizer=new_tokenizer, device=device)\n",
30
+ "\n",
31
+ "# Add label mapping for sentiment analysis\n",
32
+ "label_mapping = {1: '1/5', 2: '2/5', 3: '3/5', 4: '4/5', 5: '5/5'}\n",
33
+ "\n",
34
+ "def parse_xl(file_path):\n",
35
+ " cells = []\n",
36
+ "\n",
37
+ " workbook = load_workbook(filename=file_path)\n",
38
+ " for sheet in workbook.worksheets:\n",
39
+ " for row in sheet.iter_rows():\n",
40
+ " for cell in row:\n",
41
+ " if cell.value != None:\n",
42
+ " cells.append(cell.value)\n",
43
+ "\n",
44
+ " return cells\n",
45
+ "\n",
46
+ "def evaluate(file):\n",
47
+ " reviews = parse_xl(file)\n",
48
+ " ratings = []\n",
49
+ " text = \"\"\n",
50
+ "\n",
51
+ " for review in reviews:\n",
52
+ " ratings.append(int(classifier(review)[0]['label'].split('_')[1]))\n",
53
+ " text += review\n",
54
+ " text += \" \"\n",
55
+ " \n",
56
+ " inputs = tokenizer([text], max_length=1024, truncation=True, return_tensors=\"pt\")\n",
57
+ " summary_ids = model.generate(inputs[\"input_ids\"], num_beams=2, min_length=50, max_length=1000)\n",
58
+ " summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]\n",
59
+ "\n",
60
+ " inputs_keywords = tokenizer_keywords([text], max_length=1024, truncation=True, return_tensors=\"pt\")\n",
61
+ " summary_ids_keywords = model_keywords.generate(inputs_keywords[\"input_ids\"], num_beams=2, min_length=0, max_length=100)\n",
62
+ " keywords = tokenizer_keywords.batch_decode(summary_ids_keywords, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] \n",
63
+ "\n",
64
+ " return round(mean(ratings), 2), summary, keywords\n",
65
+ "\n",
66
+ "iface = gr.Interface(\n",
67
+ " fn=evaluate,\n",
68
+ " inputs=gr.File(label=\"Reviews\", file_types=[\".xlsx\", \".xlsm\", \".xltx\", \".xltm\"]),\n",
69
+ " outputs=[gr.Textbox(label=\"Rating\"), gr.Textbox(label=\"Summary\"), gr.Textbox(label=\"Keywords\")],\n",
70
+ " title='Summarize Reviews',\n",
71
+ " description=\"Evaluate and summarize collection of reviews. Reviews are submitted as an Excel file, where each reviews is in its own cell.\"\n",
72
+ ")\n",
73
+ "\n",
74
+ "iface.launch(share=True)"
75
+ ]
76
+ }
77
+ ],
78
+ "metadata": {
79
+ "kernelspec": {
80
+ "display_name": "SolutionsInPR",
81
+ "language": "python",
82
+ "name": "python3"
83
+ },
84
+ "language_info": {
85
+ "codemirror_mode": {
86
+ "name": "ipython",
87
+ "version": 3
88
+ },
89
+ "file_extension": ".py",
90
+ "mimetype": "text/x-python",
91
+ "name": "python",
92
+ "nbconvert_exporter": "python",
93
+ "pygments_lexer": "ipython3",
94
+ "version": "3.12.3"
95
+ }
96
+ },
97
+ "nbformat": 4,
98
+ "nbformat_minor": 2
99
+ }
Other Codes/roberta-rating.ipynb ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from transformers import AutoModelForSequenceClassification, AutoTokenizer, TextClassificationPipeline\n",
10
+ "import torch\n",
11
+ "import gradio as gr\n",
12
+ "\n",
13
+ "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
14
+ "# Load the fine-tuned model and tokenizer\n",
15
+ "new_model = AutoModelForSequenceClassification.from_pretrained('roberta-rating')\n",
16
+ "new_tokenizer = AutoTokenizer.from_pretrained('roberta-rating')\n",
17
+ "\n",
18
+ "\n",
19
+ "# Create a classification pipeline\n",
20
+ "classifier = TextClassificationPipeline(model=new_model, tokenizer=new_tokenizer, device=device)\n",
21
+ "\n",
22
+ "# Add label mapping for sentiment analysis (assuming LABEL_0 = 'negative' and LABEL_1 = 'positive')\n",
23
+ "label_mapping = {1: '1/5', 2: '2/5', 3: '3/5', 4: '4/5', 5: '5/5'}\n",
24
+ "\n",
25
+ "def evaluate(text):\n",
26
+ " result = classifier(text)\n",
27
+ " return label_mapping[int(result[0]['label'].split('_')[1])] + \".\", result[0]['score']\n",
28
+ "\n",
29
+ "iface = gr.Interface(\n",
30
+ " fn=evaluate,\n",
31
+ " inputs=gr.Textbox(label=\"Review\"),\n",
32
+ " outputs=[gr.Textbox(label=\"Evaluation\"), gr.Textbox(label=\"Score\")],\n",
33
+ " title='Write a review',\n",
34
+ " description=\"Write a product review, and the model will evaluate its numerical rating\"\n",
35
+ ")\n",
36
+ "\n",
37
+ "iface.launch(share=True)"
38
+ ]
39
+ }
40
+ ],
41
+ "metadata": {
42
+ "kernelspec": {
43
+ "display_name": "SolutionsInPR",
44
+ "language": "python",
45
+ "name": "python3"
46
+ },
47
+ "language_info": {
48
+ "name": "python",
49
+ "version": "3.12.3"
50
+ }
51
+ },
52
+ "nbformat": 4,
53
+ "nbformat_minor": 2
54
+ }
Other Codes/roberta-summarization.ipynb ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "name": "stderr",
10
+ "output_type": "stream",
11
+ "text": [
12
+ "c:\\Users\\panuk\\anaconda3\\envs\\SolutionsInPR\\Lib\\site-packages\\transformers\\tokenization_utils_base.py:1617: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be deprecated in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
13
+ " warnings.warn(\n"
14
+ ]
15
+ }
16
+ ],
17
+ "source": [
18
+ "# Load model directly\n",
19
+ "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n",
20
+ "\n",
21
+ "tokenizer = AutoTokenizer.from_pretrained(\"facebook/bart-large-cnn\")\n",
22
+ "model = AutoModelForSeq2SeqLM.from_pretrained(\"facebook/bart-large-cnn\")"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "code",
27
+ "execution_count": 2,
28
+ "metadata": {},
29
+ "outputs": [
30
+ {
31
+ "data": {
32
+ "text/plain": [
33
+ "BartForConditionalGeneration(\n",
34
+ " (model): BartModel(\n",
35
+ " (shared): BartScaledWordEmbedding(50264, 1024, padding_idx=1)\n",
36
+ " (encoder): BartEncoder(\n",
37
+ " (embed_tokens): BartScaledWordEmbedding(50264, 1024, padding_idx=1)\n",
38
+ " (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)\n",
39
+ " (layers): ModuleList(\n",
40
+ " (0-11): 12 x BartEncoderLayer(\n",
41
+ " (self_attn): BartSdpaAttention(\n",
42
+ " (k_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
43
+ " (v_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
44
+ " (q_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
45
+ " (out_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
46
+ " )\n",
47
+ " (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
48
+ " (activation_fn): GELUActivation()\n",
49
+ " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
50
+ " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
51
+ " (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
52
+ " )\n",
53
+ " )\n",
54
+ " (layernorm_embedding): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
55
+ " )\n",
56
+ " (decoder): BartDecoder(\n",
57
+ " (embed_tokens): BartScaledWordEmbedding(50264, 1024, padding_idx=1)\n",
58
+ " (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)\n",
59
+ " (layers): ModuleList(\n",
60
+ " (0-11): 12 x BartDecoderLayer(\n",
61
+ " (self_attn): BartSdpaAttention(\n",
62
+ " (k_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
63
+ " (v_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
64
+ " (q_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
65
+ " (out_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
66
+ " )\n",
67
+ " (activation_fn): GELUActivation()\n",
68
+ " (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
69
+ " (encoder_attn): BartSdpaAttention(\n",
70
+ " (k_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
71
+ " (v_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
72
+ " (q_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
73
+ " (out_proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
74
+ " )\n",
75
+ " (encoder_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
76
+ " (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
77
+ " (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
78
+ " (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
79
+ " )\n",
80
+ " )\n",
81
+ " (layernorm_embedding): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n",
82
+ " )\n",
83
+ " )\n",
84
+ " (lm_head): Linear(in_features=1024, out_features=50264, bias=False)\n",
85
+ ")"
86
+ ]
87
+ },
88
+ "execution_count": 2,
89
+ "metadata": {},
90
+ "output_type": "execute_result"
91
+ }
92
+ ],
93
+ "source": [
94
+ "import torch\n",
95
+ "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
96
+ "model.to(device)"
97
+ ]
98
+ },
99
+ {
100
+ "cell_type": "code",
101
+ "execution_count": 3,
102
+ "metadata": {},
103
+ "outputs": [
104
+ {
105
+ "name": "stdout",
106
+ "output_type": "stream",
107
+ "text": [
108
+ "Running on local URL: http://127.0.0.1:7861\n"
109
+ ]
110
+ },
111
+ {
112
+ "name": "stderr",
113
+ "output_type": "stream",
114
+ "text": [
115
+ "c:\\Users\\panuk\\anaconda3\\envs\\SolutionsInPR\\Lib\\site-packages\\gradio\\analytics.py:106: UserWarning: IMPORTANT: You are using gradio version 4.44.1, however version 5.0.1 is available, please upgrade. \n",
116
+ "--------\n",
117
+ " warnings.warn(\n"
118
+ ]
119
+ },
120
+ {
121
+ "name": "stdout",
122
+ "output_type": "stream",
123
+ "text": [
124
+ "Running on public URL: https://1fe44b84e4bdd88e83.gradio.live\n",
125
+ "\n",
126
+ "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n"
127
+ ]
128
+ },
129
+ {
130
+ "data": {
131
+ "text/html": [
132
+ "<div><iframe src=\"https://1fe44b84e4bdd88e83.gradio.live\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
133
+ ],
134
+ "text/plain": [
135
+ "<IPython.core.display.HTML object>"
136
+ ]
137
+ },
138
+ "metadata": {},
139
+ "output_type": "display_data"
140
+ },
141
+ {
142
+ "data": {
143
+ "text/plain": []
144
+ },
145
+ "execution_count": 3,
146
+ "metadata": {},
147
+ "output_type": "execute_result"
148
+ }
149
+ ],
150
+ "source": [
151
+ "\n",
152
+ "def summarize(text):\n",
153
+ " inputs = tokenizer([text], max_length=1024, return_tensors=\"pt\")\n",
154
+ " summary_ids = model.generate(inputs[\"input_ids\"], num_beams=2, min_length=0, max_length=100)\n",
155
+ " return tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]\n",
156
+ "\n",
157
+ "import gradio as gr\n",
158
+ "\n",
159
+ "iface = gr.Interface(\n",
160
+ " fn=summarize,\n",
161
+ " inputs=gr.Textbox(label=\"Text to summarize\"),\n",
162
+ " outputs=[gr.Textbox(label=\"Summary\")],\n",
163
+ " title='Summarize text'\n",
164
+ ")\n",
165
+ "\n",
166
+ "iface.launch(share=True)"
167
+ ]
168
+ }
169
+ ],
170
+ "metadata": {
171
+ "kernelspec": {
172
+ "display_name": "SolutionsInPR",
173
+ "language": "python",
174
+ "name": "python3"
175
+ },
176
+ "language_info": {
177
+ "codemirror_mode": {
178
+ "name": "ipython",
179
+ "version": 3
180
+ },
181
+ "file_extension": ".py",
182
+ "mimetype": "text/x-python",
183
+ "name": "python",
184
+ "nbconvert_exporter": "python",
185
+ "pygments_lexer": "ipython3",
186
+ "version": "3.12.3"
187
+ }
188
+ },
189
+ "nbformat": 4,
190
+ "nbformat_minor": 2
191
+ }
Other Codes/trainer.ipynb ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "# Importing necessary libraries\n",
10
+ "from datasets import load_dataset, ClassLabel\n",
11
+ "from transformers import AutoTokenizer\n",
12
+ "from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments\n",
13
+ "import torch\n",
14
+ "\n",
15
+ "# Load dataset\n",
16
+ "dataset = load_dataset(\"McAuley-Lab/Amazon-Reviews-2023\", \"raw_review_Appliances\", trust_remote_code=True, split=\"full\")\n",
17
+ "dataset = dataset.remove_columns(['title', 'images', 'asin', 'parent_asin', 'user_id', 'timestamp', 'helpful_vote', 'verified_purchase'])\n",
18
+ "dataset = dataset.rename_column('rating', 'label')\n",
19
+ "dataset = dataset.cast_column('label', ClassLabel(num_classes=6))\n",
20
+ "\n",
21
+ "# Load pre-trained tokenizer\n",
22
+ "tokenizer = AutoTokenizer.from_pretrained('roberta-base')\n",
23
+ "\n",
24
+ "# Define tokenization function\n",
25
+ "def tokenize_function(examples):\n",
26
+ " return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)\n",
27
+ "\n",
28
+ "# Apply tokenization\n",
29
+ "tokenized_datasets = dataset.map(tokenize_function, batched=True)\n",
30
+ "tokenized_datasets = tokenized_datasets.shuffle()\n",
31
+ "print(tokenized_datasets)\n",
32
+ "\n",
33
+ "# Load pre-trained BERT model for sequence classification\n",
34
+ "model = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=6)\n",
35
+ "\n",
36
+ "# Define training arguments\n",
37
+ "training_args = TrainingArguments(\n",
38
+ " output_dir='./results',\n",
39
+ " num_train_epochs=10,\n",
40
+ " per_device_train_batch_size=16,\n",
41
+ " per_device_eval_batch_size=16,\n",
42
+ " evaluation_strategy='epoch',\n",
43
+ " logging_dir='./logs',\n",
44
+ ")\n",
45
+ "\n",
46
+ "# Create trainer instance\n",
47
+ "trainer = Trainer(\n",
48
+ " model=model,\n",
49
+ " args=training_args,\n",
50
+ " train_dataset=tokenized_datasets.select(range(1000)),\n",
51
+ " eval_dataset=tokenized_datasets.select(range(1001, 2001)),\n",
52
+ ")\n",
53
+ "\n",
54
+ "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n",
55
+ "model.to(device)"
56
+ ]
57
+ },
58
+ {
59
+ "cell_type": "code",
60
+ "execution_count": null,
61
+ "metadata": {},
62
+ "outputs": [],
63
+ "source": [
64
+ "trainer.train()"
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "code",
69
+ "execution_count": null,
70
+ "metadata": {},
71
+ "outputs": [],
72
+ "source": [
73
+ "from sklearn.metrics import accuracy_score, precision_recall_fscore_support\n",
74
+ "\n",
75
+ "# Define function to compute metrics\n",
76
+ "def compute_metrics(pred):\n",
77
+ " labels = pred.label_ids\n",
78
+ " preds = pred.predictions.argmax(-1)\n",
79
+ " precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')\n",
80
+ " acc = accuracy_score(labels, preds)\n",
81
+ " return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}\n",
82
+ "\n",
83
+ "# Update trainer to include custom metrics\n",
84
+ "trainer.compute_metrics = compute_metrics\n",
85
+ "\n",
86
+ "# Evaluate the model\n",
87
+ "eval_result = trainer.evaluate()\n",
88
+ "print(eval_result)"
89
+ ]
90
+ },
91
+ {
92
+ "cell_type": "code",
93
+ "execution_count": null,
94
+ "metadata": {},
95
+ "outputs": [],
96
+ "source": [
97
+ "# Save the fine-tuned model and tokenizer\n",
98
+ "trainer.save_model('roberta-rating')\n",
99
+ "tokenizer.save_pretrained('roberta-rating')"
100
+ ]
101
+ }
102
+ ],
103
+ "metadata": {
104
+ "kernelspec": {
105
+ "display_name": "SolutionsInPR",
106
+ "language": "python",
107
+ "name": "python3"
108
+ },
109
+ "language_info": {
110
+ "name": "python",
111
+ "version": "3.12.3"
112
+ }
113
+ },
114
+ "nbformat": 4,
115
+ "nbformat_minor": 2
116
+ }