igorithm commited on
Commit
1b671b4
·
1 Parent(s): 1a450dd

Notebook with english dataset preparation

Browse files

Preparation includes removing unused columns and transforming labels to
main categories.

category_classification/datasets/en/download_common.py ADDED
@@ -0,0 +1 @@
 
 
1
+ ../download_common.py
category_classification/datasets/en/prepare_dataset.ipynb ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {
7
+ "execution": {
8
+ "iopub.execute_input": "2025-04-09T06:13:55.446824Z",
9
+ "iopub.status.busy": "2025-04-09T06:13:55.445794Z",
10
+ "iopub.status.idle": "2025-04-09T06:13:56.137367Z",
11
+ "shell.execute_reply": "2025-04-09T06:13:56.136554Z",
12
+ "shell.execute_reply.started": "2025-04-09T06:13:55.446782Z"
13
+ },
14
+ "tags": []
15
+ },
16
+ "outputs": [],
17
+ "source": [
18
+ "from pathlib import Path\n",
19
+ "\n",
20
+ "from download_common import *"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": 2,
26
+ "metadata": {
27
+ "execution": {
28
+ "iopub.execute_input": "2025-04-09T06:13:56.140406Z",
29
+ "iopub.status.busy": "2025-04-09T06:13:56.138861Z",
30
+ "iopub.status.idle": "2025-04-09T06:13:56.182854Z",
31
+ "shell.execute_reply": "2025-04-09T06:13:56.182207Z",
32
+ "shell.execute_reply.started": "2025-04-09T06:13:56.140363Z"
33
+ },
34
+ "tags": []
35
+ },
36
+ "outputs": [],
37
+ "source": [
38
+ "dest_dir = Path(globals()[\"_dh\"][0])\n",
39
+ "json_filename = \"arxiv-metadata-oai-snapshot.json\"\n",
40
+ "dataset = \"Cornell-University/arxiv\"\n",
41
+ "old_label = \"categories\"\n",
42
+ "new_label = \"category\"\n",
43
+ "train_filename = \"arxiv_train.json\"\n",
44
+ "test_filename = \"arxiv_test.json\""
45
+ ]
46
+ },
47
+ {
48
+ "cell_type": "code",
49
+ "execution_count": 3,
50
+ "metadata": {
51
+ "execution": {
52
+ "iopub.execute_input": "2025-04-09T06:13:56.184655Z",
53
+ "iopub.status.busy": "2025-04-09T06:13:56.183825Z",
54
+ "iopub.status.idle": "2025-04-09T06:15:23.665384Z",
55
+ "shell.execute_reply": "2025-04-09T06:15:23.664523Z",
56
+ "shell.execute_reply.started": "2025-04-09T06:13:56.184630Z"
57
+ }
58
+ },
59
+ "outputs": [
60
+ {
61
+ "name": "stdout",
62
+ "output_type": "stream",
63
+ "text": [
64
+ "Dataset already exists, do not download\n",
65
+ "Reading dataset...\n",
66
+ "Dataset read\n"
67
+ ]
68
+ }
69
+ ],
70
+ "source": [
71
+ "df = download_and_read_dataset(dest_dir=dest_dir, dataset=dataset, filename=json_filename)"
72
+ ]
73
+ },
74
+ {
75
+ "cell_type": "code",
76
+ "execution_count": 4,
77
+ "metadata": {
78
+ "execution": {
79
+ "iopub.execute_input": "2025-04-09T06:15:23.667946Z",
80
+ "iopub.status.busy": "2025-04-09T06:15:23.666981Z",
81
+ "iopub.status.idle": "2025-04-09T06:15:23.702581Z",
82
+ "shell.execute_reply": "2025-04-09T06:15:23.701966Z",
83
+ "shell.execute_reply.started": "2025-04-09T06:15:23.667909Z"
84
+ },
85
+ "tags": []
86
+ },
87
+ "outputs": [
88
+ {
89
+ "data": {
90
+ "text/html": [
91
+ "<div>\n",
92
+ "<style scoped>\n",
93
+ " .dataframe tbody tr th:only-of-type {\n",
94
+ " vertical-align: middle;\n",
95
+ " }\n",
96
+ "\n",
97
+ " .dataframe tbody tr th {\n",
98
+ " vertical-align: top;\n",
99
+ " }\n",
100
+ "\n",
101
+ " .dataframe thead th {\n",
102
+ " text-align: right;\n",
103
+ " }\n",
104
+ "</style>\n",
105
+ "<table border=\"1\" class=\"dataframe\">\n",
106
+ " <thead>\n",
107
+ " <tr style=\"text-align: right;\">\n",
108
+ " <th></th>\n",
109
+ " <th>id</th>\n",
110
+ " <th>submitter</th>\n",
111
+ " <th>authors</th>\n",
112
+ " <th>title</th>\n",
113
+ " <th>comments</th>\n",
114
+ " <th>journal-ref</th>\n",
115
+ " <th>doi</th>\n",
116
+ " <th>report-no</th>\n",
117
+ " <th>categories</th>\n",
118
+ " <th>license</th>\n",
119
+ " <th>abstract</th>\n",
120
+ " <th>versions</th>\n",
121
+ " <th>update_date</th>\n",
122
+ " <th>authors_parsed</th>\n",
123
+ " </tr>\n",
124
+ " </thead>\n",
125
+ " <tbody>\n",
126
+ " <tr>\n",
127
+ " <th>0</th>\n",
128
+ " <td>0704.0001</td>\n",
129
+ " <td>Pavel Nadolsky</td>\n",
130
+ " <td>C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-...</td>\n",
131
+ " <td>Calculation of prompt diphoton production cros...</td>\n",
132
+ " <td>37 pages, 15 figures; published version</td>\n",
133
+ " <td>Phys.Rev.D76:013009,2007</td>\n",
134
+ " <td>10.1103/PhysRevD.76.013009</td>\n",
135
+ " <td>ANL-HEP-PR-07-12</td>\n",
136
+ " <td>hep-ph</td>\n",
137
+ " <td>None</td>\n",
138
+ " <td>A fully differential calculation in perturba...</td>\n",
139
+ " <td>[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...</td>\n",
140
+ " <td>2008-11-26</td>\n",
141
+ " <td>[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,...</td>\n",
142
+ " </tr>\n",
143
+ " <tr>\n",
144
+ " <th>1</th>\n",
145
+ " <td>0704.0002</td>\n",
146
+ " <td>Louis Theran</td>\n",
147
+ " <td>Ileana Streinu and Louis Theran</td>\n",
148
+ " <td>Sparsity-certifying Graph Decompositions</td>\n",
149
+ " <td>To appear in Graphs and Combinatorics</td>\n",
150
+ " <td>None</td>\n",
151
+ " <td>None</td>\n",
152
+ " <td>None</td>\n",
153
+ " <td>math.CO cs.CG</td>\n",
154
+ " <td>http://arxiv.org/licenses/nonexclusive-distrib...</td>\n",
155
+ " <td>We describe a new algorithm, the $(k,\\ell)$-...</td>\n",
156
+ " <td>[{'version': 'v1', 'created': 'Sat, 31 Mar 200...</td>\n",
157
+ " <td>2008-12-13</td>\n",
158
+ " <td>[[Streinu, Ileana, ], [Theran, Louis, ]]</td>\n",
159
+ " </tr>\n",
160
+ " <tr>\n",
161
+ " <th>2</th>\n",
162
+ " <td>0704.0003</td>\n",
163
+ " <td>Hongjun Pan</td>\n",
164
+ " <td>Hongjun Pan</td>\n",
165
+ " <td>The evolution of the Earth-Moon system based o...</td>\n",
166
+ " <td>23 pages, 3 figures</td>\n",
167
+ " <td>None</td>\n",
168
+ " <td>None</td>\n",
169
+ " <td>None</td>\n",
170
+ " <td>physics.gen-ph</td>\n",
171
+ " <td>None</td>\n",
172
+ " <td>The evolution of Earth-Moon system is descri...</td>\n",
173
+ " <td>[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...</td>\n",
174
+ " <td>2008-01-13</td>\n",
175
+ " <td>[[Pan, Hongjun, ]]</td>\n",
176
+ " </tr>\n",
177
+ " <tr>\n",
178
+ " <th>3</th>\n",
179
+ " <td>0704.0004</td>\n",
180
+ " <td>David Callan</td>\n",
181
+ " <td>David Callan</td>\n",
182
+ " <td>A determinant of Stirling cycle numbers counts...</td>\n",
183
+ " <td>11 pages</td>\n",
184
+ " <td>None</td>\n",
185
+ " <td>None</td>\n",
186
+ " <td>None</td>\n",
187
+ " <td>math.CO</td>\n",
188
+ " <td>None</td>\n",
189
+ " <td>We show that a determinant of Stirling cycle...</td>\n",
190
+ " <td>[{'version': 'v1', 'created': 'Sat, 31 Mar 200...</td>\n",
191
+ " <td>2007-05-23</td>\n",
192
+ " <td>[[Callan, David, ]]</td>\n",
193
+ " </tr>\n",
194
+ " <tr>\n",
195
+ " <th>4</th>\n",
196
+ " <td>0704.0005</td>\n",
197
+ " <td>Alberto Torchinsky</td>\n",
198
+ " <td>Wael Abu-Shammala and Alberto Torchinsky</td>\n",
199
+ " <td>From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a...</td>\n",
200
+ " <td>None</td>\n",
201
+ " <td>Illinois J. Math. 52 (2008) no.2, 681-689</td>\n",
202
+ " <td>None</td>\n",
203
+ " <td>None</td>\n",
204
+ " <td>math.CA math.FA</td>\n",
205
+ " <td>None</td>\n",
206
+ " <td>In this paper we show how to compute the $\\L...</td>\n",
207
+ " <td>[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...</td>\n",
208
+ " <td>2013-10-15</td>\n",
209
+ " <td>[[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]]</td>\n",
210
+ " </tr>\n",
211
+ " </tbody>\n",
212
+ "</table>\n",
213
+ "</div>"
214
+ ],
215
+ "text/plain": [
216
+ " id ... authors_parsed\n",
217
+ "0 0704.0001 ... [[Balázs, C., ], [Berger, E. L., ], [Nadolsky,...\n",
218
+ "1 0704.0002 ... [[Streinu, Ileana, ], [Theran, Louis, ]]\n",
219
+ "2 0704.0003 ... [[Pan, Hongjun, ]]\n",
220
+ "3 0704.0004 ... [[Callan, David, ]]\n",
221
+ "4 0704.0005 ... [[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]]\n",
222
+ "\n",
223
+ "[5 rows x 14 columns]"
224
+ ]
225
+ },
226
+ "execution_count": 4,
227
+ "metadata": {},
228
+ "output_type": "execute_result"
229
+ }
230
+ ],
231
+ "source": [
232
+ "df.head()"
233
+ ]
234
+ },
235
+ {
236
+ "cell_type": "code",
237
+ "execution_count": 5,
238
+ "metadata": {
239
+ "execution": {
240
+ "iopub.execute_input": "2025-04-09T06:15:23.704471Z",
241
+ "iopub.status.busy": "2025-04-09T06:15:23.703504Z",
242
+ "iopub.status.idle": "2025-04-09T06:15:24.013564Z",
243
+ "shell.execute_reply": "2025-04-09T06:15:24.012916Z",
244
+ "shell.execute_reply.started": "2025-04-09T06:15:23.704435Z"
245
+ },
246
+ "tags": []
247
+ },
248
+ "outputs": [
249
+ {
250
+ "name": "stdout",
251
+ "output_type": "stream",
252
+ "text": [
253
+ "Removing unwanted columns...\n",
254
+ "Columns removed...\n"
255
+ ]
256
+ }
257
+ ],
258
+ "source": [
259
+ "df = filter_columns(df=df, columns=[\"title\", \"authors\", \"abstract\", old_label])"
260
+ ]
261
+ },
262
+ {
263
+ "cell_type": "code",
264
+ "execution_count": 6,
265
+ "metadata": {
266
+ "execution": {
267
+ "iopub.execute_input": "2025-04-09T06:15:24.015952Z",
268
+ "iopub.status.busy": "2025-04-09T06:15:24.014301Z",
269
+ "iopub.status.idle": "2025-04-09T06:15:25.912683Z",
270
+ "shell.execute_reply": "2025-04-09T06:15:25.911811Z",
271
+ "shell.execute_reply.started": "2025-04-09T06:15:24.015915Z"
272
+ },
273
+ "tags": []
274
+ },
275
+ "outputs": [],
276
+ "source": [
277
+ "X, y = create_features_labels(df=df, old_label=old_label, new_label=new_label)"
278
+ ]
279
+ },
280
+ {
281
+ "cell_type": "code",
282
+ "execution_count": 7,
283
+ "metadata": {
284
+ "execution": {
285
+ "iopub.execute_input": "2025-04-09T06:15:25.914486Z",
286
+ "iopub.status.busy": "2025-04-09T06:15:25.913573Z",
287
+ "iopub.status.idle": "2025-04-09T06:15:36.005889Z",
288
+ "shell.execute_reply": "2025-04-09T06:15:36.005131Z",
289
+ "shell.execute_reply.started": "2025-04-09T06:15:25.914449Z"
290
+ }
291
+ },
292
+ "outputs": [],
293
+ "source": [
294
+ "X_train, X_test, y_train, y_test = train_test_split(X, y)"
295
+ ]
296
+ },
297
+ {
298
+ "cell_type": "code",
299
+ "execution_count": 8,
300
+ "metadata": {
301
+ "execution": {
302
+ "iopub.execute_input": "2025-04-09T06:15:36.007677Z",
303
+ "iopub.status.busy": "2025-04-09T06:15:36.006731Z",
304
+ "iopub.status.idle": "2025-04-09T06:17:57.585958Z",
305
+ "shell.execute_reply": "2025-04-09T06:17:57.585110Z",
306
+ "shell.execute_reply.started": "2025-04-09T06:15:36.007643Z"
307
+ },
308
+ "tags": []
309
+ },
310
+ "outputs": [],
311
+ "source": [
312
+ "write_dataset(dest_dir=dest_dir, X=X_train, y=y_train, filename=train_filename)\n",
313
+ "write_dataset(dest_dir=dest_dir, X=X_test, y=y_test, filename=test_filename)"
314
+ ]
315
+ }
316
+ ],
317
+ "metadata": {
318
+ "kernelspec": {
319
+ "display_name": "DataSphere Kernel",
320
+ "language": "python",
321
+ "name": "python3"
322
+ },
323
+ "language_info": {
324
+ "codemirror_mode": {
325
+ "name": "ipython",
326
+ "version": 3
327
+ },
328
+ "file_extension": ".py",
329
+ "mimetype": "text/x-python",
330
+ "name": "python",
331
+ "nbconvert_exporter": "python",
332
+ "pygments_lexer": "ipython3",
333
+ "version": "3.10.12"
334
+ }
335
+ },
336
+ "nbformat": 4,
337
+ "nbformat_minor": 4
338
+ }