Spaces:

HibiscusMaximus
/

PaperClassification

Sleeping

App Files Files Community

igorithm commited on 23 days ago

Commit

1b671b4

1 Parent(s): 1a450dd

Notebook with english dataset preparation

Browse files

Preparation includes removing unused columns and transforming labels to
main categories.

Files changed (2) hide show

category_classification/datasets/en/download_common.py +1 -0
category_classification/datasets/en/prepare_dataset.ipynb +338 -0

category_classification/datasets/en/download_common.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ ../download_common.py

category_classification/datasets/en/prepare_dataset.ipynb ADDED Viewed

	@@ -0,0 +1,338 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-04-09T06:13:55.446824Z",
+     "iopub.status.busy": "2025-04-09T06:13:55.445794Z",
+     "iopub.status.idle": "2025-04-09T06:13:56.137367Z",
+     "shell.execute_reply": "2025-04-09T06:13:56.136554Z",
+     "shell.execute_reply.started": "2025-04-09T06:13:55.446782Z"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "\n",
+    "from download_common import *"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-04-09T06:13:56.140406Z",
+     "iopub.status.busy": "2025-04-09T06:13:56.138861Z",
+     "iopub.status.idle": "2025-04-09T06:13:56.182854Z",
+     "shell.execute_reply": "2025-04-09T06:13:56.182207Z",
+     "shell.execute_reply.started": "2025-04-09T06:13:56.140363Z"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "dest_dir = Path(globals()[\"_dh\"][0])\n",
+    "json_filename = \"arxiv-metadata-oai-snapshot.json\"\n",
+    "dataset = \"Cornell-University/arxiv\"\n",
+    "old_label = \"categories\"\n",
+    "new_label = \"category\"\n",
+    "train_filename = \"arxiv_train.json\"\n",
+    "test_filename = \"arxiv_test.json\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-04-09T06:13:56.184655Z",
+     "iopub.status.busy": "2025-04-09T06:13:56.183825Z",
+     "iopub.status.idle": "2025-04-09T06:15:23.665384Z",
+     "shell.execute_reply": "2025-04-09T06:15:23.664523Z",
+     "shell.execute_reply.started": "2025-04-09T06:13:56.184630Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset already exists, do not download\n",
+      "Reading dataset...\n",
+      "Dataset read\n"
+     ]
+    }
+   ],
+   "source": [
+    "df = download_and_read_dataset(dest_dir=dest_dir, dataset=dataset, filename=json_filename)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-04-09T06:15:23.667946Z",
+     "iopub.status.busy": "2025-04-09T06:15:23.666981Z",
+     "iopub.status.idle": "2025-04-09T06:15:23.702581Z",
+     "shell.execute_reply": "2025-04-09T06:15:23.701966Z",
+     "shell.execute_reply.started": "2025-04-09T06:15:23.667909Z"
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>id</th>\n",
+       "      <th>submitter</th>\n",
+       "      <th>authors</th>\n",
+       "      <th>title</th>\n",
+       "      <th>comments</th>\n",
+       "      <th>journal-ref</th>\n",
+       "      <th>doi</th>\n",
+       "      <th>report-no</th>\n",
+       "      <th>categories</th>\n",
+       "      <th>license</th>\n",
+       "      <th>abstract</th>\n",
+       "      <th>versions</th>\n",
+       "      <th>update_date</th>\n",
+       "      <th>authors_parsed</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0704.0001</td>\n",
+       "      <td>Pavel Nadolsky</td>\n",
+       "      <td>C. Bal\\'azs, E. L. Berger, P. M. Nadolsky, C.-...</td>\n",
+       "      <td>Calculation of prompt diphoton production cros...</td>\n",
+       "      <td>37 pages, 15 figures; published version</td>\n",
+       "      <td>Phys.Rev.D76:013009,2007</td>\n",
+       "      <td>10.1103/PhysRevD.76.013009</td>\n",
+       "      <td>ANL-HEP-PR-07-12</td>\n",
+       "      <td>hep-ph</td>\n",
+       "      <td>None</td>\n",
+       "      <td>A fully differential calculation in perturba...</td>\n",
+       "      <td>[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...</td>\n",
+       "      <td>2008-11-26</td>\n",
+       "      <td>[[Balázs, C., ], [Berger, E. L., ], [Nadolsky,...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0704.0002</td>\n",
+       "      <td>Louis Theran</td>\n",
+       "      <td>Ileana Streinu and Louis Theran</td>\n",
+       "      <td>Sparsity-certifying Graph Decompositions</td>\n",
+       "      <td>To appear in Graphs and Combinatorics</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>math.CO cs.CG</td>\n",
+       "      <td>http://arxiv.org/licenses/nonexclusive-distrib...</td>\n",
+       "      <td>We describe a new algorithm, the $(k,\\ell)$-...</td>\n",
+       "      <td>[{'version': 'v1', 'created': 'Sat, 31 Mar 200...</td>\n",
+       "      <td>2008-12-13</td>\n",
+       "      <td>[[Streinu, Ileana, ], [Theran, Louis, ]]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0704.0003</td>\n",
+       "      <td>Hongjun Pan</td>\n",
+       "      <td>Hongjun Pan</td>\n",
+       "      <td>The evolution of the Earth-Moon system based o...</td>\n",
+       "      <td>23 pages, 3 figures</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>physics.gen-ph</td>\n",
+       "      <td>None</td>\n",
+       "      <td>The evolution of Earth-Moon system is descri...</td>\n",
+       "      <td>[{'version': 'v1', 'created': 'Sun, 1 Apr 2007...</td>\n",
+       "      <td>2008-01-13</td>\n",
+       "      <td>[[Pan, Hongjun, ]]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0704.0004</td>\n",
+       "      <td>David Callan</td>\n",
+       "      <td>David Callan</td>\n",
+       "      <td>A determinant of Stirling cycle numbers counts...</td>\n",
+       "      <td>11 pages</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>math.CO</td>\n",
+       "      <td>None</td>\n",
+       "      <td>We show that a determinant of Stirling cycle...</td>\n",
+       "      <td>[{'version': 'v1', 'created': 'Sat, 31 Mar 200...</td>\n",
+       "      <td>2007-05-23</td>\n",
+       "      <td>[[Callan, David, ]]</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0704.0005</td>\n",
+       "      <td>Alberto Torchinsky</td>\n",
+       "      <td>Wael Abu-Shammala and Alberto Torchinsky</td>\n",
+       "      <td>From dyadic $\\Lambda_{\\alpha}$ to $\\Lambda_{\\a...</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Illinois J. Math. 52 (2008) no.2, 681-689</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>math.CA math.FA</td>\n",
+       "      <td>None</td>\n",
+       "      <td>In this paper we show how to compute the $\\L...</td>\n",
+       "      <td>[{'version': 'v1', 'created': 'Mon, 2 Apr 2007...</td>\n",
+       "      <td>2013-10-15</td>\n",
+       "      <td>[[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]]</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          id  ...                                     authors_parsed\n",
+       "0  0704.0001  ...  [[Balázs, C., ], [Berger, E. L., ], [Nadolsky,...\n",
+       "1  0704.0002  ...           [[Streinu, Ileana, ], [Theran, Louis, ]]\n",
+       "2  0704.0003  ...                                 [[Pan, Hongjun, ]]\n",
+       "3  0704.0004  ...                                [[Callan, David, ]]\n",
+       "4  0704.0005  ...  [[Abu-Shammala, Wael, ], [Torchinsky, Alberto, ]]\n",
+       "\n",
+       "[5 rows x 14 columns]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-04-09T06:15:23.704471Z",
+     "iopub.status.busy": "2025-04-09T06:15:23.703504Z",
+     "iopub.status.idle": "2025-04-09T06:15:24.013564Z",
+     "shell.execute_reply": "2025-04-09T06:15:24.012916Z",
+     "shell.execute_reply.started": "2025-04-09T06:15:23.704435Z"
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Removing unwanted columns...\n",
+      "Columns removed...\n"
+     ]
+    }
+   ],
+   "source": [
+    "df = filter_columns(df=df, columns=[\"title\", \"authors\", \"abstract\", old_label])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-04-09T06:15:24.015952Z",
+     "iopub.status.busy": "2025-04-09T06:15:24.014301Z",
+     "iopub.status.idle": "2025-04-09T06:15:25.912683Z",
+     "shell.execute_reply": "2025-04-09T06:15:25.911811Z",
+     "shell.execute_reply.started": "2025-04-09T06:15:24.015915Z"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "X, y = create_features_labels(df=df, old_label=old_label, new_label=new_label)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-04-09T06:15:25.914486Z",
+     "iopub.status.busy": "2025-04-09T06:15:25.913573Z",
+     "iopub.status.idle": "2025-04-09T06:15:36.005889Z",
+     "shell.execute_reply": "2025-04-09T06:15:36.005131Z",
+     "shell.execute_reply.started": "2025-04-09T06:15:25.914449Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "X_train, X_test, y_train, y_test = train_test_split(X, y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-04-09T06:15:36.007677Z",
+     "iopub.status.busy": "2025-04-09T06:15:36.006731Z",
+     "iopub.status.idle": "2025-04-09T06:17:57.585958Z",
+     "shell.execute_reply": "2025-04-09T06:17:57.585110Z",
+     "shell.execute_reply.started": "2025-04-09T06:15:36.007643Z"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "write_dataset(dest_dir=dest_dir, X=X_train, y=y_train, filename=train_filename)\n",
+    "write_dataset(dest_dir=dest_dir, X=X_test, y=y_test, filename=test_filename)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "DataSphere Kernel",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}