{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "import ast\n", "import pandas as pd\n", "import kagglehub\n", "from kagglehub import KaggleDatasetAdapter" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# file_path = kagglehub.dataset_download(\"neelshah18/arxivdataset\")\n", "# arxiv_df = pd.read_json(os.path.join(file_path, 'arxivData.json'))\n", "file_path = \"~/.cache/kagglehub/datasets/neelshah18/arxivdataset/versions/2/arxivData.json\"\n", "arxiv_df = pd.read_json(file_path)\n", "arxiv_df = arxiv_df.drop(columns=['author', 'day', 'id', 'link', 'month', 'year'])\n", "arxiv_df['tag'] = arxiv_df['tag'].apply(ast.literal_eval)\n", "arxiv_df = arxiv_df.explode('tag').reset_index(drop=True)\n", "arxiv_df['tag'] = arxiv_df['tag'].apply(lambda x: x['term'])\n", "arxiv_df['text'] = arxiv_df['title'] + ' ' + arxiv_df['summary']\n", "arxiv_df = arxiv_df.drop(columns=['title', 'summary'])\n", "arxiv_df = arxiv_df[['text', 'tag']]" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | text | \n", "tag | \n", "
---|---|---|
0 | \n", "Dual Recurrent Attention Units for Visual Ques... | \n", "cs.AI | \n", "
1 | \n", "Dual Recurrent Attention Units for Visual Ques... | \n", "cs.CL | \n", "
2 | \n", "Dual Recurrent Attention Units for Visual Ques... | \n", "cs.CV | \n", "
3 | \n", "Dual Recurrent Attention Units for Visual Ques... | \n", "cs.NE | \n", "
4 | \n", "Dual Recurrent Attention Units for Visual Ques... | \n", "stat.ML | \n", "
5 | \n", "Sequential Short-Text Classification with Recu... | \n", "cs.CL | \n", "
6 | \n", "Sequential Short-Text Classification with Recu... | \n", "cs.AI | \n", "
7 | \n", "Sequential Short-Text Classification with Recu... | \n", "cs.LG | \n", "
8 | \n", "Sequential Short-Text Classification with Recu... | \n", "cs.NE | \n", "
9 | \n", "Sequential Short-Text Classification with Recu... | \n", "stat.ML | \n", "
10 | \n", "Multiresolution Recurrent Neural Networks: An ... | \n", "cs.CL | \n", "
11 | \n", "Multiresolution Recurrent Neural Networks: An ... | \n", "cs.AI | \n", "
12 | \n", "Multiresolution Recurrent Neural Networks: An ... | \n", "cs.LG | \n", "
13 | \n", "Multiresolution Recurrent Neural Networks: An ... | \n", "cs.NE | \n", "
14 | \n", "Multiresolution Recurrent Neural Networks: An ... | \n", "stat.ML | \n", "