{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os; os.chdir('..');"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'Hobbies_and_Leisure': 0,\n",
       " 'News': 1,\n",
       " 'Science': 2,\n",
       " 'Autos_and_Vehicles': 3,\n",
       " 'Health': 4,\n",
       " 'Pets_and_Animals': 5,\n",
       " 'Adult': 6,\n",
       " 'Computers_and_Electronics': 7,\n",
       " 'Online Communities': 8,\n",
       " 'Beauty_and_Fitness': 9,\n",
       " 'People_and_Society': 10,\n",
       " 'Business_and_Industrial': 11,\n",
       " 'Reference': 12,\n",
       " 'Shopping': 13,\n",
       " 'Travel_and_Transportation': 14,\n",
       " 'Food_and_Drink': 15,\n",
       " 'Law_and_Government': 16,\n",
       " 'Books_and_Literature': 17,\n",
       " 'Finance': 18,\n",
       " 'Games': 19,\n",
       " 'Home_and_Garden': 20,\n",
       " 'Jobs_and_Education': 21,\n",
       " 'Arts_and_Entertainment': 22,\n",
       " 'Sensitive Subjects': 23,\n",
       " 'Real Estate': 24,\n",
       " 'Internet_and_Telecom': 25,\n",
       " 'Sports': 26}"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_cat_dict= json.load(\n",
    "    open('data/categories_refined.json', 'r')\n",
    ")\n",
    "data_cat_dict"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{0: 'Hobbies_and_Leisure',\n",
       " 1: 'News',\n",
       " 2: 'Science',\n",
       " 3: 'Autos_and_Vehicles',\n",
       " 4: 'Health',\n",
       " 5: 'Pets_and_Animals',\n",
       " 6: 'Adult',\n",
       " 7: 'Computers_and_Electronics',\n",
       " 8: 'Online Communities',\n",
       " 9: 'Beauty_and_Fitness',\n",
       " 10: 'People_and_Society',\n",
       " 11: 'Business_and_Industrial',\n",
       " 12: 'Reference',\n",
       " 13: 'Shopping',\n",
       " 14: 'Travel_and_Transportation',\n",
       " 15: 'Food_and_Drink',\n",
       " 16: 'Law_and_Government',\n",
       " 17: 'Books_and_Literature',\n",
       " 18: 'Finance',\n",
       " 19: 'Games',\n",
       " 20: 'Home_and_Garden',\n",
       " 21: 'Jobs_and_Education',\n",
       " 22: 'Arts_and_Entertainment',\n",
       " 23: 'Sensitive Subjects',\n",
       " 24: 'Real Estate',\n",
       " 25: 'Internet_and_Telecom',\n",
       " 26: 'Sports'}"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_cat_dict_rev= {}\n",
    "for key in data_cat_dict.keys():\n",
    "    data_cat_dict_rev[data_cat_dict[key]] = key\n",
    "    \n",
    "data_cat_dict_rev"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "data_categories/Hobbies_and_Leisure.csv: True\n",
      "data_categories/News.csv: True\n",
      "data_categories/Science.csv: True\n",
      "data_categories/Autos_and_Vehicles.csv: True\n",
      "data_categories/Health.csv: True\n",
      "data_categories/Pets_and_Animals.csv: True\n",
      "data_categories/Adult.csv: True\n",
      "data_categories/Computers_and_Electronics.csv: True\n",
      "data_categories/Online Communities.csv: True\n",
      "data_categories/Beauty_and_Fitness.csv: True\n",
      "data_categories/People_and_Society.csv: True\n",
      "data_categories/Business_and_Industrial.csv: True\n",
      "data_categories/Reference.csv: True\n",
      "data_categories/Shopping.csv: True\n",
      "data_categories/Travel_and_Transportation.csv: True\n",
      "data_categories/Food_and_Drink.csv: True\n",
      "data_categories/Law_and_Government.csv: True\n",
      "data_categories/Books_and_Literature.csv: True\n",
      "data_categories/Finance.csv: True\n",
      "data_categories/Games.csv: True\n",
      "data_categories/Home_and_Garden.csv: True\n",
      "data_categories/Jobs_and_Education.csv: True\n",
      "data_categories/Arts_and_Entertainment.csv: True\n",
      "data_categories/Sensitive Subjects.csv: True\n",
      "data_categories/Real Estate.csv: True\n",
      "data_categories/Internet_and_Telecom.csv: True\n",
      "data_categories/Sports.csv: True\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "27"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "path_list= []\n",
    "for i in data_cat_dict.keys():\n",
    "    path= os.path.join(\"data_categories\", f'{i}.csv')\n",
    "    print(f\"{path}: {os.path.exists(path)}\")\n",
    "    path_list.append(path)\n",
    "    \n",
    "len(path_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>category</th>\n",
       "      <th>label</th>\n",
       "      <th>label_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Skincare routine</td>\n",
       "      <td>Hobbies_and_Leisure</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Haircare tips</td>\n",
       "      <td>Hobbies_and_Leisure</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Makeup tutorials</td>\n",
       "      <td>Hobbies_and_Leisure</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Fitness workouts</td>\n",
       "      <td>Hobbies_and_Leisure</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Anti-aging products</td>\n",
       "      <td>Hobbies_and_Leisure</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              category                label  label_id\n",
       "0     Skincare routine  Hobbies_and_Leisure         0\n",
       "1        Haircare tips  Hobbies_and_Leisure         0\n",
       "2     Makeup tutorials  Hobbies_and_Leisure         0\n",
       "3     Fitness workouts  Hobbies_and_Leisure         0\n",
       "4  Anti-aging products  Hobbies_and_Leisure         0"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df= pd.read_csv(path_list[0])\n",
    "df['label']= data_cat_dict_rev[0]\n",
    "df['label_id']= data_cat_dict[data_cat_dict_rev[0]]\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>category</th>\n",
       "      <th>label</th>\n",
       "      <th>label_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Skincare routine</td>\n",
       "      <td>Hobbies_and_Leisure</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Haircare tips</td>\n",
       "      <td>Hobbies_and_Leisure</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Makeup tutorials</td>\n",
       "      <td>Hobbies_and_Leisure</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Fitness workouts</td>\n",
       "      <td>Hobbies_and_Leisure</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Anti-aging products</td>\n",
       "      <td>Hobbies_and_Leisure</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              category                label  label_id\n",
       "0     Skincare routine  Hobbies_and_Leisure         0\n",
       "1        Haircare tips  Hobbies_and_Leisure         0\n",
       "2     Makeup tutorials  Hobbies_and_Leisure         0\n",
       "3     Fitness workouts  Hobbies_and_Leisure         0\n",
       "4  Anti-aging products  Hobbies_and_Leisure         0"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "concat_df= df.copy()\n",
    "concat_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "data_categories/News.csv\n",
      "data_categories/Science.csv\n",
      "data_categories/Autos_and_Vehicles.csv\n",
      "data_categories/Health.csv\n",
      "data_categories/Pets_and_Animals.csv\n",
      "data_categories/Adult.csv\n",
      "data_categories/Computers_and_Electronics.csv\n",
      "data_categories/Online Communities.csv\n",
      "data_categories/Beauty_and_Fitness.csv\n",
      "data_categories/People_and_Society.csv\n",
      "data_categories/Business_and_Industrial.csv\n",
      "data_categories/Reference.csv\n",
      "data_categories/Shopping.csv\n",
      "data_categories/Travel_and_Transportation.csv\n",
      "data_categories/Food_and_Drink.csv\n",
      "data_categories/Law_and_Government.csv\n",
      "data_categories/Books_and_Literature.csv\n",
      "data_categories/Finance.csv\n",
      "data_categories/Games.csv\n",
      "data_categories/Home_and_Garden.csv\n",
      "data_categories/Jobs_and_Education.csv\n",
      "data_categories/Arts_and_Entertainment.csv\n",
      "data_categories/Sensitive Subjects.csv\n",
      "data_categories/Real Estate.csv\n",
      "data_categories/Internet_and_Telecom.csv\n",
      "data_categories/Sports.csv\n"
     ]
    }
   ],
   "source": [
    "for i in range(1, 27):\n",
    "    print(path_list[i])\n",
    "    df_i= pd.read_csv(path_list[i])\n",
    "    df_i['label']= data_cat_dict_rev[i]\n",
    "    df_i['label_id']= data_cat_dict[data_cat_dict_rev[i]]\n",
    "    concat_df= pd.concat([concat_df, df_i])\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>category</th>\n",
       "      <th>label</th>\n",
       "      <th>label_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1201</th>\n",
       "      <td>Plus-size clothing stores and shops</td>\n",
       "      <td>Shopping</td>\n",
       "      <td>13</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>853</th>\n",
       "      <td>Citation context extraction techniques</td>\n",
       "      <td>Reference</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1034</th>\n",
       "      <td>Cat ear headphones with aux</td>\n",
       "      <td>Computers_and_Electronics</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>632</th>\n",
       "      <td>promote such behavior</td>\n",
       "      <td>Sensitive Subjects</td>\n",
       "      <td>23</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>91</th>\n",
       "      <td>Literature review references</td>\n",
       "      <td>Reference</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>168</th>\n",
       "      <td>Freedom of speech cases</td>\n",
       "      <td>Law_and_Government</td>\n",
       "      <td>16</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1111</th>\n",
       "      <td>French country kitchen design inspiration DIY</td>\n",
       "      <td>Home_and_Garden</td>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>492</th>\n",
       "      <td>Credit score improvement techniques overview</td>\n",
       "      <td>Finance</td>\n",
       "      <td>18</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>657</th>\n",
       "      <td>regulated by laws</td>\n",
       "      <td>Sensitive Subjects</td>\n",
       "      <td>23</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1037</th>\n",
       "      <td>Health Education for Seniors</td>\n",
       "      <td>Health</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>109</th>\n",
       "      <td>Quantum mechanics experiments</td>\n",
       "      <td>Science</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>538</th>\n",
       "      <td>Healthcare AI applications</td>\n",
       "      <td>Science</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1386</th>\n",
       "      <td>AirPods Pro Case</td>\n",
       "      <td>Computers_and_Electronics</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>844</th>\n",
       "      <td>DIY home electrical repairs</td>\n",
       "      <td>Home_and_Garden</td>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>439</th>\n",
       "      <td>tube sex</td>\n",
       "      <td>Adult</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>231</th>\n",
       "      <td>Real estate sales tactics</td>\n",
       "      <td>Real Estate</td>\n",
       "      <td>24</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>610</th>\n",
       "      <td>Home solar panel cleaning and maintenance</td>\n",
       "      <td>Home_and_Garden</td>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>422</th>\n",
       "      <td>Real estate legal issues</td>\n",
       "      <td>Real Estate</td>\n",
       "      <td>24</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>222</th>\n",
       "      <td>Film industry news</td>\n",
       "      <td>Arts_and_Entertainment</td>\n",
       "      <td>22</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1077</th>\n",
       "      <td>Cat ear headphones for PS4</td>\n",
       "      <td>Computers_and_Electronics</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                           category  \\\n",
       "1201            Plus-size clothing stores and shops   \n",
       "853          Citation context extraction techniques   \n",
       "1034                    Cat ear headphones with aux   \n",
       "632                           promote such behavior   \n",
       "91                     Literature review references   \n",
       "168                         Freedom of speech cases   \n",
       "1111  French country kitchen design inspiration DIY   \n",
       "492    Credit score improvement techniques overview   \n",
       "657                               regulated by laws   \n",
       "1037                   Health Education for Seniors   \n",
       "109                   Quantum mechanics experiments   \n",
       "538                      Healthcare AI applications   \n",
       "1386                               AirPods Pro Case   \n",
       "844                     DIY home electrical repairs   \n",
       "439                                        tube sex   \n",
       "231                       Real estate sales tactics   \n",
       "610       Home solar panel cleaning and maintenance   \n",
       "422                        Real estate legal issues   \n",
       "222                              Film industry news   \n",
       "1077                     Cat ear headphones for PS4   \n",
       "\n",
       "                          label  label_id  \n",
       "1201                   Shopping        13  \n",
       "853                   Reference        12  \n",
       "1034  Computers_and_Electronics         7  \n",
       "632          Sensitive Subjects        23  \n",
       "91                    Reference        12  \n",
       "168          Law_and_Government        16  \n",
       "1111            Home_and_Garden        20  \n",
       "492                     Finance        18  \n",
       "657          Sensitive Subjects        23  \n",
       "1037                     Health         4  \n",
       "109                     Science         2  \n",
       "538                     Science         2  \n",
       "1386  Computers_and_Electronics         7  \n",
       "844             Home_and_Garden        20  \n",
       "439                       Adult         6  \n",
       "231                 Real Estate        24  \n",
       "610             Home_and_Garden        20  \n",
       "422                 Real Estate        24  \n",
       "222      Arts_and_Entertainment        22  \n",
       "1077  Computers_and_Electronics         7  "
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "concat_df.sample(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "label\n",
       "Computers_and_Electronics    1959\n",
       "Shopping                     1912\n",
       "Food_and_Drink               1851\n",
       "Reference                    1453\n",
       "Sports                       1399\n",
       "Online Communities           1396\n",
       "Travel_and_Transportation    1355\n",
       "Internet_and_Telecom         1353\n",
       "Pets_and_Animals             1324\n",
       "Beauty_and_Fitness           1259\n",
       "People_and_Society           1250\n",
       "Law_and_Government           1226\n",
       "Home_and_Garden              1200\n",
       "News                         1199\n",
       "Jobs_and_Education           1188\n",
       "Real Estate                  1166\n",
       "Arts_and_Entertainment       1162\n",
       "Business_and_Industrial      1124\n",
       "Adult                        1100\n",
       "Health                       1098\n",
       "Autos_and_Vehicles           1072\n",
       "Science                      1055\n",
       "Hobbies_and_Leisure          1049\n",
       "Books_and_Literature         1000\n",
       "Finance                      1000\n",
       "Sensitive Subjects            762\n",
       "Games                         700\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "concat_df.label.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>category</th>\n",
       "      <th>label</th>\n",
       "      <th>label_id</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Internet usage monitoring</td>\n",
       "      <td>Internet_and_Telecom</td>\n",
       "      <td>25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Food safety guidelines and regulations</td>\n",
       "      <td>Food_and_Drink</td>\n",
       "      <td>15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Internet protocols and edge computing in finance</td>\n",
       "      <td>Internet_and_Telecom</td>\n",
       "      <td>25</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Online grocery shopping</td>\n",
       "      <td>Food_and_Drink</td>\n",
       "      <td>15</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>Writing retreats for poets and novelists</td>\n",
       "      <td>Books_and_Literature</td>\n",
       "      <td>17</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>Unicorn cat ear headphones</td>\n",
       "      <td>Computers_and_Electronics</td>\n",
       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>Reference citation context tagging techniques</td>\n",
       "      <td>Reference</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>Motorcycle riding tips for beginners gear chec...</td>\n",
       "      <td>Autos_and_Vehicles</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>Space agency missions</td>\n",
       "      <td>Science</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>Game streaming self-promotion and growth tactics</td>\n",
       "      <td>Games</td>\n",
       "      <td>19</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>sex videos movies</td>\n",
       "      <td>Adult</td>\n",
       "      <td>6</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>Citation context organization methods</td>\n",
       "      <td>Reference</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>Healthy office snacks</td>\n",
       "      <td>Health</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>Indigenous rights advocacy</td>\n",
       "      <td>People_and_Society</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>News talk shows</td>\n",
       "      <td>News</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>Best facial cleansers</td>\n",
       "      <td>Hobbies_and_Leisure</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>Letter of recommendation</td>\n",
       "      <td>Reference</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>Fossil preservation techniques</td>\n",
       "      <td>Science</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>Marriage equality</td>\n",
       "      <td>People_and_Society</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>eSports Game Esports Player Fan Engagement Ini...</td>\n",
       "      <td>Sports</td>\n",
       "      <td>26</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                             category  \\\n",
       "0                           Internet usage monitoring   \n",
       "1              Food safety guidelines and regulations   \n",
       "2    Internet protocols and edge computing in finance   \n",
       "3                             Online grocery shopping   \n",
       "4            Writing retreats for poets and novelists   \n",
       "5                          Unicorn cat ear headphones   \n",
       "6       Reference citation context tagging techniques   \n",
       "7   Motorcycle riding tips for beginners gear chec...   \n",
       "8                               Space agency missions   \n",
       "9    Game streaming self-promotion and growth tactics   \n",
       "10                                  sex videos movies   \n",
       "11              Citation context organization methods   \n",
       "12                              Healthy office snacks   \n",
       "13                         Indigenous rights advocacy   \n",
       "14                                    News talk shows   \n",
       "15                              Best facial cleansers   \n",
       "16                           Letter of recommendation   \n",
       "17                     Fossil preservation techniques   \n",
       "18                                  Marriage equality   \n",
       "19  eSports Game Esports Player Fan Engagement Ini...   \n",
       "\n",
       "                        label  label_id  \n",
       "0        Internet_and_Telecom        25  \n",
       "1              Food_and_Drink        15  \n",
       "2        Internet_and_Telecom        25  \n",
       "3              Food_and_Drink        15  \n",
       "4        Books_and_Literature        17  \n",
       "5   Computers_and_Electronics         7  \n",
       "6                   Reference        12  \n",
       "7          Autos_and_Vehicles         3  \n",
       "8                     Science         2  \n",
       "9                       Games        19  \n",
       "10                      Adult         6  \n",
       "11                  Reference        12  \n",
       "12                     Health         4  \n",
       "13         People_and_Society        10  \n",
       "14                       News         1  \n",
       "15        Hobbies_and_Leisure         0  \n",
       "16                  Reference        12  \n",
       "17                    Science         2  \n",
       "18         People_and_Society        10  \n",
       "19                     Sports        26  "
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cdf_shffeled= concat_df.sample(frac=1).reset_index(drop=True)\n",
    "cdf_shffeled.head(20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "cdf_shffeled.to_csv(\n",
    "    'data_categories/Final_Category_Data_With_Labels.csv',\n",
    "    index=False\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}