created the dataset for categorical classification

Browse files

Files changed (2) hide show

data_categories/Final_Category_Data_With_Labels.csv +3 -0
research/08_organizing_the_entire_datacategories.ipynb +919 -0

data_categories/Final_Category_Data_With_Labels.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c2ba96d90a437a017a25af64364a58c7e2954ca1519a5fce27d0e55addae8da
+size 1810529

research/08_organizing_the_entire_datacategories.ipynb ADDED Viewed

	@@ -0,0 +1,919 @@

+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os; os.chdir('..');"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import json"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{'Beauty_and_Fitness': 0,\n",
+ " 'People_and_Society': 1,\n",
+ " 'Travel_and_Transportation': 2,\n",
+ " 'Shopping': 3,\n",
+ " 'Adult': 4,\n",
+ " 'Sports': 5,\n",
+ " 'Science': 6,\n",
+ " 'Food_and_Drink': 7,\n",
+ " 'News': 8,\n",
+ " 'Sensitive Subjects': 9,\n",
+ " 'Autos_and_Vehicles': 10,\n",
+ " 'Law_and_Government': 11,\n",
+ " 'Business_and_Industrial': 12,\n",
+ " 'Health': 13,\n",
+ " 'Real Estate': 14,\n",
+ " 'Books_and_Literature': 15,\n",
+ " 'Computers_and_Electronics': 16,\n",
+ " 'Internet_and_Telecom': 17,\n",
+ " 'Home_and_Garden': 18,\n",
+ " 'Jobs_and_Education': 19,\n",
+ " 'Online Communities': 20,\n",
+ " 'Finance': 21,\n",
+ " 'Arts_and_Entertainment': 22,\n",
+ " 'Games': 23,\n",
+ " 'Hobbies_and_Leisure': 24,\n",
+ " 'Reference': 25,\n",
+ " 'Pets_and_Animals': 26}"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data_cat_dict= json.load(\n",
+ " open('data/categories_refined.json', 'r')\n",
+ ")\n",
+ "data_cat_dict"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "{0: 'Beauty_and_Fitness',\n",
+ " 1: 'People_and_Society',\n",
+ " 2: 'Travel_and_Transportation',\n",
+ " 3: 'Shopping',\n",
+ " 4: 'Adult',\n",
+ " 5: 'Sports',\n",
+ " 6: 'Science',\n",
+ " 7: 'Food_and_Drink',\n",
+ " 8: 'News',\n",
+ " 9: 'Sensitive Subjects',\n",
+ " 10: 'Autos_and_Vehicles',\n",
+ " 11: 'Law_and_Government',\n",
+ " 12: 'Business_and_Industrial',\n",
+ " 13: 'Health',\n",
+ " 14: 'Real Estate',\n",
+ " 15: 'Books_and_Literature',\n",
+ " 16: 'Computers_and_Electronics',\n",
+ " 17: 'Internet_and_Telecom',\n",
+ " 18: 'Home_and_Garden',\n",
+ " 19: 'Jobs_and_Education',\n",
+ " 20: 'Online Communities',\n",
+ " 21: 'Finance',\n",
+ " 22: 'Arts_and_Entertainment',\n",
+ " 23: 'Games',\n",
+ " 24: 'Hobbies_and_Leisure',\n",
+ " 25: 'Reference',\n",
+ " 26: 'Pets_and_Animals'}"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data_cat_dict_rev= {}\n",
+ "for key in data_cat_dict.keys():\n",
+ " data_cat_dict_rev[data_cat_dict[key]] = key\n",
+ " \n",
+ "data_cat_dict_rev"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "data_categories/Beauty_and_Fitness.csv: True\n",
+ "data_categories/People_and_Society.csv: True\n",
+ "data_categories/Travel_and_Transportation.csv: True\n",
+ "data_categories/Shopping.csv: True\n",
+ "data_categories/Adult.csv: True\n",
+ "data_categories/Sports.csv: True\n",
+ "data_categories/Science.csv: True\n",
+ "data_categories/Food_and_Drink.csv: True\n",
+ "data_categories/News.csv: True\n",
+ "data_categories/Sensitive Subjects.csv: True\n",
+ "data_categories/Autos_and_Vehicles.csv: True\n",
+ "data_categories/Law_and_Government.csv: True\n",
+ "data_categories/Business_and_Industrial.csv: True\n",
+ "data_categories/Health.csv: True\n",
+ "data_categories/Real Estate.csv: True\n",
+ "data_categories/Books_and_Literature.csv: True\n",
+ "data_categories/Computers_and_Electronics.csv: True\n",
+ "data_categories/Internet_and_Telecom.csv: True\n",
+ "data_categories/Home_and_Garden.csv: True\n",
+ "data_categories/Jobs_and_Education.csv: True\n",
+ "data_categories/Online Communities.csv: True\n",
+ "data_categories/Finance.csv: True\n",
+ "data_categories/Arts_and_Entertainment.csv: True\n",
+ "data_categories/Games.csv: True\n",
+ "data_categories/Hobbies_and_Leisure.csv: True\n",
+ "data_categories/Reference.csv: True\n",
+ "data_categories/Pets_and_Animals.csv: True\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "27"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "path_list= []\n",
+ "for i in data_cat_dict.keys():\n",
+ " path= os.path.join(\"data_categories\", f'{i}.csv')\n",
+ " print(f\"{path}: {os.path.exists(path)}\")\n",
+ " path_list.append(path)\n",
+ " \n",
+ "len(path_list)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>category</th>\n",
+ " <th>label</th>\n",
+ " <th>label_id</th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>0</th>\n",
+ " <td>Makeup tutorials</td>\n",
+ " <td>Beauty_and_Fitness</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1</th>\n",
+ " <td>Skin care routines</td>\n",
+ " <td>Beauty_and_Fitness</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>2</th>\n",
+ " <td>Hairstyling tips</td>\n",
+ " <td>Beauty_and_Fitness</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>3</th>\n",
+ " <td>Weight loss programs</td>\n",
+ " <td>Beauty_and_Fitness</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>4</th>\n",
+ " <td>Yoga for beginners</td>\n",
+ " <td>Beauty_and_Fitness</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " category label label_id\n",
+ "0 Makeup tutorials Beauty_and_Fitness 0\n",
+ "1 Skin care routines Beauty_and_Fitness 0\n",
+ "2 Hairstyling tips Beauty_and_Fitness 0\n",
+ "3 Weight loss programs Beauty_and_Fitness 0\n",
+ "4 Yoga for beginners Beauty_and_Fitness 0"
+ ]
+ },
+ "execution_count": 24,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df= pd.read_csv(path_list[0])\n",
+ "df['label']= data_cat_dict_rev[0]\n",
+ "df['label_id']= data_cat_dict[data_cat_dict_rev[0]]\n",
+ "df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 25,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>category</th>\n",
+ " <th>label</th>\n",
+ " <th>label_id</th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>0</th>\n",
+ " <td>Makeup tutorials</td>\n",
+ " <td>Beauty_and_Fitness</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1</th>\n",
+ " <td>Skin care routines</td>\n",
+ " <td>Beauty_and_Fitness</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>2</th>\n",
+ " <td>Hairstyling tips</td>\n",
+ " <td>Beauty_and_Fitness</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>3</th>\n",
+ " <td>Weight loss programs</td>\n",
+ " <td>Beauty_and_Fitness</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>4</th>\n",
+ " <td>Yoga for beginners</td>\n",
+ " <td>Beauty_and_Fitness</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " category label label_id\n",
+ "0 Makeup tutorials Beauty_and_Fitness 0\n",
+ "1 Skin care routines Beauty_and_Fitness 0\n",
+ "2 Hairstyling tips Beauty_and_Fitness 0\n",
+ "3 Weight loss programs Beauty_and_Fitness 0\n",
+ "4 Yoga for beginners Beauty_and_Fitness 0"
+ ]
+ },
+ "execution_count": 25,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "concat_df= df.copy()\n",
+ "concat_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 26,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "data_categories/People_and_Society.csv\n",
+ "data_categories/Travel_and_Transportation.csv\n",
+ "data_categories/Shopping.csv\n",
+ "data_categories/Adult.csv\n",
+ "data_categories/Sports.csv\n",
+ "data_categories/Science.csv\n",
+ "data_categories/Food_and_Drink.csv\n",
+ "data_categories/News.csv\n",
+ "data_categories/Sensitive Subjects.csv\n",
+ "data_categories/Autos_and_Vehicles.csv\n",
+ "data_categories/Law_and_Government.csv\n",
+ "data_categories/Business_and_Industrial.csv\n",
+ "data_categories/Health.csv\n",
+ "data_categories/Real Estate.csv\n",
+ "data_categories/Books_and_Literature.csv\n",
+ "data_categories/Computers_and_Electronics.csv\n",
+ "data_categories/Internet_and_Telecom.csv\n",
+ "data_categories/Home_and_Garden.csv\n",
+ "data_categories/Jobs_and_Education.csv\n",
+ "data_categories/Online Communities.csv\n",
+ "data_categories/Finance.csv\n",
+ "data_categories/Arts_and_Entertainment.csv\n",
+ "data_categories/Games.csv\n",
+ "data_categories/Hobbies_and_Leisure.csv\n",
+ "data_categories/Reference.csv\n",
+ "data_categories/Pets_and_Animals.csv\n"
+ ]
+ }
+ ],
+ "source": [
+ "for i in range(1, 27):\n",
+ " print(path_list[i])\n",
+ " df_i= pd.read_csv(path_list[i])\n",
+ " df_i['label']= data_cat_dict_rev[i]\n",
+ " df_i['label_id']= data_cat_dict[data_cat_dict_rev[i]]\n",
+ " concat_df= pd.concat([concat_df, df_i])\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 27,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>category</th>\n",
+ " <th>label</th>\n",
+ " <th>label_id</th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>333</th>\n",
+ " <td>collection</td>\n",
+ " <td>Adult</td>\n",
+ " <td>4</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1463</th>\n",
+ " <td>Budget-friendly home decor and decoration</td>\n",
+ " <td>Shopping</td>\n",
+ " <td>3</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>556</th>\n",
+ " <td>Hair coloring ideas</td>\n",
+ " <td>Beauty_and_Fitness</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>723</th>\n",
+ " <td>Makeup for dry skin</td>\n",
+ " <td>Beauty_and_Fitness</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>417</th>\n",
+ " <td>Sports Team Fan Enthusiasm</td>\n",
+ " <td>Sports</td>\n",
+ " <td>5</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1351</th>\n",
+ " <td>Telecommunication industry innovation in healt...</td>\n",
+ " <td>Internet_and_Telecom</td>\n",
+ " <td>17</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>284</th>\n",
+ " <td>XXX gay movies</td>\n",
+ " <td>Adult</td>\n",
+ " <td>4</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1150</th>\n",
+ " <td>Bohemian outdoor garden party decor DIY projec...</td>\n",
+ " <td>Home_and_Garden</td>\n",
+ " <td>18</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>115</th>\n",
+ " <td>Travel destination skiing</td>\n",
+ " <td>Travel_and_Transportation</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>411</th>\n",
+ " <td>Citation context accuracy measurement platforms</td>\n",
+ " <td>Reference</td>\n",
+ " <td>25</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>285</th>\n",
+ " <td>Art techniques and creative process discussions</td>\n",
+ " <td>Online Communities</td>\n",
+ " <td>20</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1251</th>\n",
+ " <td>Food plating techniques for fine dining</td>\n",
+ " <td>Food_and_Drink</td>\n",
+ " <td>7</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>225</th>\n",
+ " <td>Job search for seniors</td>\n",
+ " <td>Jobs_and_Education</td>\n",
+ " <td>19</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>979</th>\n",
+ " <td>Beard care for beard grooming</td>\n",
+ " <td>Hobbies_and_Leisure</td>\n",
+ " <td>24</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>345</th>\n",
+ " <td>Travel destination local hospitality</td>\n",
+ " <td>Travel_and_Transportation</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>29</th>\n",
+ " <td>External hard drive</td>\n",
+ " <td>Computers_and_Electronics</td>\n",
+ " <td>16</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>556</th>\n",
+ " <td>Real estate contract law</td>\n",
+ " <td>Real Estate</td>\n",
+ " <td>14</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>812</th>\n",
+ " <td>Classic literature for historical research and...</td>\n",
+ " <td>Books_and_Literature</td>\n",
+ " <td>15</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>489</th>\n",
+ " <td>Theater posters for sale</td>\n",
+ " <td>Arts_and_Entertainment</td>\n",
+ " <td>22</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>873</th>\n",
+ " <td>LinkedIn job search for freelancers</td>\n",
+ " <td>Jobs_and_Education</td>\n",
+ " <td>19</td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " category \\\n",
+ "333 collection \n",
+ "1463 Budget-friendly home decor and decoration \n",
+ "556 Hair coloring ideas \n",
+ "723 Makeup for dry skin \n",
+ "417 Sports Team Fan Enthusiasm \n",
+ "1351 Telecommunication industry innovation in healt... \n",
+ "284 XXX gay movies \n",
+ "1150 Bohemian outdoor garden party decor DIY projec... \n",
+ "115 Travel destination skiing \n",
+ "411 Citation context accuracy measurement platforms \n",
+ "285 Art techniques and creative process discussions \n",
+ "1251 Food plating techniques for fine dining \n",
+ "225 Job search for seniors \n",
+ "979 Beard care for beard grooming \n",
+ "345 Travel destination local hospitality \n",
+ "29 External hard drive \n",
+ "556 Real estate contract law \n",
+ "812 Classic literature for historical research and... \n",
+ "489 Theater posters for sale \n",
+ "873 LinkedIn job search for freelancers \n",
+ "\n",
+ " label label_id \n",
+ "333 Adult 4 \n",
+ "1463 Shopping 3 \n",
+ "556 Beauty_and_Fitness 0 \n",
+ "723 Beauty_and_Fitness 0 \n",
+ "417 Sports 5 \n",
+ "1351 Internet_and_Telecom 17 \n",
+ "284 Adult 4 \n",
+ "1150 Home_and_Garden 18 \n",
+ "115 Travel_and_Transportation 2 \n",
+ "411 Reference 25 \n",
+ "285 Online Communities 20 \n",
+ "1251 Food_and_Drink 7 \n",
+ "225 Jobs_and_Education 19 \n",
+ "979 Hobbies_and_Leisure 24 \n",
+ "345 Travel_and_Transportation 2 \n",
+ "29 Computers_and_Electronics 16 \n",
+ "556 Real Estate 14 \n",
+ "812 Books_and_Literature 15 \n",
+ "489 Arts_and_Entertainment 22 \n",
+ "873 Jobs_and_Education 19 "
+ ]
+ },
+ "execution_count": 27,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "concat_df.sample(20)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 28,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "label\n",
+ "Shopping 1505\n",
+ "Food_and_Drink 1500\n",
+ "Sports 1399\n",
+ "Travel_and_Transportation 1355\n",
+ "Internet_and_Telecom 1353\n",
+ "Reference 1315\n",
+ "Beauty_and_Fitness 1259\n",
+ "People_and_Society 1250\n",
+ "Pets_and_Animals 1228\n",
+ "Law_and_Government 1226\n",
+ "Home_and_Garden 1200\n",
+ "News 1199\n",
+ "Jobs_and_Education 1188\n",
+ "Arts_and_Entertainment 1162\n",
+ "Business_and_Industrial 1124\n",
+ "Adult 1100\n",
+ "Health 1098\n",
+ "Autos_and_Vehicles 1072\n",
+ "Science 1055\n",
+ "Hobbies_and_Leisure 1049\n",
+ "Computers_and_Electronics 1000\n",
+ "Online Communities 1000\n",
+ "Finance 1000\n",
+ "Books_and_Literature 1000\n",
+ "Real Estate 1000\n",
+ "Games 700\n",
+ "Sensitive Subjects 688\n",
+ "Name: count, dtype: int64"
+ ]
+ },
+ "execution_count": 28,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "concat_df.label.value_counts()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 29,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>category</th>\n",
+ " <th>label</th>\n",
+ " <th>label_id</th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>0</th>\n",
+ " <td>Scientific literature review</td>\n",
+ " <td>Science</td>\n",
+ " <td>6</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1</th>\n",
+ " <td>LGBTQ+ community strategies</td>\n",
+ " <td>People_and_Society</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>2</th>\n",
+ " <td>Social services for vulnerable populations</td>\n",
+ " <td>People_and_Society</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>3</th>\n",
+ " <td>Graduate school admissions</td>\n",
+ " <td>Jobs_and_Education</td>\n",
+ " <td>19</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>4</th>\n",
+ " <td>Immigrant Health Education</td>\n",
+ " <td>Health</td>\n",
+ " <td>13</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>5</th>\n",
+ " <td>Travel deals for beachfront guesthouses</td>\n",
+ " <td>Travel_and_Transportation</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>6</th>\n",
+ " <td>Book subscription boxes</td>\n",
+ " <td>Books_and_Literature</td>\n",
+ " <td>15</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>7</th>\n",
+ " <td>Game streaming community building</td>\n",
+ " <td>Games</td>\n",
+ " <td>23</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>8</th>\n",
+ " <td>Retirement healthcare cost planning for health...</td>\n",
+ " <td>Finance</td>\n",
+ " <td>21</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>9</th>\n",
+ " <td>Campaign finance laws effectiveness impact</td>\n",
+ " <td>Law_and_Government</td>\n",
+ " <td>11</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>10</th>\n",
+ " <td>Vintage and antique furniture and decor items</td>\n",
+ " <td>Shopping</td>\n",
+ " <td>3</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>11</th>\n",
+ " <td>Volunteer opportunities near me</td>\n",
+ " <td>People_and_Society</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>12</th>\n",
+ " <td>Startup success stories</td>\n",
+ " <td>News</td>\n",
+ " <td>8</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>13</th>\n",
+ " <td>Internet connectivity solutions for sports org...</td>\n",
+ " <td>Internet_and_Telecom</td>\n",
+ " <td>17</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>14</th>\n",
+ " <td>Travel destination local experts</td>\n",
+ " <td>Travel_and_Transportation</td>\n",
+ " <td>2</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>15</th>\n",
+ " <td>Industrial revolution history</td>\n",
+ " <td>Business_and_Industrial</td>\n",
+ " <td>12</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>16</th>\n",
+ " <td>Backyard pond filtration systems</td>\n",
+ " <td>Home_and_Garden</td>\n",
+ " <td>18</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>17</th>\n",
+ " <td>Data center solutions providers list</td>\n",
+ " <td>Internet_and_Telecom</td>\n",
+ " <td>17</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>18</th>\n",
+ " <td>Wi-Fi signal optimization for hotels</td>\n",
+ " <td>Internet_and_Telecom</td>\n",
+ " <td>17</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>19</th>\n",
+ " <td>Smart home technology trends</td>\n",
+ " <td>Shopping</td>\n",
+ " <td>3</td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " category \\\n",
+ "0 Scientific literature review \n",
+ "1 LGBTQ+ community strategies \n",
+ "2 Social services for vulnerable populations \n",
+ "3 Graduate school admissions \n",
+ "4 Immigrant Health Education \n",
+ "5 Travel deals for beachfront guesthouses \n",
+ "6 Book subscription boxes \n",
+ "7 Game streaming community building \n",
+ "8 Retirement healthcare cost planning for health... \n",
+ "9 Campaign finance laws effectiveness impact \n",
+ "10 Vintage and antique furniture and decor items \n",
+ "11 Volunteer opportunities near me \n",
+ "12 Startup success stories \n",
+ "13 Internet connectivity solutions for sports org... \n",
+ "14 Travel destination local experts \n",
+ "15 Industrial revolution history \n",
+ "16 Backyard pond filtration systems \n",
+ "17 Data center solutions providers list \n",
+ "18 Wi-Fi signal optimization for hotels \n",
+ "19 Smart home technology trends \n",
+ "\n",
+ " label label_id \n",
+ "0 Science 6 \n",
+ "1 People_and_Society 1 \n",
+ "2 People_and_Society 1 \n",
+ "3 Jobs_and_Education 19 \n",
+ "4 Health 13 \n",
+ "5 Travel_and_Transportation 2 \n",
+ "6 Books_and_Literature 15 \n",
+ "7 Games 23 \n",
+ "8 Finance 21 \n",
+ "9 Law_and_Government 11 \n",
+ "10 Shopping 3 \n",
+ "11 People_and_Society 1 \n",
+ "12 News 8 \n",
+ "13 Internet_and_Telecom 17 \n",
+ "14 Travel_and_Transportation 2 \n",
+ "15 Business_and_Industrial 12 \n",
+ "16 Home_and_Garden 18 \n",
+ "17 Internet_and_Telecom 17 \n",
+ "18 Internet_and_Telecom 17 \n",
+ "19 Shopping 3 "
+ ]
+ },
+ "execution_count": 29,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "cdf_shffeled= concat_df.sample(frac=1).reset_index(drop=True)\n",
+ "cdf_shffeled.head(20)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "cdf_shffeled.to_csv(\n",
+ " 'data_categories/Final_Category_Data_With_Labels.csv'\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "venv",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}