{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os; os.chdir('..');"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import json"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'Hobbies_and_Leisure': 0,\n",
" 'News': 1,\n",
" 'Science': 2,\n",
" 'Autos_and_Vehicles': 3,\n",
" 'Health': 4,\n",
" 'Pets_and_Animals': 5,\n",
" 'Adult': 6,\n",
" 'Computers_and_Electronics': 7,\n",
" 'Online Communities': 8,\n",
" 'Beauty_and_Fitness': 9,\n",
" 'People_and_Society': 10,\n",
" 'Business_and_Industrial': 11,\n",
" 'Reference': 12,\n",
" 'Shopping': 13,\n",
" 'Travel_and_Transportation': 14,\n",
" 'Food_and_Drink': 15,\n",
" 'Law_and_Government': 16,\n",
" 'Books_and_Literature': 17,\n",
" 'Finance': 18,\n",
" 'Games': 19,\n",
" 'Home_and_Garden': 20,\n",
" 'Jobs_and_Education': 21,\n",
" 'Arts_and_Entertainment': 22,\n",
" 'Sensitive Subjects': 23,\n",
" 'Real Estate': 24,\n",
" 'Internet_and_Telecom': 25,\n",
" 'Sports': 26}"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_cat_dict= json.load(\n",
" open('data/categories_refined.json', 'r')\n",
")\n",
"data_cat_dict"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{0: 'Hobbies_and_Leisure',\n",
" 1: 'News',\n",
" 2: 'Science',\n",
" 3: 'Autos_and_Vehicles',\n",
" 4: 'Health',\n",
" 5: 'Pets_and_Animals',\n",
" 6: 'Adult',\n",
" 7: 'Computers_and_Electronics',\n",
" 8: 'Online Communities',\n",
" 9: 'Beauty_and_Fitness',\n",
" 10: 'People_and_Society',\n",
" 11: 'Business_and_Industrial',\n",
" 12: 'Reference',\n",
" 13: 'Shopping',\n",
" 14: 'Travel_and_Transportation',\n",
" 15: 'Food_and_Drink',\n",
" 16: 'Law_and_Government',\n",
" 17: 'Books_and_Literature',\n",
" 18: 'Finance',\n",
" 19: 'Games',\n",
" 20: 'Home_and_Garden',\n",
" 21: 'Jobs_and_Education',\n",
" 22: 'Arts_and_Entertainment',\n",
" 23: 'Sensitive Subjects',\n",
" 24: 'Real Estate',\n",
" 25: 'Internet_and_Telecom',\n",
" 26: 'Sports'}"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_cat_dict_rev= {}\n",
"for key in data_cat_dict.keys():\n",
" data_cat_dict_rev[data_cat_dict[key]] = key\n",
" \n",
"data_cat_dict_rev"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"data_categories/Hobbies_and_Leisure.csv: True\n",
"data_categories/News.csv: True\n",
"data_categories/Science.csv: True\n",
"data_categories/Autos_and_Vehicles.csv: True\n",
"data_categories/Health.csv: True\n",
"data_categories/Pets_and_Animals.csv: True\n",
"data_categories/Adult.csv: True\n",
"data_categories/Computers_and_Electronics.csv: True\n",
"data_categories/Online Communities.csv: True\n",
"data_categories/Beauty_and_Fitness.csv: True\n",
"data_categories/People_and_Society.csv: True\n",
"data_categories/Business_and_Industrial.csv: True\n",
"data_categories/Reference.csv: True\n",
"data_categories/Shopping.csv: True\n",
"data_categories/Travel_and_Transportation.csv: True\n",
"data_categories/Food_and_Drink.csv: True\n",
"data_categories/Law_and_Government.csv: True\n",
"data_categories/Books_and_Literature.csv: True\n",
"data_categories/Finance.csv: True\n",
"data_categories/Games.csv: True\n",
"data_categories/Home_and_Garden.csv: True\n",
"data_categories/Jobs_and_Education.csv: True\n",
"data_categories/Arts_and_Entertainment.csv: True\n",
"data_categories/Sensitive Subjects.csv: True\n",
"data_categories/Real Estate.csv: True\n",
"data_categories/Internet_and_Telecom.csv: True\n",
"data_categories/Sports.csv: True\n"
]
},
{
"data": {
"text/plain": [
"27"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"path_list= []\n",
"for i in data_cat_dict.keys():\n",
" path= os.path.join(\"data_categories\", f'{i}.csv')\n",
" print(f\"{path}: {os.path.exists(path)}\")\n",
" path_list.append(path)\n",
" \n",
"len(path_list)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" category | \n",
" label | \n",
" label_id | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Skincare routine | \n",
" Hobbies_and_Leisure | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" Haircare tips | \n",
" Hobbies_and_Leisure | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" Makeup tutorials | \n",
" Hobbies_and_Leisure | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" Fitness workouts | \n",
" Hobbies_and_Leisure | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" Anti-aging products | \n",
" Hobbies_and_Leisure | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" category label label_id\n",
"0 Skincare routine Hobbies_and_Leisure 0\n",
"1 Haircare tips Hobbies_and_Leisure 0\n",
"2 Makeup tutorials Hobbies_and_Leisure 0\n",
"3 Fitness workouts Hobbies_and_Leisure 0\n",
"4 Anti-aging products Hobbies_and_Leisure 0"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df= pd.read_csv(path_list[0])\n",
"df['label']= data_cat_dict_rev[0]\n",
"df['label_id']= data_cat_dict[data_cat_dict_rev[0]]\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" category | \n",
" label | \n",
" label_id | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Skincare routine | \n",
" Hobbies_and_Leisure | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" Haircare tips | \n",
" Hobbies_and_Leisure | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" Makeup tutorials | \n",
" Hobbies_and_Leisure | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" Fitness workouts | \n",
" Hobbies_and_Leisure | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" Anti-aging products | \n",
" Hobbies_and_Leisure | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" category label label_id\n",
"0 Skincare routine Hobbies_and_Leisure 0\n",
"1 Haircare tips Hobbies_and_Leisure 0\n",
"2 Makeup tutorials Hobbies_and_Leisure 0\n",
"3 Fitness workouts Hobbies_and_Leisure 0\n",
"4 Anti-aging products Hobbies_and_Leisure 0"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"concat_df= df.copy()\n",
"concat_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"data_categories/News.csv\n",
"data_categories/Science.csv\n",
"data_categories/Autos_and_Vehicles.csv\n",
"data_categories/Health.csv\n",
"data_categories/Pets_and_Animals.csv\n",
"data_categories/Adult.csv\n",
"data_categories/Computers_and_Electronics.csv\n",
"data_categories/Online Communities.csv\n",
"data_categories/Beauty_and_Fitness.csv\n",
"data_categories/People_and_Society.csv\n",
"data_categories/Business_and_Industrial.csv\n",
"data_categories/Reference.csv\n",
"data_categories/Shopping.csv\n",
"data_categories/Travel_and_Transportation.csv\n",
"data_categories/Food_and_Drink.csv\n",
"data_categories/Law_and_Government.csv\n",
"data_categories/Books_and_Literature.csv\n",
"data_categories/Finance.csv\n",
"data_categories/Games.csv\n",
"data_categories/Home_and_Garden.csv\n",
"data_categories/Jobs_and_Education.csv\n",
"data_categories/Arts_and_Entertainment.csv\n",
"data_categories/Sensitive Subjects.csv\n",
"data_categories/Real Estate.csv\n",
"data_categories/Internet_and_Telecom.csv\n",
"data_categories/Sports.csv\n"
]
}
],
"source": [
"for i in range(1, 27):\n",
" print(path_list[i])\n",
" df_i= pd.read_csv(path_list[i])\n",
" df_i['label']= data_cat_dict_rev[i]\n",
" df_i['label_id']= data_cat_dict[data_cat_dict_rev[i]]\n",
" concat_df= pd.concat([concat_df, df_i])\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" category | \n",
" label | \n",
" label_id | \n",
"
\n",
" \n",
" \n",
" \n",
" 1201 | \n",
" Plus-size clothing stores and shops | \n",
" Shopping | \n",
" 13 | \n",
"
\n",
" \n",
" 853 | \n",
" Citation context extraction techniques | \n",
" Reference | \n",
" 12 | \n",
"
\n",
" \n",
" 1034 | \n",
" Cat ear headphones with aux | \n",
" Computers_and_Electronics | \n",
" 7 | \n",
"
\n",
" \n",
" 632 | \n",
" promote such behavior | \n",
" Sensitive Subjects | \n",
" 23 | \n",
"
\n",
" \n",
" 91 | \n",
" Literature review references | \n",
" Reference | \n",
" 12 | \n",
"
\n",
" \n",
" 168 | \n",
" Freedom of speech cases | \n",
" Law_and_Government | \n",
" 16 | \n",
"
\n",
" \n",
" 1111 | \n",
" French country kitchen design inspiration DIY | \n",
" Home_and_Garden | \n",
" 20 | \n",
"
\n",
" \n",
" 492 | \n",
" Credit score improvement techniques overview | \n",
" Finance | \n",
" 18 | \n",
"
\n",
" \n",
" 657 | \n",
" regulated by laws | \n",
" Sensitive Subjects | \n",
" 23 | \n",
"
\n",
" \n",
" 1037 | \n",
" Health Education for Seniors | \n",
" Health | \n",
" 4 | \n",
"
\n",
" \n",
" 109 | \n",
" Quantum mechanics experiments | \n",
" Science | \n",
" 2 | \n",
"
\n",
" \n",
" 538 | \n",
" Healthcare AI applications | \n",
" Science | \n",
" 2 | \n",
"
\n",
" \n",
" 1386 | \n",
" AirPods Pro Case | \n",
" Computers_and_Electronics | \n",
" 7 | \n",
"
\n",
" \n",
" 844 | \n",
" DIY home electrical repairs | \n",
" Home_and_Garden | \n",
" 20 | \n",
"
\n",
" \n",
" 439 | \n",
" tube sex | \n",
" Adult | \n",
" 6 | \n",
"
\n",
" \n",
" 231 | \n",
" Real estate sales tactics | \n",
" Real Estate | \n",
" 24 | \n",
"
\n",
" \n",
" 610 | \n",
" Home solar panel cleaning and maintenance | \n",
" Home_and_Garden | \n",
" 20 | \n",
"
\n",
" \n",
" 422 | \n",
" Real estate legal issues | \n",
" Real Estate | \n",
" 24 | \n",
"
\n",
" \n",
" 222 | \n",
" Film industry news | \n",
" Arts_and_Entertainment | \n",
" 22 | \n",
"
\n",
" \n",
" 1077 | \n",
" Cat ear headphones for PS4 | \n",
" Computers_and_Electronics | \n",
" 7 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" category \\\n",
"1201 Plus-size clothing stores and shops \n",
"853 Citation context extraction techniques \n",
"1034 Cat ear headphones with aux \n",
"632 promote such behavior \n",
"91 Literature review references \n",
"168 Freedom of speech cases \n",
"1111 French country kitchen design inspiration DIY \n",
"492 Credit score improvement techniques overview \n",
"657 regulated by laws \n",
"1037 Health Education for Seniors \n",
"109 Quantum mechanics experiments \n",
"538 Healthcare AI applications \n",
"1386 AirPods Pro Case \n",
"844 DIY home electrical repairs \n",
"439 tube sex \n",
"231 Real estate sales tactics \n",
"610 Home solar panel cleaning and maintenance \n",
"422 Real estate legal issues \n",
"222 Film industry news \n",
"1077 Cat ear headphones for PS4 \n",
"\n",
" label label_id \n",
"1201 Shopping 13 \n",
"853 Reference 12 \n",
"1034 Computers_and_Electronics 7 \n",
"632 Sensitive Subjects 23 \n",
"91 Reference 12 \n",
"168 Law_and_Government 16 \n",
"1111 Home_and_Garden 20 \n",
"492 Finance 18 \n",
"657 Sensitive Subjects 23 \n",
"1037 Health 4 \n",
"109 Science 2 \n",
"538 Science 2 \n",
"1386 Computers_and_Electronics 7 \n",
"844 Home_and_Garden 20 \n",
"439 Adult 6 \n",
"231 Real Estate 24 \n",
"610 Home_and_Garden 20 \n",
"422 Real Estate 24 \n",
"222 Arts_and_Entertainment 22 \n",
"1077 Computers_and_Electronics 7 "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"concat_df.sample(20)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"label\n",
"Computers_and_Electronics 1959\n",
"Shopping 1912\n",
"Food_and_Drink 1851\n",
"Reference 1453\n",
"Sports 1399\n",
"Online Communities 1396\n",
"Travel_and_Transportation 1355\n",
"Internet_and_Telecom 1353\n",
"Pets_and_Animals 1324\n",
"Beauty_and_Fitness 1259\n",
"People_and_Society 1250\n",
"Law_and_Government 1226\n",
"Home_and_Garden 1200\n",
"News 1199\n",
"Jobs_and_Education 1188\n",
"Real Estate 1166\n",
"Arts_and_Entertainment 1162\n",
"Business_and_Industrial 1124\n",
"Adult 1100\n",
"Health 1098\n",
"Autos_and_Vehicles 1072\n",
"Science 1055\n",
"Hobbies_and_Leisure 1049\n",
"Books_and_Literature 1000\n",
"Finance 1000\n",
"Sensitive Subjects 762\n",
"Games 700\n",
"Name: count, dtype: int64"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"concat_df.label.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" category | \n",
" label | \n",
" label_id | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Internet usage monitoring | \n",
" Internet_and_Telecom | \n",
" 25 | \n",
"
\n",
" \n",
" 1 | \n",
" Food safety guidelines and regulations | \n",
" Food_and_Drink | \n",
" 15 | \n",
"
\n",
" \n",
" 2 | \n",
" Internet protocols and edge computing in finance | \n",
" Internet_and_Telecom | \n",
" 25 | \n",
"
\n",
" \n",
" 3 | \n",
" Online grocery shopping | \n",
" Food_and_Drink | \n",
" 15 | \n",
"
\n",
" \n",
" 4 | \n",
" Writing retreats for poets and novelists | \n",
" Books_and_Literature | \n",
" 17 | \n",
"
\n",
" \n",
" 5 | \n",
" Unicorn cat ear headphones | \n",
" Computers_and_Electronics | \n",
" 7 | \n",
"
\n",
" \n",
" 6 | \n",
" Reference citation context tagging techniques | \n",
" Reference | \n",
" 12 | \n",
"
\n",
" \n",
" 7 | \n",
" Motorcycle riding tips for beginners gear chec... | \n",
" Autos_and_Vehicles | \n",
" 3 | \n",
"
\n",
" \n",
" 8 | \n",
" Space agency missions | \n",
" Science | \n",
" 2 | \n",
"
\n",
" \n",
" 9 | \n",
" Game streaming self-promotion and growth tactics | \n",
" Games | \n",
" 19 | \n",
"
\n",
" \n",
" 10 | \n",
" sex videos movies | \n",
" Adult | \n",
" 6 | \n",
"
\n",
" \n",
" 11 | \n",
" Citation context organization methods | \n",
" Reference | \n",
" 12 | \n",
"
\n",
" \n",
" 12 | \n",
" Healthy office snacks | \n",
" Health | \n",
" 4 | \n",
"
\n",
" \n",
" 13 | \n",
" Indigenous rights advocacy | \n",
" People_and_Society | \n",
" 10 | \n",
"
\n",
" \n",
" 14 | \n",
" News talk shows | \n",
" News | \n",
" 1 | \n",
"
\n",
" \n",
" 15 | \n",
" Best facial cleansers | \n",
" Hobbies_and_Leisure | \n",
" 0 | \n",
"
\n",
" \n",
" 16 | \n",
" Letter of recommendation | \n",
" Reference | \n",
" 12 | \n",
"
\n",
" \n",
" 17 | \n",
" Fossil preservation techniques | \n",
" Science | \n",
" 2 | \n",
"
\n",
" \n",
" 18 | \n",
" Marriage equality | \n",
" People_and_Society | \n",
" 10 | \n",
"
\n",
" \n",
" 19 | \n",
" eSports Game Esports Player Fan Engagement Ini... | \n",
" Sports | \n",
" 26 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" category \\\n",
"0 Internet usage monitoring \n",
"1 Food safety guidelines and regulations \n",
"2 Internet protocols and edge computing in finance \n",
"3 Online grocery shopping \n",
"4 Writing retreats for poets and novelists \n",
"5 Unicorn cat ear headphones \n",
"6 Reference citation context tagging techniques \n",
"7 Motorcycle riding tips for beginners gear chec... \n",
"8 Space agency missions \n",
"9 Game streaming self-promotion and growth tactics \n",
"10 sex videos movies \n",
"11 Citation context organization methods \n",
"12 Healthy office snacks \n",
"13 Indigenous rights advocacy \n",
"14 News talk shows \n",
"15 Best facial cleansers \n",
"16 Letter of recommendation \n",
"17 Fossil preservation techniques \n",
"18 Marriage equality \n",
"19 eSports Game Esports Player Fan Engagement Ini... \n",
"\n",
" label label_id \n",
"0 Internet_and_Telecom 25 \n",
"1 Food_and_Drink 15 \n",
"2 Internet_and_Telecom 25 \n",
"3 Food_and_Drink 15 \n",
"4 Books_and_Literature 17 \n",
"5 Computers_and_Electronics 7 \n",
"6 Reference 12 \n",
"7 Autos_and_Vehicles 3 \n",
"8 Science 2 \n",
"9 Games 19 \n",
"10 Adult 6 \n",
"11 Reference 12 \n",
"12 Health 4 \n",
"13 People_and_Society 10 \n",
"14 News 1 \n",
"15 Hobbies_and_Leisure 0 \n",
"16 Reference 12 \n",
"17 Science 2 \n",
"18 People_and_Society 10 \n",
"19 Sports 26 "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cdf_shffeled= concat_df.sample(frac=1).reset_index(drop=True)\n",
"cdf_shffeled.head(20)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"cdf_shffeled.to_csv(\n",
" 'data_categories/Final_Category_Data_With_Labels.csv',\n",
" index=False\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 2
}