{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os; os.chdir('..');" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import json" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'Hobbies_and_Leisure': 0,\n", " 'News': 1,\n", " 'Science': 2,\n", " 'Autos_and_Vehicles': 3,\n", " 'Health': 4,\n", " 'Pets_and_Animals': 5,\n", " 'Adult': 6,\n", " 'Computers_and_Electronics': 7,\n", " 'Online Communities': 8,\n", " 'Beauty_and_Fitness': 9,\n", " 'People_and_Society': 10,\n", " 'Business_and_Industrial': 11,\n", " 'Reference': 12,\n", " 'Shopping': 13,\n", " 'Travel_and_Transportation': 14,\n", " 'Food_and_Drink': 15,\n", " 'Law_and_Government': 16,\n", " 'Books_and_Literature': 17,\n", " 'Finance': 18,\n", " 'Games': 19,\n", " 'Home_and_Garden': 20,\n", " 'Jobs_and_Education': 21,\n", " 'Arts_and_Entertainment': 22,\n", " 'Sensitive Subjects': 23,\n", " 'Real Estate': 24,\n", " 'Internet_and_Telecom': 25,\n", " 'Sports': 26}" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_cat_dict= json.load(\n", " open('data/categories_refined.json', 'r')\n", ")\n", "data_cat_dict" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{0: 'Hobbies_and_Leisure',\n", " 1: 'News',\n", " 2: 'Science',\n", " 3: 'Autos_and_Vehicles',\n", " 4: 'Health',\n", " 5: 'Pets_and_Animals',\n", " 6: 'Adult',\n", " 7: 'Computers_and_Electronics',\n", " 8: 'Online Communities',\n", " 9: 'Beauty_and_Fitness',\n", " 10: 'People_and_Society',\n", " 11: 'Business_and_Industrial',\n", " 12: 'Reference',\n", " 13: 'Shopping',\n", " 14: 'Travel_and_Transportation',\n", " 15: 'Food_and_Drink',\n", " 16: 'Law_and_Government',\n", " 17: 'Books_and_Literature',\n", " 18: 'Finance',\n", " 19: 'Games',\n", " 20: 'Home_and_Garden',\n", " 21: 'Jobs_and_Education',\n", " 22: 'Arts_and_Entertainment',\n", " 23: 'Sensitive Subjects',\n", " 24: 'Real Estate',\n", " 25: 'Internet_and_Telecom',\n", " 26: 'Sports'}" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_cat_dict_rev= {}\n", "for key in data_cat_dict.keys():\n", " data_cat_dict_rev[data_cat_dict[key]] = key\n", " \n", "data_cat_dict_rev" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "data_categories/Hobbies_and_Leisure.csv: True\n", "data_categories/News.csv: True\n", "data_categories/Science.csv: True\n", "data_categories/Autos_and_Vehicles.csv: True\n", "data_categories/Health.csv: True\n", "data_categories/Pets_and_Animals.csv: True\n", "data_categories/Adult.csv: True\n", "data_categories/Computers_and_Electronics.csv: True\n", "data_categories/Online Communities.csv: True\n", "data_categories/Beauty_and_Fitness.csv: True\n", "data_categories/People_and_Society.csv: True\n", "data_categories/Business_and_Industrial.csv: True\n", "data_categories/Reference.csv: True\n", "data_categories/Shopping.csv: True\n", "data_categories/Travel_and_Transportation.csv: True\n", "data_categories/Food_and_Drink.csv: True\n", "data_categories/Law_and_Government.csv: True\n", "data_categories/Books_and_Literature.csv: True\n", "data_categories/Finance.csv: True\n", "data_categories/Games.csv: True\n", "data_categories/Home_and_Garden.csv: True\n", "data_categories/Jobs_and_Education.csv: True\n", "data_categories/Arts_and_Entertainment.csv: True\n", "data_categories/Sensitive Subjects.csv: True\n", "data_categories/Real Estate.csv: True\n", "data_categories/Internet_and_Telecom.csv: True\n", "data_categories/Sports.csv: True\n" ] }, { "data": { "text/plain": [ "27" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "path_list= []\n", "for i in data_cat_dict.keys():\n", " path= os.path.join(\"data_categories\", f'{i}.csv')\n", " print(f\"{path}: {os.path.exists(path)}\")\n", " path_list.append(path)\n", " \n", "len(path_list)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
categorylabellabel_id
0Skincare routineHobbies_and_Leisure0
1Haircare tipsHobbies_and_Leisure0
2Makeup tutorialsHobbies_and_Leisure0
3Fitness workoutsHobbies_and_Leisure0
4Anti-aging productsHobbies_and_Leisure0
\n", "
" ], "text/plain": [ " category label label_id\n", "0 Skincare routine Hobbies_and_Leisure 0\n", "1 Haircare tips Hobbies_and_Leisure 0\n", "2 Makeup tutorials Hobbies_and_Leisure 0\n", "3 Fitness workouts Hobbies_and_Leisure 0\n", "4 Anti-aging products Hobbies_and_Leisure 0" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df= pd.read_csv(path_list[0])\n", "df['label']= data_cat_dict_rev[0]\n", "df['label_id']= data_cat_dict[data_cat_dict_rev[0]]\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
categorylabellabel_id
0Skincare routineHobbies_and_Leisure0
1Haircare tipsHobbies_and_Leisure0
2Makeup tutorialsHobbies_and_Leisure0
3Fitness workoutsHobbies_and_Leisure0
4Anti-aging productsHobbies_and_Leisure0
\n", "
" ], "text/plain": [ " category label label_id\n", "0 Skincare routine Hobbies_and_Leisure 0\n", "1 Haircare tips Hobbies_and_Leisure 0\n", "2 Makeup tutorials Hobbies_and_Leisure 0\n", "3 Fitness workouts Hobbies_and_Leisure 0\n", "4 Anti-aging products Hobbies_and_Leisure 0" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "concat_df= df.copy()\n", "concat_df.head()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "data_categories/News.csv\n", "data_categories/Science.csv\n", "data_categories/Autos_and_Vehicles.csv\n", "data_categories/Health.csv\n", "data_categories/Pets_and_Animals.csv\n", "data_categories/Adult.csv\n", "data_categories/Computers_and_Electronics.csv\n", "data_categories/Online Communities.csv\n", "data_categories/Beauty_and_Fitness.csv\n", "data_categories/People_and_Society.csv\n", "data_categories/Business_and_Industrial.csv\n", "data_categories/Reference.csv\n", "data_categories/Shopping.csv\n", "data_categories/Travel_and_Transportation.csv\n", "data_categories/Food_and_Drink.csv\n", "data_categories/Law_and_Government.csv\n", "data_categories/Books_and_Literature.csv\n", "data_categories/Finance.csv\n", "data_categories/Games.csv\n", "data_categories/Home_and_Garden.csv\n", "data_categories/Jobs_and_Education.csv\n", "data_categories/Arts_and_Entertainment.csv\n", "data_categories/Sensitive Subjects.csv\n", "data_categories/Real Estate.csv\n", "data_categories/Internet_and_Telecom.csv\n", "data_categories/Sports.csv\n" ] } ], "source": [ "for i in range(1, 27):\n", " print(path_list[i])\n", " df_i= pd.read_csv(path_list[i])\n", " df_i['label']= data_cat_dict_rev[i]\n", " df_i['label_id']= data_cat_dict[data_cat_dict_rev[i]]\n", " concat_df= pd.concat([concat_df, df_i])\n", " " ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
categorylabellabel_id
1201Plus-size clothing stores and shopsShopping13
853Citation context extraction techniquesReference12
1034Cat ear headphones with auxComputers_and_Electronics7
632promote such behaviorSensitive Subjects23
91Literature review referencesReference12
168Freedom of speech casesLaw_and_Government16
1111French country kitchen design inspiration DIYHome_and_Garden20
492Credit score improvement techniques overviewFinance18
657regulated by lawsSensitive Subjects23
1037Health Education for SeniorsHealth4
109Quantum mechanics experimentsScience2
538Healthcare AI applicationsScience2
1386AirPods Pro CaseComputers_and_Electronics7
844DIY home electrical repairsHome_and_Garden20
439tube sexAdult6
231Real estate sales tacticsReal Estate24
610Home solar panel cleaning and maintenanceHome_and_Garden20
422Real estate legal issuesReal Estate24
222Film industry newsArts_and_Entertainment22
1077Cat ear headphones for PS4Computers_and_Electronics7
\n", "
" ], "text/plain": [ " category \\\n", "1201 Plus-size clothing stores and shops \n", "853 Citation context extraction techniques \n", "1034 Cat ear headphones with aux \n", "632 promote such behavior \n", "91 Literature review references \n", "168 Freedom of speech cases \n", "1111 French country kitchen design inspiration DIY \n", "492 Credit score improvement techniques overview \n", "657 regulated by laws \n", "1037 Health Education for Seniors \n", "109 Quantum mechanics experiments \n", "538 Healthcare AI applications \n", "1386 AirPods Pro Case \n", "844 DIY home electrical repairs \n", "439 tube sex \n", "231 Real estate sales tactics \n", "610 Home solar panel cleaning and maintenance \n", "422 Real estate legal issues \n", "222 Film industry news \n", "1077 Cat ear headphones for PS4 \n", "\n", " label label_id \n", "1201 Shopping 13 \n", "853 Reference 12 \n", "1034 Computers_and_Electronics 7 \n", "632 Sensitive Subjects 23 \n", "91 Reference 12 \n", "168 Law_and_Government 16 \n", "1111 Home_and_Garden 20 \n", "492 Finance 18 \n", "657 Sensitive Subjects 23 \n", "1037 Health 4 \n", "109 Science 2 \n", "538 Science 2 \n", "1386 Computers_and_Electronics 7 \n", "844 Home_and_Garden 20 \n", "439 Adult 6 \n", "231 Real Estate 24 \n", "610 Home_and_Garden 20 \n", "422 Real Estate 24 \n", "222 Arts_and_Entertainment 22 \n", "1077 Computers_and_Electronics 7 " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "concat_df.sample(20)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "label\n", "Computers_and_Electronics 1959\n", "Shopping 1912\n", "Food_and_Drink 1851\n", "Reference 1453\n", "Sports 1399\n", "Online Communities 1396\n", "Travel_and_Transportation 1355\n", "Internet_and_Telecom 1353\n", "Pets_and_Animals 1324\n", "Beauty_and_Fitness 1259\n", "People_and_Society 1250\n", "Law_and_Government 1226\n", "Home_and_Garden 1200\n", "News 1199\n", "Jobs_and_Education 1188\n", "Real Estate 1166\n", "Arts_and_Entertainment 1162\n", "Business_and_Industrial 1124\n", "Adult 1100\n", "Health 1098\n", "Autos_and_Vehicles 1072\n", "Science 1055\n", "Hobbies_and_Leisure 1049\n", "Books_and_Literature 1000\n", "Finance 1000\n", "Sensitive Subjects 762\n", "Games 700\n", "Name: count, dtype: int64" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "concat_df.label.value_counts()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
categorylabellabel_id
0Internet usage monitoringInternet_and_Telecom25
1Food safety guidelines and regulationsFood_and_Drink15
2Internet protocols and edge computing in financeInternet_and_Telecom25
3Online grocery shoppingFood_and_Drink15
4Writing retreats for poets and novelistsBooks_and_Literature17
5Unicorn cat ear headphonesComputers_and_Electronics7
6Reference citation context tagging techniquesReference12
7Motorcycle riding tips for beginners gear chec...Autos_and_Vehicles3
8Space agency missionsScience2
9Game streaming self-promotion and growth tacticsGames19
10sex videos moviesAdult6
11Citation context organization methodsReference12
12Healthy office snacksHealth4
13Indigenous rights advocacyPeople_and_Society10
14News talk showsNews1
15Best facial cleansersHobbies_and_Leisure0
16Letter of recommendationReference12
17Fossil preservation techniquesScience2
18Marriage equalityPeople_and_Society10
19eSports Game Esports Player Fan Engagement Ini...Sports26
\n", "
" ], "text/plain": [ " category \\\n", "0 Internet usage monitoring \n", "1 Food safety guidelines and regulations \n", "2 Internet protocols and edge computing in finance \n", "3 Online grocery shopping \n", "4 Writing retreats for poets and novelists \n", "5 Unicorn cat ear headphones \n", "6 Reference citation context tagging techniques \n", "7 Motorcycle riding tips for beginners gear chec... \n", "8 Space agency missions \n", "9 Game streaming self-promotion and growth tactics \n", "10 sex videos movies \n", "11 Citation context organization methods \n", "12 Healthy office snacks \n", "13 Indigenous rights advocacy \n", "14 News talk shows \n", "15 Best facial cleansers \n", "16 Letter of recommendation \n", "17 Fossil preservation techniques \n", "18 Marriage equality \n", "19 eSports Game Esports Player Fan Engagement Ini... \n", "\n", " label label_id \n", "0 Internet_and_Telecom 25 \n", "1 Food_and_Drink 15 \n", "2 Internet_and_Telecom 25 \n", "3 Food_and_Drink 15 \n", "4 Books_and_Literature 17 \n", "5 Computers_and_Electronics 7 \n", "6 Reference 12 \n", "7 Autos_and_Vehicles 3 \n", "8 Science 2 \n", "9 Games 19 \n", "10 Adult 6 \n", "11 Reference 12 \n", "12 Health 4 \n", "13 People_and_Society 10 \n", "14 News 1 \n", "15 Hobbies_and_Leisure 0 \n", "16 Reference 12 \n", "17 Science 2 \n", "18 People_and_Society 10 \n", "19 Sports 26 " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cdf_shffeled= concat_df.sample(frac=1).reset_index(drop=True)\n", "cdf_shffeled.head(20)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "cdf_shffeled.to_csv(\n", " 'data_categories/Final_Category_Data_With_Labels.csv',\n", " index=False\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 2 }