Ubuntu commited on
Commit
7bd8341
1 Parent(s): fca1cc4

created the dataset for categorical classification

Browse files
data_categories/Final_Category_Data_With_Labels.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c2ba96d90a437a017a25af64364a58c7e2954ca1519a5fce27d0e55addae8da
3
+ size 1810529
research/08_organizing_the_entire_datacategories.ipynb ADDED
@@ -0,0 +1,919 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import os; os.chdir('..');"
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": 2,
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "import json"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": 3,
24
+ "metadata": {},
25
+ "outputs": [
26
+ {
27
+ "data": {
28
+ "text/plain": [
29
+ "{'Beauty_and_Fitness': 0,\n",
30
+ " 'People_and_Society': 1,\n",
31
+ " 'Travel_and_Transportation': 2,\n",
32
+ " 'Shopping': 3,\n",
33
+ " 'Adult': 4,\n",
34
+ " 'Sports': 5,\n",
35
+ " 'Science': 6,\n",
36
+ " 'Food_and_Drink': 7,\n",
37
+ " 'News': 8,\n",
38
+ " 'Sensitive Subjects': 9,\n",
39
+ " 'Autos_and_Vehicles': 10,\n",
40
+ " 'Law_and_Government': 11,\n",
41
+ " 'Business_and_Industrial': 12,\n",
42
+ " 'Health': 13,\n",
43
+ " 'Real Estate': 14,\n",
44
+ " 'Books_and_Literature': 15,\n",
45
+ " 'Computers_and_Electronics': 16,\n",
46
+ " 'Internet_and_Telecom': 17,\n",
47
+ " 'Home_and_Garden': 18,\n",
48
+ " 'Jobs_and_Education': 19,\n",
49
+ " 'Online Communities': 20,\n",
50
+ " 'Finance': 21,\n",
51
+ " 'Arts_and_Entertainment': 22,\n",
52
+ " 'Games': 23,\n",
53
+ " 'Hobbies_and_Leisure': 24,\n",
54
+ " 'Reference': 25,\n",
55
+ " 'Pets_and_Animals': 26}"
56
+ ]
57
+ },
58
+ "execution_count": 3,
59
+ "metadata": {},
60
+ "output_type": "execute_result"
61
+ }
62
+ ],
63
+ "source": [
64
+ "data_cat_dict= json.load(\n",
65
+ " open('data/categories_refined.json', 'r')\n",
66
+ ")\n",
67
+ "data_cat_dict"
68
+ ]
69
+ },
70
+ {
71
+ "cell_type": "code",
72
+ "execution_count": 5,
73
+ "metadata": {},
74
+ "outputs": [
75
+ {
76
+ "data": {
77
+ "text/plain": [
78
+ "{0: 'Beauty_and_Fitness',\n",
79
+ " 1: 'People_and_Society',\n",
80
+ " 2: 'Travel_and_Transportation',\n",
81
+ " 3: 'Shopping',\n",
82
+ " 4: 'Adult',\n",
83
+ " 5: 'Sports',\n",
84
+ " 6: 'Science',\n",
85
+ " 7: 'Food_and_Drink',\n",
86
+ " 8: 'News',\n",
87
+ " 9: 'Sensitive Subjects',\n",
88
+ " 10: 'Autos_and_Vehicles',\n",
89
+ " 11: 'Law_and_Government',\n",
90
+ " 12: 'Business_and_Industrial',\n",
91
+ " 13: 'Health',\n",
92
+ " 14: 'Real Estate',\n",
93
+ " 15: 'Books_and_Literature',\n",
94
+ " 16: 'Computers_and_Electronics',\n",
95
+ " 17: 'Internet_and_Telecom',\n",
96
+ " 18: 'Home_and_Garden',\n",
97
+ " 19: 'Jobs_and_Education',\n",
98
+ " 20: 'Online Communities',\n",
99
+ " 21: 'Finance',\n",
100
+ " 22: 'Arts_and_Entertainment',\n",
101
+ " 23: 'Games',\n",
102
+ " 24: 'Hobbies_and_Leisure',\n",
103
+ " 25: 'Reference',\n",
104
+ " 26: 'Pets_and_Animals'}"
105
+ ]
106
+ },
107
+ "execution_count": 5,
108
+ "metadata": {},
109
+ "output_type": "execute_result"
110
+ }
111
+ ],
112
+ "source": [
113
+ "data_cat_dict_rev= {}\n",
114
+ "for key in data_cat_dict.keys():\n",
115
+ " data_cat_dict_rev[data_cat_dict[key]] = key\n",
116
+ " \n",
117
+ "data_cat_dict_rev"
118
+ ]
119
+ },
120
+ {
121
+ "cell_type": "code",
122
+ "execution_count": 8,
123
+ "metadata": {},
124
+ "outputs": [
125
+ {
126
+ "name": "stdout",
127
+ "output_type": "stream",
128
+ "text": [
129
+ "data_categories/Beauty_and_Fitness.csv: True\n",
130
+ "data_categories/People_and_Society.csv: True\n",
131
+ "data_categories/Travel_and_Transportation.csv: True\n",
132
+ "data_categories/Shopping.csv: True\n",
133
+ "data_categories/Adult.csv: True\n",
134
+ "data_categories/Sports.csv: True\n",
135
+ "data_categories/Science.csv: True\n",
136
+ "data_categories/Food_and_Drink.csv: True\n",
137
+ "data_categories/News.csv: True\n",
138
+ "data_categories/Sensitive Subjects.csv: True\n",
139
+ "data_categories/Autos_and_Vehicles.csv: True\n",
140
+ "data_categories/Law_and_Government.csv: True\n",
141
+ "data_categories/Business_and_Industrial.csv: True\n",
142
+ "data_categories/Health.csv: True\n",
143
+ "data_categories/Real Estate.csv: True\n",
144
+ "data_categories/Books_and_Literature.csv: True\n",
145
+ "data_categories/Computers_and_Electronics.csv: True\n",
146
+ "data_categories/Internet_and_Telecom.csv: True\n",
147
+ "data_categories/Home_and_Garden.csv: True\n",
148
+ "data_categories/Jobs_and_Education.csv: True\n",
149
+ "data_categories/Online Communities.csv: True\n",
150
+ "data_categories/Finance.csv: True\n",
151
+ "data_categories/Arts_and_Entertainment.csv: True\n",
152
+ "data_categories/Games.csv: True\n",
153
+ "data_categories/Hobbies_and_Leisure.csv: True\n",
154
+ "data_categories/Reference.csv: True\n",
155
+ "data_categories/Pets_and_Animals.csv: True\n"
156
+ ]
157
+ },
158
+ {
159
+ "data": {
160
+ "text/plain": [
161
+ "27"
162
+ ]
163
+ },
164
+ "execution_count": 8,
165
+ "metadata": {},
166
+ "output_type": "execute_result"
167
+ }
168
+ ],
169
+ "source": [
170
+ "path_list= []\n",
171
+ "for i in data_cat_dict.keys():\n",
172
+ " path= os.path.join(\"data_categories\", f'{i}.csv')\n",
173
+ " print(f\"{path}: {os.path.exists(path)}\")\n",
174
+ " path_list.append(path)\n",
175
+ " \n",
176
+ "len(path_list)"
177
+ ]
178
+ },
179
+ {
180
+ "cell_type": "code",
181
+ "execution_count": 9,
182
+ "metadata": {},
183
+ "outputs": [],
184
+ "source": [
185
+ "import pandas as pd"
186
+ ]
187
+ },
188
+ {
189
+ "cell_type": "code",
190
+ "execution_count": 24,
191
+ "metadata": {},
192
+ "outputs": [
193
+ {
194
+ "data": {
195
+ "text/html": [
196
+ "<div>\n",
197
+ "<style scoped>\n",
198
+ " .dataframe tbody tr th:only-of-type {\n",
199
+ " vertical-align: middle;\n",
200
+ " }\n",
201
+ "\n",
202
+ " .dataframe tbody tr th {\n",
203
+ " vertical-align: top;\n",
204
+ " }\n",
205
+ "\n",
206
+ " .dataframe thead th {\n",
207
+ " text-align: right;\n",
208
+ " }\n",
209
+ "</style>\n",
210
+ "<table border=\"1\" class=\"dataframe\">\n",
211
+ " <thead>\n",
212
+ " <tr style=\"text-align: right;\">\n",
213
+ " <th></th>\n",
214
+ " <th>category</th>\n",
215
+ " <th>label</th>\n",
216
+ " <th>label_id</th>\n",
217
+ " </tr>\n",
218
+ " </thead>\n",
219
+ " <tbody>\n",
220
+ " <tr>\n",
221
+ " <th>0</th>\n",
222
+ " <td>Makeup tutorials</td>\n",
223
+ " <td>Beauty_and_Fitness</td>\n",
224
+ " <td>0</td>\n",
225
+ " </tr>\n",
226
+ " <tr>\n",
227
+ " <th>1</th>\n",
228
+ " <td>Skin care routines</td>\n",
229
+ " <td>Beauty_and_Fitness</td>\n",
230
+ " <td>0</td>\n",
231
+ " </tr>\n",
232
+ " <tr>\n",
233
+ " <th>2</th>\n",
234
+ " <td>Hairstyling tips</td>\n",
235
+ " <td>Beauty_and_Fitness</td>\n",
236
+ " <td>0</td>\n",
237
+ " </tr>\n",
238
+ " <tr>\n",
239
+ " <th>3</th>\n",
240
+ " <td>Weight loss programs</td>\n",
241
+ " <td>Beauty_and_Fitness</td>\n",
242
+ " <td>0</td>\n",
243
+ " </tr>\n",
244
+ " <tr>\n",
245
+ " <th>4</th>\n",
246
+ " <td>Yoga for beginners</td>\n",
247
+ " <td>Beauty_and_Fitness</td>\n",
248
+ " <td>0</td>\n",
249
+ " </tr>\n",
250
+ " </tbody>\n",
251
+ "</table>\n",
252
+ "</div>"
253
+ ],
254
+ "text/plain": [
255
+ " category label label_id\n",
256
+ "0 Makeup tutorials Beauty_and_Fitness 0\n",
257
+ "1 Skin care routines Beauty_and_Fitness 0\n",
258
+ "2 Hairstyling tips Beauty_and_Fitness 0\n",
259
+ "3 Weight loss programs Beauty_and_Fitness 0\n",
260
+ "4 Yoga for beginners Beauty_and_Fitness 0"
261
+ ]
262
+ },
263
+ "execution_count": 24,
264
+ "metadata": {},
265
+ "output_type": "execute_result"
266
+ }
267
+ ],
268
+ "source": [
269
+ "df= pd.read_csv(path_list[0])\n",
270
+ "df['label']= data_cat_dict_rev[0]\n",
271
+ "df['label_id']= data_cat_dict[data_cat_dict_rev[0]]\n",
272
+ "df.head()"
273
+ ]
274
+ },
275
+ {
276
+ "cell_type": "code",
277
+ "execution_count": 25,
278
+ "metadata": {},
279
+ "outputs": [
280
+ {
281
+ "data": {
282
+ "text/html": [
283
+ "<div>\n",
284
+ "<style scoped>\n",
285
+ " .dataframe tbody tr th:only-of-type {\n",
286
+ " vertical-align: middle;\n",
287
+ " }\n",
288
+ "\n",
289
+ " .dataframe tbody tr th {\n",
290
+ " vertical-align: top;\n",
291
+ " }\n",
292
+ "\n",
293
+ " .dataframe thead th {\n",
294
+ " text-align: right;\n",
295
+ " }\n",
296
+ "</style>\n",
297
+ "<table border=\"1\" class=\"dataframe\">\n",
298
+ " <thead>\n",
299
+ " <tr style=\"text-align: right;\">\n",
300
+ " <th></th>\n",
301
+ " <th>category</th>\n",
302
+ " <th>label</th>\n",
303
+ " <th>label_id</th>\n",
304
+ " </tr>\n",
305
+ " </thead>\n",
306
+ " <tbody>\n",
307
+ " <tr>\n",
308
+ " <th>0</th>\n",
309
+ " <td>Makeup tutorials</td>\n",
310
+ " <td>Beauty_and_Fitness</td>\n",
311
+ " <td>0</td>\n",
312
+ " </tr>\n",
313
+ " <tr>\n",
314
+ " <th>1</th>\n",
315
+ " <td>Skin care routines</td>\n",
316
+ " <td>Beauty_and_Fitness</td>\n",
317
+ " <td>0</td>\n",
318
+ " </tr>\n",
319
+ " <tr>\n",
320
+ " <th>2</th>\n",
321
+ " <td>Hairstyling tips</td>\n",
322
+ " <td>Beauty_and_Fitness</td>\n",
323
+ " <td>0</td>\n",
324
+ " </tr>\n",
325
+ " <tr>\n",
326
+ " <th>3</th>\n",
327
+ " <td>Weight loss programs</td>\n",
328
+ " <td>Beauty_and_Fitness</td>\n",
329
+ " <td>0</td>\n",
330
+ " </tr>\n",
331
+ " <tr>\n",
332
+ " <th>4</th>\n",
333
+ " <td>Yoga for beginners</td>\n",
334
+ " <td>Beauty_and_Fitness</td>\n",
335
+ " <td>0</td>\n",
336
+ " </tr>\n",
337
+ " </tbody>\n",
338
+ "</table>\n",
339
+ "</div>"
340
+ ],
341
+ "text/plain": [
342
+ " category label label_id\n",
343
+ "0 Makeup tutorials Beauty_and_Fitness 0\n",
344
+ "1 Skin care routines Beauty_and_Fitness 0\n",
345
+ "2 Hairstyling tips Beauty_and_Fitness 0\n",
346
+ "3 Weight loss programs Beauty_and_Fitness 0\n",
347
+ "4 Yoga for beginners Beauty_and_Fitness 0"
348
+ ]
349
+ },
350
+ "execution_count": 25,
351
+ "metadata": {},
352
+ "output_type": "execute_result"
353
+ }
354
+ ],
355
+ "source": [
356
+ "concat_df= df.copy()\n",
357
+ "concat_df.head()"
358
+ ]
359
+ },
360
+ {
361
+ "cell_type": "code",
362
+ "execution_count": 26,
363
+ "metadata": {},
364
+ "outputs": [
365
+ {
366
+ "name": "stdout",
367
+ "output_type": "stream",
368
+ "text": [
369
+ "data_categories/People_and_Society.csv\n",
370
+ "data_categories/Travel_and_Transportation.csv\n",
371
+ "data_categories/Shopping.csv\n",
372
+ "data_categories/Adult.csv\n",
373
+ "data_categories/Sports.csv\n",
374
+ "data_categories/Science.csv\n",
375
+ "data_categories/Food_and_Drink.csv\n",
376
+ "data_categories/News.csv\n",
377
+ "data_categories/Sensitive Subjects.csv\n",
378
+ "data_categories/Autos_and_Vehicles.csv\n",
379
+ "data_categories/Law_and_Government.csv\n",
380
+ "data_categories/Business_and_Industrial.csv\n",
381
+ "data_categories/Health.csv\n",
382
+ "data_categories/Real Estate.csv\n",
383
+ "data_categories/Books_and_Literature.csv\n",
384
+ "data_categories/Computers_and_Electronics.csv\n",
385
+ "data_categories/Internet_and_Telecom.csv\n",
386
+ "data_categories/Home_and_Garden.csv\n",
387
+ "data_categories/Jobs_and_Education.csv\n",
388
+ "data_categories/Online Communities.csv\n",
389
+ "data_categories/Finance.csv\n",
390
+ "data_categories/Arts_and_Entertainment.csv\n",
391
+ "data_categories/Games.csv\n",
392
+ "data_categories/Hobbies_and_Leisure.csv\n",
393
+ "data_categories/Reference.csv\n",
394
+ "data_categories/Pets_and_Animals.csv\n"
395
+ ]
396
+ }
397
+ ],
398
+ "source": [
399
+ "for i in range(1, 27):\n",
400
+ " print(path_list[i])\n",
401
+ " df_i= pd.read_csv(path_list[i])\n",
402
+ " df_i['label']= data_cat_dict_rev[i]\n",
403
+ " df_i['label_id']= data_cat_dict[data_cat_dict_rev[i]]\n",
404
+ " concat_df= pd.concat([concat_df, df_i])\n",
405
+ " "
406
+ ]
407
+ },
408
+ {
409
+ "cell_type": "code",
410
+ "execution_count": 27,
411
+ "metadata": {},
412
+ "outputs": [
413
+ {
414
+ "data": {
415
+ "text/html": [
416
+ "<div>\n",
417
+ "<style scoped>\n",
418
+ " .dataframe tbody tr th:only-of-type {\n",
419
+ " vertical-align: middle;\n",
420
+ " }\n",
421
+ "\n",
422
+ " .dataframe tbody tr th {\n",
423
+ " vertical-align: top;\n",
424
+ " }\n",
425
+ "\n",
426
+ " .dataframe thead th {\n",
427
+ " text-align: right;\n",
428
+ " }\n",
429
+ "</style>\n",
430
+ "<table border=\"1\" class=\"dataframe\">\n",
431
+ " <thead>\n",
432
+ " <tr style=\"text-align: right;\">\n",
433
+ " <th></th>\n",
434
+ " <th>category</th>\n",
435
+ " <th>label</th>\n",
436
+ " <th>label_id</th>\n",
437
+ " </tr>\n",
438
+ " </thead>\n",
439
+ " <tbody>\n",
440
+ " <tr>\n",
441
+ " <th>333</th>\n",
442
+ " <td>collection</td>\n",
443
+ " <td>Adult</td>\n",
444
+ " <td>4</td>\n",
445
+ " </tr>\n",
446
+ " <tr>\n",
447
+ " <th>1463</th>\n",
448
+ " <td>Budget-friendly home decor and decoration</td>\n",
449
+ " <td>Shopping</td>\n",
450
+ " <td>3</td>\n",
451
+ " </tr>\n",
452
+ " <tr>\n",
453
+ " <th>556</th>\n",
454
+ " <td>Hair coloring ideas</td>\n",
455
+ " <td>Beauty_and_Fitness</td>\n",
456
+ " <td>0</td>\n",
457
+ " </tr>\n",
458
+ " <tr>\n",
459
+ " <th>723</th>\n",
460
+ " <td>Makeup for dry skin</td>\n",
461
+ " <td>Beauty_and_Fitness</td>\n",
462
+ " <td>0</td>\n",
463
+ " </tr>\n",
464
+ " <tr>\n",
465
+ " <th>417</th>\n",
466
+ " <td>Sports Team Fan Enthusiasm</td>\n",
467
+ " <td>Sports</td>\n",
468
+ " <td>5</td>\n",
469
+ " </tr>\n",
470
+ " <tr>\n",
471
+ " <th>1351</th>\n",
472
+ " <td>Telecommunication industry innovation in healt...</td>\n",
473
+ " <td>Internet_and_Telecom</td>\n",
474
+ " <td>17</td>\n",
475
+ " </tr>\n",
476
+ " <tr>\n",
477
+ " <th>284</th>\n",
478
+ " <td>XXX gay movies</td>\n",
479
+ " <td>Adult</td>\n",
480
+ " <td>4</td>\n",
481
+ " </tr>\n",
482
+ " <tr>\n",
483
+ " <th>1150</th>\n",
484
+ " <td>Bohemian outdoor garden party decor DIY projec...</td>\n",
485
+ " <td>Home_and_Garden</td>\n",
486
+ " <td>18</td>\n",
487
+ " </tr>\n",
488
+ " <tr>\n",
489
+ " <th>115</th>\n",
490
+ " <td>Travel destination skiing</td>\n",
491
+ " <td>Travel_and_Transportation</td>\n",
492
+ " <td>2</td>\n",
493
+ " </tr>\n",
494
+ " <tr>\n",
495
+ " <th>411</th>\n",
496
+ " <td>Citation context accuracy measurement platforms</td>\n",
497
+ " <td>Reference</td>\n",
498
+ " <td>25</td>\n",
499
+ " </tr>\n",
500
+ " <tr>\n",
501
+ " <th>285</th>\n",
502
+ " <td>Art techniques and creative process discussions</td>\n",
503
+ " <td>Online Communities</td>\n",
504
+ " <td>20</td>\n",
505
+ " </tr>\n",
506
+ " <tr>\n",
507
+ " <th>1251</th>\n",
508
+ " <td>Food plating techniques for fine dining</td>\n",
509
+ " <td>Food_and_Drink</td>\n",
510
+ " <td>7</td>\n",
511
+ " </tr>\n",
512
+ " <tr>\n",
513
+ " <th>225</th>\n",
514
+ " <td>Job search for seniors</td>\n",
515
+ " <td>Jobs_and_Education</td>\n",
516
+ " <td>19</td>\n",
517
+ " </tr>\n",
518
+ " <tr>\n",
519
+ " <th>979</th>\n",
520
+ " <td>Beard care for beard grooming</td>\n",
521
+ " <td>Hobbies_and_Leisure</td>\n",
522
+ " <td>24</td>\n",
523
+ " </tr>\n",
524
+ " <tr>\n",
525
+ " <th>345</th>\n",
526
+ " <td>Travel destination local hospitality</td>\n",
527
+ " <td>Travel_and_Transportation</td>\n",
528
+ " <td>2</td>\n",
529
+ " </tr>\n",
530
+ " <tr>\n",
531
+ " <th>29</th>\n",
532
+ " <td>External hard drive</td>\n",
533
+ " <td>Computers_and_Electronics</td>\n",
534
+ " <td>16</td>\n",
535
+ " </tr>\n",
536
+ " <tr>\n",
537
+ " <th>556</th>\n",
538
+ " <td>Real estate contract law</td>\n",
539
+ " <td>Real Estate</td>\n",
540
+ " <td>14</td>\n",
541
+ " </tr>\n",
542
+ " <tr>\n",
543
+ " <th>812</th>\n",
544
+ " <td>Classic literature for historical research and...</td>\n",
545
+ " <td>Books_and_Literature</td>\n",
546
+ " <td>15</td>\n",
547
+ " </tr>\n",
548
+ " <tr>\n",
549
+ " <th>489</th>\n",
550
+ " <td>Theater posters for sale</td>\n",
551
+ " <td>Arts_and_Entertainment</td>\n",
552
+ " <td>22</td>\n",
553
+ " </tr>\n",
554
+ " <tr>\n",
555
+ " <th>873</th>\n",
556
+ " <td>LinkedIn job search for freelancers</td>\n",
557
+ " <td>Jobs_and_Education</td>\n",
558
+ " <td>19</td>\n",
559
+ " </tr>\n",
560
+ " </tbody>\n",
561
+ "</table>\n",
562
+ "</div>"
563
+ ],
564
+ "text/plain": [
565
+ " category \\\n",
566
+ "333 collection \n",
567
+ "1463 Budget-friendly home decor and decoration \n",
568
+ "556 Hair coloring ideas \n",
569
+ "723 Makeup for dry skin \n",
570
+ "417 Sports Team Fan Enthusiasm \n",
571
+ "1351 Telecommunication industry innovation in healt... \n",
572
+ "284 XXX gay movies \n",
573
+ "1150 Bohemian outdoor garden party decor DIY projec... \n",
574
+ "115 Travel destination skiing \n",
575
+ "411 Citation context accuracy measurement platforms \n",
576
+ "285 Art techniques and creative process discussions \n",
577
+ "1251 Food plating techniques for fine dining \n",
578
+ "225 Job search for seniors \n",
579
+ "979 Beard care for beard grooming \n",
580
+ "345 Travel destination local hospitality \n",
581
+ "29 External hard drive \n",
582
+ "556 Real estate contract law \n",
583
+ "812 Classic literature for historical research and... \n",
584
+ "489 Theater posters for sale \n",
585
+ "873 LinkedIn job search for freelancers \n",
586
+ "\n",
587
+ " label label_id \n",
588
+ "333 Adult 4 \n",
589
+ "1463 Shopping 3 \n",
590
+ "556 Beauty_and_Fitness 0 \n",
591
+ "723 Beauty_and_Fitness 0 \n",
592
+ "417 Sports 5 \n",
593
+ "1351 Internet_and_Telecom 17 \n",
594
+ "284 Adult 4 \n",
595
+ "1150 Home_and_Garden 18 \n",
596
+ "115 Travel_and_Transportation 2 \n",
597
+ "411 Reference 25 \n",
598
+ "285 Online Communities 20 \n",
599
+ "1251 Food_and_Drink 7 \n",
600
+ "225 Jobs_and_Education 19 \n",
601
+ "979 Hobbies_and_Leisure 24 \n",
602
+ "345 Travel_and_Transportation 2 \n",
603
+ "29 Computers_and_Electronics 16 \n",
604
+ "556 Real Estate 14 \n",
605
+ "812 Books_and_Literature 15 \n",
606
+ "489 Arts_and_Entertainment 22 \n",
607
+ "873 Jobs_and_Education 19 "
608
+ ]
609
+ },
610
+ "execution_count": 27,
611
+ "metadata": {},
612
+ "output_type": "execute_result"
613
+ }
614
+ ],
615
+ "source": [
616
+ "concat_df.sample(20)"
617
+ ]
618
+ },
619
+ {
620
+ "cell_type": "code",
621
+ "execution_count": 28,
622
+ "metadata": {},
623
+ "outputs": [
624
+ {
625
+ "data": {
626
+ "text/plain": [
627
+ "label\n",
628
+ "Shopping 1505\n",
629
+ "Food_and_Drink 1500\n",
630
+ "Sports 1399\n",
631
+ "Travel_and_Transportation 1355\n",
632
+ "Internet_and_Telecom 1353\n",
633
+ "Reference 1315\n",
634
+ "Beauty_and_Fitness 1259\n",
635
+ "People_and_Society 1250\n",
636
+ "Pets_and_Animals 1228\n",
637
+ "Law_and_Government 1226\n",
638
+ "Home_and_Garden 1200\n",
639
+ "News 1199\n",
640
+ "Jobs_and_Education 1188\n",
641
+ "Arts_and_Entertainment 1162\n",
642
+ "Business_and_Industrial 1124\n",
643
+ "Adult 1100\n",
644
+ "Health 1098\n",
645
+ "Autos_and_Vehicles 1072\n",
646
+ "Science 1055\n",
647
+ "Hobbies_and_Leisure 1049\n",
648
+ "Computers_and_Electronics 1000\n",
649
+ "Online Communities 1000\n",
650
+ "Finance 1000\n",
651
+ "Books_and_Literature 1000\n",
652
+ "Real Estate 1000\n",
653
+ "Games 700\n",
654
+ "Sensitive Subjects 688\n",
655
+ "Name: count, dtype: int64"
656
+ ]
657
+ },
658
+ "execution_count": 28,
659
+ "metadata": {},
660
+ "output_type": "execute_result"
661
+ }
662
+ ],
663
+ "source": [
664
+ "concat_df.label.value_counts()"
665
+ ]
666
+ },
667
+ {
668
+ "cell_type": "code",
669
+ "execution_count": 29,
670
+ "metadata": {},
671
+ "outputs": [
672
+ {
673
+ "data": {
674
+ "text/html": [
675
+ "<div>\n",
676
+ "<style scoped>\n",
677
+ " .dataframe tbody tr th:only-of-type {\n",
678
+ " vertical-align: middle;\n",
679
+ " }\n",
680
+ "\n",
681
+ " .dataframe tbody tr th {\n",
682
+ " vertical-align: top;\n",
683
+ " }\n",
684
+ "\n",
685
+ " .dataframe thead th {\n",
686
+ " text-align: right;\n",
687
+ " }\n",
688
+ "</style>\n",
689
+ "<table border=\"1\" class=\"dataframe\">\n",
690
+ " <thead>\n",
691
+ " <tr style=\"text-align: right;\">\n",
692
+ " <th></th>\n",
693
+ " <th>category</th>\n",
694
+ " <th>label</th>\n",
695
+ " <th>label_id</th>\n",
696
+ " </tr>\n",
697
+ " </thead>\n",
698
+ " <tbody>\n",
699
+ " <tr>\n",
700
+ " <th>0</th>\n",
701
+ " <td>Scientific literature review</td>\n",
702
+ " <td>Science</td>\n",
703
+ " <td>6</td>\n",
704
+ " </tr>\n",
705
+ " <tr>\n",
706
+ " <th>1</th>\n",
707
+ " <td>LGBTQ+ community strategies</td>\n",
708
+ " <td>People_and_Society</td>\n",
709
+ " <td>1</td>\n",
710
+ " </tr>\n",
711
+ " <tr>\n",
712
+ " <th>2</th>\n",
713
+ " <td>Social services for vulnerable populations</td>\n",
714
+ " <td>People_and_Society</td>\n",
715
+ " <td>1</td>\n",
716
+ " </tr>\n",
717
+ " <tr>\n",
718
+ " <th>3</th>\n",
719
+ " <td>Graduate school admissions</td>\n",
720
+ " <td>Jobs_and_Education</td>\n",
721
+ " <td>19</td>\n",
722
+ " </tr>\n",
723
+ " <tr>\n",
724
+ " <th>4</th>\n",
725
+ " <td>Immigrant Health Education</td>\n",
726
+ " <td>Health</td>\n",
727
+ " <td>13</td>\n",
728
+ " </tr>\n",
729
+ " <tr>\n",
730
+ " <th>5</th>\n",
731
+ " <td>Travel deals for beachfront guesthouses</td>\n",
732
+ " <td>Travel_and_Transportation</td>\n",
733
+ " <td>2</td>\n",
734
+ " </tr>\n",
735
+ " <tr>\n",
736
+ " <th>6</th>\n",
737
+ " <td>Book subscription boxes</td>\n",
738
+ " <td>Books_and_Literature</td>\n",
739
+ " <td>15</td>\n",
740
+ " </tr>\n",
741
+ " <tr>\n",
742
+ " <th>7</th>\n",
743
+ " <td>Game streaming community building</td>\n",
744
+ " <td>Games</td>\n",
745
+ " <td>23</td>\n",
746
+ " </tr>\n",
747
+ " <tr>\n",
748
+ " <th>8</th>\n",
749
+ " <td>Retirement healthcare cost planning for health...</td>\n",
750
+ " <td>Finance</td>\n",
751
+ " <td>21</td>\n",
752
+ " </tr>\n",
753
+ " <tr>\n",
754
+ " <th>9</th>\n",
755
+ " <td>Campaign finance laws effectiveness impact</td>\n",
756
+ " <td>Law_and_Government</td>\n",
757
+ " <td>11</td>\n",
758
+ " </tr>\n",
759
+ " <tr>\n",
760
+ " <th>10</th>\n",
761
+ " <td>Vintage and antique furniture and decor items</td>\n",
762
+ " <td>Shopping</td>\n",
763
+ " <td>3</td>\n",
764
+ " </tr>\n",
765
+ " <tr>\n",
766
+ " <th>11</th>\n",
767
+ " <td>Volunteer opportunities near me</td>\n",
768
+ " <td>People_and_Society</td>\n",
769
+ " <td>1</td>\n",
770
+ " </tr>\n",
771
+ " <tr>\n",
772
+ " <th>12</th>\n",
773
+ " <td>Startup success stories</td>\n",
774
+ " <td>News</td>\n",
775
+ " <td>8</td>\n",
776
+ " </tr>\n",
777
+ " <tr>\n",
778
+ " <th>13</th>\n",
779
+ " <td>Internet connectivity solutions for sports org...</td>\n",
780
+ " <td>Internet_and_Telecom</td>\n",
781
+ " <td>17</td>\n",
782
+ " </tr>\n",
783
+ " <tr>\n",
784
+ " <th>14</th>\n",
785
+ " <td>Travel destination local experts</td>\n",
786
+ " <td>Travel_and_Transportation</td>\n",
787
+ " <td>2</td>\n",
788
+ " </tr>\n",
789
+ " <tr>\n",
790
+ " <th>15</th>\n",
791
+ " <td>Industrial revolution history</td>\n",
792
+ " <td>Business_and_Industrial</td>\n",
793
+ " <td>12</td>\n",
794
+ " </tr>\n",
795
+ " <tr>\n",
796
+ " <th>16</th>\n",
797
+ " <td>Backyard pond filtration systems</td>\n",
798
+ " <td>Home_and_Garden</td>\n",
799
+ " <td>18</td>\n",
800
+ " </tr>\n",
801
+ " <tr>\n",
802
+ " <th>17</th>\n",
803
+ " <td>Data center solutions providers list</td>\n",
804
+ " <td>Internet_and_Telecom</td>\n",
805
+ " <td>17</td>\n",
806
+ " </tr>\n",
807
+ " <tr>\n",
808
+ " <th>18</th>\n",
809
+ " <td>Wi-Fi signal optimization for hotels</td>\n",
810
+ " <td>Internet_and_Telecom</td>\n",
811
+ " <td>17</td>\n",
812
+ " </tr>\n",
813
+ " <tr>\n",
814
+ " <th>19</th>\n",
815
+ " <td>Smart home technology trends</td>\n",
816
+ " <td>Shopping</td>\n",
817
+ " <td>3</td>\n",
818
+ " </tr>\n",
819
+ " </tbody>\n",
820
+ "</table>\n",
821
+ "</div>"
822
+ ],
823
+ "text/plain": [
824
+ " category \\\n",
825
+ "0 Scientific literature review \n",
826
+ "1 LGBTQ+ community strategies \n",
827
+ "2 Social services for vulnerable populations \n",
828
+ "3 Graduate school admissions \n",
829
+ "4 Immigrant Health Education \n",
830
+ "5 Travel deals for beachfront guesthouses \n",
831
+ "6 Book subscription boxes \n",
832
+ "7 Game streaming community building \n",
833
+ "8 Retirement healthcare cost planning for health... \n",
834
+ "9 Campaign finance laws effectiveness impact \n",
835
+ "10 Vintage and antique furniture and decor items \n",
836
+ "11 Volunteer opportunities near me \n",
837
+ "12 Startup success stories \n",
838
+ "13 Internet connectivity solutions for sports org... \n",
839
+ "14 Travel destination local experts \n",
840
+ "15 Industrial revolution history \n",
841
+ "16 Backyard pond filtration systems \n",
842
+ "17 Data center solutions providers list \n",
843
+ "18 Wi-Fi signal optimization for hotels \n",
844
+ "19 Smart home technology trends \n",
845
+ "\n",
846
+ " label label_id \n",
847
+ "0 Science 6 \n",
848
+ "1 People_and_Society 1 \n",
849
+ "2 People_and_Society 1 \n",
850
+ "3 Jobs_and_Education 19 \n",
851
+ "4 Health 13 \n",
852
+ "5 Travel_and_Transportation 2 \n",
853
+ "6 Books_and_Literature 15 \n",
854
+ "7 Games 23 \n",
855
+ "8 Finance 21 \n",
856
+ "9 Law_and_Government 11 \n",
857
+ "10 Shopping 3 \n",
858
+ "11 People_and_Society 1 \n",
859
+ "12 News 8 \n",
860
+ "13 Internet_and_Telecom 17 \n",
861
+ "14 Travel_and_Transportation 2 \n",
862
+ "15 Business_and_Industrial 12 \n",
863
+ "16 Home_and_Garden 18 \n",
864
+ "17 Internet_and_Telecom 17 \n",
865
+ "18 Internet_and_Telecom 17 \n",
866
+ "19 Shopping 3 "
867
+ ]
868
+ },
869
+ "execution_count": 29,
870
+ "metadata": {},
871
+ "output_type": "execute_result"
872
+ }
873
+ ],
874
+ "source": [
875
+ "cdf_shffeled= concat_df.sample(frac=1).reset_index(drop=True)\n",
876
+ "cdf_shffeled.head(20)"
877
+ ]
878
+ },
879
+ {
880
+ "cell_type": "code",
881
+ "execution_count": 23,
882
+ "metadata": {},
883
+ "outputs": [],
884
+ "source": [
885
+ "cdf_shffeled.to_csv(\n",
886
+ " 'data_categories/Final_Category_Data_With_Labels.csv'\n",
887
+ ")"
888
+ ]
889
+ },
890
+ {
891
+ "cell_type": "code",
892
+ "execution_count": null,
893
+ "metadata": {},
894
+ "outputs": [],
895
+ "source": []
896
+ }
897
+ ],
898
+ "metadata": {
899
+ "kernelspec": {
900
+ "display_name": "venv",
901
+ "language": "python",
902
+ "name": "python3"
903
+ },
904
+ "language_info": {
905
+ "codemirror_mode": {
906
+ "name": "ipython",
907
+ "version": 3
908
+ },
909
+ "file_extension": ".py",
910
+ "mimetype": "text/x-python",
911
+ "name": "python",
912
+ "nbconvert_exporter": "python",
913
+ "pygments_lexer": "ipython3",
914
+ "version": "3.10.12"
915
+ }
916
+ },
917
+ "nbformat": 4,
918
+ "nbformat_minor": 2
919
+ }