igorithm commited on
Commit
1a450dd
·
1 Parent(s): 7aae4ed

Dataset download utilities

Browse files
category_classification/datasets/download_common.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ from kaggle import api as kapi
4
+ import pandas as pd
5
+ from sklearn.model_selection import train_test_split as sk_train_test_split
6
+
7
+
8
+ def download_dataset(dest_dir, dataset, filename):
9
+ if (Path(dest_dir) / filename).exists():
10
+ print('Dataset already exists, do not download')
11
+ return
12
+ print('Downloading dataset...')
13
+ kapi.dataset_download_file(dataset=dataset, file_name=filename, path=dest_dir, quiet=False)
14
+
15
+
16
+ # Takes a lot of RAM
17
+ def read_dataset(dest_dir, filename) -> pd.DataFrame:
18
+ print('Reading dataset...')
19
+ json_file_path = Path(dest_dir) / filename
20
+ df = pd.read_json(json_file_path, lines=True)
21
+ print('Dataset read')
22
+ return df
23
+
24
+
25
+ def download_and_read_dataset(dest_dir, dataset, filename):
26
+ download_dataset(dest_dir=dest_dir, dataset=dataset, filename=filename)
27
+ return read_dataset(dest_dir=dest_dir, filename=filename)
28
+
29
+
30
+ def filter_columns(df: pd.DataFrame, columns) -> pd.DataFrame:
31
+ print("Removing unwanted columns...")
32
+ df = df[columns]
33
+ print("Columns removed...")
34
+ return df
35
+
36
+
37
+ def create_features_labels(df: pd.DataFrame, old_label, new_label):
38
+ def transform_categories(categories):
39
+ categories = categories.split()
40
+ category = categories[0]
41
+ if '.' in category:
42
+ return category[: category.index(".")]
43
+ return category
44
+ labels = df[old_label].apply(transform_categories)
45
+ labels = labels.rename(new_label)
46
+ features = df.drop(old_label, axis=1)
47
+ return features, labels
48
+
49
+
50
+ def train_test_split(X, y, test_size=0.25):
51
+ return sk_train_test_split(X, y, test_size=test_size, stratify=y)
52
+
53
+
54
+ def write_dataset(dest_dir, X, y, filename):
55
+ dest_dir = Path(dest_dir)
56
+ df = pd.concat((X, y), axis=1)
57
+ df.to_json(filename, orient="records", lines=True)
requirements.txt CHANGED
@@ -65,6 +65,8 @@ jupyterlab==4.3.6
65
  jupyterlab_pygments==0.3.0
66
  jupyterlab_server==2.27.3
67
  jupyterlab_widgets==3.0.13
 
 
68
  MarkupSafe==3.0.2
69
  matplotlib-inline==0.1.7
70
  mistune==3.1.3
@@ -114,6 +116,7 @@ pydeck==0.9.1
114
  Pygments==2.19.1
115
  python-dateutil==2.9.0.post0
116
  python-json-logger==3.3.0
 
117
  pytz==2025.2
118
  PyYAML==6.0.2
119
  pyzmq==26.4.0
@@ -136,6 +139,7 @@ streamlit==1.44.1
136
  sympy==1.13.1
137
  tenacity==9.1.2
138
  terminado==0.18.1
 
139
  threadpoolctl==3.6.0
140
  tinycss2==1.4.0
141
  tokenizers==0.21.1
 
65
  jupyterlab_pygments==0.3.0
66
  jupyterlab_server==2.27.3
67
  jupyterlab_widgets==3.0.13
68
+ kaggle==1.7.4.2
69
+ kagglehub==0.3.11
70
  MarkupSafe==3.0.2
71
  matplotlib-inline==0.1.7
72
  mistune==3.1.3
 
116
  Pygments==2.19.1
117
  python-dateutil==2.9.0.post0
118
  python-json-logger==3.3.0
119
+ python-slugify==8.0.4
120
  pytz==2025.2
121
  PyYAML==6.0.2
122
  pyzmq==26.4.0
 
139
  sympy==1.13.1
140
  tenacity==9.1.2
141
  terminado==0.18.1
142
+ text-unidecode==1.3
143
  threadpoolctl==3.6.0
144
  tinycss2==1.4.0
145
  tokenizers==0.21.1