Spaces:

HibiscusMaximus
/

PaperClassification

Sleeping

App Files Files Community

igorithm commited on Apr 9

Commit

1a450dd

1 Parent(s): 7aae4ed

Dataset download utilities

Browse files

Files changed (2) hide show

category_classification/datasets/download_common.py +57 -0
requirements.txt +4 -0

category_classification/datasets/download_common.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from pathlib import Path
+from kaggle import api as kapi
+import pandas as pd
+from sklearn.model_selection import train_test_split as sk_train_test_split
+def download_dataset(dest_dir, dataset, filename):
+    if (Path(dest_dir) / filename).exists():
+        print('Dataset already exists, do not download')
+        return
+    print('Downloading dataset...')
+    kapi.dataset_download_file(dataset=dataset, file_name=filename, path=dest_dir, quiet=False)
+# Takes a lot of RAM
+def read_dataset(dest_dir, filename) -> pd.DataFrame:
+    print('Reading dataset...')
+    json_file_path = Path(dest_dir) / filename
+    df = pd.read_json(json_file_path, lines=True)
+    print('Dataset read')
+    return df
+def download_and_read_dataset(dest_dir, dataset, filename):
+    download_dataset(dest_dir=dest_dir, dataset=dataset, filename=filename)
+    return read_dataset(dest_dir=dest_dir, filename=filename)
+def filter_columns(df: pd.DataFrame, columns) -> pd.DataFrame:
+    print("Removing unwanted columns...")
+    df = df[columns]
+    print("Columns removed...")
+    return df
+def create_features_labels(df: pd.DataFrame, old_label, new_label):
+    def transform_categories(categories):
+        categories = categories.split()
+        category = categories[0]
+        if '.' in category:
+            return category[: category.index(".")]
+        return category
+    labels = df[old_label].apply(transform_categories)
+    labels = labels.rename(new_label)
+    features = df.drop(old_label, axis=1)
+    return features, labels
+def train_test_split(X, y, test_size=0.25):
+    return sk_train_test_split(X, y, test_size=test_size, stratify=y)
+def write_dataset(dest_dir, X, y, filename):
+    dest_dir = Path(dest_dir)
+    df = pd.concat((X, y), axis=1)
+    df.to_json(filename, orient="records", lines=True)

requirements.txt CHANGED Viewed

@@ -65,6 +65,8 @@ jupyterlab==4.3.6
 jupyterlab_pygments==0.3.0
 jupyterlab_server==2.27.3
 jupyterlab_widgets==3.0.13
 MarkupSafe==3.0.2
 matplotlib-inline==0.1.7
 mistune==3.1.3
@@ -114,6 +116,7 @@ pydeck==0.9.1
 Pygments==2.19.1
 python-dateutil==2.9.0.post0
 python-json-logger==3.3.0
 pytz==2025.2
 PyYAML==6.0.2
 pyzmq==26.4.0
@@ -136,6 +139,7 @@ streamlit==1.44.1
 sympy==1.13.1
 tenacity==9.1.2
 terminado==0.18.1
 threadpoolctl==3.6.0
 tinycss2==1.4.0
 tokenizers==0.21.1

 jupyterlab_pygments==0.3.0
 jupyterlab_server==2.27.3
 jupyterlab_widgets==3.0.13
+kaggle==1.7.4.2
+kagglehub==0.3.11
 MarkupSafe==3.0.2
 matplotlib-inline==0.1.7
 mistune==3.1.3
 Pygments==2.19.1
 python-dateutil==2.9.0.post0
 python-json-logger==3.3.0
+python-slugify==8.0.4
 pytz==2025.2
 PyYAML==6.0.2
 pyzmq==26.4.0
 sympy==1.13.1
 tenacity==9.1.2
 terminado==0.18.1
+text-unidecode==1.3
 threadpoolctl==3.6.0
 tinycss2==1.4.0
 tokenizers==0.21.1