Spaces:

alexpantex
/

prompt-search-app

Running

alexpantex commited on Jan 10

Commit

ef09277

verified ·

1 Parent(s): f4e126c

Upload scripts/preprocess.py with huggingface_hub

Files changed (1) hide show

scripts/preprocess.py ADDED Viewed

+import os
+import sys
+sys.path.append(sys.path[0].replace('scripts', ''))
+from urllib.request import urlretrieve
+import pandas as pd
+from config.data_paths import RAW_DATA_PATH, PROCESSED_DATA_PATH
+import re
+from scripts.utils import load_config
+PROMPTS_URL = load_config()['data'].get('prompts_corpus_url', 'https://huggingface.co/datasets/poloclub/diffusiondb/resolve/main/metadata.parquet')
+def preprocess_text(text: str) -> str:
+    """
+    Text preprocessing function.
+    Args:
+        text: Raw text prompt.
+    Returns:
+        Preprocessed text.
+    """
+    text = text.strip()                 # Remove leading/trailing whitespace
+    text = re.sub(r'\s+', ' ', text)    # Replace multiple spaces with a single space
+    return text
+def clean_corpus():
+    """
+    Utility function to clean and preprocess the prompt corpus.
+    """
+    if not os.path.isfile(os.path.join(PROCESSED_DATA_PATH, 'prompt_corpus_clean.parquet')): # to speed up the process
+        df = pd.read_parquet(PROMPTS_URL).sample(10000, random_state=123)
+        assert 'prompt' in df.columns, "Parquet file must contain a 'prompt' column."
+        df = df[df['prompt'].notna()][['prompt']]     # drop missing rows
+        df['prompt'] = df['prompt'].apply(preprocess_text)              # preprocess each prompt
+        df = df.drop_duplicates()                   # drop duplicates
+        df.to_parquet(os.path.join(PROCESSED_DATA_PATH, 'prompt_corpus_clean.parquet'))