Spaces:
Running
Running
Upload scripts/preprocess.py with huggingface_hub
Browse files- scripts/preprocess.py +37 -0
scripts/preprocess.py
ADDED
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
sys.path.append(sys.path[0].replace('scripts', ''))
|
4 |
+
from urllib.request import urlretrieve
|
5 |
+
import pandas as pd
|
6 |
+
|
7 |
+
from config.data_paths import RAW_DATA_PATH, PROCESSED_DATA_PATH
|
8 |
+
import re
|
9 |
+
|
10 |
+
from scripts.utils import load_config
|
11 |
+
|
12 |
+
PROMPTS_URL = load_config()['data'].get('prompts_corpus_url', 'https://huggingface.co/datasets/poloclub/diffusiondb/resolve/main/metadata.parquet')
|
13 |
+
|
14 |
+
def preprocess_text(text: str) -> str:
|
15 |
+
"""
|
16 |
+
Text preprocessing function.
|
17 |
+
Args:
|
18 |
+
text: Raw text prompt.
|
19 |
+
Returns:
|
20 |
+
Preprocessed text.
|
21 |
+
"""
|
22 |
+
text = text.strip() # Remove leading/trailing whitespace
|
23 |
+
text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with a single space
|
24 |
+
return text
|
25 |
+
|
26 |
+
def clean_corpus():
|
27 |
+
"""
|
28 |
+
Utility function to clean and preprocess the prompt corpus.
|
29 |
+
"""
|
30 |
+
if not os.path.isfile(os.path.join(PROCESSED_DATA_PATH, 'prompt_corpus_clean.parquet')): # to speed up the process
|
31 |
+
df = pd.read_parquet(PROMPTS_URL).sample(10000, random_state=123)
|
32 |
+
assert 'prompt' in df.columns, "Parquet file must contain a 'prompt' column."
|
33 |
+
df = df[df['prompt'].notna()][['prompt']] # drop missing rows
|
34 |
+
df['prompt'] = df['prompt'].apply(preprocess_text) # preprocess each prompt
|
35 |
+
df = df.drop_duplicates() # drop duplicates
|
36 |
+
|
37 |
+
df.to_parquet(os.path.join(PROCESSED_DATA_PATH, 'prompt_corpus_clean.parquet'))
|