alexpantex commited on
Commit
ef09277
·
verified ·
1 Parent(s): f4e126c

Upload scripts/preprocess.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. scripts/preprocess.py +37 -0
scripts/preprocess.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ sys.path.append(sys.path[0].replace('scripts', ''))
4
+ from urllib.request import urlretrieve
5
+ import pandas as pd
6
+
7
+ from config.data_paths import RAW_DATA_PATH, PROCESSED_DATA_PATH
8
+ import re
9
+
10
+ from scripts.utils import load_config
11
+
12
+ PROMPTS_URL = load_config()['data'].get('prompts_corpus_url', 'https://huggingface.co/datasets/poloclub/diffusiondb/resolve/main/metadata.parquet')
13
+
14
+ def preprocess_text(text: str) -> str:
15
+ """
16
+ Text preprocessing function.
17
+ Args:
18
+ text: Raw text prompt.
19
+ Returns:
20
+ Preprocessed text.
21
+ """
22
+ text = text.strip() # Remove leading/trailing whitespace
23
+ text = re.sub(r'\s+', ' ', text) # Replace multiple spaces with a single space
24
+ return text
25
+
26
+ def clean_corpus():
27
+ """
28
+ Utility function to clean and preprocess the prompt corpus.
29
+ """
30
+ if not os.path.isfile(os.path.join(PROCESSED_DATA_PATH, 'prompt_corpus_clean.parquet')): # to speed up the process
31
+ df = pd.read_parquet(PROMPTS_URL).sample(10000, random_state=123)
32
+ assert 'prompt' in df.columns, "Parquet file must contain a 'prompt' column."
33
+ df = df[df['prompt'].notna()][['prompt']] # drop missing rows
34
+ df['prompt'] = df['prompt'].apply(preprocess_text) # preprocess each prompt
35
+ df = df.drop_duplicates() # drop duplicates
36
+
37
+ df.to_parquet(os.path.join(PROCESSED_DATA_PATH, 'prompt_corpus_clean.parquet'))