Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Update appStore/prep_data.py
Browse files- appStore/prep_data.py +41 -0
appStore/prep_data.py
CHANGED
@@ -32,12 +32,53 @@ def process_iati():
|
|
32 |
|
33 |
return projects_df
|
34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
def process_giz_worldwide():
|
36 |
"""
|
37 |
this will read the giz_worldwide files and create the chunks
|
38 |
"""
|
39 |
giz_df = pd.read_json(f'{path_to_data}giz_worldwide/data_giz_website.json')
|
40 |
giz_df = giz_df.rename(columns={'content':'project_description'})
|
|
|
|
|
41 |
giz_df['text_size'] = giz_df.apply(lambda x: len((x['project_name'] + x['project_description']).split()), axis=1)
|
42 |
giz_df['chunks'] = giz_df.apply(lambda x:create_chunks(x['project_name'] + x['project_description']),axis=1)
|
43 |
print("initial df length:",len(giz_df))
|
|
|
32 |
|
33 |
return projects_df
|
34 |
|
35 |
+
|
36 |
+
# def process_giz_worldwide():
|
37 |
+
# """
|
38 |
+
# This will read the new giz_worldwide file and create the chunks.
|
39 |
+
# The following adjustments have been made:
|
40 |
+
# - Reads the file 'giz_worldwide_api_download_23_02_2025.json'
|
41 |
+
# - Renames 'name.en' to 'project_name'
|
42 |
+
# - Uses the 'merged_text' column for creating chunks and computing text size
|
43 |
+
# - Creates an empty 'url' column (since the new dataset has an empty URL)
|
44 |
+
# - Renames 'country' to 'countries'
|
45 |
+
# - Renames 'duration.project.start' to 'start_year' and 'duration.project.end' to 'end_year'
|
46 |
+
# """
|
47 |
+
# # Read the new JSON file
|
48 |
+
# giz_df = pd.read_json(f'{path_to_data}giz_worldwide/giz_worldwide_api_download_23_02_2025.json')
|
49 |
+
|
50 |
+
# # Rename columns per new dataset requirements
|
51 |
+
# giz_df = giz_df.rename(columns={
|
52 |
+
# 'name.en': 'project_name',
|
53 |
+
# 'country': 'countries',
|
54 |
+
# 'duration.project.start': 'start_year',
|
55 |
+
# 'duration.project.end': 'end_year'
|
56 |
+
# })
|
57 |
+
|
58 |
+
# # Create an empty 'url' column as the new dataset has an empty URL
|
59 |
+
# giz_df['url'] = ''
|
60 |
+
|
61 |
+
# # Create text_size based on merged_text and create chunks from merged_text
|
62 |
+
# giz_df['text_size'] = giz_df['merged_text'].apply(lambda text: len(text.split()) if isinstance(text, str) else 0)
|
63 |
+
# giz_df['chunks'] = giz_df['merged_text'].apply(lambda text: create_chunks(text) if isinstance(text, str) else [])
|
64 |
+
|
65 |
+
# print("initial df length:", len(giz_df))
|
66 |
+
# giz_df = giz_df.explode(column=['chunks'], ignore_index=True)
|
67 |
+
# print("new df length:", len(giz_df))
|
68 |
+
# print(giz_df.columns)
|
69 |
+
|
70 |
+
# giz_df['source'] = 'GIZ_WORLDWIDE'
|
71 |
+
# return giz_df
|
72 |
+
|
73 |
+
|
74 |
def process_giz_worldwide():
|
75 |
"""
|
76 |
this will read the giz_worldwide files and create the chunks
|
77 |
"""
|
78 |
giz_df = pd.read_json(f'{path_to_data}giz_worldwide/data_giz_website.json')
|
79 |
giz_df = giz_df.rename(columns={'content':'project_description'})
|
80 |
+
# Sample 10 random rows for quick embeddings (seed set for reproducibility)
|
81 |
+
giz_df = giz_df.sample(n=5, random_state=42)
|
82 |
giz_df['text_size'] = giz_df.apply(lambda x: len((x['project_name'] + x['project_description']).split()), axis=1)
|
83 |
giz_df['chunks'] = giz_df.apply(lambda x:create_chunks(x['project_name'] + x['project_description']),axis=1)
|
84 |
print("initial df length:",len(giz_df))
|