annikwag commited on
Commit
8e1fb27
·
verified ·
1 Parent(s): bb1bd7a

Update appStore/prep_data.py

Browse files
Files changed (1) hide show
  1. appStore/prep_data.py +41 -0
appStore/prep_data.py CHANGED
@@ -32,12 +32,53 @@ def process_iati():
32
 
33
  return projects_df
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  def process_giz_worldwide():
36
  """
37
  this will read the giz_worldwide files and create the chunks
38
  """
39
  giz_df = pd.read_json(f'{path_to_data}giz_worldwide/data_giz_website.json')
40
  giz_df = giz_df.rename(columns={'content':'project_description'})
 
 
41
  giz_df['text_size'] = giz_df.apply(lambda x: len((x['project_name'] + x['project_description']).split()), axis=1)
42
  giz_df['chunks'] = giz_df.apply(lambda x:create_chunks(x['project_name'] + x['project_description']),axis=1)
43
  print("initial df length:",len(giz_df))
 
32
 
33
  return projects_df
34
 
35
+
36
+ # def process_giz_worldwide():
37
+ # """
38
+ # This will read the new giz_worldwide file and create the chunks.
39
+ # The following adjustments have been made:
40
+ # - Reads the file 'giz_worldwide_api_download_23_02_2025.json'
41
+ # - Renames 'name.en' to 'project_name'
42
+ # - Uses the 'merged_text' column for creating chunks and computing text size
43
+ # - Creates an empty 'url' column (since the new dataset has an empty URL)
44
+ # - Renames 'country' to 'countries'
45
+ # - Renames 'duration.project.start' to 'start_year' and 'duration.project.end' to 'end_year'
46
+ # """
47
+ # # Read the new JSON file
48
+ # giz_df = pd.read_json(f'{path_to_data}giz_worldwide/giz_worldwide_api_download_23_02_2025.json')
49
+
50
+ # # Rename columns per new dataset requirements
51
+ # giz_df = giz_df.rename(columns={
52
+ # 'name.en': 'project_name',
53
+ # 'country': 'countries',
54
+ # 'duration.project.start': 'start_year',
55
+ # 'duration.project.end': 'end_year'
56
+ # })
57
+
58
+ # # Create an empty 'url' column as the new dataset has an empty URL
59
+ # giz_df['url'] = ''
60
+
61
+ # # Create text_size based on merged_text and create chunks from merged_text
62
+ # giz_df['text_size'] = giz_df['merged_text'].apply(lambda text: len(text.split()) if isinstance(text, str) else 0)
63
+ # giz_df['chunks'] = giz_df['merged_text'].apply(lambda text: create_chunks(text) if isinstance(text, str) else [])
64
+
65
+ # print("initial df length:", len(giz_df))
66
+ # giz_df = giz_df.explode(column=['chunks'], ignore_index=True)
67
+ # print("new df length:", len(giz_df))
68
+ # print(giz_df.columns)
69
+
70
+ # giz_df['source'] = 'GIZ_WORLDWIDE'
71
+ # return giz_df
72
+
73
+
74
  def process_giz_worldwide():
75
  """
76
  this will read the giz_worldwide files and create the chunks
77
  """
78
  giz_df = pd.read_json(f'{path_to_data}giz_worldwide/data_giz_website.json')
79
  giz_df = giz_df.rename(columns={'content':'project_description'})
80
+ # Sample 10 random rows for quick embeddings (seed set for reproducibility)
81
+ giz_df = giz_df.sample(n=5, random_state=42)
82
  giz_df['text_size'] = giz_df.apply(lambda x: len((x['project_name'] + x['project_description']).split()), axis=1)
83
  giz_df['chunks'] = giz_df.apply(lambda x:create_chunks(x['project_name'] + x['project_description']),axis=1)
84
  print("initial df length:",len(giz_df))