l-tran commited on
Commit
76cab84
·
verified ·
1 Parent(s): 2765093

Change data source

Browse files
Files changed (1) hide show
  1. app.py +37 -27
app.py CHANGED
@@ -24,39 +24,49 @@ def create_chunks(text):
24
  return texts
25
 
26
  def get_chunks():
27
- orgas_df = pd.read_csv("iati_files/project_orgas.csv")
28
- region_df = pd.read_csv("iati_files/project_region.csv")
29
- sector_df = pd.read_csv("iati_files/project_sector.csv")
30
- status_df = pd.read_csv("iati_files/project_status.csv")
31
- texts_df = pd.read_csv("iati_files/project_texts.csv")
32
-
33
- projects_df = pd.merge(orgas_df, region_df, on='iati_id', how='inner')
34
- projects_df = pd.merge(projects_df, sector_df, on='iati_id', how='inner')
35
- projects_df = pd.merge(projects_df, status_df, on='iati_id', how='inner')
36
- projects_df = pd.merge(projects_df, texts_df, on='iati_id', how='inner')
37
- giz_df = projects_df[projects_df.client.str.contains('bmz')].reset_index(drop=True)
38
-
39
- giz_df.drop(columns= ['orga_abbreviation', 'client',
40
- 'orga_full_name', 'country',
41
- 'country_flag', 'crs_5_code', 'crs_3_code',
42
- 'sgd_pred_code'], inplace=True)
43
-
44
- giz_df['text_size'] = giz_df.apply(lambda x: len((x['title_main'] + x['description_main']).split()), axis=1)
45
- giz_df['chunks'] = giz_df.apply(lambda x:create_chunks(x['title_main'] + x['description_main']),axis=1)
 
 
 
 
46
  giz_df = giz_df.explode(column=['chunks'], ignore_index=True)
47
 
48
 
49
  placeholder= []
50
  for i in range(len(giz_df)):
51
  placeholder.append(Document(page_content= giz_df.loc[i,'chunks'],
52
- metadata={"iati_id": giz_df.loc[i,'iati_id'],
53
- "iati_orga_id":giz_df.loc[i,'iati_orga_id'],
54
- "country_name":str(giz_df.loc[i,'country_name']),
55
- "crs_5_name": giz_df.loc[i,'crs_5_name'],
56
- "crs_3_name": giz_df.loc[i,'crs_3_name'],
57
- "sgd_pred_str":giz_df.loc[i,'sgd_pred_str'],
58
- "status":giz_df.loc[i,'status'],
59
- "title_main":giz_df.loc[i,'title_main'],}))
 
 
 
 
 
 
60
  return placeholder
61
 
62
  def embed_chunks(chunks):
 
24
  return texts
25
 
26
  def get_chunks():
27
+ #orgas_df = pd.read_csv("iati_files/project_orgas.csv")
28
+ #region_df = pd.read_csv("iati_files/project_region.csv")
29
+ #sector_df = pd.read_csv("iati_files/project_sector.csv")
30
+ #status_df = pd.read_csv("iati_files/project_status.csv")
31
+ #texts_df = pd.read_csv("iati_files/project_texts.csv")
32
+
33
+ #projects_df = pd.merge(orgas_df, region_df, on='iati_id', how='inner')
34
+ #projects_df = pd.merge(projects_df, sector_df, on='iati_id', how='inner')
35
+ #projects_df = pd.merge(projects_df, status_df, on='iati_id', how='inner')
36
+ #projects_df = pd.merge(projects_df, texts_df, on='iati_id', how='inner')
37
+ #giz_df = projects_df[projects_df.client.str.contains('bmz')].reset_index(drop=True)
38
+
39
+ #giz_df.drop(columns= ['orga_abbreviation', 'client',
40
+ # 'orga_full_name', 'country',
41
+ # 'country_flag', 'crs_5_code', 'crs_3_code',
42
+ # 'sgd_pred_code'], inplace=True)
43
+
44
+ giz_df = pd.read_json('iati_files/data_giz_website.json')
45
+ giz_df = giz_df.rename(columns={'content':'project_description'})
46
+
47
+
48
+ giz_df['text_size'] = giz_df.apply(lambda x: len((x['project_name'] + x['project_description']).split()), axis=1)
49
+ giz_df['chunks'] = giz_df.apply(lambda x:create_chunks(x['project_name'] + x['project_description']),axis=1)
50
  giz_df = giz_df.explode(column=['chunks'], ignore_index=True)
51
 
52
 
53
  placeholder= []
54
  for i in range(len(giz_df)):
55
  placeholder.append(Document(page_content= giz_df.loc[i,'chunks'],
56
+ metadata={
57
+ "title_main":giz_df.loc[i,'title_main'],
58
+ "country_name":str(giz_df.loc[i,'countries']),
59
+ "client": giz_df_new.loc[i, 'client'],
60
+ "language":giz_df_new.loc[i, 'language'],
61
+ "political_sponsor":giz_df.loc[i, 'poli_trager'],
62
+ "url": giz_df.loc[i, 'url']
63
+ #"iati_id": giz_df.loc[i,'iati_id'],
64
+ #"iati_orga_id":giz_df.loc[i,'iati_orga_id'],
65
+ #"crs_5_name": giz_df.loc[i,'crs_5_name'],
66
+ #"crs_3_name": giz_df.loc[i,'crs_3_name'],
67
+ #"sgd_pred_str":giz_df.loc[i,'sgd_pred_str'],
68
+ #"status":giz_df.loc[i,'status'],
69
+ }))
70
  return placeholder
71
 
72
  def embed_chunks(chunks):