annikwag commited on
Commit
5ee7936
·
verified ·
1 Parent(s): 47177b9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -34
app.py CHANGED
@@ -25,12 +25,12 @@ region_df = load_region_data(region_lookup_path)
25
  #################### Create the embeddings collection and save ######################
26
  # the steps below need to be performed only once and then commented out any unnecssary compute over-run
27
  ##### First we process and create the chunks for relvant data source
28
- chunks = process_giz_worldwide()
29
  ##### Convert to langchain documents
30
- temp_doc = create_documents(chunks,'chunks')
31
  ##### Embed and store docs, check if collection exist then you need to update the collection
32
  collection_name = "giz_worldwide"
33
- hybrid_embed_chunks(docs=temp_doc, collection_name=collection_name, del_if_exists=True)
34
 
35
  ################### Hybrid Search ######################################################
36
  client = get_client()
@@ -47,6 +47,7 @@ _, unique_sub_regions = get_regions(region_df)
47
  def get_country_name_and_region_mapping(_client, collection_name, region_df):
48
  results = hybrid_search(_client, "", collection_name)
49
  country_set = set()
 
50
  for res in results[0] + results[1]:
51
  countries = res.payload.get('metadata', {}).get('countries', "[]")
52
  try:
@@ -94,23 +95,23 @@ else:
94
  with col2:
95
  country_filter = st.selectbox("Country", ["All/Not allocated"] + filtered_country_names) # Display filtered country names
96
 
97
- # Year range slider
98
- with col3:
99
- current_year = datetime.now().year
100
- default_start_year = current_year - 5
101
 
102
- # 3) The max_value is now the actual max end_year from collection
103
- end_year_range = st.slider(
104
- "Project End Year",
105
- min_value=2010,
106
- max_value=max_end_year,
107
- value=(default_start_year, max_end_year),
108
- )
109
 
110
  # Checkbox to control whether to show only exact matches
111
  show_exact_matches = st.checkbox("Show only exact matches", value=False)
112
 
113
- def filter_results(results, country_filter, region_filter, end_year_range):
114
  filtered = []
115
  for r in results:
116
  metadata = r.payload.get('metadata', {})
@@ -145,7 +146,7 @@ def filter_results(results, country_filter, region_filter, end_year_range):
145
  if (
146
  (country_filter == "All/Not allocated" or selected_iso_code in c_list)
147
  and (region_filter == "All/Not allocated" or countries_in_region)
148
- and (end_year_range[0] <= end_year_val <= end_year_range[1])
149
  ):
150
  filtered.append(r)
151
  return filtered
@@ -161,20 +162,20 @@ lexical_all = results[1]
161
 
162
  # 2) Filter out content < 20 chars (as intermediate fix to problem that e.g. super short paragraphs with few chars get high similarity score)
163
  semantic_all = [
164
- r for r in semantic_all if len(r.payload["page_content"]) >= 20
165
  ]
166
  lexical_all = [
167
- r for r in lexical_all if len(r.payload["page_content"]) >= 20
168
  ]
169
 
170
  # 2) Apply a threshold to SEMANTIC results (score >= 0.4)
171
- semantic_thresholded = [r for r in semantic_all if r.score >= 0.4]
172
 
173
  # 2) Filter the entire sets
174
- filtered_semantic = filter_results(semantic_thresholded, country_filter, region_filter, end_year_range)
175
- filtered_lexical = filter_results(lexical_all, country_filter, region_filter, end_year_range)
176
 
177
- filtered_semantic_no_dupe = remove_duplicates(filtered_semantic)
178
  filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
179
 
180
 
@@ -197,8 +198,8 @@ if show_exact_matches:
197
 
198
  # 3) Now apply your region/country/year filter on that new list
199
  filtered_lexical = filter_results(
200
- lexical_substring_filtered, country_filter, region_filter, end_year_range
201
- )
202
 
203
  # 4) Remove duplicates
204
  filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
@@ -216,7 +217,7 @@ if show_exact_matches:
216
  # Snippet logic (80 words)
217
  full_text = res.payload['page_content']
218
  words = full_text.split()
219
- preview_word_count = 80
220
  preview_text = " ".join(words[:preview_word_count])
221
  remainder_text = " ".join(words[preview_word_count:])
222
  st.write(preview_text + ("..." if remainder_text else ""))
@@ -232,6 +233,10 @@ if show_exact_matches:
232
  client_name = metadata.get('client', 'Unknown Client')
233
  start_year = metadata.get('start_year', None)
234
  end_year = metadata.get('end_year', None)
 
 
 
 
235
 
236
  try:
237
  c_list = json.loads(countries.replace("'", '"'))
@@ -255,18 +260,16 @@ if show_exact_matches:
255
  start_year_str = f"{int(round(float(start_year)))}" if start_year else "Unknown"
256
  end_year_str = f"{int(round(float(end_year)))}" if end_year else "Unknown"
257
 
258
- # Build the final string
259
  if matched_countries:
260
- # We have at least 1 valid country name
261
  additional_text = (
262
  f"**{', '.join(matched_countries)}**, commissioned by **{client_name}**, "
263
- f"**{start_year_str}-{end_year_str}**"
264
  )
265
  else:
266
- # No valid countries found
267
  additional_text = (
268
- f"Commissioned by **{client_name}**, **{start_year_str}-{end_year_str}**"
269
  )
 
270
 
271
  st.markdown(additional_text)
272
  st.divider()
@@ -302,6 +305,9 @@ else:
302
  client_name = metadata.get('client', 'Unknown Client')
303
  start_year = metadata.get('start_year', None)
304
  end_year = metadata.get('end_year', None)
 
 
 
305
 
306
  try:
307
  c_list = json.loads(countries.replace("'", '"'))
@@ -327,16 +333,15 @@ else:
327
 
328
  # Build the final string
329
  if matched_countries:
330
- # We have at least 1 valid country name
331
  additional_text = (
332
  f"**{', '.join(matched_countries)}**, commissioned by **{client_name}**, "
333
- f"**{start_year_str}-{end_year_str}**"
334
  )
335
  else:
336
- # No valid countries found
337
  additional_text = (
338
- f"Commissioned by **{client_name}**, **{start_year_str}-{end_year_str}**"
339
  )
 
340
 
341
  st.markdown(additional_text)
342
  st.divider()
 
25
  #################### Create the embeddings collection and save ######################
26
  # the steps below need to be performed only once and then commented out any unnecssary compute over-run
27
  ##### First we process and create the chunks for relvant data source
28
+ #chunks = process_giz_worldwide()
29
  ##### Convert to langchain documents
30
+ #temp_doc = create_documents(chunks,'chunks')
31
  ##### Embed and store docs, check if collection exist then you need to update the collection
32
  collection_name = "giz_worldwide"
33
+ #hybrid_embed_chunks(docs=temp_doc, collection_name=collection_name, del_if_exists=True)
34
 
35
  ################### Hybrid Search ######################################################
36
  client = get_client()
 
47
  def get_country_name_and_region_mapping(_client, collection_name, region_df):
48
  results = hybrid_search(_client, "", collection_name)
49
  country_set = set()
50
+
51
  for res in results[0] + results[1]:
52
  countries = res.payload.get('metadata', {}).get('countries', "[]")
53
  try:
 
95
  with col2:
96
  country_filter = st.selectbox("Country", ["All/Not allocated"] + filtered_country_names) # Display filtered country names
97
 
98
+ # # Year range slider # ToDo add end_year filter again
99
+ # with col3:
100
+ # current_year = datetime.now().year
101
+ # default_start_year = current_year - 5
102
 
103
+ # # 3) The max_value is now the actual max end_year from collection
104
+ # end_year_range = st.slider(
105
+ # "Project End Year",
106
+ # min_value=2010,
107
+ # max_value=max_end_year,
108
+ # value=(default_start_year, max_end_year),
109
+ # )
110
 
111
  # Checkbox to control whether to show only exact matches
112
  show_exact_matches = st.checkbox("Show only exact matches", value=False)
113
 
114
+ def filter_results(results, country_filter, region_filter): ## , end_year_range ToDo add end_year filter again
115
  filtered = []
116
  for r in results:
117
  metadata = r.payload.get('metadata', {})
 
146
  if (
147
  (country_filter == "All/Not allocated" or selected_iso_code in c_list)
148
  and (region_filter == "All/Not allocated" or countries_in_region)
149
+ # and (end_year_range[0] <= end_year_val <= end_year_range[1]) # ToDo add end_year filter again
150
  ):
151
  filtered.append(r)
152
  return filtered
 
162
 
163
  # 2) Filter out content < 20 chars (as intermediate fix to problem that e.g. super short paragraphs with few chars get high similarity score)
164
  semantic_all = [
165
+ r for r in semantic_all if len(r.payload["page_content"]) >= 5
166
  ]
167
  lexical_all = [
168
+ r for r in lexical_all if len(r.payload["page_content"]) >= 5
169
  ]
170
 
171
  # 2) Apply a threshold to SEMANTIC results (score >= 0.4)
172
+ semantic_thresholded = [r for r in semantic_all if r.score >= 0.0]
173
 
174
  # 2) Filter the entire sets
175
+ filtered_semantic = filter_results(semantic_thresholded, country_filter, region_filter) ## , end_year_range ToDo add end_year filter again
176
+ filtered_lexical = filter_results(lexical_all, country_filter, region_filter)## , end_year_range ToDo add end_year filter again
177
 
178
+ filtered_semantic_no_dupe = remove_duplicates(filtered_semantic) # ToDo remove duplicates again?
179
  filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
180
 
181
 
 
198
 
199
  # 3) Now apply your region/country/year filter on that new list
200
  filtered_lexical = filter_results(
201
+ lexical_substring_filtered, country_filter, region_filter
202
+ ) ## , end_year_range ToDo add end_year filter again
203
 
204
  # 4) Remove duplicates
205
  filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
 
217
  # Snippet logic (80 words)
218
  full_text = res.payload['page_content']
219
  words = full_text.split()
220
+ preview_word_count = 200
221
  preview_text = " ".join(words[:preview_word_count])
222
  remainder_text = " ".join(words[preview_word_count:])
223
  st.write(preview_text + ("..." if remainder_text else ""))
 
233
  client_name = metadata.get('client', 'Unknown Client')
234
  start_year = metadata.get('start_year', None)
235
  end_year = metadata.get('end_year', None)
236
+ total_volume = metadata.get('total_volume', "Unknown")
237
+ total_project = metadata.get('total_project', "Unknown")
238
+ id = metadata.get('id', "Unknown")
239
+
240
 
241
  try:
242
  c_list = json.loads(countries.replace("'", '"'))
 
260
  start_year_str = f"{int(round(float(start_year)))}" if start_year else "Unknown"
261
  end_year_str = f"{int(round(float(end_year)))}" if end_year else "Unknown"
262
 
 
263
  if matched_countries:
 
264
  additional_text = (
265
  f"**{', '.join(matched_countries)}**, commissioned by **{client_name}**, "
266
+ f"**{start_year_str}-{end_year_str}**, project ID: {id}, project budget: {total_project}, total volumne: {total_volume}"
267
  )
268
  else:
 
269
  additional_text = (
270
+ f"Commissioned by **{client_name}**, **{start_year_str}-{end_year_str}**, project ID: {id}, project budget: {total_project}, total volumne: {total_volume}"
271
  )
272
+
273
 
274
  st.markdown(additional_text)
275
  st.divider()
 
305
  client_name = metadata.get('client', 'Unknown Client')
306
  start_year = metadata.get('start_year', None)
307
  end_year = metadata.get('end_year', None)
308
+ total_volume = metadata.get('total_volume', "Unknown")
309
+ total_project = metadata.get('total_project', "Unknown")
310
+ id = metadata.get('id', "Unknown")
311
 
312
  try:
313
  c_list = json.loads(countries.replace("'", '"'))
 
333
 
334
  # Build the final string
335
  if matched_countries:
 
336
  additional_text = (
337
  f"**{', '.join(matched_countries)}**, commissioned by **{client_name}**, "
338
+ f"**{start_year_str}-{end_year_str}**, project ID: {id}, project budget: {total_project}, total volumne: {total_volume}"
339
  )
340
  else:
 
341
  additional_text = (
342
+ f"Commissioned by **{client_name}**, **{start_year_str}-{end_year_str}**, project ID: {id}, project budget: {total_project}, total volumne: {total_volume}"
343
  )
344
+
345
 
346
  st.markdown(additional_text)
347
  st.divider()