Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -25,12 +25,12 @@ region_df = load_region_data(region_lookup_path)
|
|
25 |
#################### Create the embeddings collection and save ######################
|
26 |
# the steps below need to be performed only once and then commented out any unnecssary compute over-run
|
27 |
##### First we process and create the chunks for relvant data source
|
28 |
-
chunks = process_giz_worldwide()
|
29 |
##### Convert to langchain documents
|
30 |
-
temp_doc = create_documents(chunks,'chunks')
|
31 |
##### Embed and store docs, check if collection exist then you need to update the collection
|
32 |
collection_name = "giz_worldwide"
|
33 |
-
hybrid_embed_chunks(docs=temp_doc, collection_name=collection_name, del_if_exists=True)
|
34 |
|
35 |
################### Hybrid Search ######################################################
|
36 |
client = get_client()
|
@@ -47,6 +47,7 @@ _, unique_sub_regions = get_regions(region_df)
|
|
47 |
def get_country_name_and_region_mapping(_client, collection_name, region_df):
|
48 |
results = hybrid_search(_client, "", collection_name)
|
49 |
country_set = set()
|
|
|
50 |
for res in results[0] + results[1]:
|
51 |
countries = res.payload.get('metadata', {}).get('countries', "[]")
|
52 |
try:
|
@@ -94,23 +95,23 @@ else:
|
|
94 |
with col2:
|
95 |
country_filter = st.selectbox("Country", ["All/Not allocated"] + filtered_country_names) # Display filtered country names
|
96 |
|
97 |
-
# Year range slider
|
98 |
-
with col3:
|
99 |
-
|
100 |
-
|
101 |
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
|
110 |
# Checkbox to control whether to show only exact matches
|
111 |
show_exact_matches = st.checkbox("Show only exact matches", value=False)
|
112 |
|
113 |
-
def filter_results(results, country_filter, region_filter, end_year_range
|
114 |
filtered = []
|
115 |
for r in results:
|
116 |
metadata = r.payload.get('metadata', {})
|
@@ -145,7 +146,7 @@ def filter_results(results, country_filter, region_filter, end_year_range):
|
|
145 |
if (
|
146 |
(country_filter == "All/Not allocated" or selected_iso_code in c_list)
|
147 |
and (region_filter == "All/Not allocated" or countries_in_region)
|
148 |
-
and (end_year_range[0] <= end_year_val <= end_year_range[1])
|
149 |
):
|
150 |
filtered.append(r)
|
151 |
return filtered
|
@@ -161,20 +162,20 @@ lexical_all = results[1]
|
|
161 |
|
162 |
# 2) Filter out content < 20 chars (as intermediate fix to problem that e.g. super short paragraphs with few chars get high similarity score)
|
163 |
semantic_all = [
|
164 |
-
r for r in semantic_all if len(r.payload["page_content"]) >=
|
165 |
]
|
166 |
lexical_all = [
|
167 |
-
r for r in lexical_all if len(r.payload["page_content"]) >=
|
168 |
]
|
169 |
|
170 |
# 2) Apply a threshold to SEMANTIC results (score >= 0.4)
|
171 |
-
semantic_thresholded = [r for r in semantic_all if r.score >= 0.
|
172 |
|
173 |
# 2) Filter the entire sets
|
174 |
-
filtered_semantic = filter_results(semantic_thresholded, country_filter, region_filter, end_year_range
|
175 |
-
filtered_lexical = filter_results(lexical_all, country_filter, region_filter, end_year_range
|
176 |
|
177 |
-
filtered_semantic_no_dupe = remove_duplicates(filtered_semantic)
|
178 |
filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
|
179 |
|
180 |
|
@@ -197,8 +198,8 @@ if show_exact_matches:
|
|
197 |
|
198 |
# 3) Now apply your region/country/year filter on that new list
|
199 |
filtered_lexical = filter_results(
|
200 |
-
lexical_substring_filtered, country_filter, region_filter
|
201 |
-
)
|
202 |
|
203 |
# 4) Remove duplicates
|
204 |
filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
|
@@ -216,7 +217,7 @@ if show_exact_matches:
|
|
216 |
# Snippet logic (80 words)
|
217 |
full_text = res.payload['page_content']
|
218 |
words = full_text.split()
|
219 |
-
preview_word_count =
|
220 |
preview_text = " ".join(words[:preview_word_count])
|
221 |
remainder_text = " ".join(words[preview_word_count:])
|
222 |
st.write(preview_text + ("..." if remainder_text else ""))
|
@@ -232,6 +233,10 @@ if show_exact_matches:
|
|
232 |
client_name = metadata.get('client', 'Unknown Client')
|
233 |
start_year = metadata.get('start_year', None)
|
234 |
end_year = metadata.get('end_year', None)
|
|
|
|
|
|
|
|
|
235 |
|
236 |
try:
|
237 |
c_list = json.loads(countries.replace("'", '"'))
|
@@ -255,18 +260,16 @@ if show_exact_matches:
|
|
255 |
start_year_str = f"{int(round(float(start_year)))}" if start_year else "Unknown"
|
256 |
end_year_str = f"{int(round(float(end_year)))}" if end_year else "Unknown"
|
257 |
|
258 |
-
# Build the final string
|
259 |
if matched_countries:
|
260 |
-
# We have at least 1 valid country name
|
261 |
additional_text = (
|
262 |
f"**{', '.join(matched_countries)}**, commissioned by **{client_name}**, "
|
263 |
-
f"**{start_year_str}-{end_year_str}
|
264 |
)
|
265 |
else:
|
266 |
-
# No valid countries found
|
267 |
additional_text = (
|
268 |
-
f"Commissioned by **{client_name}**, **{start_year_str}-{end_year_str}
|
269 |
)
|
|
|
270 |
|
271 |
st.markdown(additional_text)
|
272 |
st.divider()
|
@@ -302,6 +305,9 @@ else:
|
|
302 |
client_name = metadata.get('client', 'Unknown Client')
|
303 |
start_year = metadata.get('start_year', None)
|
304 |
end_year = metadata.get('end_year', None)
|
|
|
|
|
|
|
305 |
|
306 |
try:
|
307 |
c_list = json.loads(countries.replace("'", '"'))
|
@@ -327,16 +333,15 @@ else:
|
|
327 |
|
328 |
# Build the final string
|
329 |
if matched_countries:
|
330 |
-
# We have at least 1 valid country name
|
331 |
additional_text = (
|
332 |
f"**{', '.join(matched_countries)}**, commissioned by **{client_name}**, "
|
333 |
-
f"**{start_year_str}-{end_year_str}
|
334 |
)
|
335 |
else:
|
336 |
-
# No valid countries found
|
337 |
additional_text = (
|
338 |
-
f"Commissioned by **{client_name}**, **{start_year_str}-{end_year_str}
|
339 |
)
|
|
|
340 |
|
341 |
st.markdown(additional_text)
|
342 |
st.divider()
|
|
|
25 |
#################### Create the embeddings collection and save ######################
|
26 |
# the steps below need to be performed only once and then commented out any unnecssary compute over-run
|
27 |
##### First we process and create the chunks for relvant data source
|
28 |
+
#chunks = process_giz_worldwide()
|
29 |
##### Convert to langchain documents
|
30 |
+
#temp_doc = create_documents(chunks,'chunks')
|
31 |
##### Embed and store docs, check if collection exist then you need to update the collection
|
32 |
collection_name = "giz_worldwide"
|
33 |
+
#hybrid_embed_chunks(docs=temp_doc, collection_name=collection_name, del_if_exists=True)
|
34 |
|
35 |
################### Hybrid Search ######################################################
|
36 |
client = get_client()
|
|
|
47 |
def get_country_name_and_region_mapping(_client, collection_name, region_df):
|
48 |
results = hybrid_search(_client, "", collection_name)
|
49 |
country_set = set()
|
50 |
+
|
51 |
for res in results[0] + results[1]:
|
52 |
countries = res.payload.get('metadata', {}).get('countries', "[]")
|
53 |
try:
|
|
|
95 |
with col2:
|
96 |
country_filter = st.selectbox("Country", ["All/Not allocated"] + filtered_country_names) # Display filtered country names
|
97 |
|
98 |
+
# # Year range slider # ToDo add end_year filter again
|
99 |
+
# with col3:
|
100 |
+
# current_year = datetime.now().year
|
101 |
+
# default_start_year = current_year - 5
|
102 |
|
103 |
+
# # 3) The max_value is now the actual max end_year from collection
|
104 |
+
# end_year_range = st.slider(
|
105 |
+
# "Project End Year",
|
106 |
+
# min_value=2010,
|
107 |
+
# max_value=max_end_year,
|
108 |
+
# value=(default_start_year, max_end_year),
|
109 |
+
# )
|
110 |
|
111 |
# Checkbox to control whether to show only exact matches
|
112 |
show_exact_matches = st.checkbox("Show only exact matches", value=False)
|
113 |
|
114 |
+
def filter_results(results, country_filter, region_filter): ## , end_year_range ToDo add end_year filter again
|
115 |
filtered = []
|
116 |
for r in results:
|
117 |
metadata = r.payload.get('metadata', {})
|
|
|
146 |
if (
|
147 |
(country_filter == "All/Not allocated" or selected_iso_code in c_list)
|
148 |
and (region_filter == "All/Not allocated" or countries_in_region)
|
149 |
+
# and (end_year_range[0] <= end_year_val <= end_year_range[1]) # ToDo add end_year filter again
|
150 |
):
|
151 |
filtered.append(r)
|
152 |
return filtered
|
|
|
162 |
|
163 |
# 2) Filter out content < 20 chars (as intermediate fix to problem that e.g. super short paragraphs with few chars get high similarity score)
|
164 |
semantic_all = [
|
165 |
+
r for r in semantic_all if len(r.payload["page_content"]) >= 5
|
166 |
]
|
167 |
lexical_all = [
|
168 |
+
r for r in lexical_all if len(r.payload["page_content"]) >= 5
|
169 |
]
|
170 |
|
171 |
# 2) Apply a threshold to SEMANTIC results (score >= 0.4)
|
172 |
+
semantic_thresholded = [r for r in semantic_all if r.score >= 0.0]
|
173 |
|
174 |
# 2) Filter the entire sets
|
175 |
+
filtered_semantic = filter_results(semantic_thresholded, country_filter, region_filter) ## , end_year_range ToDo add end_year filter again
|
176 |
+
filtered_lexical = filter_results(lexical_all, country_filter, region_filter)## , end_year_range ToDo add end_year filter again
|
177 |
|
178 |
+
filtered_semantic_no_dupe = remove_duplicates(filtered_semantic) # ToDo remove duplicates again?
|
179 |
filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
|
180 |
|
181 |
|
|
|
198 |
|
199 |
# 3) Now apply your region/country/year filter on that new list
|
200 |
filtered_lexical = filter_results(
|
201 |
+
lexical_substring_filtered, country_filter, region_filter
|
202 |
+
) ## , end_year_range ToDo add end_year filter again
|
203 |
|
204 |
# 4) Remove duplicates
|
205 |
filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
|
|
|
217 |
# Snippet logic (80 words)
|
218 |
full_text = res.payload['page_content']
|
219 |
words = full_text.split()
|
220 |
+
preview_word_count = 200
|
221 |
preview_text = " ".join(words[:preview_word_count])
|
222 |
remainder_text = " ".join(words[preview_word_count:])
|
223 |
st.write(preview_text + ("..." if remainder_text else ""))
|
|
|
233 |
client_name = metadata.get('client', 'Unknown Client')
|
234 |
start_year = metadata.get('start_year', None)
|
235 |
end_year = metadata.get('end_year', None)
|
236 |
+
total_volume = metadata.get('total_volume', "Unknown")
|
237 |
+
total_project = metadata.get('total_project', "Unknown")
|
238 |
+
id = metadata.get('id', "Unknown")
|
239 |
+
|
240 |
|
241 |
try:
|
242 |
c_list = json.loads(countries.replace("'", '"'))
|
|
|
260 |
start_year_str = f"{int(round(float(start_year)))}" if start_year else "Unknown"
|
261 |
end_year_str = f"{int(round(float(end_year)))}" if end_year else "Unknown"
|
262 |
|
|
|
263 |
if matched_countries:
|
|
|
264 |
additional_text = (
|
265 |
f"**{', '.join(matched_countries)}**, commissioned by **{client_name}**, "
|
266 |
+
f"**{start_year_str}-{end_year_str}**, project ID: {id}, project budget: {total_project}, total volumne: {total_volume}"
|
267 |
)
|
268 |
else:
|
|
|
269 |
additional_text = (
|
270 |
+
f"Commissioned by **{client_name}**, **{start_year_str}-{end_year_str}**, project ID: {id}, project budget: {total_project}, total volumne: {total_volume}"
|
271 |
)
|
272 |
+
|
273 |
|
274 |
st.markdown(additional_text)
|
275 |
st.divider()
|
|
|
305 |
client_name = metadata.get('client', 'Unknown Client')
|
306 |
start_year = metadata.get('start_year', None)
|
307 |
end_year = metadata.get('end_year', None)
|
308 |
+
total_volume = metadata.get('total_volume', "Unknown")
|
309 |
+
total_project = metadata.get('total_project', "Unknown")
|
310 |
+
id = metadata.get('id', "Unknown")
|
311 |
|
312 |
try:
|
313 |
c_list = json.loads(countries.replace("'", '"'))
|
|
|
333 |
|
334 |
# Build the final string
|
335 |
if matched_countries:
|
|
|
336 |
additional_text = (
|
337 |
f"**{', '.join(matched_countries)}**, commissioned by **{client_name}**, "
|
338 |
+
f"**{start_year_str}-{end_year_str}**, project ID: {id}, project budget: {total_project}, total volumne: {total_volume}"
|
339 |
)
|
340 |
else:
|
|
|
341 |
additional_text = (
|
342 |
+
f"Commissioned by **{client_name}**, **{start_year_str}-{end_year_str}**, project ID: {id}, project budget: {total_project}, total volumne: {total_volume}"
|
343 |
)
|
344 |
+
|
345 |
|
346 |
st.markdown(additional_text)
|
347 |
st.divider()
|