Spaces:
Sleeping
Sleeping
Create filter_utils.py
Browse files- appStore/filter_utils.py +116 -0
appStore/filter_utils.py
ADDED
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import json
|
3 |
+
import streamlit as st
|
4 |
+
from appStore.crs_utils import lookup_crs_value
|
5 |
+
from appStore.search import hybrid_search
|
6 |
+
from appStore.prep_data import remove_duplicates, extract_year
|
7 |
+
|
8 |
+
def parse_budget(value):
|
9 |
+
"""
|
10 |
+
Safely parse a budget value from string to float. Returns 0.0 on error.
|
11 |
+
"""
|
12 |
+
try:
|
13 |
+
return float(value)
|
14 |
+
except:
|
15 |
+
return 0.0
|
16 |
+
|
17 |
+
def filter_results(results, country_filter, region_filter, end_year_range, crs_filter, budget_filter,
|
18 |
+
region_df, iso_code_to_sub_region, clean_country_code_fn, get_country_name_fn):
|
19 |
+
"""
|
20 |
+
Filter search results by country, region, end_year range, CRS code, and budget threshold.
|
21 |
+
|
22 |
+
Args:
|
23 |
+
results (list): List of results from the vector DB.
|
24 |
+
country_filter (str): Selected country name to filter by (or "All/Not allocated").
|
25 |
+
region_filter (str): Selected region name to filter by (or "All/Not allocated").
|
26 |
+
end_year_range (tuple): (min_end_year, max_end_year) from a Streamlit slider.
|
27 |
+
crs_filter (str): The chosen CRS code from the dropdown (or "All/Not allocated").
|
28 |
+
budget_filter (float): Minimum budget in million EUR.
|
29 |
+
region_df (pd.DataFrame): Region lookup DataFrame.
|
30 |
+
iso_code_to_sub_region (dict): Mapping from alpha-2 country code to sub-region name.
|
31 |
+
clean_country_code_fn (callable): Function that normalizes country code strings.
|
32 |
+
get_country_name_fn (callable): Function that returns the country name from alpha-2 code.
|
33 |
+
|
34 |
+
Returns:
|
35 |
+
list: Filtered list of results that match all criteria.
|
36 |
+
"""
|
37 |
+
filtered = []
|
38 |
+
for r in results:
|
39 |
+
metadata = r.payload.get('metadata', {})
|
40 |
+
country = metadata.get('country', "[]")
|
41 |
+
year_str = metadata.get('end_year')
|
42 |
+
if year_str:
|
43 |
+
extracted = extract_year(year_str)
|
44 |
+
try:
|
45 |
+
end_year_val = int(extracted) if extracted != "Unknown" else 0
|
46 |
+
except ValueError:
|
47 |
+
end_year_val = 0
|
48 |
+
else:
|
49 |
+
end_year_val = 0
|
50 |
+
|
51 |
+
# Parse country field as a JSON list or a single string
|
52 |
+
if country.strip().startswith("["):
|
53 |
+
try:
|
54 |
+
parsed_country = json.loads(country.replace("'", '"'))
|
55 |
+
if isinstance(parsed_country, str):
|
56 |
+
country_list = [parsed_country]
|
57 |
+
else:
|
58 |
+
country_list = parsed_country
|
59 |
+
except json.JSONDecodeError:
|
60 |
+
country_list = []
|
61 |
+
else:
|
62 |
+
country_list = [country.strip()]
|
63 |
+
|
64 |
+
c_list = [clean_country_code_fn(code) for code in country_list if len(clean_country_code_fn(code)) == 2]
|
65 |
+
resolved_names = [get_country_name_fn(code, region_df) for code in c_list]
|
66 |
+
|
67 |
+
# Check if any of the codes match the chosen region
|
68 |
+
country_in_region = any(
|
69 |
+
iso_code_to_sub_region.get(code, "Not allocated") == region_filter
|
70 |
+
for code in c_list
|
71 |
+
)
|
72 |
+
|
73 |
+
# CRS filtering
|
74 |
+
crs_key = metadata.get("crs_key", "").strip()
|
75 |
+
crs_key_clean = re.sub(r'\.0$', '', str(crs_key))
|
76 |
+
new_crs_value = lookup_crs_value(crs_key_clean)
|
77 |
+
new_crs_value_clean = re.sub(r'\.0$', '', str(new_crs_value).strip())
|
78 |
+
crs_combined = f"{crs_key_clean}: {new_crs_value_clean}" if crs_key_clean else ""
|
79 |
+
|
80 |
+
if crs_filter != "All/Not allocated" and crs_combined:
|
81 |
+
if crs_filter != crs_combined:
|
82 |
+
continue
|
83 |
+
|
84 |
+
# Budget filtering
|
85 |
+
budget_value = parse_budget(metadata.get('total_project', "0"))
|
86 |
+
if budget_value < (budget_filter * 1e6):
|
87 |
+
continue
|
88 |
+
|
89 |
+
year_ok = True if end_year_val == 0 else (end_year_range[0] <= end_year_val <= end_year_range[1])
|
90 |
+
|
91 |
+
if ((country_filter == "All/Not allocated" or (country_filter in resolved_names))
|
92 |
+
and (region_filter == "All/Not allocated" or country_in_region)
|
93 |
+
and year_ok):
|
94 |
+
filtered.append(r)
|
95 |
+
|
96 |
+
return filtered
|
97 |
+
|
98 |
+
@st.cache_data(show_spinner=False)
|
99 |
+
def get_crs_options(_client, collection_name):
|
100 |
+
"""
|
101 |
+
Return a sorted list of all CRS code+description combos found in the collection.
|
102 |
+
Used to populate the 'CRS' dropdown in the app.
|
103 |
+
"""
|
104 |
+
results = hybrid_search(_client, "", collection_name)
|
105 |
+
all_results = results[0] + results[1]
|
106 |
+
crs_set = set()
|
107 |
+
for res in all_results:
|
108 |
+
metadata = res.payload.get('metadata', {})
|
109 |
+
raw_crs_key = metadata.get("crs_key", "")
|
110 |
+
crs_key_clean = re.sub(r'\.0$', '', str(raw_crs_key).strip())
|
111 |
+
if crs_key_clean:
|
112 |
+
new_value_raw = lookup_crs_value(crs_key_clean)
|
113 |
+
new_value_clean = re.sub(r'\.0$', '', str(new_value_raw).strip())
|
114 |
+
crs_combined = f"{crs_key_clean}: {new_value_clean}"
|
115 |
+
crs_set.add(crs_combined)
|
116 |
+
return sorted(crs_set)
|