TEST-GIZ-Project-Search

Running on CPU Upgrade

App Files Files Community

annikwag commited on Mar 3

Commit

367acc4

verified ·

1 Parent(s): 540cd3a

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -67

app.py CHANGED Viewed

@@ -23,20 +23,10 @@ DEDICATED_ENDPOINT = "https://qu2d8m6dmsollhly.us-east-1.aws.endpoints.huggingfa
 WRITE_ACCESS_TOKEN = st.secrets["Llama_3_1"]
 def get_rag_answer(query, top_results):
-    """
-    Constructs a prompt from the query and the page contexts of the top results,
-    truncates the context to avoid exceeding the token limit, then sends it to the
-    dedicated endpoint and returns only the generated answer.
-    """
-    # Combine the context from the top results (adjust the separator as needed)
     context = "\n\n".join([res.payload["page_content"] for res in top_results])
-    # Truncate the context to a maximum number of characters (e.g., 12000 characters)
     max_context_chars = 15000
     if len(context) > max_context_chars:
         context = context[:max_context_chars]
-    # Build the prompt, instructing the model to only output the final answer.
     prompt = (
         "Using the following context, answer the question concisely. "
         "Only output the final answer below, without repeating the context or question.\n\n"
@@ -44,37 +34,29 @@ def get_rag_answer(query, top_results):
         f"Question: {query}\n\n"
         "Answer:"
     )
     headers = {"Authorization": f"Bearer {WRITE_ACCESS_TOKEN}"}
     payload = {
         "inputs": prompt,
-        "parameters": {
-            "max_new_tokens": 150  # Adjust max tokens as needed
-        }
     }
     response = requests.post(DEDICATED_ENDPOINT, headers=headers, json=payload)
     if response.status_code == 200:
         result = response.json()
         answer = result[0]["generated_text"]
-        # If the model returns the full prompt, split and extract only the portion after "Answer:"
         if "Answer:" in answer:
             answer = answer.split("Answer:")[-1].strip()
         return answer
     else:
         return f"Error in generating answer: {response.text}"
-#######
-# Helper function: Format project id (e.g., "201940485" -> "2019.4048.5")
 def format_project_id(pid):
     s = str(pid)
     if len(s) > 5:
         return s[:4] + "." + s[4:-1] + "." + s[-1]
     return s
-# Helper function: Compute title from metadata using name.en (or name.de if empty)
 def compute_title(metadata):
     name_en = metadata.get("name.en", "").strip()
     name_de = metadata.get("name.de", "").strip()
@@ -84,7 +66,7 @@ def compute_title(metadata):
         return f"{base} [{format_project_id(pid)}]"
     return base or "No Title"
-# Helper function: Get CRS filter options from all documents in the collection
 @st.cache_data
 def get_crs_options(_client, collection_name):
     results = hybrid_search(_client, "", collection_name)
@@ -99,8 +81,7 @@ def get_crs_options(_client, collection_name):
             crs_set.add(crs_combined)
     return sorted(crs_set)
-# Update filter_results to also filter by crs_combined.
 def filter_results(results, country_filter, region_filter, end_year_range, crs_filter):
     filtered = []
     for r in results:
@@ -128,30 +109,32 @@ def filter_results(results, country_filter, region_filter, end_year_range, crs_f
         else:
             countries_in_region = c_list
-        # Filter by CRS: compute crs_combined and compare to the selected filter.
         crs_key = metadata.get("crs_key", "").strip()
         crs_value = metadata.get("crs_value", "").strip()
         crs_combined = f"{crs_key}: {crs_value}" if (crs_key or crs_value) else ""
-        if crs_filter != "All/Not allocated" and crs_filter != crs_combined:
-            continue
-        if ((country_filter == "All/Not allocated" or selected_iso_code in c_list)
             and (region_filter == "All/Not allocated" or countries_in_region)
-            and (end_year_range[0] <= end_year_val <= end_year_range[1])):
             filtered.append(r)
     return filtered
-#######
-# get the device to be used eithe gpu or cpu
 device = 'cuda' if cuda.is_available() else 'cpu'
-st.set_page_config(page_title="SEARCH IATI",layout='wide')
 st.title("GIZ Project Database (PROTOTYPE)")
 var = st.text_input("Enter Search Question")
 # Load the region lookup CSV
 region_lookup_path = "docStore/regions_lookup.csv"
 region_df = load_region_data(region_lookup_path)
@@ -196,14 +179,19 @@ def get_country_name_and_region_mapping(_client, collection_name, region_df):
 client = get_client()
 country_name_mapping, iso_code_to_sub_region = get_country_name_and_region_mapping(client, collection_name, region_df)
-unique_country_names = sorted(country_name_mapping.keys())  # List of country names
-# Layout filters in columns: add a new filter for CRS in col4.
 col1, col2, col3, col4 = st.columns([1, 1, 1, 4])
 with col1:
     region_filter = st.selectbox("Region", ["All/Not allocated"] + sorted(unique_sub_regions))
 with col2:
-    country_filter = st.selectbox("Country", ["All/Not allocated"] + filtered_country_names if (filtered_country_names := unique_country_names) else unique_country_names)
 with col3:
     current_year = datetime.now().year
     default_start_year = current_year - 4
@@ -212,46 +200,32 @@ with col4:
     crs_options = ["All/Not allocated"] + get_crs_options(client, collection_name)
     crs_filter = st.selectbox("CRS", crs_options)
-# Checkbox to control whether to show only exact matches
 show_exact_matches = st.checkbox("Show only exact matches", value=False)
-# Run the search
-# 1) Adjust limit so we get more than 15 results
-results = hybrid_search(client, var, collection_name, limit=500)  # e.g., 100 or 200
-# results is a tuple: (semantic_results, lexical_results)
 semantic_all = results[0]
 lexical_all = results[1]
-# 2) Filter out content < 20 chars (as intermediate fix to problem that e.g. super short paragraphs with few chars get high similarity score)
-semantic_all = [
-    r for r in semantic_all if len(r.payload["page_content"]) >= 5
-]
-lexical_all = [
-    r for r in lexical_all if len(r.payload["page_content"]) >= 5
-]
-# 2) Apply a threshold to SEMANTIC results (score >= 0.4)
 semantic_thresholded = [r for r in semantic_all if r.score >= 0.0]
 filtered_semantic = filter_results(semantic_thresholded, country_filter, region_filter, end_year_range, crs_filter)
 filtered_lexical = filter_results(lexical_all, country_filter, region_filter, end_year_range, crs_filter)
-filtered_semantic_no_dupe = remove_duplicates(filtered_semantic) # ToDo remove duplicates again?
 filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
-# Define a helper function to format currency values
 def format_currency(value):
     try:
-        # Convert to float then int for formatting (assumes whole numbers)
         return f"€{int(float(value)):,}"
     except (ValueError, TypeError):
         return value
-# Helper function to highlight query matches (case-insensitive)
 def highlight_query(text, query):
     pattern = re.compile(re.escape(query), re.IGNORECASE)
     return pattern.sub(lambda m: f"**{m.group(0)}**", text)
@@ -275,15 +249,12 @@ if show_exact_matches:
         st.divider()
         for res in top_results:
             metadata = res.payload.get('metadata', {})
-            # Compute new title if not already set
             if "title" not in metadata:
                 metadata["title"] = compute_title(metadata)
-            # Use new title instead of project_name and highlight query if present
             display_title = highlight_query(metadata["title"], var) if var.strip() else metadata["title"]
             proj_id = metadata.get('id', 'Unknown')
             st.markdown(f"#### {display_title} [{proj_id}]")
-            # Build snippet with potential highlighting
             objectives = metadata.get("objectives", "")
             desc_de = metadata.get("description.de", "")
             desc_en = metadata.get("description.en", "")
@@ -299,13 +270,11 @@ if show_exact_matches:
                 with st.expander("Show more"):
                     st.write(remainder_text)
-            # Keywords
             full_text = res.payload['page_content']
             top_keywords = extract_top_keywords(full_text, top_n=5)
             if top_keywords:
                 st.markdown(f"_{' · '.join(top_keywords)}_")
-            # Country info
             try:
                 c_list = json.loads(metadata.get('countries', "[]").replace("'", '"'))
             except json.JSONDecodeError:
@@ -318,7 +287,6 @@ if show_exact_matches:
                         matched_countries.append(resolved_name)
             additional_text = f"Country: **{', '.join(matched_countries) if matched_countries else 'Unknown'}**"
-            # Add contact info if available and not [email protected]
             contact = metadata.get("contact", "").strip()
             if contact and contact.lower() != "[email protected]":
                 additional_text += f" | Contact: **{contact}**"
@@ -380,7 +348,6 @@ else:
                 additional_text += f" | Contact: **{contact}**"
             st.markdown(additional_text)
             st.divider()
     #  for i in results:
     #      st.subheader(str(i.metadata['id'])+":"+str(i.metadata['title_main']))
     #      st.caption(f"Status:{str(i.metadata['status'])}, Country:{str(i.metadata['country_name'])}")

 WRITE_ACCESS_TOKEN = st.secrets["Llama_3_1"]
 def get_rag_answer(query, top_results):
     context = "\n\n".join([res.payload["page_content"] for res in top_results])
     max_context_chars = 15000
     if len(context) > max_context_chars:
         context = context[:max_context_chars]
     prompt = (
         "Using the following context, answer the question concisely. "
         "Only output the final answer below, without repeating the context or question.\n\n"
         f"Question: {query}\n\n"
         "Answer:"
     )
     headers = {"Authorization": f"Bearer {WRITE_ACCESS_TOKEN}"}
     payload = {
         "inputs": prompt,
+        "parameters": {"max_new_tokens": 150}
     }
     response = requests.post(DEDICATED_ENDPOINT, headers=headers, json=payload)
     if response.status_code == 200:
         result = response.json()
         answer = result[0]["generated_text"]
         if "Answer:" in answer:
             answer = answer.split("Answer:")[-1].strip()
         return answer
     else:
         return f"Error in generating answer: {response.text}"
+# Helper: Format project id (e.g., "201940485" -> "2019.4048.5")
 def format_project_id(pid):
     s = str(pid)
     if len(s) > 5:
         return s[:4] + "." + s[4:-1] + "." + s[-1]
     return s
+# Helper: Compute title from metadata using name.en (or name.de if empty)
 def compute_title(metadata):
     name_en = metadata.get("name.en", "").strip()
     name_de = metadata.get("name.de", "").strip()
         return f"{base} [{format_project_id(pid)}]"
     return base or "No Title"
+# Helper: Get CRS filter options from all documents
 @st.cache_data
 def get_crs_options(_client, collection_name):
     results = hybrid_search(_client, "", collection_name)
             crs_set.add(crs_combined)
     return sorted(crs_set)
+# Revised filter_results: Allow missing end_year or CRS; enforce CRS only when present.
 def filter_results(results, country_filter, region_filter, end_year_range, crs_filter):
     filtered = []
     for r in results:
         else:
             countries_in_region = c_list
         crs_key = metadata.get("crs_key", "").strip()
         crs_value = metadata.get("crs_value", "").strip()
         crs_combined = f"{crs_key}: {crs_value}" if (crs_key or crs_value) else ""
+        # Only enforce CRS filter if result has a CRS value.
+        if crs_filter != "All/Not allocated" and crs_combined:
+            if crs_filter != crs_combined:
+                continue
+        # Allow projects with no valid end_year to pass (if end_year_val is 0)
+        year_ok = True if end_year_val == 0 else (end_year_range[0] <= end_year_val <= end_year_range[1])
+        if ((country_filter == "All/Not allocated" or (selected_iso_code and selected_iso_code in c_list))
             and (region_filter == "All/Not allocated" or countries_in_region)
+            and year_ok):
             filtered.append(r)
     return filtered
+# Get the device to be used (GPU or CPU)
 device = 'cuda' if cuda.is_available() else 'cpu'
+st.set_page_config(page_title="SEARCH IATI", layout='wide')
 st.title("GIZ Project Database (PROTOTYPE)")
 var = st.text_input("Enter Search Question")
 # Load the region lookup CSV
 region_lookup_path = "docStore/regions_lookup.csv"
 region_df = load_region_data(region_lookup_path)
 client = get_client()
 country_name_mapping, iso_code_to_sub_region = get_country_name_and_region_mapping(client, collection_name, region_df)
+unique_country_names = sorted(country_name_mapping.keys())
+# Layout filters in columns
 col1, col2, col3, col4 = st.columns([1, 1, 1, 4])
 with col1:
     region_filter = st.selectbox("Region", ["All/Not allocated"] + sorted(unique_sub_regions))
+# Compute filtered_country_names based on region_filter:
+if region_filter == "All/Not allocated":
+    filtered_country_names = unique_country_names
+else:
+    filtered_country_names = [name for name, code in country_name_mapping.items() if iso_code_to_sub_region.get(code) == region_filter]
 with col2:
+    country_filter = st.selectbox("Country", ["All/Not allocated"] + filtered_country_names)
 with col3:
     current_year = datetime.now().year
     default_start_year = current_year - 4
     crs_options = ["All/Not allocated"] + get_crs_options(client, collection_name)
     crs_filter = st.selectbox("CRS", crs_options)
+# Checkbox for exact matches
 show_exact_matches = st.checkbox("Show only exact matches", value=False)
+# Run the search
+results = hybrid_search(client, var, collection_name, limit=500)
 semantic_all = results[0]
 lexical_all = results[1]
+semantic_all = [r for r in semantic_all if len(r.payload["page_content"]) >= 5]
+lexical_all = [r for r in lexical_all if len(r.payload["page_content"]) >= 5]
 semantic_thresholded = [r for r in semantic_all if r.score >= 0.0]
 filtered_semantic = filter_results(semantic_thresholded, country_filter, region_filter, end_year_range, crs_filter)
 filtered_lexical = filter_results(lexical_all, country_filter, region_filter, end_year_range, crs_filter)
+filtered_semantic_no_dupe = remove_duplicates(filtered_semantic)
 filtered_lexical_no_dupe = remove_duplicates(filtered_lexical)
 def format_currency(value):
     try:
         return f"€{int(float(value)):,}"
     except (ValueError, TypeError):
         return value
+# Helper to highlight query matches (case-insensitive)
 def highlight_query(text, query):
     pattern = re.compile(re.escape(query), re.IGNORECASE)
     return pattern.sub(lambda m: f"**{m.group(0)}**", text)
         st.divider()
         for res in top_results:
             metadata = res.payload.get('metadata', {})
             if "title" not in metadata:
                 metadata["title"] = compute_title(metadata)
             display_title = highlight_query(metadata["title"], var) if var.strip() else metadata["title"]
             proj_id = metadata.get('id', 'Unknown')
             st.markdown(f"#### {display_title} [{proj_id}]")
             objectives = metadata.get("objectives", "")
             desc_de = metadata.get("description.de", "")
             desc_en = metadata.get("description.en", "")
                 with st.expander("Show more"):
                     st.write(remainder_text)
             full_text = res.payload['page_content']
             top_keywords = extract_top_keywords(full_text, top_n=5)
             if top_keywords:
                 st.markdown(f"_{' · '.join(top_keywords)}_")
             try:
                 c_list = json.loads(metadata.get('countries', "[]").replace("'", '"'))
             except json.JSONDecodeError:
                         matched_countries.append(resolved_name)
             additional_text = f"Country: **{', '.join(matched_countries) if matched_countries else 'Unknown'}**"
             contact = metadata.get("contact", "").strip()
             if contact and contact.lower() != "[email protected]":
                 additional_text += f" | Contact: **{contact}**"
                 additional_text += f" | Contact: **{contact}**"
             st.markdown(additional_text)
             st.divider()
     #  for i in results:
     #      st.subheader(str(i.metadata['id'])+":"+str(i.metadata['title_main']))
     #      st.caption(f"Status:{str(i.metadata['status'])}, Country:{str(i.metadata['country_name'])}")