Spaces:

cjber
/

planning-ai

Build error

App Files Files Community

cjber commited on Jan 9

Commit

963aee4

1 Parent(s): 613cc82

update to auto select policies

Browse files

Files changed (20) hide show

planning_ai/chains/fix_chain.py +8 -7
planning_ai/chains/map_chain.py +64 -73
planning_ai/chains/policy_chain.py +26 -0
planning_ai/chains/prompts/map.txt +2 -38
planning_ai/chains/prompts/ocr.txt +1 -1
planning_ai/chains/prompts/policy.txt +7 -0
planning_ai/chains/prompts/reduce.txt +4 -36
planning_ai/chains/prompts/themes.txt +0 -82
planning_ai/common/utils.py +20 -1
planning_ai/graph.py +19 -5
planning_ai/main.py +125 -112
planning_ai/nodes/hallucination_node.py +62 -39
planning_ai/nodes/map_node.py +102 -48
planning_ai/nodes/reduce_node.py +111 -10
planning_ai/preprocessing/gcpt3.py +47 -35
planning_ai/preprocessing/process_pdfs.py +21 -23
planning_ai/report.py +30 -0
planning_ai/retrievers/theme_retriever.py +84 -0
planning_ai/states.py +44 -32
planning_ai/themes.py +75 -133

planning_ai/chains/fix_chain.py CHANGED Viewed

@@ -1,13 +1,13 @@
 from langchain_core.prompts import ChatPromptTemplate
-from planning_ai.chains.map_chain import SLLM
 from planning_ai.common.utils import Paths
-with open(Paths.PROMPTS / "fix_hallucination.txt", "r") as f:
-    map_template = f.read()
-map_prompt = ChatPromptTemplate.from_messages([("system", map_template)])
-fix_chain = map_prompt | SLLM
 if __name__ == "__main__":
     test_document = """
@@ -16,7 +16,8 @@ if __name__ == "__main__":
     the major settlement of Cambourne has been created - now over the projected 3,000 homes and
     Papworth Everard has grown beyond recognition. This in itself is a matter of concern.
     """
     result = fix_chain.invoke(
         {
             "summary": "This plan is great because they are building a nuclear power plant.",
@@ -24,4 +25,4 @@ if __name__ == "__main__":
             "context": test_document,
         }
     )
-    print(result)

 from langchain_core.prompts import ChatPromptTemplate
+from planning_ai.chains.map_chain import create_dynamic_map_chain
 from planning_ai.common.utils import Paths
+with open(Paths.PROMPTS / "themes.txt", "r") as f:
+    themes_txt = f.read()
+with open(Paths.PROMPTS / "fix_hallucination.txt", "r") as f:
+    fix_template = f"{themes_txt}\n\n {f.read()}"
 if __name__ == "__main__":
     test_document = """
     the major settlement of Cambourne has been created - now over the projected 3,000 homes and
     Papworth Everard has grown beyond recognition. This in itself is a matter of concern.
     """
+    test_themes = {"Great Places", "Homes", "Climate Change"}
+    fix_chain = create_dynamic_map_chain(test_themes, fix_template)
     result = fix_chain.invoke(
         {
             "summary": "This plan is great because they are building a nuclear power plant.",
             "context": test_document,
         }
     )
+    __import__("pprint").pprint(dict(result))

planning_ai/chains/map_chain.py CHANGED Viewed

@@ -1,96 +1,85 @@
-from enum import Enum
-from langchain.output_parsers import RetryOutputParser
 from langchain_core.prompts import ChatPromptTemplate
-from langchain_core.runnables import RunnableLambda
-from pydantic import BaseModel, Field
 from planning_ai.common.utils import Paths
 from planning_ai.llms.llm import LLM
-from planning_ai.themes import PolicySelection, Theme
-with open(Paths.PROMPTS / "themes.txt", "r") as f:
-    themes_txt = f.read()
 with open(Paths.PROMPTS / "map.txt", "r") as f:
-    map_template = f"{themes_txt}\n\n {f.read()}"
-class Sentiment(Enum):
-    POSITIVE = "positive"
-    NEGATIVE = "negative"
-    NEUTRAL = "neutral"
-class Place(BaseModel):
-    """Represents a geographical location mentioned in the response with associated sentiment."""
-    place: str = Field(
-        ...,
-        description=(
-            "The name of the geographical location mentioned in the response. "
-            "This can be a city, town, region, or any identifiable place."
-        ),
-    )
-    sentiment: Sentiment = Field(
-        ...,
-        description=(
-            "The sentiment associated with the mentioned place, categorized as 'positive', 'negative', or 'neutral'. "
-            "Assess sentiment based on the context in which the place is mentioned, considering both positive and negative connotations."
-        ),
-    )
-class BriefSummary(BaseModel):
-    """A summary of the response with generated metadata"""
-    summary: str = Field(
-        ...,
-        description=(
-            "A concise summary of the response, capturing the main points and overall sentiment. "
-            "The summary should reflect the key arguments and conclusions presented in the response."
-        ),
-    )
-    themes: list[Theme] = Field(
-        ...,
-        description=(
-            "A list of themes associated with the response. Themes are overarching topics or "
-            "categories that the response addresses, such as 'Climate change' or 'Infrastructure'. "
-            "Identify themes based on the content and context of the response."
-        ),
-    )
-    policies: list[PolicySelection] = Field(
-        ...,
-        description=(
-            "A list of policies associated with the response, each accompanied by directly related "
-            "information as bullet points. Bullet points should provide specific details or examples "
-            "that illustrate how the policy is relevant to the response."
-        ),
-    )
-    places: list[Place] = Field(
-        ...,
-        description=(
-            "All places mentioned in the response, with the sentiment categorized as 'positive', 'negative', or 'neutral'. "
-            "A place can be a city, region, or any geographical location. Assess sentiment based on the context "
-            "in which the place is mentioned, considering both positive and negative connotations."
-        ),
     )
-    is_constructive: bool = Field(
-        ...,
-        description=(
-            "A flag indicating whether the response is constructive. A response is considered constructive if it "
-            "provides actionable suggestions or feedback, addresses specific themes or policies, and is presented "
-            "in a coherent and logical manner."
-        ),
     )
-SLLM = LLM.with_structured_output(BriefSummary, strict=False)
-# TODO: Split out the policy stuff from this class. Find policies later based on
-# what themes are already identified (should improve accuracy)
-map_prompt = ChatPromptTemplate.from_messages([("system", map_template)])
-map_chain = map_prompt | SLLM
 if __name__ == "__main__":
@@ -100,6 +89,8 @@ if __name__ == "__main__":
     the major settlement of Cambourne has been created - now over the projected 3,000 homes and
     Papworth Everard has grown beyond recognition. This in itself is a matter of concern.
     """
-    result = map_chain.invoke({"context": test_document})
     __import__("pprint").pprint(dict(result))

+from enum import Enum, auto
+from typing import Optional, Set, Type
+from langchain.schema import BaseCache
 from langchain_core.prompts import ChatPromptTemplate
+from pydantic import BaseModel, create_model
 from planning_ai.common.utils import Paths
 from planning_ai.llms.llm import LLM
+from planning_ai.themes import THEMES_AND_POLICIES
+# with open(Paths.PROMPTS / "themes.txt", "r") as f:
+#     themes_txt = f.read()
 with open(Paths.PROMPTS / "map.txt", "r") as f:
+    # map_template = f"{themes_txt}\n\n {f.read()}"
+    map_template = f.read()
+def create_policy_enum(
+    policy_groups: Set[str], name: str = "DynamicPolicyEnum"
+) -> Enum:
+    """
+    Create a dynamic enum for policies based on the given policy groups.
+    Args:
+        policy_groups (Set[str]): A set of policy group names.
+        name (str): Name of the enum to be created.
+    Returns:
+        Type[Enum]: A dynamically created Enum class for the policies.
+    """
+    return Enum(name, {policy: auto() for policy in policy_groups})
+def create_brief_summary_model(policy_enum: Enum) -> Type[BaseModel]:
+    """
+    Dynamically create a BriefSummary model using the provided policy enum.
+    Args:
+        policy_enum (Type[Enum]): The dynamically created policy enum.
+    Returns:
+        Type[BaseModel]: A dynamically generated Pydantic model for BriefSummary.
+    """
+    DynamicPolicy = create_model(
+        "DynamicPolicy",
+        policy=(policy_enum, ...),
+        note=(str, ...),
+        __config__={"extra": "forbid"},
     )
+    return create_model(
+        "DynamicBriefSummary",
+        summary=(str, ...),
+        policies=(Optional[list[DynamicPolicy]], ...),
+        __module__=__name__,
+        __config__={"extra": "forbid"},
     )
+def create_dynamic_map_chain(themes, prompt: str):
+    policy_groups = set()
+    for theme in themes:
+        if theme in THEMES_AND_POLICIES:
+            policy_groups.update(THEMES_AND_POLICIES[theme])
+    PolicyEnum = create_policy_enum(policy_groups)
+    DynamicBriefSummary = create_brief_summary_model(PolicyEnum)
+    SLLM = LLM.with_structured_output(DynamicBriefSummary, strict=True)
+    prompt = (
+        f"{prompt}\n\nAvailable Policies:\n\n- "
+        + "\n- ".join(policy_groups)
+        + "\n\nContext:\n\n{context}"
+    )
+    map_prompt = ChatPromptTemplate.from_messages([("system", prompt)])
+    map_chain = map_prompt | SLLM
+    return map_chain
 if __name__ == "__main__":
     the major settlement of Cambourne has been created - now over the projected 3,000 homes and
     Papworth Everard has grown beyond recognition. This in itself is a matter of concern.
     """
+    test_themes = {"Great Places", "Homes"}
+    dynamic_map_chain = create_dynamic_map_chain(test_themes, prompt=map_template)
+    result = dynamic_map_chain.invoke({"context": test_document, "themes": test_themes})
     __import__("pprint").pprint(dict(result))

planning_ai/chains/policy_chain.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from langchain_core.output_parsers import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from planning_ai.common.utils import Paths
+from planning_ai.llms.llm import LLM
+with open(Paths.PROMPTS / "policy.txt", "r") as f:
+    policy_template = f.read()
+policy_prompt = ChatPromptTemplate([("system", policy_template)])
+policy_chain = policy_prompt | LLM | StrOutputParser()
+if __name__ == "__main__":
+    test_policy = "Protecting open spaces"
+    test_bullet = "* " + "\n* ".join(
+        [
+            "The response emphasizes the need to preserve greenfield land, which relates to protecting open spaces.",
+            "The response notes that greenspace land should be preserved."
+            "The response emphasizes the need for creating more parks, which relates to protecting open spaces.",
+        ]
+    )
+    result = policy_chain.invoke({"policy": test_policy, "bullet_points": test_bullet})
+    print(result)

planning_ai/chains/prompts/map.txt CHANGED Viewed

@@ -1,40 +1,4 @@
-Summarise the following response to a planning application, focusing on the themes and policies proposed by the council. Follow these steps:
-1. **Summary:** Provide a concise, neutral summary that captures the key points of the response, particularly in relation to the council's proposed themes.
-2. **Themes:** List the council's themes discussed in the response.
-3. **Policies:** Identify relevant policies associated with the extracted themes.
-4. **Places:** Mention any geographical locations considered by the author.
-5. **Constructiveness:** Indicate whether the response is constructive. A response is constructive if it provides any feedback or commentary on the plan, regardless of its depth or specificity.
-**Few-shot examples for reference:**
----
-**Example 1:**
-Response:
-"I am in favour of this new park development as it will provide much-needed green space for families. However, the parking situation needs to be reconsidered."
-- **Summary:** The author supports the park development for its benefit to families but expresses concern about parking.
-- **Themes:** Biodiversity and green spaces, Infrastructure
-- **Places:** None
-- **Constructiveness:** True
----
-**Example 2:**
-Response:
-"This development in Cambridge will destroy local wildlife and create traffic chaos. It should not go ahead."
-- **Summary:** The author opposes the development due to concerns about wildlife and traffic congestion.
-- **Themes:** Biodiversity and green spaces, Infrastructure
-- **Places:** Cambridge
-- **Constructiveness:** True
----
-**Now summarise the following response in British English:**
-Response:
-{context}


1	+ Read the following response to a planning application, first summarise the response, then identify relevant 'policies' from any given. For each policy, list at least one section of the response that is related. Do not invent new policies. You must return valid JSON in the format given.
2
3	+ Choose from the following list given, by name only:




4

planning_ai/chains/prompts/ocr.txt CHANGED Viewed

@@ -2,7 +2,7 @@ The images provided are from a planning response form filled out by a member of
 Please follow these instructions to process the images:
-1. **Extract Free-Form Information Only**: Focus on extracting and outputting the free-form written content from the images. Do not include single-word answers, brief responses, or any extra content that is not part of the detailed responses.
 2. **Verbatim Output**: Ensure that the extracted information is output exactly as it appears in the images. Add a heading before each section of free-form text if it helps with organisation, but ensure the heading is not added by the model itself. Ignore blank sections entirely—do not generate or include any additional thoughts or content.
 3. **Sequential Processing**: The images are sequentially ordered. A response might continue from one image to the next, so capture the full context across multiple images if necessary.
 4. **Ignore Non-Relevant Content**: Exclude any content that does not fit the criteria of free-form, detailed responses.

 Please follow these instructions to process the images:
+1. **Extract Free-Form Information Only**: Focus on extracting and outputting the free-form written content from the images. Do not include single-word answers, brief responses, or any extra content that is not part of the detailed responses. If there is no response, state **nothing**.
 2. **Verbatim Output**: Ensure that the extracted information is output exactly as it appears in the images. Add a heading before each section of free-form text if it helps with organisation, but ensure the heading is not added by the model itself. Ignore blank sections entirely—do not generate or include any additional thoughts or content.
 3. **Sequential Processing**: The images are sequentially ordered. A response might continue from one image to the next, so capture the full context across multiple images if necessary.
 4. **Ignore Non-Relevant Content**: Exclude any content that does not fit the criteria of free-form, detailed responses.

planning_ai/chains/prompts/policy.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+The following is a list of bullet points relating to a particular planning policy. Rewrite the bullet points to focus only on the key action or idea, excluding additional context (like the name of the policy). If multiple bullet points share the same concept, combine them together.
+Policy: {policy}
+Bullet Points:
+{bullet_points}

planning_ai/chains/prompts/reduce.txt CHANGED Viewed

@@ -1,41 +1,9 @@
-The following is one or more summaries of responses to a new plan by the South Cambridgeshire Council:
 {context}
-You are tasked with producing a **detailed and thorough final summary** that consolidates the main themes raised in the responses. **Do not add, infer, or create information.** Only use content directly mentioned in the summaries. Write in British English.
-Each provided summary may contain both supporting and opposing key points. Summarise these points, indicating whether they **support** or **oppose** the proposed plan. Each point should be grouped into a relevant header chosen from the 'Aims' associated with the response summary.  **Only include aims mentioned in the provided responses.** Omit any aim that is not discussed.
-**Guidelines**:
-- Provide an **extended, one page, balanced summary** of the key themes at the beginning, capturing the overall sentiment and notable trends.
-- In the 'Key points' sections, **group points by aim only if that aim is explicitly mentioned**.
-- Be sure to include specific, concise points that reflect the underlying concerns or support expressed by respondents.
-- Do **not** include information, assumptions, or summaries of aims that were not explicitly mentioned in the responses.
-**Format**:
-## Summary
-<Provide an extended, comprehensive overview of all the main themes. Mention key concerns, positive feedback, and overall trends.>
-## Key points raised in support
-For each key point raised in support, group them by aim **only if that aim is mentioned in the responses**.
-### [Aim name]
-- <Key point 1>
-- <Key point 2>
-- ...
-## Key points raised in opposition
-For each key point raised in opposition, group them by aim **only if that aim is mentioned in the responses**.
-### [Aim name]
-- <Key point 1>
-- <Key point 2>
-- ...

+The following contains summaries of public responses to a new plan proposed by the South Cambridgeshire Council:
 {context}
+As a representative of the Cambridgeshire Council, your task is to craft a **comprehensive and articulate executive summary**. This summary will serve as the introductory section of a major report, highlighting the key themes and concerns raised in the public responses. Ensure that the summary is clear, concise, and professional, reflecting the tone and standards expected in official council documents. **Do not add, infer, or create information.** Use only the content explicitly mentioned in the summaries. Adhere to British English conventions.
+Each time you make a reference to a response document, please add an inline citation which corresponds with the documents numerical ID. For example 'Concerns regarding the impact of increased housing density on the character of Cambridge were prevalent [1][2][11].'.
+## Executive Summary

planning_ai/chains/prompts/themes.txt CHANGED Viewed

@@ -1,82 +0,0 @@
-The following themes are proposed by the South Cambridgeshire Council with each of their associated policies.
-# Climate change
-Net zero carbon new buildings
-Water efficiency in new developments
-Designing for a changing climate
-Flooding and integrated water management
-Renewable energy projects and infrastructure
-Reducing waste and supporting the circular economy
-Supporting land-based carbon sequestration
-# Biodiversity and green spaces
-Biodiversity and geodiversity
-Green infrastructure
-Improving Tree Canopy Cover and the Tree Population
-River corridors
-Protecting open spaces
-Providing and enhancing open spaces
-# Wellbeing and social inclusion
-Creating healthy new developments
-Community, sports and leisure facilities
-Meanwhile uses during long term redevelopments
-Creating inclusive employment and business opportunities through new developments
-Pollution, health and safety
-# Great places
-People and place responsive design
-Protection and enhancement of landscape character
-Protection and enhancement of the Cambridge Green Belt
-Achieving high quality development
-Establishing high quality landscape and public realm
-Conservation and enhancement of heritage assets
-Adapting heritage assets to climate change
-Protection of public houses
-# Jobs
-New employment and development proposals
-Supporting the rural economy
-Protecting the best agricultural land
-Protecting existing business space
-Enabling remote working
-Affordable workspace and creative industries
-Supporting a range of facilities in employment parks
-Retail and centres
-Visitor accommodation, attractions and facilities
-Faculty development and specialist / language schools
-# Homes
-Affordable housing
-Exception sites for affordable housing
-Housing mix
-Housing density
-Garden land and subdivision of existing plots
-Residential space standards and accessible homes
-Specialist housing and homes for older people
-Self and custom build homes
-Build to rent homes
-Houses in multiple occupation (HMOs)
-Student accommodation
-Dwellings in the countryside
-Residential moorings
-Residential caravan sites
-Gypsy and Traveller and Travelling Showpeople sites
-Community-led housing
-# Infrastructure
-Sustainable transport and connectivity
-Parking and electric vehicles
-Freight and delivery consolidation
-Safeguarding important infrastructure
-Aviation development
-Energy infrastructure masterplanning
-Infrastructure and delivery
-Digital infrastructure

planning_ai/common/utils.py CHANGED Viewed

@@ -11,6 +11,18 @@ pl.Config(
 )
 class Paths:
     DATA = Path("data")
@@ -25,7 +37,14 @@ class Paths:
     @classmethod
     def ensure_directories_exist(cls):
-        for path in [cls.DATA, cls.RAW, cls.STAGING, cls.OUT, cls.SUMMARY, cls.SUMMARIES]:
             path.mkdir(parents=True, exist_ok=True)

 )
+def filename_reducer(docs_a, docs_b):
+    if docs_a == []:
+        return docs_b
+    b_dict = {d["filename"]: d for d in docs_b}
+    for i, dict_a in enumerate(docs_a):
+        filename = dict_a.get("filename")
+        if filename in b_dict:
+            docs_a[i] = b_dict[filename]
+    return docs_a
 class Paths:
     DATA = Path("data")
     @classmethod
     def ensure_directories_exist(cls):
+        for path in [
+            cls.DATA,
+            cls.RAW,
+            cls.STAGING,
+            cls.OUT,
+            cls.SUMMARY,
+            cls.SUMMARIES,
+        ]:
             path.mkdir(parents=True, exist_ok=True)

planning_ai/graph.py CHANGED Viewed

@@ -7,7 +7,13 @@ from planning_ai.nodes.hallucination_node import (
     map_fix_hallucinations,
     map_hallucinations,
 )
-from planning_ai.nodes.map_node import generate_summary, map_summaries
 from planning_ai.nodes.reduce_node import generate_final_summary
 from planning_ai.states import OverallState
@@ -25,13 +31,21 @@ def create_graph():
         StateGraph: The compiled state graph ready for execution.
     """
     graph = StateGraph(OverallState)
     graph.add_node("generate_summary", generate_summary)
     graph.add_node("check_hallucination", check_hallucination)
     graph.add_node("fix_hallucination", fix_hallucination)
-    # graph.add_node("generate_final_summary", generate_final_summary)
     graph.add_conditional_edges(
-        START,
         map_summaries,
         ["generate_summary"],
     )
@@ -51,7 +65,7 @@ def create_graph():
         ["check_hallucination"],
     )
-    # graph.add_edge("check_hallucination", "generate_final_summary")
-    # graph.add_edge("generate_final_summary", END)
     return graph.compile()

     map_fix_hallucinations,
     map_hallucinations,
 )
+from planning_ai.nodes.map_node import (
+    add_entities,
+    generate_summary,
+    map_retrieve_themes,
+    map_summaries,
+    retrieve_themes,
+)
 from planning_ai.nodes.reduce_node import generate_final_summary
 from planning_ai.states import OverallState
         StateGraph: The compiled state graph ready for execution.
     """
     graph = StateGraph(OverallState)
+    graph.add_node("add_entities", add_entities)
+    graph.add_node("retrieve_themes", retrieve_themes)
     graph.add_node("generate_summary", generate_summary)
     graph.add_node("check_hallucination", check_hallucination)
     graph.add_node("fix_hallucination", fix_hallucination)
+    graph.add_node("generate_final_summary", generate_final_summary)
+    graph.add_edge(START, "add_entities")
+    graph.add_conditional_edges(
+        "add_entities",
+        map_retrieve_themes,
+        ["retrieve_themes"],
+    )
     graph.add_conditional_edges(
+        "retrieve_themes",
         map_summaries,
         ["generate_summary"],
     )
         ["check_hallucination"],
     )
+    graph.add_edge("check_hallucination", "generate_final_summary")
+    graph.add_edge("generate_final_summary", END)
     return graph.compile()

planning_ai/main.py CHANGED Viewed

@@ -1,18 +1,28 @@
 import os
 import time
 from collections import Counter
 from pathlib import Path
-import geopandas as gpd
 import matplotlib.pyplot as plt
 import polars as pl
 from dotenv import load_dotenv
-from langchain_community.document_loaders import DirectoryLoader, TextLoader
-from langchain_text_splitters import CharacterTextSplitter
-from opencage.geocoder import OpenCageGeocode
 from planning_ai.common.utils import Paths
 from planning_ai.graph import create_graph
 load_dotenv()
@@ -64,81 +74,72 @@ def map_locations(places_df: pl.DataFrame):
 def build_quarto_doc(doc_title, out):
     final = out["generate_final_summary"]
-    executive_summary = (
-        final["final_summary"].split("## Key points raised in support")[0].strip()
-    )
-    key_points = final["final_summary"].split("## Key points raised in support")[1]
-    aims = []
-    for summary in final["summaries_fixed"]:
-        aim = summary["summary"].aims
-        aims.extend(aim)
-    value_counts = Counter(aims)
-    total_values = sum(value_counts.values())
-    percentages = {
-        key: {"count": count, "percentage": (count / total_values)}
-        for key, count in value_counts.items()
-    }
-    top_5 = sorted(percentages.items(), key=lambda x: x[1]["percentage"], reverse=True)[
-        :5
-    ]
-    thematic_breakdown = "| **Aim** | **Percentage** | **Count** |\n|---|---|---|\n"
-    thematic_breakdown += "\n".join(
-        [f"| {item} | {d['percentage']:.2%} | {d['count']} |" for item, d in top_5]
-    )
-    places_df = (
-        pl.DataFrame(
-            [
-                place.dict()
-                for summary in final["summaries_fixed"]
-                for place in summary["summary"].places
-            ]
-        )
-        .group_by("place")
-        .agg(
-            pl.col("place").len().alias("Count"),
-            pl.col("sentiment").mean().alias("Mean Sentiment"),
-        )
-        .rename({"place": "Place"})
-    )
-    map_locations(places_df)
-    places_breakdown = (
-        places_df.sort("Count", descending=True)
-        .head()
-        .to_pandas()
-        .to_markdown(index=False)
-    )
-    stances = [summary["summary"].stance for summary in final["summaries_fixed"]]
-    value_counts = Counter(stances)
-    total_values = sum(value_counts.values())
-    percentages = {
-        key: {"count": count, "percentage": (count / total_values)}
-        for key, count in value_counts.items()
-    }
-    stances_top = sorted(
-        percentages.items(), key=lambda x: x[1]["percentage"], reverse=True
-    )
-    stances_breakdown = " | ".join(
-        [
-            f"**{item}**: {stance['percentage']:.2%} _({stance['count']})_"
-            for item, stance in stances_top
-        ]
-    )
-    short_summaries = "\n\n".join(
-        [
-            f"#### **TODO**\n"
-            f"{summary['summary'].summary}\n\n"
-            f"**Stance**: {summary['summary'].stance}\n\n"
-            f"**Constructiveness**: {summary['summary'].rating}\n\n"
-            for summary in final["summaries_fixed"]
-        ]
-    )
     quarto_doc = (
         "---\n"
@@ -153,53 +154,63 @@ def build_quarto_doc(doc_title, out):
         "monofontoptions:\n"
         "  - Scale=0.55\n"
         "---\n\n"
-        f"{executive_summary}\n\n"
-        f"{stances_breakdown}\n\n"
-        "## Aim Breakdown\n\n"
-        "The aim breakdown identifies which aims are mentioned "
-        "within each response. "
-        "A single response may discuss multiple topics.\n"
-        f"\n\n{thematic_breakdown}\n\n"
-        f"\n\n{places_breakdown}\n\n"
-        f"![Locations mentioned by sentiment](./figs/places.png)\n\n"
-        "## Key points raised in support\n\n"
-        f"{key_points}\n\n"
-        "## Summaries\n"
-        f"{short_summaries}"
     )
     with open(Paths.SUMMARY / f"{doc_title.replace(' ', '_')}.qmd", "w") as f:
         f.write(quarto_doc)
-def main():
-    loader = DirectoryLoader(
-        path=str(Paths.STAGING / "pdfs"),
-        show_progress=True,
-        use_multithreading=True,
-        loader_cls=TextLoader,
-        recursive=True,
-    )
-    docs = [doc for doc in loader.load() if doc.page_content]
-    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
-        chunk_size=1000, chunk_overlap=0
     )
-    split_docs = text_splitter.split_documents(docs)
     app = create_graph()
     step = None
-    for step in app.stream(
-        {
-            "documents": [doc.page_content for doc in split_docs],
-            "filenames": [Path(doc.metadata["source"]) for doc in split_docs],
-        }
-    ):
-        print(list(step.keys()))
     if step is None:
         raise ValueError("No steps were processed!")
     return step
@@ -208,7 +219,9 @@ if __name__ == "__main__":
     tic = time.time()
     out = main()
-    # build_quarto_doc(doc_title, out)
     toc = time.time()
     print(f"Time taken: {(toc - tic) / 60:.2f} minutes.")

+import logging
 import os
+import re
 import time
 from collections import Counter
+from itertools import groupby
 from pathlib import Path
+# import geopandas as gpd
 import matplotlib.pyplot as plt
 import polars as pl
 from dotenv import load_dotenv
+from langchain_community.document_loaders import (
+    DirectoryLoader,
+    PolarsDataFrameLoader,
+    TextLoader,
+)
+from langchain_text_splitters import CharacterTextSplitter, markdown
 from planning_ai.common.utils import Paths
 from planning_ai.graph import create_graph
+from planning_ai.themes import THEMES_AND_POLICIES
+# from opencage.geocoder import OpenCageGeocode
 load_dotenv()
 def build_quarto_doc(doc_title, out):
     final = out["generate_final_summary"]
+    # value_counts = Counter(aims)
+    # total_values = sum(value_counts.values())
+    # percentages = {
+    #     key: {"count": count, "percentage": (count / total_values)}
+    #     for key, count in value_counts.items()
+    # }
+    # top_5 = sorted(percentages.items(), key=lambda x: x[1]["percentage"], reverse=True)[
+    #     :5
+    # ]
+    # thematic_breakdown = "| **Aim** | **Percentage** | **Count** |\n|---|---|---|\n"
+    # thematic_breakdown += "\n".join(
+    #     [f"| {item} | {d['percentage']:.2%} | {d['count']} |" for item, d in top_5]
+    # )
+    #
+    # places_df = (
+    #     pl.DataFrame(
+    #         [
+    #             place.dict()
+    #             for summary in final["summaries_fixed"]
+    #             for place in summary["summary"].places
+    #         ]
+    #     )
+    #     .group_by("place")
+    #     .agg(
+    #         pl.col("place").len().alias("Count"),
+    #         pl.col("sentiment").mean().alias("Mean Sentiment"),
+    #     )
+    #     .rename({"place": "Place"})
+    # )
+    #
+    # map_locations(places_df)
+    #
+    # places_breakdown = (
+    #     places_df.sort("Count", descending=True)
+    #     .head()
+    #     .to_pandas()
+    #     .to_markdown(index=False)
+    # )
+    #
+    # stances = [summary["summary"].stance for summary in final["summaries_fixed"]]
+    # value_counts = Counter(stances)
+    # total_values = sum(value_counts.values())
+    # percentages = {
+    #     key: {"count": count, "percentage": (count / total_values)}
+    #     for key, count in value_counts.items()
+    # }
+    # stances_top = sorted(
+    #     percentages.items(), key=lambda x: x[1]["percentage"], reverse=True
+    # )
+    # stances_breakdown = " | ".join(
+    #     [
+    #         f"**{item}**: {stance['percentage']:.2%} _({stance['count']})_"
+    #         for item, stance in stances_top
+    #     ]
+    # )
+    #
+    # short_summaries = "\n\n".join(
+    #     [
+    #         f"#### **TODO**\n"
+    #         f"{summary['summary'].summary}\n\n"
+    #         f"**Stance**: {summary['summary'].stance}\n\n"
+    #         f"**Constructiveness**: {summary['summary'].rating}\n\n"
+    #         for summary in final["summaries_fixed"]
+    #     ]
+    # )
     quarto_doc = (
         "---\n"
         "monofontoptions:\n"
         "  - Scale=0.55\n"
         "---\n\n"
+        f"{final['final_summary']}\n\n"
+        f"{final['policies']}"
+        # f"{executive_summary}\n\n"
+        # f"{stances_breakdown}\n\n"
+        # "## Aim Breakdown\n\n"
+        # "The aim breakdown identifies which aims are mentioned "
+        # "within each response. "
+        # "A single response may discuss multiple topics.\n"
+        # f"\n\n{thematic_breakdown}\n\n"
+        # f"\n\n{places_breakdown}\n\n"
+        # f"![Locations mentioned by sentiment](./figs/places.png)\n\n"
+        # "## Key points raised in support\n\n"
+        # f"{key_points}\n\n"
+        # "## Summaries\n"
+        # f"{short_summaries}"
     )
     with open(Paths.SUMMARY / f"{doc_title.replace(' ', '_')}.qmd", "w") as f:
         f.write(quarto_doc)
+def read_docs():
+    df = pl.read_parquet(Paths.STAGING / "gcpt3.parquet")
+    df = df.filter(
+        pl.col("representations_document") == "Local Plan Issues and Options Report"
+    ).unique("id")
+    loader = PolarsDataFrameLoader(df, page_content_column="text")
+    docs = list(
+        {
+            doc.page_content: {"document": doc, "filename": doc.metadata["id"]}
+            for doc in loader.load()
+            if doc.page_content and len(doc.page_content.split(" ")) > 5
+        }.values()
     )
+    return docs
+def main():
+    docs = read_docs()
+    n_docs = len(docs)
+    logging.warning(f"{n_docs} documents being processed!")
+    # text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
+    #     chunk_size=10_240, chunk_overlap=0
+    # )
+    # split_docs = text_splitter.split_documents(docs)
     app = create_graph()
     step = None
+    for step in app.stream({"documents": docs, "n_docs": n_docs}):
+        print(step.keys())
     if step is None:
         raise ValueError("No steps were processed!")
     return step
     tic = time.time()
     out = main()
+    build_quarto_doc(doc_title, out)
+    print(out["generate_final_summary"]["final_summary"])
     toc = time.time()
     print(f"Time taken: {(toc - tic) / 60:.2f} minutes.")

planning_ai/nodes/hallucination_node.py CHANGED Viewed

@@ -1,12 +1,31 @@
-from langgraph.constants import Send
-from planning_ai.chains.fix_chain import fix_chain
 from planning_ai.chains.hallucination_chain import (
     HallucinationChecker,
     hallucination_chain,
 )
 from planning_ai.states import DocumentState, OverallState
 def check_hallucination(state: DocumentState):
     """Checks for hallucinations in the summary of a document.
@@ -23,25 +42,29 @@ def check_hallucination(state: DocumentState):
         dict: A dictionary containing either a list of fixed summaries or hallucinations
         that need to be addressed.
     """
-    if state["iteration"] > 5:
-        state["iteration"] = -99
-        return {"summaries_fixed": [state]}
-    response: HallucinationChecker = hallucination_chain.invoke(
-        {"document": state["document"], "summary": state["summary"]}
-    )  # type: ignore
     if response.score == 1:
-        return {"summaries_fixed": [state]}
     return {
-        "hallucinations": [
-            {
-                "hallucination": response,
-                "document": state["document"],
-                "filename": state["filename"],
-                "summary": state["summary"],
-                "iteration": state["iteration"] + 1,
-            }
         ]
     }
@@ -60,7 +83,7 @@ def map_hallucinations(state: OverallState):
         list: A list of Send objects directing each summary to the check_hallucination
         function.
     """
-    return [Send("check_hallucination", summary) for summary in state["summaries"]]
 def fix_hallucination(state: DocumentState):
@@ -77,24 +100,24 @@ def fix_hallucination(state: DocumentState):
         dict: A dictionary containing the updated summaries after attempting to fix
         hallucinations.
     """
-    response = fix_chain.invoke(
-        {
-            "context": state["document"],
-            "summary": state["summary"],
-            "explanation": state["hallucination"],
-        }
-    )
-    state["summary"] = response  # type: ignore
-    return {
-        "summaries": [
             {
-                "document": state["document"],
-                "filename": state["filename"],
-                "summary": state["summary"],
-                "iteration": state["iteration"],
             }
-        ]
-    }
 def map_fix_hallucinations(state: OverallState):
@@ -112,11 +135,11 @@ def map_fix_hallucinations(state: OverallState):
         fix_hallucination function.
     """
     hallucinations = []
-    if "hallucinations" in state:
         hallucinations = [
-            hallucination
-            for hallucination in state["hallucinations"]
-            if hallucination["hallucination"].score != 1
         ]
     return [
         Send("fix_hallucination", hallucination) for hallucination in hallucinations

+import json
+import logging
+from langchain_core.exceptions import OutputParserException
+from langgraph.types import Send
+from pydantic import BaseModel
+from planning_ai.chains.fix_chain import fix_template
 from planning_ai.chains.hallucination_chain import (
     HallucinationChecker,
     hallucination_chain,
 )
+from planning_ai.chains.map_chain import create_dynamic_map_chain
 from planning_ai.states import DocumentState, OverallState
+logging.basicConfig(
+    level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+class BasicSummaryBroken(BaseModel):
+    summary: str
+    policies: None
+ITERATIONS = 2
 def check_hallucination(state: DocumentState):
     """Checks for hallucinations in the summary of a document.
         dict: A dictionary containing either a list of fixed summaries or hallucinations
         that need to be addressed.
     """
+    logger.warning(f"Checking hallucinations for document {state['filename']}")
+    # Stop trying after 2 iterations
+    if state["iteration"] > ITERATIONS:
+        state["iteration"] = 99
+        state["hallucination"].score = 1
+        return {"documents": [state]}
+    try:
+        response = hallucination_chain.invoke(
+            {"document": state["document"], "summary": state["summary"].summary}
+        )
+    except (OutputParserException, json.JSONDecodeError) as e:
+        logger.error(f"Failed to decode JSON: {e}.")
+        state["iteration"] = 99
+        state["hallucination"] = HallucinationChecker(score=1, explanation="INVALID")
+        state["summary"] = BasicSummaryBroken(summary="INVALID", policies=None)
+        return {"documents": [state]}
     if response.score == 1:
+        return {"documents": [{**state, "hallucination": response}]}
     return {
+        "documents": [
+            {**state, "hallucination": response, "iteration": state["iteration"] + 1}
         ]
     }
         list: A list of Send objects directing each summary to the check_hallucination
         function.
     """
+    return [Send("check_hallucination", document) for document in state["documents"]]
 def fix_hallucination(state: DocumentState):
         dict: A dictionary containing the updated summaries after attempting to fix
         hallucinations.
     """
+    logger.warning(f"Fixing hallucinations for document {state['filename']}")
+    fix_chain = create_dynamic_map_chain(state["themes"], fix_template)
+    try:
+        response = fix_chain.invoke(
             {
+                "context": state["document"],
+                "summary": state["summary"].summary,
+                "explanation": state["hallucination"].explanation,
             }
+        )
+    except (OutputParserException, json.JSONDecodeError) as e:
+        logger.error(f"Failed to decode JSON: {e}.")
+        state["iteration"] = 99
+        state["hallucination"] = HallucinationChecker(score=1, explanation="INVALID")
+        state["summary"] = BasicSummaryBroken(summary="INVALID", policies=None)
+        return {"documents": [state]}
+    state["summary"] = response  # type: ignore
+    return {"documents": [state]}
 def map_fix_hallucinations(state: OverallState):
         fix_hallucination function.
     """
     hallucinations = []
+    if "documents" in state:
         hallucinations = [
+            document
+            for document in state["documents"]
+            if document["hallucination"].score != 1
         ]
     return [
         Send("fix_hallucination", hallucination) for hallucination in hallucinations

planning_ai/nodes/map_node.py CHANGED Viewed

@@ -1,16 +1,89 @@
 import json
 from pathlib import Path
-from langgraph.constants import Send
 from presidio_analyzer import AnalyzerEngine
 from presidio_anonymizer import AnonymizerEngine
-from planning_ai.chains.map_chain import map_chain
 from planning_ai.common.utils import Paths
 from planning_ai.states import DocumentState, OverallState
-anonymizer = AnonymizerEngine()
 analyzer = AnalyzerEngine()
 def remove_pii(document: str) -> str:
@@ -25,12 +98,14 @@ def remove_pii(document: str) -> str:
     Returns:
         str: The document text with PII anonymized.
     """
     results = analyzer.analyze(
         text=document,
         entities=["PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS"],
         language="en",
     )
-    document = anonymizer.anonymize(text=document, analyzer_results=results)
     return document
@@ -47,43 +122,27 @@ def generate_summary(state: DocumentState) -> dict:
     Returns:
         dict: A dictionary containing the generated summary and updated document state.
     """
-    state["document"] = remove_pii(state["document"])
-    response = map_chain.invoke({"context": state["document"]})
-    summary = response.summary
-    themes = [theme.value for theme in response.themes]
-    policies = [policy.dict() for policy in response.policies]
-    out_policies = []
-    for theme in policies:
-        name = theme["theme"].value
-        policy_list = theme["policies"]
-        out_policies.append({"theme": name, "policies": policy_list})
-    out_places = []
-    for place in response.places:
-        name = place.place
-        sentiment = place.sentiment.value
-        out_places.append({"place": name, "sentiment": sentiment})
-    save_output = {
-        "summary": summary,
-        "themes": themes,
-        "policies": out_policies,
-        "places": out_places,
-    }
-    outfile = f"{Path(state["filename"]).stem}_summary.json"
-    with open(Paths.SUMMARIES / outfile, "w") as file:
-        json.dump(save_output, file, indent=4)
-    output = {
-        "summary": response,
-        "document": state["document"],
-        "filename": str(state["filename"]),
-        "iteration": 1,
-    }
-    return {"summaries": [output]}
 def map_summaries(state: OverallState) -> list[Send]:
@@ -99,10 +158,5 @@ def map_summaries(state: OverallState) -> list[Send]:
         list: A list of Send objects directing each document to the `generate_summary`
         function.
     """
-    return [
-        Send(
-            "generate_summary",
-            {"document": document, "filename": filename},
-        )
-        for document, filename in zip(state["documents"], state["filenames"])
-    ]

 import json
+import logging
 from pathlib import Path
+from typing import TypedDict
+import spacy
+from langchain_core.exceptions import OutputParserException
+from langgraph.types import Send
 from presidio_analyzer import AnalyzerEngine
 from presidio_anonymizer import AnonymizerEngine
+from pydantic import BaseModel, ValidationError
+from planning_ai.chains.hallucination_chain import HallucinationChecker
+from planning_ai.chains.map_chain import create_dynamic_map_chain, map_template
 from planning_ai.common.utils import Paths
+from planning_ai.retrievers.theme_retriever import grade_chain, theme_retriever
 from planning_ai.states import DocumentState, OverallState
+logging.basicConfig(
+    level=logging.WARN, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+class BasicSummaryBroken(BaseModel):
+    summary: str
+    policies: None
 analyzer = AnalyzerEngine()
+anonymizer = AnonymizerEngine()
+nlp = spacy.load("en_core_web_lg")
+def retrieve_themes(state: DocumentState) -> dict:
+    theme_documents = theme_retriever.invoke(input=state["document"].page_content)
+    # TODO: add something similar but more efficient?
+    grade_scores = []
+    for doc in theme_documents:
+        try:
+            score = grade_chain.invoke(
+                {
+                    "context": doc.page_content,
+                    "document": state["document"].page_content,
+                }
+            ).binary_score
+        except (OutputParserException, json.JSONDecodeError) as e:
+            logger.error(f"Failed to decode JSON: {e}.\n Setting to 'no'")
+            score = "no"
+        grade_scores.append(score)
+    theme_documents = [
+        doc for doc, include in zip(theme_documents, grade_scores) if include == "yes"
+    ]
+    # TODO: Add metadata to this as string?
+    theme_documents_text = "\n\n".join([d.page_content for d in theme_documents])
+    # state["document"].page_content = (
+    #     f"{state['document'].page_content}\n\n"
+    #     f"Related Information:\n\n{theme_documents_text}"
+    # )
+    state["theme_docs"] = theme_documents
+    state["themes"] = {doc.metadata["theme"] for doc in theme_documents}
+    logger.warning(f"Retrieved relevant theme documents for: {state['filename']}")
+    return {"documents": [state]}
+def map_retrieve_themes(state: OverallState) -> list[Send]:
+    logger.warning("Mapping documents to retrieve themes.")
+    return [Send("retrieve_themes", document) for document in state["documents"]]
+def add_entities(state: OverallState) -> OverallState:
+    for idx, document in enumerate(
+        nlp.pipe(
+            [doc["document"].page_content for doc in state["documents"]],
+        )
+    ):
+        state["documents"][idx]["entities"] = [
+            {"entity": ent.text, "label": ent.label_} for ent in document.ents
+        ]
+    return state
 def remove_pii(document: str) -> str:
     Returns:
         str: The document text with PII anonymized.
     """
+    logger.warning("Starting PII removal.")
     results = analyzer.analyze(
         text=document,
         entities=["PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS"],
         language="en",
     )
+    document = anonymizer.anonymize(text=document, analyzer_results=results).text
+    logger.warning("PII removal completed.")
     return document
     Returns:
         dict: A dictionary containing the generated summary and updated document state.
     """
+    logger.warning(f"Generating summary for document: {state['filename']}")
+    state["document"].page_content = remove_pii(state["document"].page_content)
+    if not state["themes"]:
+        state["iteration"] = 99
+        state["hallucination"] = HallucinationChecker(score=1, explanation="INVALID")
+        state["summary"] = BasicSummaryBroken(summary="INVALID", policies=None)
+        return {"documents": [state]}
+    map_chain = create_dynamic_map_chain(themes=state["themes"], prompt=map_template)
+    try:
+        response = map_chain.invoke({"context": state["document"].page_content})
+    except (OutputParserException, json.JSONDecodeError) as e:
+        logger.error(f"Failed to decode JSON: {e}.")
+        state["iteration"] = 99
+        state["hallucination"] = HallucinationChecker(score=1, explanation="INVALID")
+        state["summary"] = BasicSummaryBroken(summary="INVALID", policies=None)
+        return {"documents": [state]}
+    logger.warning(f"Summary generation completed for document: {state['filename']}")
+    return {"documents": [{**state, "summary": response, "iteration": 1}]}
 def map_summaries(state: OverallState) -> list[Send]:
         list: A list of Send objects directing each document to the `generate_summary`
         function.
     """
+    logger.warning("Mapping documents to generate summaries.")
+    return [Send("generate_summary", document) for document in state["documents"]]

planning_ai/nodes/reduce_node.py CHANGED Viewed

@@ -1,5 +1,36 @@
 from planning_ai.chains.reduce_chain import reduce_chain
 from planning_ai.states import OverallState
 def generate_final_summary(state: OverallState):
@@ -18,17 +49,87 @@ def generate_final_summary(state: OverallState):
         dict: A dictionary containing the final summary, along with the original
         documents, summaries, fixed summaries, and hallucinations.
     """
-    if len(state["documents"]) == len(state["summaries_fixed"]):
         summaries = [
-            str(summary["summary"])
-            for summary in state["summaries_fixed"]
-            if summary["summary"].stance != "NEUTRAL" and summary["summary"].rating >= 5
         ]
-        response = reduce_chain.invoke({"context": summaries})
         return {
-            "final_summary": response,
-            "summaries_fixed": state["summaries_fixed"],
-            "summaries": state["summaries"],
-            "hallucinations": state["hallucinations"],
-            "documents": state["documents"],
         }

+import json
+import logging
+from pathlib import Path
+import polars as pl
+from planning_ai.chains.policy_chain import policy_chain
 from planning_ai.chains.reduce_chain import reduce_chain
 from planning_ai.states import OverallState
+from planning_ai.themes import THEMES_AND_POLICIES
+logging.basicConfig(
+    level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+# TODO: Reduce down the grouping of policies. I.e. combine points that are closely related and add citations.
+def markdown_bullets(summaries):
+    policies = {"themes": [], "policies": [], "details": []}
+    for summary in summaries:
+        if not summary["summary"].policies:
+            continue
+        for policy in summary["summary"].policies:
+            for theme, p in THEMES_AND_POLICIES.items():
+                if policy.policy.name in p:
+                    policies["themes"].append(theme)
+                    policies["policies"].append(policy.policy.name)
+                    policies["details"].append(policy.note)
+    df = pl.DataFrame(policies)
+    grouped = df.group_by(["themes", "policies"]).agg(pl.col("details"))
+    return grouped
 def generate_final_summary(state: OverallState):
         dict: A dictionary containing the final summary, along with the original
         documents, summaries, fixed summaries, and hallucinations.
     """
+    logger.warning("Generating final summary")
+    final_docs = [doc for doc in state["documents"] if doc["hallucination"].score == 1]
+    logger.warning(f"Number of final docs: {len(final_docs)}")
+    if len(final_docs) == state["n_docs"]:
         summaries = [
+            # f"Document ID: [{id}]\n\n{doc["summary"].summary}"
+            doc
+            for id, doc in zip(range(state["n_docs"]), final_docs)
+            if doc["summary"].summary != "INVALID"
+            and doc["themes"] != set()
+            and doc["iteration"] != 99
         ]
+        out = []
+        for doc in summaries:
+            summary = doc["summary"].model_dump()
+            if summary["policies"] is not None:
+                policies = [
+                    {"policy": policy["policy"].name, "note": policy["note"]}
+                    for policy in summary["policies"]
+                ]
+            else:
+                policies = []
+            summary = summary["summary"]
+            out.append(
+                {
+                    "document": doc["document"].model_dump()["page_content"],
+                    "filename": doc["filename"],
+                    "entities": doc["entities"],
+                    "theme_docs": [d.model_dump() for d in doc["theme_docs"]],
+                    "themes": list(doc["themes"]),
+                    "summary": summary,
+                    "policies": policies,
+                    "iteration": doc["iteration"],
+                    "hallucination": doc["hallucination"].model_dump(),
+                }
+            )
+        for doc in out:
+            filename = Path(str(doc["filename"])).stem
+            with open(f"data/out/summaries/{filename}.json", "w") as f:
+                json.dump(doc, f)
+        summaries_text = [s["summary"].summary for s in summaries]
+        final_responses = []
+        batch_size = 50
+        for i in range(0, len(summaries_text), batch_size):
+            logger.warning("Processing batches.")
+            batch = summaries_text[i : i + batch_size]
+            response = reduce_chain.invoke({"context": batch})
+            final_responses.append(response)
+        final_response = reduce_chain.invoke({"context": "\n\n".join(final_responses)})
+        pols = markdown_bullets(summaries)
+        pol_out = []
+        for _, policy in pols.group_by(["themes", "policies"]):
+            logger.warning("Processing policies.")
+            bullets = "* " + "* \n".join(policy["details"][0])
+            pchain_out = policy_chain.invoke(
+                {"policy": policy["policies"][0], "bullet_points": bullets}
+            )
+            pol_out.append(
+                {
+                    "theme": policy["themes"][0],
+                    "policy": policy["policies"][0],
+                    "points": pchain_out,
+                }
+            )
+        themes = ""
+        for theme, policies in pl.DataFrame(pol_out).group_by("theme"):
+            themes += f"# {theme[0]}\n\n"
+            for row in policies.iter_rows(named=True):
+                themes += f"\n## {row['policy']}\n\n"
+                themes += f"{row['points']}\n"
+            themes += "\n"
         return {
+            "final_summary": final_response,
+            "documents": final_docs,
+            "policies": themes,
         }

planning_ai/preprocessing/gcpt3.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import logging
 from pathlib import Path
 from typing import Any
@@ -59,14 +60,13 @@ def process_files(files: list[Path], schema: dict[str, Any]) -> None:
 def download_attachments():
     df = pl.read_parquet(Paths.STAGING / "gcpt3.parquet")
-    existing_files = {int(f.stem) for f in (Paths.RAW / "pdfs").glob("*.pdf")}
     failed_files = set()
     failed_file_path = Paths.RAW / "failed_downloads.txt"
     if failed_file_path.exists():
         with open(failed_file_path, "r") as file:
-            failed_files = set(int(l) for l in file.read().splitlines())
     for row in tqdm(
         df.drop_nulls(subset="attachments_id")
@@ -74,52 +74,64 @@ def download_attachments():
         .sample(shuffle=True, fraction=1)
         .rows(named=True)
     ):
-        attachment_id = int(row["attachments_id"])
-        if attachment_id in existing_files or attachment_id in failed_files:
-            print(f"Skipping {attachment_id} (already exists or previously failed)")
-            continue
         if (
-            row["attachments_url"].endswith(".pdf")
-            and not row["attachments_url"].startswith("https://egov.scambs.gov.uk")
-            and not row["attachments_url"].startswith("http://egov.scambs.gov.uk")
         ):
-            file_path = Paths.RAW / "pdfs" / f"{attachment_id}.pdf"
-            try:
-                response = requests.get(row["attachments_url"], timeout=10)
-                response.raise_for_status()
-                with open(file_path, "wb") as f:
-                    f.write(response.content)
-                print(f"Downloaded {attachment_id} to {file_path}")
-            except requests.RequestException as e:
-                logging.error(f"RequestException for {attachment_id}: {e}")
-                failed_files.add(attachment_id)
-                with open(failed_file_path, "a") as file:
-                    file.write(f"{attachment_id}\n")
-                print(f"Skipping {attachment_id} due to error: {e}")
-            except Exception as e:
-                logging.error(f"Unexpected error for {attachment_id}: {e}")
-                failed_files.add(attachment_id)
-                with open(failed_file_path, "a") as file:
-                    file.write(f"{attachment_id}\n")
-                print(f"Unexpected error for {attachment_id}: {e}")
 def convert_txt():
     df = pl.read_parquet(Paths.STAGING / "gcpt3.parquet")
-    # attachment_txt =
-    f"{df['text']}\n\nPOSITION: {df['representations_support/object']}"
 def main() -> None:
     files = list(Path(Paths.RAW / "gcpt3").glob("*.json"))
     schema = get_schema()
     process_files(files, schema)
 if __name__ == "__main__":

 import logging
+import textwrap
 from pathlib import Path
 from typing import Any
 def download_attachments():
     df = pl.read_parquet(Paths.STAGING / "gcpt3.parquet")
+    existing_files = {f.stem for f in (Paths.RAW / "pdfs").glob("*.pdf")}
     failed_files = set()
     failed_file_path = Paths.RAW / "failed_downloads.txt"
     if failed_file_path.exists():
         with open(failed_file_path, "r") as file:
+            failed_files = set(l for l in file.read().splitlines())
     for row in tqdm(
         df.drop_nulls(subset="attachments_id")
         .sample(shuffle=True, fraction=1)
         .rows(named=True)
     ):
         if (
+            row["attachments_url"].startswith(
+                ("https://egov.scambs.gov.uk", "http://egov.scambs.gov.uk")
+            )
+            or row["attachments_id"] in existing_files
+            or row["attachments_id"] in failed_files
         ):
+            failed_files.add(row["attachments_id"])
+            continue
+        file_path = Paths.RAW / "pdfs" / f"{row['attachments_id']}.pdf"
+        try:
+            response = requests.get(row["attachments_url"], timeout=3)
+            response.raise_for_status()
+            with open(file_path, "wb") as f:
+                f.write(response.content)
+            print(f"Downloaded {row['attachments_url']} to {file_path}")
+        except requests.RequestException as e:
+            logging.error(f"RequestException for {row['attachments_url']}: {e}")
+            failed_files.add(row["attachments_id"])
+            with open(failed_file_path, "a") as file:
+                file.write(f"{row['attachments_id']}\n")
+            print(f"Skipping {row['attachments_url']} due to error: {e}")
+        except Exception as e:
+            logging.error(f"Unexpected error for {row['attachments_url']}: {e}")
+            row["attachments_url"]
+            failed_files.add(row["attachments_id"])
+            with open(failed_file_path, "a") as file:
+                file.write(f"{row['attachments_id']}\n")
+            print(f"Unexpected error for {row['attachments_url']}: {e}")
 def convert_txt():
+    # TODO: add pdf content
     df = pl.read_parquet(Paths.STAGING / "gcpt3.parquet")
+    for response_doc, dfd in df.group_by("representations_document"):
+        for row in tqdm(dfd.rows(named=True)):
+            text = f"{row["text"]}"
+            with open(
+                Paths.STAGING
+                / "txt"
+                / f"{response_doc}"
+                / f"{row['representations_id']}.txt",
+                "w",
+            ) as f:
+                f.write(text)
 def main() -> None:
     files = list(Path(Paths.RAW / "gcpt3").glob("*.json"))
     schema = get_schema()
     process_files(files, schema)
+    download_attachments()
+    convert_txt()
 if __name__ == "__main__":

planning_ai/preprocessing/process_pdfs.py CHANGED Viewed

@@ -2,36 +2,16 @@ import base64
 import os
 from io import BytesIO
-import cv2
-import numpy as np
 import requests
 from dotenv import load_dotenv
 from pdf2image import convert_from_path
 from tqdm import tqdm
 from planning_ai.common.utils import Paths
 load_dotenv()
-import easyocr
-from pdf2image import convert_from_path
-pdf_path = "data/raw/pdfs/25.pdf"
-# pdf_path = "../../data/raw/pdfs/26.pdf"
-images = convert_from_path(pdf_path)
-reader = easyocr.Reader(lang_list=["en"], gpu=True)
-for i, image in enumerate(images):
-    results = reader.readtext(np.array(image))
-    print(f"Page {i+1}:")
-    confidences = []
-    for result in results:
-        confidences.append(result[2])
-        print(f"Detected text: {result[1]} (confidence: {result[2]:.2f})")
-np.array(confidences).mean()
 def encode_images_to_base64(images):
     image_b64 = []
@@ -61,13 +41,28 @@ def send_request_to_api(messages):
     return response.json()
 def main():
     pdfs = (Paths.RAW / "pdfs").glob("*.pdf")
     with open("planning_ai/preprocessing/prompts/ocr.txt", "r") as f:
         ocr_prompt = f.read()
     for file in tqdm(pdfs):
-        if file.stem:
             images = convert_from_path(file)
             image_b64 = encode_images_to_base64(images)
@@ -79,12 +74,15 @@ def main():
             ]
             response = send_request_to_api(messages)
             out = response["choices"][0]["message"]["content"]
-            outfile = Paths.STAGING / "pdfs" / f"{file.stem}.txt"
             if outfile.exists():
                 continue
             with open(outfile, "w") as f:
                 f.write(out)
 if __name__ == "__main__":

 import os
 from io import BytesIO
 import requests
 from dotenv import load_dotenv
 from pdf2image import convert_from_path
+from PyPDF2 import PdfReader
 from tqdm import tqdm
 from planning_ai.common.utils import Paths
 load_dotenv()
 def encode_images_to_base64(images):
     image_b64 = []
     return response.json()
+def extract_text_from_pdf(file_path):
+    """Extracts text from a PDF file using PyPDF2."""
+    try:
+        reader = PdfReader(file_path, strict=True)
+        text = []
+        for page in reader.pages:
+            text.append(page.extract_text() or "")
+        return "\n".join(text).strip()
+    except Exception as e:
+        print(e)
+        return None
 def main():
     pdfs = (Paths.RAW / "pdfs").glob("*.pdf")
     with open("planning_ai/preprocessing/prompts/ocr.txt", "r") as f:
         ocr_prompt = f.read()
     for file in tqdm(pdfs):
+        outfile = Paths.STAGING / "pdfs" / f"{file.stem}.txt"
+        try:
             images = convert_from_path(file)
             image_b64 = encode_images_to_base64(images)
             ]
             response = send_request_to_api(messages)
+            if not "choices" in response:
+                continue
             out = response["choices"][0]["message"]["content"]
             if outfile.exists():
                 continue
             with open(outfile, "w") as f:
                 f.write(out)
+        except:
+            continue
 if __name__ == "__main__":

planning_ai/report.py ADDED Viewed

	@@ -0,0 +1,30 @@

+# build_quarto_doc(doc_title, out)
+#
+# d = [
+#     i for i in out["generate_final_summary"]["summaries_fixed"] if i["iteration"] == 4
+# ][0]
+# d["document"]
+#
+# h = [
+#     i["summary"].summary
+#     for i in out["generate_final_summary"]["hallucinations"]
+#     if i["document"] == d["document"]
+# ]
+#
+# e = [
+#     i["hallucination"].explanation
+#     for i in out["generate_final_summary"]["hallucinations"]
+#     if i["document"] == d["document"]
+# ]
+#
+# test = {
+#     "document": d["document"],
+#     "final_summary": d["summary"].summary,
+#     "attempts": h,
+#     "reasoning": e,
+# }
+#
+# print(f"Document:\n\n{test['document']}\n\n")
+# print(f"Final:\n\n{test['final_summary']}\n\n")
+# print("Attempts: \n\n*", "\n\n* ".join(test["attempts"]), "\n\n")
+# print("Reasoning: \n\n*", "\n\n* ".join(test["reasoning"]), "\n\n")

planning_ai/retrievers/theme_retriever.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import logging
+from pathlib import Path
+from chromadb import PersistentClient
+from langchain_community.document_loaders import PyPDFLoader
+from langchain_community.vectorstores import Chroma
+from langchain_core.prompts import PromptTemplate
+from langchain_openai import OpenAIEmbeddings
+from pydantic import BaseModel, Field
+from planning_ai.llms.llm import LLM
+# See: https://consultations.greatercambridgeplanning.org/greater-cambridge-local-plan-preferred-options/supporting-documents
+PDFS = {
+    "Biodiversity and Green Spaces": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPBiodiversityandGreenSpacesAug21v2Nov21_0.pdf",
+    "Climate Change": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPClimateChangeAug21v2Nov21_0.pdf",
+    "Great Places": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPGreatPlacesAug21v1Aug21.pdf",
+    "Homes": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPHomesAug21v2Nov21.pdf",
+    "Infrastructure": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPInfrastructureAug21v2Nov21.pdf",
+    "Jobs": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPJobsAug21v2Nov21.pdf",
+    # "Strategy topic paper": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPStrategyAug21v3Nov21_0.pdf",
+    "Wellbeing and Social Inclusion": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPWellbeingAug21v2Nov21.pdf",
+}
+class Grade(BaseModel):
+    """Binary score for relevance check."""
+    binary_score: str = Field(description="Relevance score 'yes' or 'no'")
+def create_db():
+    chroma_dir = Path("./chroma_themesdb")
+    if chroma_dir.exists():
+        persistent_client = PersistentClient(path="./chroma_themesdb")
+        vectorstore = Chroma(
+            client=persistent_client,
+            collection_name="themes-chroma",
+            embedding_function=OpenAIEmbeddings(),
+        )
+    else:
+        docs = []
+        for name, pdf in PDFS.items():
+            doc = PyPDFLoader(pdf).load()[5:]
+            for d in doc:
+                d.metadata["theme"] = name
+            docs.extend(doc)
+        logging.warning(f"Building ChromaDB...")
+        vectorstore = Chroma.from_documents(
+            documents=docs,
+            collection_name="themes-chroma",
+            embedding=OpenAIEmbeddings(),
+            persist_directory="./chroma_themesdb",
+        )
+    return vectorstore
+grade_template = PromptTemplate(
+    template="""You are a grader assessing relevance of a retrieved document to a user question. \n
+        Here is the retrieved document: \n\n {context} \n\n
+        Here is the original document: {document} \n
+        If the retrieved document contains keyword(s) or semantic meaning related to the original, grade it as relevant. \n
+        Give a binary score 'yes' or 'no' score to indicate whether the retrieved document is relevant to the original.""",
+    input_variables=["context", "document"],
+)
+SLLM = LLM.with_structured_output(Grade, strict=True)
+grade_chain = grade_template | SLLM
+vectorstore = create_db()
+theme_retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
+logging.warning(f"Finished building ChromaDB...")
+if __name__ == "__main__":
+    test_content = """
+    We would certainly support this and would emphasise the importance of trying
+    to solve the severance problems created by the M11 and A14.
+    """
+    len(theme_retriever.invoke(input=test_content))

planning_ai/states.py CHANGED Viewed

@@ -1,9 +1,39 @@
 import operator
 from pathlib import Path
-from typing import Annotated, List, TypedDict
 from planning_ai.chains.hallucination_chain import HallucinationChecker
-from planning_ai.chains.map_chain import BriefSummary
 class OverallState(TypedDict):
@@ -23,35 +53,17 @@ class OverallState(TypedDict):
         iterations (list[int]): A list of iteration counts for processing each document.
     """
-    documents: list[str]
-    final_summary: str
-    summaries: Annotated[list, operator.add]
-    summaries_fixed: Annotated[list, operator.add]
-    hallucinations: Annotated[list, operator.add]
-    filenames: List[Path]
-    iterations: list[int]
-class DocumentState(TypedDict):
-    """Represents the state of an individual document during processing.
-    This class is a TypedDict that encapsulates the state of a single document
-    during the processing workflow. It includes the document text, summary,
-    hallucination details, filename, and iteration count.
-    Attributes:
-        document (str): The text of the document.
-        summary (BriefSummary): The summary of the document.
-        hallucination (HallucinationChecker): The hallucination details for the document's summary.
-        filename (Path): The file path of the document.
-        iteration (int): The current iteration count for processing the document.
-    """
-    document: str
-    summary: BriefSummary
-    hallucination: HallucinationChecker
-    filename: Path
-    iteration: int

 import operator
 from pathlib import Path
+from typing import Annotated, List, Optional, TypedDict
+from langchain_core.documents import Document
+from pydantic import BaseModel
 from planning_ai.chains.hallucination_chain import HallucinationChecker
+from planning_ai.common.utils import filename_reducer
+class DocumentState(TypedDict):
+    """Represents the state of an individual document during processing.
+    This class is a TypedDict that encapsulates the state of a single document
+    during the processing workflow. It includes the document text, summary,
+    hallucination details, filename, and iteration count.
+    Attributes:
+        document (str): The text of the document.
+        summary (BriefSummary): The summary of the document.
+        hallucination (HallucinationChecker): The hallucination details for the document's summary.
+        filename (Path): The file path of the document.
+        iteration (int): The current iteration count for processing the document.
+    """
+    document: Document
+    filename: Path
+    entities: list[dict]
+    themes: set[str]
+    summary: BaseModel
+    theme_docs: list[Document]
+    hallucination: HallucinationChecker
+    iteration: int
 class OverallState(TypedDict):
         iterations (list[int]): A list of iteration counts for processing each document.
     """
+    documents: Annotated[list[DocumentState], filename_reducer]
+    n_docs: int
+    final_summary: Optional[str]
+    # documents_related: Annotated[list, operator.add]
+    # related_theme_docs: Annotated[list, operator.add]
+    #
+    # summaries: Annotated[list, operator.add]
+    # summaries_fixed: Annotated[list, operator.add]
+    # hallucinations: Annotated[list, operator.add]
+    #
+    # filenames: Annotated[list, operator.add]
+    # iterations: list[int]

planning_ai/themes.py CHANGED Viewed

@@ -1,134 +1,76 @@
-from enum import Enum
-from pydantic import BaseModel, field_validator
-class Theme(str, Enum):
-    climate = "Climate change"
-    biodiversity = "Biodiversity and green spaces"
-    wellbeing = "Wellbeing and social inclusion"
-    great_places = "Great places"
-    jobs = "Jobs"
-    homes = "Homes"
-    infrastructure = "Infrastructure"
-class ClimatePolicies(str, Enum):
-    CC_NZ = "Net zero carbon new buildings"
-    CC_WE = "Water efficiency in new developments"
-    CC_DC = "Designing for a changing climate"
-    CC_FM = "Flooding and integrated water management"
-    CC_RE = "Renewable energy projects and infrastructure"
-    CC_CE = "Reducing waste and supporting the circular economy"
-    CC_CS = "Supporting land-based carbon sequestration"
-class BiodiversityPolicies(str, Enum):
-    BG_BG = "Biodiversity and geodiversity"
-    BG_GI = "Green infrastructure"
-    BG_TC = "Improving Tree Canopy Cover and the Tree Population"
-    BG_RC = "River corridors"
-    BG_PO = "Protecting open spaces"
-    BG_EO = "Providing and enhancing open spaces"
-class WellbeingPolicies(str, Enum):
-    WS_HD = "Creating healthy new developments"
-    WS_CF = "Community, sports and leisure facilities"
-    WS_MU = "Meanwhile uses during long term redevelopments"
-    WS_IO = "Creating inclusive employment and business opportunities through new developments"
-    WS_HS = "Pollution, health and safety"
-class GreatPlacesPolicies(str, Enum):
-    GP_PP = "People and place responsive design"
-    GP_LC = "Protection and enhancement of landscape character"
-    GP_GB = "Protection and enhancement of the Cambridge Green Belt"
-    GP_QD = "Achieving high quality development"
-    GP_QP = "Establishing high quality landscape and public realm"
-    GP_HA = "Conservation and enhancement of heritage assets"
-    GP_CC = "Adapting heritage assets to climate change"
-    GP_PH = "Protection of public houses"
-class JobsPolicies(str, Enum):
-    J_NE = "New employment and development proposals"
-    J_RE = "Supporting the rural economy"
-    J_AL = "Protecting the best agricultural land"
-    J_PB = "Protecting existing business space"
-    J_RW = "Enabling remote working"
-    J_AW = "Affordable workspace and creative industries"
-    J_EP = "Supporting a range of facilities in employment parks"
-    J_RC = "Retail and centres"
-    J_VA = "Visitor accommodation, attractions and facilities"
-    J_FD = "Faculty development and specialist / language schools"
-class HomesPolicies(str, Enum):
-    H_AH = "Affordable housing"
-    H_ES = "Exception sites for affordable housing"
-    H_HM = "Housing mix"
-    H_HD = "Housing density"
-    H_GL = "Garden land and subdivision of existing plots"
-    H_SS = "Residential space standards and accessible homes"
-    H_SH = "Specialist housing and homes for older people"
-    H_CB = "Self and custom build homes"
-    H_BR = "Build to rent homes"
-    H_MO = "Houses in multiple occupation (HMOs)"
-    H_SA = "Student accommodation"
-    H_DC = "Dwellings in the countryside"
-    H_RM = "Residential moorings"
-    H_RC = "Residential caravan sites"
-    H_GT = "Gypsy and Traveller and Travelling Showpeople sites"
-    H_CH = "Community-led housing"
-class InfrastructurePolicies(str, Enum):
-    I_ST = "Sustainable transport and connectivity"
-    I_EV = "Parking and electric vehicles"
-    I_FD = "Freight and delivery consolidation"
-    I_SI = "Safeguarding important infrastructure"
-    I_AD = "Aviation development"
-    I_EI = "Energy infrastructure masterplanning"
-    I_ID = "Infrastructure and delivery"
-    I_DI = "Digital infrastructure"
-THEME_TO_POLICY_GROUP = {
-    Theme.climate: ClimatePolicies,
-    Theme.biodiversity: BiodiversityPolicies,
-    Theme.wellbeing: WellbeingPolicies,
-    Theme.great_places: GreatPlacesPolicies,
-    Theme.jobs: JobsPolicies,
-    Theme.homes: HomesPolicies,
-    Theme.infrastructure: InfrastructurePolicies,
 }
-class PolicyDetail(BaseModel):
-    policy: str
-    details: list[str]
-class PolicySelection(BaseModel):
-    theme: Theme
-    policies: list[PolicyDetail]
-    @field_validator("policies", mode="before")
-    @classmethod
-    def validate_policies(cls, policies, info):
-        """Ensure policies match the selected theme."""
-        if not isinstance(policies, list):
-            raise ValueError("Policies must be provided as a list.")
-        theme = info.data.get("theme")
-        if not theme:
-            raise ValueError("Theme must be provided before validating policies.")
-        allowed_policies = [p.value for p in THEME_TO_POLICY_GROUP[theme]]
-        for policy in policies:
-            if policy["policy"] not in allowed_policies:
-                raise ValueError(
-                    f"Policy '{policy['policy']}' is not valid for theme '{theme.value}'."
-                )
-        return policies

+THEMES_AND_POLICIES = {
+    "Climate Change": [
+        "Net zero carbon new buildings",
+        "Water efficiency in new developments",
+        "Designing for a changing climate",
+        "Flooding and integrated water management",
+        "Renewable energy projects and infrastructure",
+        "Reducing waste and supporting the circular economy",
+        "Supporting land-based carbon sequestration",
+    ],
+    "Biodiversity and Green Spaces": [
+        "Biodiversity and geodiversity",
+        "Green infrastructure",
+        "Improving Tree Canopy Cover and the Tree Population",
+        "River corridors",
+        "Protecting open spaces",
+        "Providing and enhancing open spaces",
+    ],
+    "Wellbeing and Social Inclusion": [
+        "Creating healthy new developments",
+        "Community, sports and leisure facilities",
+        "Meanwhile uses during long term redevelopments",
+        "Creating inclusive employment and business opportunities through new developments",
+        "Pollution, health and safety",
+    ],
+    "Great Places": [
+        "People and place responsive design",
+        "Protection and enhancement of landscape character",
+        "Protection and enhancement of the Cambridge Green Belt",
+        "Achieving high quality development",
+        "Establishing high quality landscape and public realm",
+        "Conservation and enhancement of heritage assets",
+        "Adapting heritage assets to climate change",
+        "Protection of public houses",
+    ],
+    "Jobs": [
+        "New employment and development proposals",
+        "Supporting the rural economy",
+        "Protecting the best agricultural land",
+        "Protecting existing business space",
+        "Enabling remote working",
+        "Affordable workspace and creative industries",
+        "Supporting a range of facilities in employment parks",
+        "Retail and centres",
+        "Visitor accommodation, attractions and facilities",
+        "Faculty development and specialist / language schools",
+    ],
+    "Homes": [
+        "Affordable housing",
+        "Exception sites for affordable housing",
+        "Housing mix",
+        "Housing density",
+        "Garden land and subdivision of existing plots",
+        "Residential space standards and accessible homes",
+        "Specialist housing and homes for older people",
+        "Self and custom build homes",
+        "Build to rent homes",
+        "Houses in multiple occupation (HMOs)",
+        "Student accommodation",
+        "Dwellings in the countryside",
+        "Residential moorings",
+        "Residential caravan sites",
+        "Gypsy and Traveller and Travelling Showpeople sites",
+        "Community-led housing",
+    ],
+    "Infrastructure": [
+        "Sustainable transport and connectivity",
+        "Parking and electric vehicles",
+        "Freight and delivery consolidation",
+        "Safeguarding important infrastructure",
+        "Aviation development",
+        "Energy infrastructure masterplanning",
+        "Infrastructure and delivery",
+        "Digital infrastructure",
+    ],
 }