Spaces:
Sleeping
Sleeping
update to auto select policies
Browse files- planning_ai/chains/fix_chain.py +8 -7
- planning_ai/chains/map_chain.py +64 -73
- planning_ai/chains/policy_chain.py +26 -0
- planning_ai/chains/prompts/map.txt +2 -38
- planning_ai/chains/prompts/ocr.txt +1 -1
- planning_ai/chains/prompts/policy.txt +7 -0
- planning_ai/chains/prompts/reduce.txt +4 -36
- planning_ai/chains/prompts/themes.txt +0 -82
- planning_ai/common/utils.py +20 -1
- planning_ai/graph.py +19 -5
- planning_ai/main.py +125 -112
- planning_ai/nodes/hallucination_node.py +62 -39
- planning_ai/nodes/map_node.py +102 -48
- planning_ai/nodes/reduce_node.py +111 -10
- planning_ai/preprocessing/gcpt3.py +47 -35
- planning_ai/preprocessing/process_pdfs.py +21 -23
- planning_ai/report.py +30 -0
- planning_ai/retrievers/theme_retriever.py +84 -0
- planning_ai/states.py +44 -32
- planning_ai/themes.py +75 -133
planning_ai/chains/fix_chain.py
CHANGED
@@ -1,13 +1,13 @@
|
|
1 |
from langchain_core.prompts import ChatPromptTemplate
|
2 |
|
3 |
-
from planning_ai.chains.map_chain import
|
4 |
from planning_ai.common.utils import Paths
|
5 |
|
6 |
-
with open(Paths.PROMPTS / "
|
7 |
-
|
8 |
|
9 |
-
|
10 |
-
|
11 |
|
12 |
if __name__ == "__main__":
|
13 |
test_document = """
|
@@ -16,7 +16,8 @@ if __name__ == "__main__":
|
|
16 |
the major settlement of Cambourne has been created - now over the projected 3,000 homes and
|
17 |
Papworth Everard has grown beyond recognition. This in itself is a matter of concern.
|
18 |
"""
|
19 |
-
|
|
|
20 |
result = fix_chain.invoke(
|
21 |
{
|
22 |
"summary": "This plan is great because they are building a nuclear power plant.",
|
@@ -24,4 +25,4 @@ if __name__ == "__main__":
|
|
24 |
"context": test_document,
|
25 |
}
|
26 |
)
|
27 |
-
|
|
|
1 |
from langchain_core.prompts import ChatPromptTemplate
|
2 |
|
3 |
+
from planning_ai.chains.map_chain import create_dynamic_map_chain
|
4 |
from planning_ai.common.utils import Paths
|
5 |
|
6 |
+
with open(Paths.PROMPTS / "themes.txt", "r") as f:
|
7 |
+
themes_txt = f.read()
|
8 |
|
9 |
+
with open(Paths.PROMPTS / "fix_hallucination.txt", "r") as f:
|
10 |
+
fix_template = f"{themes_txt}\n\n {f.read()}"
|
11 |
|
12 |
if __name__ == "__main__":
|
13 |
test_document = """
|
|
|
16 |
the major settlement of Cambourne has been created - now over the projected 3,000 homes and
|
17 |
Papworth Everard has grown beyond recognition. This in itself is a matter of concern.
|
18 |
"""
|
19 |
+
test_themes = {"Great Places", "Homes", "Climate Change"}
|
20 |
+
fix_chain = create_dynamic_map_chain(test_themes, fix_template)
|
21 |
result = fix_chain.invoke(
|
22 |
{
|
23 |
"summary": "This plan is great because they are building a nuclear power plant.",
|
|
|
25 |
"context": test_document,
|
26 |
}
|
27 |
)
|
28 |
+
__import__("pprint").pprint(dict(result))
|
planning_ai/chains/map_chain.py
CHANGED
@@ -1,96 +1,85 @@
|
|
1 |
-
from enum import Enum
|
|
|
2 |
|
3 |
-
from langchain.
|
4 |
from langchain_core.prompts import ChatPromptTemplate
|
5 |
-
from
|
6 |
-
from pydantic import BaseModel, Field
|
7 |
|
8 |
from planning_ai.common.utils import Paths
|
9 |
from planning_ai.llms.llm import LLM
|
10 |
-
from planning_ai.themes import
|
11 |
|
12 |
-
with open(Paths.PROMPTS / "themes.txt", "r") as f:
|
13 |
-
|
14 |
|
15 |
with open(Paths.PROMPTS / "map.txt", "r") as f:
|
16 |
-
map_template = f"{themes_txt}\n\n {f.read()}"
|
|
|
17 |
|
18 |
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
|
|
23 |
|
|
|
|
|
|
|
24 |
|
25 |
-
|
26 |
-
|
|
|
|
|
27 |
|
28 |
-
place: str = Field(
|
29 |
-
...,
|
30 |
-
description=(
|
31 |
-
"The name of the geographical location mentioned in the response. "
|
32 |
-
"This can be a city, town, region, or any identifiable place."
|
33 |
-
),
|
34 |
-
)
|
35 |
-
sentiment: Sentiment = Field(
|
36 |
-
...,
|
37 |
-
description=(
|
38 |
-
"The sentiment associated with the mentioned place, categorized as 'positive', 'negative', or 'neutral'. "
|
39 |
-
"Assess sentiment based on the context in which the place is mentioned, considering both positive and negative connotations."
|
40 |
-
),
|
41 |
-
)
|
42 |
|
|
|
|
|
|
|
43 |
|
44 |
-
|
45 |
-
|
46 |
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
description=(
|
57 |
-
"A list of themes associated with the response. Themes are overarching topics or "
|
58 |
-
"categories that the response addresses, such as 'Climate change' or 'Infrastructure'. "
|
59 |
-
"Identify themes based on the content and context of the response."
|
60 |
-
),
|
61 |
-
)
|
62 |
-
policies: list[PolicySelection] = Field(
|
63 |
-
...,
|
64 |
-
description=(
|
65 |
-
"A list of policies associated with the response, each accompanied by directly related "
|
66 |
-
"information as bullet points. Bullet points should provide specific details or examples "
|
67 |
-
"that illustrate how the policy is relevant to the response."
|
68 |
-
),
|
69 |
-
)
|
70 |
-
places: list[Place] = Field(
|
71 |
-
...,
|
72 |
-
description=(
|
73 |
-
"All places mentioned in the response, with the sentiment categorized as 'positive', 'negative', or 'neutral'. "
|
74 |
-
"A place can be a city, region, or any geographical location. Assess sentiment based on the context "
|
75 |
-
"in which the place is mentioned, considering both positive and negative connotations."
|
76 |
-
),
|
77 |
)
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
)
|
86 |
|
87 |
|
88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
|
90 |
-
|
91 |
-
# what themes are already identified (should improve accuracy)
|
92 |
-
map_prompt = ChatPromptTemplate.from_messages([("system", map_template)])
|
93 |
-
map_chain = map_prompt | SLLM
|
94 |
|
95 |
|
96 |
if __name__ == "__main__":
|
@@ -100,6 +89,8 @@ if __name__ == "__main__":
|
|
100 |
the major settlement of Cambourne has been created - now over the projected 3,000 homes and
|
101 |
Papworth Everard has grown beyond recognition. This in itself is a matter of concern.
|
102 |
"""
|
|
|
103 |
|
104 |
-
|
|
|
105 |
__import__("pprint").pprint(dict(result))
|
|
|
1 |
+
from enum import Enum, auto
|
2 |
+
from typing import Optional, Set, Type
|
3 |
|
4 |
+
from langchain.schema import BaseCache
|
5 |
from langchain_core.prompts import ChatPromptTemplate
|
6 |
+
from pydantic import BaseModel, create_model
|
|
|
7 |
|
8 |
from planning_ai.common.utils import Paths
|
9 |
from planning_ai.llms.llm import LLM
|
10 |
+
from planning_ai.themes import THEMES_AND_POLICIES
|
11 |
|
12 |
+
# with open(Paths.PROMPTS / "themes.txt", "r") as f:
|
13 |
+
# themes_txt = f.read()
|
14 |
|
15 |
with open(Paths.PROMPTS / "map.txt", "r") as f:
|
16 |
+
# map_template = f"{themes_txt}\n\n {f.read()}"
|
17 |
+
map_template = f.read()
|
18 |
|
19 |
|
20 |
+
def create_policy_enum(
|
21 |
+
policy_groups: Set[str], name: str = "DynamicPolicyEnum"
|
22 |
+
) -> Enum:
|
23 |
+
"""
|
24 |
+
Create a dynamic enum for policies based on the given policy groups.
|
25 |
|
26 |
+
Args:
|
27 |
+
policy_groups (Set[str]): A set of policy group names.
|
28 |
+
name (str): Name of the enum to be created.
|
29 |
|
30 |
+
Returns:
|
31 |
+
Type[Enum]: A dynamically created Enum class for the policies.
|
32 |
+
"""
|
33 |
+
return Enum(name, {policy: auto() for policy in policy_groups})
|
34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
|
36 |
+
def create_brief_summary_model(policy_enum: Enum) -> Type[BaseModel]:
|
37 |
+
"""
|
38 |
+
Dynamically create a BriefSummary model using the provided policy enum.
|
39 |
|
40 |
+
Args:
|
41 |
+
policy_enum (Type[Enum]): The dynamically created policy enum.
|
42 |
|
43 |
+
Returns:
|
44 |
+
Type[BaseModel]: A dynamically generated Pydantic model for BriefSummary.
|
45 |
+
"""
|
46 |
+
|
47 |
+
DynamicPolicy = create_model(
|
48 |
+
"DynamicPolicy",
|
49 |
+
policy=(policy_enum, ...),
|
50 |
+
note=(str, ...),
|
51 |
+
__config__={"extra": "forbid"},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
)
|
53 |
+
|
54 |
+
return create_model(
|
55 |
+
"DynamicBriefSummary",
|
56 |
+
summary=(str, ...),
|
57 |
+
policies=(Optional[list[DynamicPolicy]], ...),
|
58 |
+
__module__=__name__,
|
59 |
+
__config__={"extra": "forbid"},
|
60 |
)
|
61 |
|
62 |
|
63 |
+
def create_dynamic_map_chain(themes, prompt: str):
|
64 |
+
policy_groups = set()
|
65 |
+
for theme in themes:
|
66 |
+
if theme in THEMES_AND_POLICIES:
|
67 |
+
policy_groups.update(THEMES_AND_POLICIES[theme])
|
68 |
+
|
69 |
+
PolicyEnum = create_policy_enum(policy_groups)
|
70 |
+
DynamicBriefSummary = create_brief_summary_model(PolicyEnum)
|
71 |
+
|
72 |
+
SLLM = LLM.with_structured_output(DynamicBriefSummary, strict=True)
|
73 |
+
|
74 |
+
prompt = (
|
75 |
+
f"{prompt}\n\nAvailable Policies:\n\n- "
|
76 |
+
+ "\n- ".join(policy_groups)
|
77 |
+
+ "\n\nContext:\n\n{context}"
|
78 |
+
)
|
79 |
+
map_prompt = ChatPromptTemplate.from_messages([("system", prompt)])
|
80 |
+
map_chain = map_prompt | SLLM
|
81 |
|
82 |
+
return map_chain
|
|
|
|
|
|
|
83 |
|
84 |
|
85 |
if __name__ == "__main__":
|
|
|
89 |
the major settlement of Cambourne has been created - now over the projected 3,000 homes and
|
90 |
Papworth Everard has grown beyond recognition. This in itself is a matter of concern.
|
91 |
"""
|
92 |
+
test_themes = {"Great Places", "Homes"}
|
93 |
|
94 |
+
dynamic_map_chain = create_dynamic_map_chain(test_themes, prompt=map_template)
|
95 |
+
result = dynamic_map_chain.invoke({"context": test_document, "themes": test_themes})
|
96 |
__import__("pprint").pprint(dict(result))
|
planning_ai/chains/policy_chain.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from langchain_core.output_parsers import StrOutputParser
|
2 |
+
from langchain_core.prompts import ChatPromptTemplate
|
3 |
+
|
4 |
+
from planning_ai.common.utils import Paths
|
5 |
+
from planning_ai.llms.llm import LLM
|
6 |
+
|
7 |
+
with open(Paths.PROMPTS / "policy.txt", "r") as f:
|
8 |
+
policy_template = f.read()
|
9 |
+
|
10 |
+
|
11 |
+
policy_prompt = ChatPromptTemplate([("system", policy_template)])
|
12 |
+
policy_chain = policy_prompt | LLM | StrOutputParser()
|
13 |
+
|
14 |
+
|
15 |
+
if __name__ == "__main__":
|
16 |
+
test_policy = "Protecting open spaces"
|
17 |
+
test_bullet = "* " + "\n* ".join(
|
18 |
+
[
|
19 |
+
"The response emphasizes the need to preserve greenfield land, which relates to protecting open spaces.",
|
20 |
+
"The response notes that greenspace land should be preserved."
|
21 |
+
"The response emphasizes the need for creating more parks, which relates to protecting open spaces.",
|
22 |
+
]
|
23 |
+
)
|
24 |
+
|
25 |
+
result = policy_chain.invoke({"policy": test_policy, "bullet_points": test_bullet})
|
26 |
+
print(result)
|
planning_ai/chains/prompts/map.txt
CHANGED
@@ -1,40 +1,4 @@
|
|
1 |
-
|
2 |
|
3 |
-
|
4 |
-
2. **Themes:** List the council's themes discussed in the response.
|
5 |
-
3. **Policies:** Identify relevant policies associated with the extracted themes.
|
6 |
-
4. **Places:** Mention any geographical locations considered by the author.
|
7 |
-
5. **Constructiveness:** Indicate whether the response is constructive. A response is constructive if it provides any feedback or commentary on the plan, regardless of its depth or specificity.
|
8 |
|
9 |
-
**Few-shot examples for reference:**
|
10 |
-
|
11 |
-
---
|
12 |
-
|
13 |
-
**Example 1:**
|
14 |
-
|
15 |
-
Response:
|
16 |
-
"I am in favour of this new park development as it will provide much-needed green space for families. However, the parking situation needs to be reconsidered."
|
17 |
-
|
18 |
-
- **Summary:** The author supports the park development for its benefit to families but expresses concern about parking.
|
19 |
-
- **Themes:** Biodiversity and green spaces, Infrastructure
|
20 |
-
- **Places:** None
|
21 |
-
- **Constructiveness:** True
|
22 |
-
|
23 |
-
---
|
24 |
-
|
25 |
-
**Example 2:**
|
26 |
-
|
27 |
-
Response:
|
28 |
-
"This development in Cambridge will destroy local wildlife and create traffic chaos. It should not go ahead."
|
29 |
-
|
30 |
-
- **Summary:** The author opposes the development due to concerns about wildlife and traffic congestion.
|
31 |
-
- **Themes:** Biodiversity and green spaces, Infrastructure
|
32 |
-
- **Places:** Cambridge
|
33 |
-
- **Constructiveness:** True
|
34 |
-
|
35 |
-
---
|
36 |
-
|
37 |
-
**Now summarise the following response in British English:**
|
38 |
-
|
39 |
-
Response:
|
40 |
-
{context}
|
|
|
1 |
+
Read the following response to a planning application, first summarise the response, then identify relevant 'policies' from any given. For each policy, list at least one section of the response that is related. Do **not** invent new policies. You **must** return valid JSON in the format given.
|
2 |
|
3 |
+
Choose from the following list given, by name **only**:
|
|
|
|
|
|
|
|
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
planning_ai/chains/prompts/ocr.txt
CHANGED
@@ -2,7 +2,7 @@ The images provided are from a planning response form filled out by a member of
|
|
2 |
|
3 |
Please follow these instructions to process the images:
|
4 |
|
5 |
-
1. **Extract Free-Form Information Only**: Focus on extracting and outputting the free-form written content from the images. Do not include single-word answers, brief responses, or any extra content that is not part of the detailed responses.
|
6 |
2. **Verbatim Output**: Ensure that the extracted information is output exactly as it appears in the images. Add a heading before each section of free-form text if it helps with organisation, but ensure the heading is not added by the model itself. Ignore blank sections entirely—do not generate or include any additional thoughts or content.
|
7 |
3. **Sequential Processing**: The images are sequentially ordered. A response might continue from one image to the next, so capture the full context across multiple images if necessary.
|
8 |
4. **Ignore Non-Relevant Content**: Exclude any content that does not fit the criteria of free-form, detailed responses.
|
|
|
2 |
|
3 |
Please follow these instructions to process the images:
|
4 |
|
5 |
+
1. **Extract Free-Form Information Only**: Focus on extracting and outputting the free-form written content from the images. Do not include single-word answers, brief responses, or any extra content that is not part of the detailed responses. If there is no response, state **nothing**.
|
6 |
2. **Verbatim Output**: Ensure that the extracted information is output exactly as it appears in the images. Add a heading before each section of free-form text if it helps with organisation, but ensure the heading is not added by the model itself. Ignore blank sections entirely—do not generate or include any additional thoughts or content.
|
7 |
3. **Sequential Processing**: The images are sequentially ordered. A response might continue from one image to the next, so capture the full context across multiple images if necessary.
|
8 |
4. **Ignore Non-Relevant Content**: Exclude any content that does not fit the criteria of free-form, detailed responses.
|
planning_ai/chains/prompts/policy.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
The following is a list of bullet points relating to a particular planning policy. Rewrite the bullet points to focus only on the key action or idea, excluding additional context (like the name of the policy). If multiple bullet points share the same concept, combine them together.
|
2 |
+
|
3 |
+
Policy: {policy}
|
4 |
+
|
5 |
+
Bullet Points:
|
6 |
+
|
7 |
+
{bullet_points}
|
planning_ai/chains/prompts/reduce.txt
CHANGED
@@ -1,41 +1,9 @@
|
|
1 |
-
The following
|
2 |
|
3 |
{context}
|
4 |
|
5 |
-
|
6 |
|
7 |
-
Each
|
8 |
-
|
9 |
-
**Guidelines**:
|
10 |
-
|
11 |
-
- Provide an **extended, one page, balanced summary** of the key themes at the beginning, capturing the overall sentiment and notable trends.
|
12 |
-
- In the 'Key points' sections, **group points by aim only if that aim is explicitly mentioned**.
|
13 |
-
- Be sure to include specific, concise points that reflect the underlying concerns or support expressed by respondents.
|
14 |
-
- Do **not** include information, assumptions, or summaries of aims that were not explicitly mentioned in the responses.
|
15 |
-
|
16 |
-
**Format**:
|
17 |
-
|
18 |
-
## Summary
|
19 |
-
|
20 |
-
<Provide an extended, comprehensive overview of all the main themes. Mention key concerns, positive feedback, and overall trends.>
|
21 |
-
|
22 |
-
## Key points raised in support
|
23 |
-
|
24 |
-
For each key point raised in support, group them by aim **only if that aim is mentioned in the responses**.
|
25 |
-
|
26 |
-
### [Aim name]
|
27 |
-
|
28 |
-
- <Key point 1>
|
29 |
-
- <Key point 2>
|
30 |
-
- ...
|
31 |
-
|
32 |
-
## Key points raised in opposition
|
33 |
-
|
34 |
-
For each key point raised in opposition, group them by aim **only if that aim is mentioned in the responses**.
|
35 |
-
|
36 |
-
### [Aim name]
|
37 |
-
|
38 |
-
- <Key point 1>
|
39 |
-
- <Key point 2>
|
40 |
-
- ...
|
41 |
|
|
|
|
1 |
+
The following contains summaries of public responses to a new plan proposed by the South Cambridgeshire Council:
|
2 |
|
3 |
{context}
|
4 |
|
5 |
+
As a representative of the Cambridgeshire Council, your task is to craft a **comprehensive and articulate executive summary**. This summary will serve as the introductory section of a major report, highlighting the key themes and concerns raised in the public responses. Ensure that the summary is clear, concise, and professional, reflecting the tone and standards expected in official council documents. **Do not add, infer, or create information.** Use only the content explicitly mentioned in the summaries. Adhere to British English conventions.
|
6 |
|
7 |
+
Each time you make a reference to a response document, please add an inline citation which corresponds with the documents numerical ID. For example 'Concerns regarding the impact of increased housing density on the character of Cambridge were prevalent [1][2][11].'.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
+
## Executive Summary
|
planning_ai/chains/prompts/themes.txt
CHANGED
@@ -1,82 +0,0 @@
|
|
1 |
-
The following themes are proposed by the South Cambridgeshire Council with each of their associated policies.
|
2 |
-
|
3 |
-
# Climate change
|
4 |
-
|
5 |
-
Net zero carbon new buildings
|
6 |
-
Water efficiency in new developments
|
7 |
-
Designing for a changing climate
|
8 |
-
Flooding and integrated water management
|
9 |
-
Renewable energy projects and infrastructure
|
10 |
-
Reducing waste and supporting the circular economy
|
11 |
-
Supporting land-based carbon sequestration
|
12 |
-
|
13 |
-
# Biodiversity and green spaces
|
14 |
-
|
15 |
-
Biodiversity and geodiversity
|
16 |
-
Green infrastructure
|
17 |
-
Improving Tree Canopy Cover and the Tree Population
|
18 |
-
River corridors
|
19 |
-
Protecting open spaces
|
20 |
-
Providing and enhancing open spaces
|
21 |
-
|
22 |
-
# Wellbeing and social inclusion
|
23 |
-
|
24 |
-
Creating healthy new developments
|
25 |
-
Community, sports and leisure facilities
|
26 |
-
Meanwhile uses during long term redevelopments
|
27 |
-
Creating inclusive employment and business opportunities through new developments
|
28 |
-
Pollution, health and safety
|
29 |
-
|
30 |
-
# Great places
|
31 |
-
|
32 |
-
People and place responsive design
|
33 |
-
Protection and enhancement of landscape character
|
34 |
-
Protection and enhancement of the Cambridge Green Belt
|
35 |
-
Achieving high quality development
|
36 |
-
Establishing high quality landscape and public realm
|
37 |
-
Conservation and enhancement of heritage assets
|
38 |
-
Adapting heritage assets to climate change
|
39 |
-
Protection of public houses
|
40 |
-
|
41 |
-
# Jobs
|
42 |
-
|
43 |
-
New employment and development proposals
|
44 |
-
Supporting the rural economy
|
45 |
-
Protecting the best agricultural land
|
46 |
-
Protecting existing business space
|
47 |
-
Enabling remote working
|
48 |
-
Affordable workspace and creative industries
|
49 |
-
Supporting a range of facilities in employment parks
|
50 |
-
Retail and centres
|
51 |
-
Visitor accommodation, attractions and facilities
|
52 |
-
Faculty development and specialist / language schools
|
53 |
-
|
54 |
-
# Homes
|
55 |
-
|
56 |
-
Affordable housing
|
57 |
-
Exception sites for affordable housing
|
58 |
-
Housing mix
|
59 |
-
Housing density
|
60 |
-
Garden land and subdivision of existing plots
|
61 |
-
Residential space standards and accessible homes
|
62 |
-
Specialist housing and homes for older people
|
63 |
-
Self and custom build homes
|
64 |
-
Build to rent homes
|
65 |
-
Houses in multiple occupation (HMOs)
|
66 |
-
Student accommodation
|
67 |
-
Dwellings in the countryside
|
68 |
-
Residential moorings
|
69 |
-
Residential caravan sites
|
70 |
-
Gypsy and Traveller and Travelling Showpeople sites
|
71 |
-
Community-led housing
|
72 |
-
|
73 |
-
# Infrastructure
|
74 |
-
|
75 |
-
Sustainable transport and connectivity
|
76 |
-
Parking and electric vehicles
|
77 |
-
Freight and delivery consolidation
|
78 |
-
Safeguarding important infrastructure
|
79 |
-
Aviation development
|
80 |
-
Energy infrastructure masterplanning
|
81 |
-
Infrastructure and delivery
|
82 |
-
Digital infrastructure
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
planning_ai/common/utils.py
CHANGED
@@ -11,6 +11,18 @@ pl.Config(
|
|
11 |
)
|
12 |
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
class Paths:
|
15 |
DATA = Path("data")
|
16 |
|
@@ -25,7 +37,14 @@ class Paths:
|
|
25 |
|
26 |
@classmethod
|
27 |
def ensure_directories_exist(cls):
|
28 |
-
for path in [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
path.mkdir(parents=True, exist_ok=True)
|
30 |
|
31 |
|
|
|
11 |
)
|
12 |
|
13 |
|
14 |
+
def filename_reducer(docs_a, docs_b):
|
15 |
+
if docs_a == []:
|
16 |
+
return docs_b
|
17 |
+
b_dict = {d["filename"]: d for d in docs_b}
|
18 |
+
|
19 |
+
for i, dict_a in enumerate(docs_a):
|
20 |
+
filename = dict_a.get("filename")
|
21 |
+
if filename in b_dict:
|
22 |
+
docs_a[i] = b_dict[filename]
|
23 |
+
return docs_a
|
24 |
+
|
25 |
+
|
26 |
class Paths:
|
27 |
DATA = Path("data")
|
28 |
|
|
|
37 |
|
38 |
@classmethod
|
39 |
def ensure_directories_exist(cls):
|
40 |
+
for path in [
|
41 |
+
cls.DATA,
|
42 |
+
cls.RAW,
|
43 |
+
cls.STAGING,
|
44 |
+
cls.OUT,
|
45 |
+
cls.SUMMARY,
|
46 |
+
cls.SUMMARIES,
|
47 |
+
]:
|
48 |
path.mkdir(parents=True, exist_ok=True)
|
49 |
|
50 |
|
planning_ai/graph.py
CHANGED
@@ -7,7 +7,13 @@ from planning_ai.nodes.hallucination_node import (
|
|
7 |
map_fix_hallucinations,
|
8 |
map_hallucinations,
|
9 |
)
|
10 |
-
from planning_ai.nodes.map_node import
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
from planning_ai.nodes.reduce_node import generate_final_summary
|
12 |
from planning_ai.states import OverallState
|
13 |
|
@@ -25,13 +31,21 @@ def create_graph():
|
|
25 |
StateGraph: The compiled state graph ready for execution.
|
26 |
"""
|
27 |
graph = StateGraph(OverallState)
|
|
|
|
|
28 |
graph.add_node("generate_summary", generate_summary)
|
29 |
graph.add_node("check_hallucination", check_hallucination)
|
30 |
graph.add_node("fix_hallucination", fix_hallucination)
|
31 |
-
|
32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
graph.add_conditional_edges(
|
34 |
-
|
35 |
map_summaries,
|
36 |
["generate_summary"],
|
37 |
)
|
@@ -51,7 +65,7 @@ def create_graph():
|
|
51 |
["check_hallucination"],
|
52 |
)
|
53 |
|
54 |
-
|
55 |
-
|
56 |
|
57 |
return graph.compile()
|
|
|
7 |
map_fix_hallucinations,
|
8 |
map_hallucinations,
|
9 |
)
|
10 |
+
from planning_ai.nodes.map_node import (
|
11 |
+
add_entities,
|
12 |
+
generate_summary,
|
13 |
+
map_retrieve_themes,
|
14 |
+
map_summaries,
|
15 |
+
retrieve_themes,
|
16 |
+
)
|
17 |
from planning_ai.nodes.reduce_node import generate_final_summary
|
18 |
from planning_ai.states import OverallState
|
19 |
|
|
|
31 |
StateGraph: The compiled state graph ready for execution.
|
32 |
"""
|
33 |
graph = StateGraph(OverallState)
|
34 |
+
graph.add_node("add_entities", add_entities)
|
35 |
+
graph.add_node("retrieve_themes", retrieve_themes)
|
36 |
graph.add_node("generate_summary", generate_summary)
|
37 |
graph.add_node("check_hallucination", check_hallucination)
|
38 |
graph.add_node("fix_hallucination", fix_hallucination)
|
39 |
+
graph.add_node("generate_final_summary", generate_final_summary)
|
40 |
|
41 |
+
graph.add_edge(START, "add_entities")
|
42 |
+
graph.add_conditional_edges(
|
43 |
+
"add_entities",
|
44 |
+
map_retrieve_themes,
|
45 |
+
["retrieve_themes"],
|
46 |
+
)
|
47 |
graph.add_conditional_edges(
|
48 |
+
"retrieve_themes",
|
49 |
map_summaries,
|
50 |
["generate_summary"],
|
51 |
)
|
|
|
65 |
["check_hallucination"],
|
66 |
)
|
67 |
|
68 |
+
graph.add_edge("check_hallucination", "generate_final_summary")
|
69 |
+
graph.add_edge("generate_final_summary", END)
|
70 |
|
71 |
return graph.compile()
|
planning_ai/main.py
CHANGED
@@ -1,18 +1,28 @@
|
|
|
|
1 |
import os
|
|
|
2 |
import time
|
3 |
from collections import Counter
|
|
|
4 |
from pathlib import Path
|
5 |
|
6 |
-
import geopandas as gpd
|
7 |
import matplotlib.pyplot as plt
|
8 |
import polars as pl
|
9 |
from dotenv import load_dotenv
|
10 |
-
from langchain_community.document_loaders import
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
13 |
|
14 |
from planning_ai.common.utils import Paths
|
15 |
from planning_ai.graph import create_graph
|
|
|
|
|
|
|
|
|
16 |
|
17 |
load_dotenv()
|
18 |
|
@@ -64,81 +74,72 @@ def map_locations(places_df: pl.DataFrame):
|
|
64 |
|
65 |
def build_quarto_doc(doc_title, out):
|
66 |
final = out["generate_final_summary"]
|
67 |
-
executive_summary = (
|
68 |
-
final["final_summary"].split("## Key points raised in support")[0].strip()
|
69 |
-
)
|
70 |
-
key_points = final["final_summary"].split("## Key points raised in support")[1]
|
71 |
-
|
72 |
-
aims = []
|
73 |
-
for summary in final["summaries_fixed"]:
|
74 |
-
aim = summary["summary"].aims
|
75 |
-
aims.extend(aim)
|
76 |
-
|
77 |
-
value_counts = Counter(aims)
|
78 |
-
total_values = sum(value_counts.values())
|
79 |
-
percentages = {
|
80 |
-
key: {"count": count, "percentage": (count / total_values)}
|
81 |
-
for key, count in value_counts.items()
|
82 |
-
}
|
83 |
-
top_5 = sorted(percentages.items(), key=lambda x: x[1]["percentage"], reverse=True)[
|
84 |
-
:5
|
85 |
-
]
|
86 |
-
thematic_breakdown = "| **Aim** | **Percentage** | **Count** |\n|---|---|---|\n"
|
87 |
-
thematic_breakdown += "\n".join(
|
88 |
-
[f"| {item} | {d['percentage']:.2%} | {d['count']} |" for item, d in top_5]
|
89 |
-
)
|
90 |
-
|
91 |
-
places_df = (
|
92 |
-
pl.DataFrame(
|
93 |
-
[
|
94 |
-
place.dict()
|
95 |
-
for summary in final["summaries_fixed"]
|
96 |
-
for place in summary["summary"].places
|
97 |
-
]
|
98 |
-
)
|
99 |
-
.group_by("place")
|
100 |
-
.agg(
|
101 |
-
pl.col("place").len().alias("Count"),
|
102 |
-
pl.col("sentiment").mean().alias("Mean Sentiment"),
|
103 |
-
)
|
104 |
-
.rename({"place": "Place"})
|
105 |
-
)
|
106 |
-
|
107 |
-
map_locations(places_df)
|
108 |
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
)
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
|
143 |
quarto_doc = (
|
144 |
"---\n"
|
@@ -153,53 +154,63 @@ def build_quarto_doc(doc_title, out):
|
|
153 |
"monofontoptions:\n"
|
154 |
" - Scale=0.55\n"
|
155 |
"---\n\n"
|
156 |
-
f"{
|
157 |
-
f"{
|
158 |
-
"
|
159 |
-
|
160 |
-
"
|
161 |
-
"
|
162 |
-
|
163 |
-
|
164 |
-
f"
|
165 |
-
"
|
166 |
-
f"
|
167 |
-
"##
|
168 |
-
f"{
|
|
|
|
|
169 |
)
|
170 |
|
171 |
with open(Paths.SUMMARY / f"{doc_title.replace(' ', '_')}.qmd", "w") as f:
|
172 |
f.write(quarto_doc)
|
173 |
|
174 |
|
175 |
-
def
|
176 |
-
|
177 |
-
|
178 |
-
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
|
|
|
|
186 |
)
|
187 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
|
189 |
app = create_graph()
|
190 |
|
191 |
step = None
|
192 |
-
for step in app.stream(
|
193 |
-
|
194 |
-
"documents": [doc.page_content for doc in split_docs],
|
195 |
-
"filenames": [Path(doc.metadata["source"]) for doc in split_docs],
|
196 |
-
}
|
197 |
-
):
|
198 |
-
print(list(step.keys()))
|
199 |
|
200 |
if step is None:
|
201 |
raise ValueError("No steps were processed!")
|
202 |
-
|
203 |
return step
|
204 |
|
205 |
|
@@ -208,7 +219,9 @@ if __name__ == "__main__":
|
|
208 |
|
209 |
tic = time.time()
|
210 |
out = main()
|
211 |
-
|
|
|
|
|
212 |
toc = time.time()
|
213 |
|
214 |
print(f"Time taken: {(toc - tic) / 60:.2f} minutes.")
|
|
|
1 |
+
import logging
|
2 |
import os
|
3 |
+
import re
|
4 |
import time
|
5 |
from collections import Counter
|
6 |
+
from itertools import groupby
|
7 |
from pathlib import Path
|
8 |
|
9 |
+
# import geopandas as gpd
|
10 |
import matplotlib.pyplot as plt
|
11 |
import polars as pl
|
12 |
from dotenv import load_dotenv
|
13 |
+
from langchain_community.document_loaders import (
|
14 |
+
DirectoryLoader,
|
15 |
+
PolarsDataFrameLoader,
|
16 |
+
TextLoader,
|
17 |
+
)
|
18 |
+
from langchain_text_splitters import CharacterTextSplitter, markdown
|
19 |
|
20 |
from planning_ai.common.utils import Paths
|
21 |
from planning_ai.graph import create_graph
|
22 |
+
from planning_ai.themes import THEMES_AND_POLICIES
|
23 |
+
|
24 |
+
# from opencage.geocoder import OpenCageGeocode
|
25 |
+
|
26 |
|
27 |
load_dotenv()
|
28 |
|
|
|
74 |
|
75 |
def build_quarto_doc(doc_title, out):
|
76 |
final = out["generate_final_summary"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
77 |
|
78 |
+
# value_counts = Counter(aims)
|
79 |
+
# total_values = sum(value_counts.values())
|
80 |
+
# percentages = {
|
81 |
+
# key: {"count": count, "percentage": (count / total_values)}
|
82 |
+
# for key, count in value_counts.items()
|
83 |
+
# }
|
84 |
+
# top_5 = sorted(percentages.items(), key=lambda x: x[1]["percentage"], reverse=True)[
|
85 |
+
# :5
|
86 |
+
# ]
|
87 |
+
# thematic_breakdown = "| **Aim** | **Percentage** | **Count** |\n|---|---|---|\n"
|
88 |
+
# thematic_breakdown += "\n".join(
|
89 |
+
# [f"| {item} | {d['percentage']:.2%} | {d['count']} |" for item, d in top_5]
|
90 |
+
# )
|
91 |
+
#
|
92 |
+
# places_df = (
|
93 |
+
# pl.DataFrame(
|
94 |
+
# [
|
95 |
+
# place.dict()
|
96 |
+
# for summary in final["summaries_fixed"]
|
97 |
+
# for place in summary["summary"].places
|
98 |
+
# ]
|
99 |
+
# )
|
100 |
+
# .group_by("place")
|
101 |
+
# .agg(
|
102 |
+
# pl.col("place").len().alias("Count"),
|
103 |
+
# pl.col("sentiment").mean().alias("Mean Sentiment"),
|
104 |
+
# )
|
105 |
+
# .rename({"place": "Place"})
|
106 |
+
# )
|
107 |
+
#
|
108 |
+
# map_locations(places_df)
|
109 |
+
#
|
110 |
+
# places_breakdown = (
|
111 |
+
# places_df.sort("Count", descending=True)
|
112 |
+
# .head()
|
113 |
+
# .to_pandas()
|
114 |
+
# .to_markdown(index=False)
|
115 |
+
# )
|
116 |
+
#
|
117 |
+
# stances = [summary["summary"].stance for summary in final["summaries_fixed"]]
|
118 |
+
# value_counts = Counter(stances)
|
119 |
+
# total_values = sum(value_counts.values())
|
120 |
+
# percentages = {
|
121 |
+
# key: {"count": count, "percentage": (count / total_values)}
|
122 |
+
# for key, count in value_counts.items()
|
123 |
+
# }
|
124 |
+
# stances_top = sorted(
|
125 |
+
# percentages.items(), key=lambda x: x[1]["percentage"], reverse=True
|
126 |
+
# )
|
127 |
+
# stances_breakdown = " | ".join(
|
128 |
+
# [
|
129 |
+
# f"**{item}**: {stance['percentage']:.2%} _({stance['count']})_"
|
130 |
+
# for item, stance in stances_top
|
131 |
+
# ]
|
132 |
+
# )
|
133 |
+
#
|
134 |
+
# short_summaries = "\n\n".join(
|
135 |
+
# [
|
136 |
+
# f"#### **TODO**\n"
|
137 |
+
# f"{summary['summary'].summary}\n\n"
|
138 |
+
# f"**Stance**: {summary['summary'].stance}\n\n"
|
139 |
+
# f"**Constructiveness**: {summary['summary'].rating}\n\n"
|
140 |
+
# for summary in final["summaries_fixed"]
|
141 |
+
# ]
|
142 |
+
# )
|
143 |
|
144 |
quarto_doc = (
|
145 |
"---\n"
|
|
|
154 |
"monofontoptions:\n"
|
155 |
" - Scale=0.55\n"
|
156 |
"---\n\n"
|
157 |
+
f"{final['final_summary']}\n\n"
|
158 |
+
f"{final['policies']}"
|
159 |
+
# f"{executive_summary}\n\n"
|
160 |
+
# f"{stances_breakdown}\n\n"
|
161 |
+
# "## Aim Breakdown\n\n"
|
162 |
+
# "The aim breakdown identifies which aims are mentioned "
|
163 |
+
# "within each response. "
|
164 |
+
# "A single response may discuss multiple topics.\n"
|
165 |
+
# f"\n\n{thematic_breakdown}\n\n"
|
166 |
+
# f"\n\n{places_breakdown}\n\n"
|
167 |
+
# f"\n\n"
|
168 |
+
# "## Key points raised in support\n\n"
|
169 |
+
# f"{key_points}\n\n"
|
170 |
+
# "## Summaries\n"
|
171 |
+
# f"{short_summaries}"
|
172 |
)
|
173 |
|
174 |
with open(Paths.SUMMARY / f"{doc_title.replace(' ', '_')}.qmd", "w") as f:
|
175 |
f.write(quarto_doc)
|
176 |
|
177 |
|
178 |
+
def read_docs():
|
179 |
+
df = pl.read_parquet(Paths.STAGING / "gcpt3.parquet")
|
180 |
+
df = df.filter(
|
181 |
+
pl.col("representations_document") == "Local Plan Issues and Options Report"
|
182 |
+
).unique("id")
|
183 |
+
loader = PolarsDataFrameLoader(df, page_content_column="text")
|
184 |
+
|
185 |
+
docs = list(
|
186 |
+
{
|
187 |
+
doc.page_content: {"document": doc, "filename": doc.metadata["id"]}
|
188 |
+
for doc in loader.load()
|
189 |
+
if doc.page_content and len(doc.page_content.split(" ")) > 5
|
190 |
+
}.values()
|
191 |
)
|
192 |
+
return docs
|
193 |
+
|
194 |
+
|
195 |
+
def main():
|
196 |
+
docs = read_docs()
|
197 |
+
n_docs = len(docs)
|
198 |
+
|
199 |
+
logging.warning(f"{n_docs} documents being processed!")
|
200 |
+
|
201 |
+
# text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
|
202 |
+
# chunk_size=10_240, chunk_overlap=0
|
203 |
+
# )
|
204 |
+
# split_docs = text_splitter.split_documents(docs)
|
205 |
|
206 |
app = create_graph()
|
207 |
|
208 |
step = None
|
209 |
+
for step in app.stream({"documents": docs, "n_docs": n_docs}):
|
210 |
+
print(step.keys())
|
|
|
|
|
|
|
|
|
|
|
211 |
|
212 |
if step is None:
|
213 |
raise ValueError("No steps were processed!")
|
|
|
214 |
return step
|
215 |
|
216 |
|
|
|
219 |
|
220 |
tic = time.time()
|
221 |
out = main()
|
222 |
+
build_quarto_doc(doc_title, out)
|
223 |
+
print(out["generate_final_summary"]["final_summary"])
|
224 |
+
|
225 |
toc = time.time()
|
226 |
|
227 |
print(f"Time taken: {(toc - tic) / 60:.2f} minutes.")
|
planning_ai/nodes/hallucination_node.py
CHANGED
@@ -1,12 +1,31 @@
|
|
1 |
-
|
|
|
2 |
|
3 |
-
from
|
|
|
|
|
|
|
|
|
4 |
from planning_ai.chains.hallucination_chain import (
|
5 |
HallucinationChecker,
|
6 |
hallucination_chain,
|
7 |
)
|
|
|
8 |
from planning_ai.states import DocumentState, OverallState
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
def check_hallucination(state: DocumentState):
|
12 |
"""Checks for hallucinations in the summary of a document.
|
@@ -23,25 +42,29 @@ def check_hallucination(state: DocumentState):
|
|
23 |
dict: A dictionary containing either a list of fixed summaries or hallucinations
|
24 |
that need to be addressed.
|
25 |
"""
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
{"
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
if response.score == 1:
|
34 |
-
return {"
|
35 |
|
36 |
return {
|
37 |
-
"
|
38 |
-
{
|
39 |
-
"hallucination": response,
|
40 |
-
"document": state["document"],
|
41 |
-
"filename": state["filename"],
|
42 |
-
"summary": state["summary"],
|
43 |
-
"iteration": state["iteration"] + 1,
|
44 |
-
}
|
45 |
]
|
46 |
}
|
47 |
|
@@ -60,7 +83,7 @@ def map_hallucinations(state: OverallState):
|
|
60 |
list: A list of Send objects directing each summary to the check_hallucination
|
61 |
function.
|
62 |
"""
|
63 |
-
return [Send("check_hallucination",
|
64 |
|
65 |
|
66 |
def fix_hallucination(state: DocumentState):
|
@@ -77,24 +100,24 @@ def fix_hallucination(state: DocumentState):
|
|
77 |
dict: A dictionary containing the updated summaries after attempting to fix
|
78 |
hallucinations.
|
79 |
"""
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
"explanation": state["hallucination"],
|
85 |
-
}
|
86 |
-
)
|
87 |
-
state["summary"] = response # type: ignore
|
88 |
-
return {
|
89 |
-
"summaries": [
|
90 |
{
|
91 |
-
"
|
92 |
-
"
|
93 |
-
"
|
94 |
-
"iteration": state["iteration"],
|
95 |
}
|
96 |
-
|
97 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
|
99 |
|
100 |
def map_fix_hallucinations(state: OverallState):
|
@@ -112,11 +135,11 @@ def map_fix_hallucinations(state: OverallState):
|
|
112 |
fix_hallucination function.
|
113 |
"""
|
114 |
hallucinations = []
|
115 |
-
if "
|
116 |
hallucinations = [
|
117 |
-
|
118 |
-
for
|
119 |
-
if
|
120 |
]
|
121 |
return [
|
122 |
Send("fix_hallucination", hallucination) for hallucination in hallucinations
|
|
|
1 |
+
import json
|
2 |
+
import logging
|
3 |
|
4 |
+
from langchain_core.exceptions import OutputParserException
|
5 |
+
from langgraph.types import Send
|
6 |
+
from pydantic import BaseModel
|
7 |
+
|
8 |
+
from planning_ai.chains.fix_chain import fix_template
|
9 |
from planning_ai.chains.hallucination_chain import (
|
10 |
HallucinationChecker,
|
11 |
hallucination_chain,
|
12 |
)
|
13 |
+
from planning_ai.chains.map_chain import create_dynamic_map_chain
|
14 |
from planning_ai.states import DocumentState, OverallState
|
15 |
|
16 |
+
logging.basicConfig(
|
17 |
+
level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s"
|
18 |
+
)
|
19 |
+
logger = logging.getLogger(__name__)
|
20 |
+
|
21 |
+
|
22 |
+
class BasicSummaryBroken(BaseModel):
|
23 |
+
summary: str
|
24 |
+
policies: None
|
25 |
+
|
26 |
+
|
27 |
+
ITERATIONS = 2
|
28 |
+
|
29 |
|
30 |
def check_hallucination(state: DocumentState):
|
31 |
"""Checks for hallucinations in the summary of a document.
|
|
|
42 |
dict: A dictionary containing either a list of fixed summaries or hallucinations
|
43 |
that need to be addressed.
|
44 |
"""
|
45 |
+
logger.warning(f"Checking hallucinations for document {state['filename']}")
|
46 |
+
# Stop trying after 2 iterations
|
47 |
+
if state["iteration"] > ITERATIONS:
|
48 |
+
state["iteration"] = 99
|
49 |
+
state["hallucination"].score = 1
|
50 |
+
return {"documents": [state]}
|
51 |
+
|
52 |
+
try:
|
53 |
+
response = hallucination_chain.invoke(
|
54 |
+
{"document": state["document"], "summary": state["summary"].summary}
|
55 |
+
)
|
56 |
+
except (OutputParserException, json.JSONDecodeError) as e:
|
57 |
+
logger.error(f"Failed to decode JSON: {e}.")
|
58 |
+
state["iteration"] = 99
|
59 |
+
state["hallucination"] = HallucinationChecker(score=1, explanation="INVALID")
|
60 |
+
state["summary"] = BasicSummaryBroken(summary="INVALID", policies=None)
|
61 |
+
return {"documents": [state]}
|
62 |
if response.score == 1:
|
63 |
+
return {"documents": [{**state, "hallucination": response}]}
|
64 |
|
65 |
return {
|
66 |
+
"documents": [
|
67 |
+
{**state, "hallucination": response, "iteration": state["iteration"] + 1}
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
]
|
69 |
}
|
70 |
|
|
|
83 |
list: A list of Send objects directing each summary to the check_hallucination
|
84 |
function.
|
85 |
"""
|
86 |
+
return [Send("check_hallucination", document) for document in state["documents"]]
|
87 |
|
88 |
|
89 |
def fix_hallucination(state: DocumentState):
|
|
|
100 |
dict: A dictionary containing the updated summaries after attempting to fix
|
101 |
hallucinations.
|
102 |
"""
|
103 |
+
logger.warning(f"Fixing hallucinations for document {state['filename']}")
|
104 |
+
fix_chain = create_dynamic_map_chain(state["themes"], fix_template)
|
105 |
+
try:
|
106 |
+
response = fix_chain.invoke(
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
{
|
108 |
+
"context": state["document"],
|
109 |
+
"summary": state["summary"].summary,
|
110 |
+
"explanation": state["hallucination"].explanation,
|
|
|
111 |
}
|
112 |
+
)
|
113 |
+
except (OutputParserException, json.JSONDecodeError) as e:
|
114 |
+
logger.error(f"Failed to decode JSON: {e}.")
|
115 |
+
state["iteration"] = 99
|
116 |
+
state["hallucination"] = HallucinationChecker(score=1, explanation="INVALID")
|
117 |
+
state["summary"] = BasicSummaryBroken(summary="INVALID", policies=None)
|
118 |
+
return {"documents": [state]}
|
119 |
+
state["summary"] = response # type: ignore
|
120 |
+
return {"documents": [state]}
|
121 |
|
122 |
|
123 |
def map_fix_hallucinations(state: OverallState):
|
|
|
135 |
fix_hallucination function.
|
136 |
"""
|
137 |
hallucinations = []
|
138 |
+
if "documents" in state:
|
139 |
hallucinations = [
|
140 |
+
document
|
141 |
+
for document in state["documents"]
|
142 |
+
if document["hallucination"].score != 1
|
143 |
]
|
144 |
return [
|
145 |
Send("fix_hallucination", hallucination) for hallucination in hallucinations
|
planning_ai/nodes/map_node.py
CHANGED
@@ -1,16 +1,89 @@
|
|
1 |
import json
|
|
|
2 |
from pathlib import Path
|
|
|
3 |
|
4 |
-
|
|
|
|
|
5 |
from presidio_analyzer import AnalyzerEngine
|
6 |
from presidio_anonymizer import AnonymizerEngine
|
|
|
7 |
|
8 |
-
from planning_ai.chains.
|
|
|
9 |
from planning_ai.common.utils import Paths
|
|
|
10 |
from planning_ai.states import DocumentState, OverallState
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
analyzer = AnalyzerEngine()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
|
16 |
def remove_pii(document: str) -> str:
|
@@ -25,12 +98,14 @@ def remove_pii(document: str) -> str:
|
|
25 |
Returns:
|
26 |
str: The document text with PII anonymized.
|
27 |
"""
|
|
|
28 |
results = analyzer.analyze(
|
29 |
text=document,
|
30 |
entities=["PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS"],
|
31 |
language="en",
|
32 |
)
|
33 |
-
document = anonymizer.anonymize(text=document, analyzer_results=results)
|
|
|
34 |
return document
|
35 |
|
36 |
|
@@ -47,43 +122,27 @@ def generate_summary(state: DocumentState) -> dict:
|
|
47 |
Returns:
|
48 |
dict: A dictionary containing the generated summary and updated document state.
|
49 |
"""
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
"policies": out_policies,
|
72 |
-
"places": out_places,
|
73 |
-
}
|
74 |
-
|
75 |
-
outfile = f"{Path(state["filename"]).stem}_summary.json"
|
76 |
-
with open(Paths.SUMMARIES / outfile, "w") as file:
|
77 |
-
json.dump(save_output, file, indent=4)
|
78 |
-
|
79 |
-
output = {
|
80 |
-
"summary": response,
|
81 |
-
"document": state["document"],
|
82 |
-
"filename": str(state["filename"]),
|
83 |
-
"iteration": 1,
|
84 |
-
}
|
85 |
-
|
86 |
-
return {"summaries": [output]}
|
87 |
|
88 |
|
89 |
def map_summaries(state: OverallState) -> list[Send]:
|
@@ -99,10 +158,5 @@ def map_summaries(state: OverallState) -> list[Send]:
|
|
99 |
list: A list of Send objects directing each document to the `generate_summary`
|
100 |
function.
|
101 |
"""
|
102 |
-
|
103 |
-
|
104 |
-
"generate_summary",
|
105 |
-
{"document": document, "filename": filename},
|
106 |
-
)
|
107 |
-
for document, filename in zip(state["documents"], state["filenames"])
|
108 |
-
]
|
|
|
1 |
import json
|
2 |
+
import logging
|
3 |
from pathlib import Path
|
4 |
+
from typing import TypedDict
|
5 |
|
6 |
+
import spacy
|
7 |
+
from langchain_core.exceptions import OutputParserException
|
8 |
+
from langgraph.types import Send
|
9 |
from presidio_analyzer import AnalyzerEngine
|
10 |
from presidio_anonymizer import AnonymizerEngine
|
11 |
+
from pydantic import BaseModel, ValidationError
|
12 |
|
13 |
+
from planning_ai.chains.hallucination_chain import HallucinationChecker
|
14 |
+
from planning_ai.chains.map_chain import create_dynamic_map_chain, map_template
|
15 |
from planning_ai.common.utils import Paths
|
16 |
+
from planning_ai.retrievers.theme_retriever import grade_chain, theme_retriever
|
17 |
from planning_ai.states import DocumentState, OverallState
|
18 |
|
19 |
+
logging.basicConfig(
|
20 |
+
level=logging.WARN, format="%(asctime)s - %(levelname)s - %(message)s"
|
21 |
+
)
|
22 |
+
logger = logging.getLogger(__name__)
|
23 |
+
|
24 |
+
|
25 |
+
class BasicSummaryBroken(BaseModel):
|
26 |
+
summary: str
|
27 |
+
policies: None
|
28 |
+
|
29 |
+
|
30 |
analyzer = AnalyzerEngine()
|
31 |
+
anonymizer = AnonymizerEngine()
|
32 |
+
|
33 |
+
nlp = spacy.load("en_core_web_lg")
|
34 |
+
|
35 |
+
|
36 |
+
def retrieve_themes(state: DocumentState) -> dict:
|
37 |
+
theme_documents = theme_retriever.invoke(input=state["document"].page_content)
|
38 |
+
|
39 |
+
# TODO: add something similar but more efficient?
|
40 |
+
grade_scores = []
|
41 |
+
for doc in theme_documents:
|
42 |
+
try:
|
43 |
+
score = grade_chain.invoke(
|
44 |
+
{
|
45 |
+
"context": doc.page_content,
|
46 |
+
"document": state["document"].page_content,
|
47 |
+
}
|
48 |
+
).binary_score
|
49 |
+
except (OutputParserException, json.JSONDecodeError) as e:
|
50 |
+
logger.error(f"Failed to decode JSON: {e}.\n Setting to 'no'")
|
51 |
+
score = "no"
|
52 |
+
grade_scores.append(score)
|
53 |
+
|
54 |
+
theme_documents = [
|
55 |
+
doc for doc, include in zip(theme_documents, grade_scores) if include == "yes"
|
56 |
+
]
|
57 |
+
|
58 |
+
# TODO: Add metadata to this as string?
|
59 |
+
theme_documents_text = "\n\n".join([d.page_content for d in theme_documents])
|
60 |
+
|
61 |
+
# state["document"].page_content = (
|
62 |
+
# f"{state['document'].page_content}\n\n"
|
63 |
+
# f"Related Information:\n\n{theme_documents_text}"
|
64 |
+
# )
|
65 |
+
state["theme_docs"] = theme_documents
|
66 |
+
state["themes"] = {doc.metadata["theme"] for doc in theme_documents}
|
67 |
+
|
68 |
+
logger.warning(f"Retrieved relevant theme documents for: {state['filename']}")
|
69 |
+
return {"documents": [state]}
|
70 |
+
|
71 |
+
|
72 |
+
def map_retrieve_themes(state: OverallState) -> list[Send]:
|
73 |
+
logger.warning("Mapping documents to retrieve themes.")
|
74 |
+
return [Send("retrieve_themes", document) for document in state["documents"]]
|
75 |
+
|
76 |
+
|
77 |
+
def add_entities(state: OverallState) -> OverallState:
|
78 |
+
for idx, document in enumerate(
|
79 |
+
nlp.pipe(
|
80 |
+
[doc["document"].page_content for doc in state["documents"]],
|
81 |
+
)
|
82 |
+
):
|
83 |
+
state["documents"][idx]["entities"] = [
|
84 |
+
{"entity": ent.text, "label": ent.label_} for ent in document.ents
|
85 |
+
]
|
86 |
+
return state
|
87 |
|
88 |
|
89 |
def remove_pii(document: str) -> str:
|
|
|
98 |
Returns:
|
99 |
str: The document text with PII anonymized.
|
100 |
"""
|
101 |
+
logger.warning("Starting PII removal.")
|
102 |
results = analyzer.analyze(
|
103 |
text=document,
|
104 |
entities=["PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS"],
|
105 |
language="en",
|
106 |
)
|
107 |
+
document = anonymizer.anonymize(text=document, analyzer_results=results).text
|
108 |
+
logger.warning("PII removal completed.")
|
109 |
return document
|
110 |
|
111 |
|
|
|
122 |
Returns:
|
123 |
dict: A dictionary containing the generated summary and updated document state.
|
124 |
"""
|
125 |
+
logger.warning(f"Generating summary for document: {state['filename']}")
|
126 |
+
|
127 |
+
state["document"].page_content = remove_pii(state["document"].page_content)
|
128 |
+
if not state["themes"]:
|
129 |
+
state["iteration"] = 99
|
130 |
+
state["hallucination"] = HallucinationChecker(score=1, explanation="INVALID")
|
131 |
+
state["summary"] = BasicSummaryBroken(summary="INVALID", policies=None)
|
132 |
+
return {"documents": [state]}
|
133 |
+
|
134 |
+
map_chain = create_dynamic_map_chain(themes=state["themes"], prompt=map_template)
|
135 |
+
try:
|
136 |
+
response = map_chain.invoke({"context": state["document"].page_content})
|
137 |
+
except (OutputParserException, json.JSONDecodeError) as e:
|
138 |
+
logger.error(f"Failed to decode JSON: {e}.")
|
139 |
+
state["iteration"] = 99
|
140 |
+
state["hallucination"] = HallucinationChecker(score=1, explanation="INVALID")
|
141 |
+
state["summary"] = BasicSummaryBroken(summary="INVALID", policies=None)
|
142 |
+
return {"documents": [state]}
|
143 |
+
|
144 |
+
logger.warning(f"Summary generation completed for document: {state['filename']}")
|
145 |
+
return {"documents": [{**state, "summary": response, "iteration": 1}]}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
146 |
|
147 |
|
148 |
def map_summaries(state: OverallState) -> list[Send]:
|
|
|
158 |
list: A list of Send objects directing each document to the `generate_summary`
|
159 |
function.
|
160 |
"""
|
161 |
+
logger.warning("Mapping documents to generate summaries.")
|
162 |
+
return [Send("generate_summary", document) for document in state["documents"]]
|
|
|
|
|
|
|
|
|
|
planning_ai/nodes/reduce_node.py
CHANGED
@@ -1,5 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
from planning_ai.chains.reduce_chain import reduce_chain
|
2 |
from planning_ai.states import OverallState
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
|
5 |
def generate_final_summary(state: OverallState):
|
@@ -18,17 +49,87 @@ def generate_final_summary(state: OverallState):
|
|
18 |
dict: A dictionary containing the final summary, along with the original
|
19 |
documents, summaries, fixed summaries, and hallucinations.
|
20 |
"""
|
21 |
-
|
|
|
|
|
|
|
|
|
22 |
summaries = [
|
23 |
-
|
24 |
-
|
25 |
-
|
|
|
|
|
|
|
26 |
]
|
27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
return {
|
29 |
-
"final_summary":
|
30 |
-
"
|
31 |
-
"
|
32 |
-
"hallucinations": state["hallucinations"],
|
33 |
-
"documents": state["documents"],
|
34 |
}
|
|
|
1 |
+
import json
|
2 |
+
import logging
|
3 |
+
from pathlib import Path
|
4 |
+
|
5 |
+
import polars as pl
|
6 |
+
|
7 |
+
from planning_ai.chains.policy_chain import policy_chain
|
8 |
from planning_ai.chains.reduce_chain import reduce_chain
|
9 |
from planning_ai.states import OverallState
|
10 |
+
from planning_ai.themes import THEMES_AND_POLICIES
|
11 |
+
|
12 |
+
logging.basicConfig(
|
13 |
+
level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s"
|
14 |
+
)
|
15 |
+
logger = logging.getLogger(__name__)
|
16 |
+
|
17 |
+
|
18 |
+
# TODO: Reduce down the grouping of policies. I.e. combine points that are closely related and add citations.
|
19 |
+
def markdown_bullets(summaries):
|
20 |
+
policies = {"themes": [], "policies": [], "details": []}
|
21 |
+
for summary in summaries:
|
22 |
+
if not summary["summary"].policies:
|
23 |
+
continue
|
24 |
+
for policy in summary["summary"].policies:
|
25 |
+
for theme, p in THEMES_AND_POLICIES.items():
|
26 |
+
if policy.policy.name in p:
|
27 |
+
policies["themes"].append(theme)
|
28 |
+
policies["policies"].append(policy.policy.name)
|
29 |
+
policies["details"].append(policy.note)
|
30 |
+
df = pl.DataFrame(policies)
|
31 |
+
|
32 |
+
grouped = df.group_by(["themes", "policies"]).agg(pl.col("details"))
|
33 |
+
return grouped
|
34 |
|
35 |
|
36 |
def generate_final_summary(state: OverallState):
|
|
|
49 |
dict: A dictionary containing the final summary, along with the original
|
50 |
documents, summaries, fixed summaries, and hallucinations.
|
51 |
"""
|
52 |
+
logger.warning("Generating final summary")
|
53 |
+
final_docs = [doc for doc in state["documents"] if doc["hallucination"].score == 1]
|
54 |
+
logger.warning(f"Number of final docs: {len(final_docs)}")
|
55 |
+
|
56 |
+
if len(final_docs) == state["n_docs"]:
|
57 |
summaries = [
|
58 |
+
# f"Document ID: [{id}]\n\n{doc["summary"].summary}"
|
59 |
+
doc
|
60 |
+
for id, doc in zip(range(state["n_docs"]), final_docs)
|
61 |
+
if doc["summary"].summary != "INVALID"
|
62 |
+
and doc["themes"] != set()
|
63 |
+
and doc["iteration"] != 99
|
64 |
]
|
65 |
+
|
66 |
+
out = []
|
67 |
+
for doc in summaries:
|
68 |
+
summary = doc["summary"].model_dump()
|
69 |
+
if summary["policies"] is not None:
|
70 |
+
policies = [
|
71 |
+
{"policy": policy["policy"].name, "note": policy["note"]}
|
72 |
+
for policy in summary["policies"]
|
73 |
+
]
|
74 |
+
else:
|
75 |
+
policies = []
|
76 |
+
summary = summary["summary"]
|
77 |
+
out.append(
|
78 |
+
{
|
79 |
+
"document": doc["document"].model_dump()["page_content"],
|
80 |
+
"filename": doc["filename"],
|
81 |
+
"entities": doc["entities"],
|
82 |
+
"theme_docs": [d.model_dump() for d in doc["theme_docs"]],
|
83 |
+
"themes": list(doc["themes"]),
|
84 |
+
"summary": summary,
|
85 |
+
"policies": policies,
|
86 |
+
"iteration": doc["iteration"],
|
87 |
+
"hallucination": doc["hallucination"].model_dump(),
|
88 |
+
}
|
89 |
+
)
|
90 |
+
|
91 |
+
for doc in out:
|
92 |
+
filename = Path(str(doc["filename"])).stem
|
93 |
+
with open(f"data/out/summaries/{filename}.json", "w") as f:
|
94 |
+
json.dump(doc, f)
|
95 |
+
|
96 |
+
summaries_text = [s["summary"].summary for s in summaries]
|
97 |
+
final_responses = []
|
98 |
+
batch_size = 50
|
99 |
+
for i in range(0, len(summaries_text), batch_size):
|
100 |
+
logger.warning("Processing batches.")
|
101 |
+
batch = summaries_text[i : i + batch_size]
|
102 |
+
response = reduce_chain.invoke({"context": batch})
|
103 |
+
final_responses.append(response)
|
104 |
+
|
105 |
+
final_response = reduce_chain.invoke({"context": "\n\n".join(final_responses)})
|
106 |
+
pols = markdown_bullets(summaries)
|
107 |
+
|
108 |
+
pol_out = []
|
109 |
+
for _, policy in pols.group_by(["themes", "policies"]):
|
110 |
+
logger.warning("Processing policies.")
|
111 |
+
bullets = "* " + "* \n".join(policy["details"][0])
|
112 |
+
pchain_out = policy_chain.invoke(
|
113 |
+
{"policy": policy["policies"][0], "bullet_points": bullets}
|
114 |
+
)
|
115 |
+
pol_out.append(
|
116 |
+
{
|
117 |
+
"theme": policy["themes"][0],
|
118 |
+
"policy": policy["policies"][0],
|
119 |
+
"points": pchain_out,
|
120 |
+
}
|
121 |
+
)
|
122 |
+
|
123 |
+
themes = ""
|
124 |
+
for theme, policies in pl.DataFrame(pol_out).group_by("theme"):
|
125 |
+
themes += f"# {theme[0]}\n\n"
|
126 |
+
for row in policies.iter_rows(named=True):
|
127 |
+
themes += f"\n## {row['policy']}\n\n"
|
128 |
+
themes += f"{row['points']}\n"
|
129 |
+
themes += "\n"
|
130 |
+
|
131 |
return {
|
132 |
+
"final_summary": final_response,
|
133 |
+
"documents": final_docs,
|
134 |
+
"policies": themes,
|
|
|
|
|
135 |
}
|
planning_ai/preprocessing/gcpt3.py
CHANGED
@@ -1,4 +1,5 @@
|
|
1 |
import logging
|
|
|
2 |
from pathlib import Path
|
3 |
from typing import Any
|
4 |
|
@@ -59,14 +60,13 @@ def process_files(files: list[Path], schema: dict[str, Any]) -> None:
|
|
59 |
|
60 |
def download_attachments():
|
61 |
df = pl.read_parquet(Paths.STAGING / "gcpt3.parquet")
|
|
|
62 |
|
63 |
-
existing_files = {int(f.stem) for f in (Paths.RAW / "pdfs").glob("*.pdf")}
|
64 |
failed_files = set()
|
65 |
-
|
66 |
failed_file_path = Paths.RAW / "failed_downloads.txt"
|
67 |
if failed_file_path.exists():
|
68 |
with open(failed_file_path, "r") as file:
|
69 |
-
failed_files = set(
|
70 |
|
71 |
for row in tqdm(
|
72 |
df.drop_nulls(subset="attachments_id")
|
@@ -74,52 +74,64 @@ def download_attachments():
|
|
74 |
.sample(shuffle=True, fraction=1)
|
75 |
.rows(named=True)
|
76 |
):
|
77 |
-
attachment_id = int(row["attachments_id"])
|
78 |
-
|
79 |
-
if attachment_id in existing_files or attachment_id in failed_files:
|
80 |
-
print(f"Skipping {attachment_id} (already exists or previously failed)")
|
81 |
-
continue
|
82 |
if (
|
83 |
-
row["attachments_url"].
|
84 |
-
|
85 |
-
|
|
|
|
|
86 |
):
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
|
|
|
|
|
|
109 |
|
110 |
|
111 |
def convert_txt():
|
|
|
112 |
df = pl.read_parquet(Paths.STAGING / "gcpt3.parquet")
|
113 |
|
114 |
-
|
|
|
|
|
115 |
|
116 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
|
118 |
|
119 |
def main() -> None:
|
120 |
files = list(Path(Paths.RAW / "gcpt3").glob("*.json"))
|
121 |
schema = get_schema()
|
122 |
process_files(files, schema)
|
|
|
|
|
123 |
|
124 |
|
125 |
if __name__ == "__main__":
|
|
|
1 |
import logging
|
2 |
+
import textwrap
|
3 |
from pathlib import Path
|
4 |
from typing import Any
|
5 |
|
|
|
60 |
|
61 |
def download_attachments():
|
62 |
df = pl.read_parquet(Paths.STAGING / "gcpt3.parquet")
|
63 |
+
existing_files = {f.stem for f in (Paths.RAW / "pdfs").glob("*.pdf")}
|
64 |
|
|
|
65 |
failed_files = set()
|
|
|
66 |
failed_file_path = Paths.RAW / "failed_downloads.txt"
|
67 |
if failed_file_path.exists():
|
68 |
with open(failed_file_path, "r") as file:
|
69 |
+
failed_files = set(l for l in file.read().splitlines())
|
70 |
|
71 |
for row in tqdm(
|
72 |
df.drop_nulls(subset="attachments_id")
|
|
|
74 |
.sample(shuffle=True, fraction=1)
|
75 |
.rows(named=True)
|
76 |
):
|
|
|
|
|
|
|
|
|
|
|
77 |
if (
|
78 |
+
row["attachments_url"].startswith(
|
79 |
+
("https://egov.scambs.gov.uk", "http://egov.scambs.gov.uk")
|
80 |
+
)
|
81 |
+
or row["attachments_id"] in existing_files
|
82 |
+
or row["attachments_id"] in failed_files
|
83 |
):
|
84 |
+
failed_files.add(row["attachments_id"])
|
85 |
+
continue
|
86 |
+
file_path = Paths.RAW / "pdfs" / f"{row['attachments_id']}.pdf"
|
87 |
+
try:
|
88 |
+
response = requests.get(row["attachments_url"], timeout=3)
|
89 |
+
response.raise_for_status()
|
90 |
+
|
91 |
+
with open(file_path, "wb") as f:
|
92 |
+
f.write(response.content)
|
93 |
+
print(f"Downloaded {row['attachments_url']} to {file_path}")
|
94 |
+
|
95 |
+
except requests.RequestException as e:
|
96 |
+
logging.error(f"RequestException for {row['attachments_url']}: {e}")
|
97 |
+
failed_files.add(row["attachments_id"])
|
98 |
+
with open(failed_file_path, "a") as file:
|
99 |
+
file.write(f"{row['attachments_id']}\n")
|
100 |
+
print(f"Skipping {row['attachments_url']} due to error: {e}")
|
101 |
+
|
102 |
+
except Exception as e:
|
103 |
+
logging.error(f"Unexpected error for {row['attachments_url']}: {e}")
|
104 |
+
row["attachments_url"]
|
105 |
+
failed_files.add(row["attachments_id"])
|
106 |
+
with open(failed_file_path, "a") as file:
|
107 |
+
file.write(f"{row['attachments_id']}\n")
|
108 |
+
print(f"Unexpected error for {row['attachments_url']}: {e}")
|
109 |
|
110 |
|
111 |
def convert_txt():
|
112 |
+
# TODO: add pdf content
|
113 |
df = pl.read_parquet(Paths.STAGING / "gcpt3.parquet")
|
114 |
|
115 |
+
for response_doc, dfd in df.group_by("representations_document"):
|
116 |
+
for row in tqdm(dfd.rows(named=True)):
|
117 |
+
text = f"{row["text"]}"
|
118 |
|
119 |
+
with open(
|
120 |
+
Paths.STAGING
|
121 |
+
/ "txt"
|
122 |
+
/ f"{response_doc}"
|
123 |
+
/ f"{row['representations_id']}.txt",
|
124 |
+
"w",
|
125 |
+
) as f:
|
126 |
+
f.write(text)
|
127 |
|
128 |
|
129 |
def main() -> None:
|
130 |
files = list(Path(Paths.RAW / "gcpt3").glob("*.json"))
|
131 |
schema = get_schema()
|
132 |
process_files(files, schema)
|
133 |
+
download_attachments()
|
134 |
+
convert_txt()
|
135 |
|
136 |
|
137 |
if __name__ == "__main__":
|
planning_ai/preprocessing/process_pdfs.py
CHANGED
@@ -2,36 +2,16 @@ import base64
|
|
2 |
import os
|
3 |
from io import BytesIO
|
4 |
|
5 |
-
import cv2
|
6 |
-
import numpy as np
|
7 |
import requests
|
8 |
from dotenv import load_dotenv
|
9 |
from pdf2image import convert_from_path
|
|
|
10 |
from tqdm import tqdm
|
11 |
|
12 |
from planning_ai.common.utils import Paths
|
13 |
|
14 |
load_dotenv()
|
15 |
|
16 |
-
import easyocr
|
17 |
-
from pdf2image import convert_from_path
|
18 |
-
|
19 |
-
pdf_path = "data/raw/pdfs/25.pdf"
|
20 |
-
# pdf_path = "../../data/raw/pdfs/26.pdf"
|
21 |
-
images = convert_from_path(pdf_path)
|
22 |
-
|
23 |
-
reader = easyocr.Reader(lang_list=["en"], gpu=True)
|
24 |
-
|
25 |
-
for i, image in enumerate(images):
|
26 |
-
results = reader.readtext(np.array(image))
|
27 |
-
print(f"Page {i+1}:")
|
28 |
-
confidences = []
|
29 |
-
for result in results:
|
30 |
-
confidences.append(result[2])
|
31 |
-
print(f"Detected text: {result[1]} (confidence: {result[2]:.2f})")
|
32 |
-
|
33 |
-
np.array(confidences).mean()
|
34 |
-
|
35 |
|
36 |
def encode_images_to_base64(images):
|
37 |
image_b64 = []
|
@@ -61,13 +41,28 @@ def send_request_to_api(messages):
|
|
61 |
return response.json()
|
62 |
|
63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
def main():
|
65 |
pdfs = (Paths.RAW / "pdfs").glob("*.pdf")
|
66 |
with open("planning_ai/preprocessing/prompts/ocr.txt", "r") as f:
|
67 |
ocr_prompt = f.read()
|
68 |
|
69 |
for file in tqdm(pdfs):
|
70 |
-
|
|
|
|
|
71 |
images = convert_from_path(file)
|
72 |
image_b64 = encode_images_to_base64(images)
|
73 |
|
@@ -79,12 +74,15 @@ def main():
|
|
79 |
]
|
80 |
|
81 |
response = send_request_to_api(messages)
|
|
|
|
|
82 |
out = response["choices"][0]["message"]["content"]
|
83 |
-
outfile = Paths.STAGING / "pdfs" / f"{file.stem}.txt"
|
84 |
if outfile.exists():
|
85 |
continue
|
86 |
with open(outfile, "w") as f:
|
87 |
f.write(out)
|
|
|
|
|
88 |
|
89 |
|
90 |
if __name__ == "__main__":
|
|
|
2 |
import os
|
3 |
from io import BytesIO
|
4 |
|
|
|
|
|
5 |
import requests
|
6 |
from dotenv import load_dotenv
|
7 |
from pdf2image import convert_from_path
|
8 |
+
from PyPDF2 import PdfReader
|
9 |
from tqdm import tqdm
|
10 |
|
11 |
from planning_ai.common.utils import Paths
|
12 |
|
13 |
load_dotenv()
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
def encode_images_to_base64(images):
|
17 |
image_b64 = []
|
|
|
41 |
return response.json()
|
42 |
|
43 |
|
44 |
+
def extract_text_from_pdf(file_path):
|
45 |
+
"""Extracts text from a PDF file using PyPDF2."""
|
46 |
+
try:
|
47 |
+
reader = PdfReader(file_path, strict=True)
|
48 |
+
text = []
|
49 |
+
for page in reader.pages:
|
50 |
+
text.append(page.extract_text() or "")
|
51 |
+
return "\n".join(text).strip()
|
52 |
+
except Exception as e:
|
53 |
+
print(e)
|
54 |
+
return None
|
55 |
+
|
56 |
+
|
57 |
def main():
|
58 |
pdfs = (Paths.RAW / "pdfs").glob("*.pdf")
|
59 |
with open("planning_ai/preprocessing/prompts/ocr.txt", "r") as f:
|
60 |
ocr_prompt = f.read()
|
61 |
|
62 |
for file in tqdm(pdfs):
|
63 |
+
outfile = Paths.STAGING / "pdfs" / f"{file.stem}.txt"
|
64 |
+
|
65 |
+
try:
|
66 |
images = convert_from_path(file)
|
67 |
image_b64 = encode_images_to_base64(images)
|
68 |
|
|
|
74 |
]
|
75 |
|
76 |
response = send_request_to_api(messages)
|
77 |
+
if not "choices" in response:
|
78 |
+
continue
|
79 |
out = response["choices"][0]["message"]["content"]
|
|
|
80 |
if outfile.exists():
|
81 |
continue
|
82 |
with open(outfile, "w") as f:
|
83 |
f.write(out)
|
84 |
+
except:
|
85 |
+
continue
|
86 |
|
87 |
|
88 |
if __name__ == "__main__":
|
planning_ai/report.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# build_quarto_doc(doc_title, out)
|
2 |
+
#
|
3 |
+
# d = [
|
4 |
+
# i for i in out["generate_final_summary"]["summaries_fixed"] if i["iteration"] == 4
|
5 |
+
# ][0]
|
6 |
+
# d["document"]
|
7 |
+
#
|
8 |
+
# h = [
|
9 |
+
# i["summary"].summary
|
10 |
+
# for i in out["generate_final_summary"]["hallucinations"]
|
11 |
+
# if i["document"] == d["document"]
|
12 |
+
# ]
|
13 |
+
#
|
14 |
+
# e = [
|
15 |
+
# i["hallucination"].explanation
|
16 |
+
# for i in out["generate_final_summary"]["hallucinations"]
|
17 |
+
# if i["document"] == d["document"]
|
18 |
+
# ]
|
19 |
+
#
|
20 |
+
# test = {
|
21 |
+
# "document": d["document"],
|
22 |
+
# "final_summary": d["summary"].summary,
|
23 |
+
# "attempts": h,
|
24 |
+
# "reasoning": e,
|
25 |
+
# }
|
26 |
+
#
|
27 |
+
# print(f"Document:\n\n{test['document']}\n\n")
|
28 |
+
# print(f"Final:\n\n{test['final_summary']}\n\n")
|
29 |
+
# print("Attempts: \n\n*", "\n\n* ".join(test["attempts"]), "\n\n")
|
30 |
+
# print("Reasoning: \n\n*", "\n\n* ".join(test["reasoning"]), "\n\n")
|
planning_ai/retrievers/theme_retriever.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
from pathlib import Path
|
3 |
+
|
4 |
+
from chromadb import PersistentClient
|
5 |
+
from langchain_community.document_loaders import PyPDFLoader
|
6 |
+
from langchain_community.vectorstores import Chroma
|
7 |
+
from langchain_core.prompts import PromptTemplate
|
8 |
+
from langchain_openai import OpenAIEmbeddings
|
9 |
+
from pydantic import BaseModel, Field
|
10 |
+
|
11 |
+
from planning_ai.llms.llm import LLM
|
12 |
+
|
13 |
+
# See: https://consultations.greatercambridgeplanning.org/greater-cambridge-local-plan-preferred-options/supporting-documents
|
14 |
+
|
15 |
+
PDFS = {
|
16 |
+
"Biodiversity and Green Spaces": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPBiodiversityandGreenSpacesAug21v2Nov21_0.pdf",
|
17 |
+
"Climate Change": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPClimateChangeAug21v2Nov21_0.pdf",
|
18 |
+
"Great Places": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPGreatPlacesAug21v1Aug21.pdf",
|
19 |
+
"Homes": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPHomesAug21v2Nov21.pdf",
|
20 |
+
"Infrastructure": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPInfrastructureAug21v2Nov21.pdf",
|
21 |
+
"Jobs": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPJobsAug21v2Nov21.pdf",
|
22 |
+
# "Strategy topic paper": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPStrategyAug21v3Nov21_0.pdf",
|
23 |
+
"Wellbeing and Social Inclusion": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPWellbeingAug21v2Nov21.pdf",
|
24 |
+
}
|
25 |
+
|
26 |
+
|
27 |
+
class Grade(BaseModel):
|
28 |
+
"""Binary score for relevance check."""
|
29 |
+
|
30 |
+
binary_score: str = Field(description="Relevance score 'yes' or 'no'")
|
31 |
+
|
32 |
+
|
33 |
+
def create_db():
|
34 |
+
chroma_dir = Path("./chroma_themesdb")
|
35 |
+
if chroma_dir.exists():
|
36 |
+
persistent_client = PersistentClient(path="./chroma_themesdb")
|
37 |
+
vectorstore = Chroma(
|
38 |
+
client=persistent_client,
|
39 |
+
collection_name="themes-chroma",
|
40 |
+
embedding_function=OpenAIEmbeddings(),
|
41 |
+
)
|
42 |
+
|
43 |
+
else:
|
44 |
+
docs = []
|
45 |
+
for name, pdf in PDFS.items():
|
46 |
+
doc = PyPDFLoader(pdf).load()[5:]
|
47 |
+
for d in doc:
|
48 |
+
d.metadata["theme"] = name
|
49 |
+
docs.extend(doc)
|
50 |
+
|
51 |
+
logging.warning(f"Building ChromaDB...")
|
52 |
+
vectorstore = Chroma.from_documents(
|
53 |
+
documents=docs,
|
54 |
+
collection_name="themes-chroma",
|
55 |
+
embedding=OpenAIEmbeddings(),
|
56 |
+
persist_directory="./chroma_themesdb",
|
57 |
+
)
|
58 |
+
return vectorstore
|
59 |
+
|
60 |
+
|
61 |
+
grade_template = PromptTemplate(
|
62 |
+
template="""You are a grader assessing relevance of a retrieved document to a user question. \n
|
63 |
+
Here is the retrieved document: \n\n {context} \n\n
|
64 |
+
Here is the original document: {document} \n
|
65 |
+
If the retrieved document contains keyword(s) or semantic meaning related to the original, grade it as relevant. \n
|
66 |
+
Give a binary score 'yes' or 'no' score to indicate whether the retrieved document is relevant to the original.""",
|
67 |
+
input_variables=["context", "document"],
|
68 |
+
)
|
69 |
+
|
70 |
+
|
71 |
+
SLLM = LLM.with_structured_output(Grade, strict=True)
|
72 |
+
grade_chain = grade_template | SLLM
|
73 |
+
|
74 |
+
vectorstore = create_db()
|
75 |
+
theme_retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
|
76 |
+
logging.warning(f"Finished building ChromaDB...")
|
77 |
+
|
78 |
+
if __name__ == "__main__":
|
79 |
+
test_content = """
|
80 |
+
We would certainly support this and would emphasise the importance of trying
|
81 |
+
to solve the severance problems created by the M11 and A14.
|
82 |
+
"""
|
83 |
+
|
84 |
+
len(theme_retriever.invoke(input=test_content))
|
planning_ai/states.py
CHANGED
@@ -1,9 +1,39 @@
|
|
1 |
import operator
|
2 |
from pathlib import Path
|
3 |
-
from typing import Annotated, List, TypedDict
|
|
|
|
|
|
|
4 |
|
5 |
from planning_ai.chains.hallucination_chain import HallucinationChecker
|
6 |
-
from planning_ai.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
|
9 |
class OverallState(TypedDict):
|
@@ -23,35 +53,17 @@ class OverallState(TypedDict):
|
|
23 |
iterations (list[int]): A list of iteration counts for processing each document.
|
24 |
"""
|
25 |
|
26 |
-
documents: list[
|
27 |
-
|
28 |
-
final_summary: str
|
29 |
-
summaries: Annotated[list, operator.add]
|
30 |
-
summaries_fixed: Annotated[list, operator.add]
|
31 |
-
hallucinations: Annotated[list, operator.add]
|
32 |
-
|
33 |
-
filenames: List[Path]
|
34 |
-
iterations: list[int]
|
35 |
-
|
36 |
-
|
37 |
-
class DocumentState(TypedDict):
|
38 |
-
"""Represents the state of an individual document during processing.
|
39 |
-
|
40 |
-
This class is a TypedDict that encapsulates the state of a single document
|
41 |
-
during the processing workflow. It includes the document text, summary,
|
42 |
-
hallucination details, filename, and iteration count.
|
43 |
-
|
44 |
-
Attributes:
|
45 |
-
document (str): The text of the document.
|
46 |
-
summary (BriefSummary): The summary of the document.
|
47 |
-
hallucination (HallucinationChecker): The hallucination details for the document's summary.
|
48 |
-
filename (Path): The file path of the document.
|
49 |
-
iteration (int): The current iteration count for processing the document.
|
50 |
-
"""
|
51 |
|
52 |
-
|
53 |
-
summary: BriefSummary
|
54 |
-
hallucination: HallucinationChecker
|
55 |
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import operator
|
2 |
from pathlib import Path
|
3 |
+
from typing import Annotated, List, Optional, TypedDict
|
4 |
+
|
5 |
+
from langchain_core.documents import Document
|
6 |
+
from pydantic import BaseModel
|
7 |
|
8 |
from planning_ai.chains.hallucination_chain import HallucinationChecker
|
9 |
+
from planning_ai.common.utils import filename_reducer
|
10 |
+
|
11 |
+
|
12 |
+
class DocumentState(TypedDict):
|
13 |
+
"""Represents the state of an individual document during processing.
|
14 |
+
|
15 |
+
This class is a TypedDict that encapsulates the state of a single document
|
16 |
+
during the processing workflow. It includes the document text, summary,
|
17 |
+
hallucination details, filename, and iteration count.
|
18 |
+
|
19 |
+
Attributes:
|
20 |
+
document (str): The text of the document.
|
21 |
+
summary (BriefSummary): The summary of the document.
|
22 |
+
hallucination (HallucinationChecker): The hallucination details for the document's summary.
|
23 |
+
filename (Path): The file path of the document.
|
24 |
+
iteration (int): The current iteration count for processing the document.
|
25 |
+
"""
|
26 |
+
|
27 |
+
document: Document
|
28 |
+
filename: Path
|
29 |
+
|
30 |
+
entities: list[dict]
|
31 |
+
themes: set[str]
|
32 |
+
summary: BaseModel
|
33 |
+
theme_docs: list[Document]
|
34 |
+
hallucination: HallucinationChecker
|
35 |
+
|
36 |
+
iteration: int
|
37 |
|
38 |
|
39 |
class OverallState(TypedDict):
|
|
|
53 |
iterations (list[int]): A list of iteration counts for processing each document.
|
54 |
"""
|
55 |
|
56 |
+
documents: Annotated[list[DocumentState], filename_reducer]
|
57 |
+
n_docs: int
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
+
final_summary: Optional[str]
|
|
|
|
|
60 |
|
61 |
+
# documents_related: Annotated[list, operator.add]
|
62 |
+
# related_theme_docs: Annotated[list, operator.add]
|
63 |
+
#
|
64 |
+
# summaries: Annotated[list, operator.add]
|
65 |
+
# summaries_fixed: Annotated[list, operator.add]
|
66 |
+
# hallucinations: Annotated[list, operator.add]
|
67 |
+
#
|
68 |
+
# filenames: Annotated[list, operator.add]
|
69 |
+
# iterations: list[int]
|
planning_ai/themes.py
CHANGED
@@ -1,134 +1,76 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
H_BR = "Build to rent homes"
|
77 |
-
H_MO = "Houses in multiple occupation (HMOs)"
|
78 |
-
H_SA = "Student accommodation"
|
79 |
-
H_DC = "Dwellings in the countryside"
|
80 |
-
H_RM = "Residential moorings"
|
81 |
-
H_RC = "Residential caravan sites"
|
82 |
-
H_GT = "Gypsy and Traveller and Travelling Showpeople sites"
|
83 |
-
H_CH = "Community-led housing"
|
84 |
-
|
85 |
-
|
86 |
-
class InfrastructurePolicies(str, Enum):
|
87 |
-
I_ST = "Sustainable transport and connectivity"
|
88 |
-
I_EV = "Parking and electric vehicles"
|
89 |
-
I_FD = "Freight and delivery consolidation"
|
90 |
-
I_SI = "Safeguarding important infrastructure"
|
91 |
-
I_AD = "Aviation development"
|
92 |
-
I_EI = "Energy infrastructure masterplanning"
|
93 |
-
I_ID = "Infrastructure and delivery"
|
94 |
-
I_DI = "Digital infrastructure"
|
95 |
-
|
96 |
-
|
97 |
-
THEME_TO_POLICY_GROUP = {
|
98 |
-
Theme.climate: ClimatePolicies,
|
99 |
-
Theme.biodiversity: BiodiversityPolicies,
|
100 |
-
Theme.wellbeing: WellbeingPolicies,
|
101 |
-
Theme.great_places: GreatPlacesPolicies,
|
102 |
-
Theme.jobs: JobsPolicies,
|
103 |
-
Theme.homes: HomesPolicies,
|
104 |
-
Theme.infrastructure: InfrastructurePolicies,
|
105 |
}
|
106 |
-
|
107 |
-
|
108 |
-
class PolicyDetail(BaseModel):
|
109 |
-
policy: str
|
110 |
-
details: list[str]
|
111 |
-
|
112 |
-
|
113 |
-
class PolicySelection(BaseModel):
|
114 |
-
theme: Theme
|
115 |
-
policies: list[PolicyDetail]
|
116 |
-
|
117 |
-
@field_validator("policies", mode="before")
|
118 |
-
@classmethod
|
119 |
-
def validate_policies(cls, policies, info):
|
120 |
-
"""Ensure policies match the selected theme."""
|
121 |
-
if not isinstance(policies, list):
|
122 |
-
raise ValueError("Policies must be provided as a list.")
|
123 |
-
|
124 |
-
theme = info.data.get("theme")
|
125 |
-
if not theme:
|
126 |
-
raise ValueError("Theme must be provided before validating policies.")
|
127 |
-
|
128 |
-
allowed_policies = [p.value for p in THEME_TO_POLICY_GROUP[theme]]
|
129 |
-
for policy in policies:
|
130 |
-
if policy["policy"] not in allowed_policies:
|
131 |
-
raise ValueError(
|
132 |
-
f"Policy '{policy['policy']}' is not valid for theme '{theme.value}'."
|
133 |
-
)
|
134 |
-
return policies
|
|
|
1 |
+
THEMES_AND_POLICIES = {
|
2 |
+
"Climate Change": [
|
3 |
+
"Net zero carbon new buildings",
|
4 |
+
"Water efficiency in new developments",
|
5 |
+
"Designing for a changing climate",
|
6 |
+
"Flooding and integrated water management",
|
7 |
+
"Renewable energy projects and infrastructure",
|
8 |
+
"Reducing waste and supporting the circular economy",
|
9 |
+
"Supporting land-based carbon sequestration",
|
10 |
+
],
|
11 |
+
"Biodiversity and Green Spaces": [
|
12 |
+
"Biodiversity and geodiversity",
|
13 |
+
"Green infrastructure",
|
14 |
+
"Improving Tree Canopy Cover and the Tree Population",
|
15 |
+
"River corridors",
|
16 |
+
"Protecting open spaces",
|
17 |
+
"Providing and enhancing open spaces",
|
18 |
+
],
|
19 |
+
"Wellbeing and Social Inclusion": [
|
20 |
+
"Creating healthy new developments",
|
21 |
+
"Community, sports and leisure facilities",
|
22 |
+
"Meanwhile uses during long term redevelopments",
|
23 |
+
"Creating inclusive employment and business opportunities through new developments",
|
24 |
+
"Pollution, health and safety",
|
25 |
+
],
|
26 |
+
"Great Places": [
|
27 |
+
"People and place responsive design",
|
28 |
+
"Protection and enhancement of landscape character",
|
29 |
+
"Protection and enhancement of the Cambridge Green Belt",
|
30 |
+
"Achieving high quality development",
|
31 |
+
"Establishing high quality landscape and public realm",
|
32 |
+
"Conservation and enhancement of heritage assets",
|
33 |
+
"Adapting heritage assets to climate change",
|
34 |
+
"Protection of public houses",
|
35 |
+
],
|
36 |
+
"Jobs": [
|
37 |
+
"New employment and development proposals",
|
38 |
+
"Supporting the rural economy",
|
39 |
+
"Protecting the best agricultural land",
|
40 |
+
"Protecting existing business space",
|
41 |
+
"Enabling remote working",
|
42 |
+
"Affordable workspace and creative industries",
|
43 |
+
"Supporting a range of facilities in employment parks",
|
44 |
+
"Retail and centres",
|
45 |
+
"Visitor accommodation, attractions and facilities",
|
46 |
+
"Faculty development and specialist / language schools",
|
47 |
+
],
|
48 |
+
"Homes": [
|
49 |
+
"Affordable housing",
|
50 |
+
"Exception sites for affordable housing",
|
51 |
+
"Housing mix",
|
52 |
+
"Housing density",
|
53 |
+
"Garden land and subdivision of existing plots",
|
54 |
+
"Residential space standards and accessible homes",
|
55 |
+
"Specialist housing and homes for older people",
|
56 |
+
"Self and custom build homes",
|
57 |
+
"Build to rent homes",
|
58 |
+
"Houses in multiple occupation (HMOs)",
|
59 |
+
"Student accommodation",
|
60 |
+
"Dwellings in the countryside",
|
61 |
+
"Residential moorings",
|
62 |
+
"Residential caravan sites",
|
63 |
+
"Gypsy and Traveller and Travelling Showpeople sites",
|
64 |
+
"Community-led housing",
|
65 |
+
],
|
66 |
+
"Infrastructure": [
|
67 |
+
"Sustainable transport and connectivity",
|
68 |
+
"Parking and electric vehicles",
|
69 |
+
"Freight and delivery consolidation",
|
70 |
+
"Safeguarding important infrastructure",
|
71 |
+
"Aviation development",
|
72 |
+
"Energy infrastructure masterplanning",
|
73 |
+
"Infrastructure and delivery",
|
74 |
+
"Digital infrastructure",
|
75 |
+
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
76 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|