cjber commited on
Commit
963aee4
·
1 Parent(s): 613cc82

update to auto select policies

Browse files
planning_ai/chains/fix_chain.py CHANGED
@@ -1,13 +1,13 @@
1
  from langchain_core.prompts import ChatPromptTemplate
2
 
3
- from planning_ai.chains.map_chain import SLLM
4
  from planning_ai.common.utils import Paths
5
 
6
- with open(Paths.PROMPTS / "fix_hallucination.txt", "r") as f:
7
- map_template = f.read()
8
 
9
- map_prompt = ChatPromptTemplate.from_messages([("system", map_template)])
10
- fix_chain = map_prompt | SLLM
11
 
12
  if __name__ == "__main__":
13
  test_document = """
@@ -16,7 +16,8 @@ if __name__ == "__main__":
16
  the major settlement of Cambourne has been created - now over the projected 3,000 homes and
17
  Papworth Everard has grown beyond recognition. This in itself is a matter of concern.
18
  """
19
-
 
20
  result = fix_chain.invoke(
21
  {
22
  "summary": "This plan is great because they are building a nuclear power plant.",
@@ -24,4 +25,4 @@ if __name__ == "__main__":
24
  "context": test_document,
25
  }
26
  )
27
- print(result)
 
1
  from langchain_core.prompts import ChatPromptTemplate
2
 
3
+ from planning_ai.chains.map_chain import create_dynamic_map_chain
4
  from planning_ai.common.utils import Paths
5
 
6
+ with open(Paths.PROMPTS / "themes.txt", "r") as f:
7
+ themes_txt = f.read()
8
 
9
+ with open(Paths.PROMPTS / "fix_hallucination.txt", "r") as f:
10
+ fix_template = f"{themes_txt}\n\n {f.read()}"
11
 
12
  if __name__ == "__main__":
13
  test_document = """
 
16
  the major settlement of Cambourne has been created - now over the projected 3,000 homes and
17
  Papworth Everard has grown beyond recognition. This in itself is a matter of concern.
18
  """
19
+ test_themes = {"Great Places", "Homes", "Climate Change"}
20
+ fix_chain = create_dynamic_map_chain(test_themes, fix_template)
21
  result = fix_chain.invoke(
22
  {
23
  "summary": "This plan is great because they are building a nuclear power plant.",
 
25
  "context": test_document,
26
  }
27
  )
28
+ __import__("pprint").pprint(dict(result))
planning_ai/chains/map_chain.py CHANGED
@@ -1,96 +1,85 @@
1
- from enum import Enum
 
2
 
3
- from langchain.output_parsers import RetryOutputParser
4
  from langchain_core.prompts import ChatPromptTemplate
5
- from langchain_core.runnables import RunnableLambda
6
- from pydantic import BaseModel, Field
7
 
8
  from planning_ai.common.utils import Paths
9
  from planning_ai.llms.llm import LLM
10
- from planning_ai.themes import PolicySelection, Theme
11
 
12
- with open(Paths.PROMPTS / "themes.txt", "r") as f:
13
- themes_txt = f.read()
14
 
15
  with open(Paths.PROMPTS / "map.txt", "r") as f:
16
- map_template = f"{themes_txt}\n\n {f.read()}"
 
17
 
18
 
19
- class Sentiment(Enum):
20
- POSITIVE = "positive"
21
- NEGATIVE = "negative"
22
- NEUTRAL = "neutral"
 
23
 
 
 
 
24
 
25
- class Place(BaseModel):
26
- """Represents a geographical location mentioned in the response with associated sentiment."""
 
 
27
 
28
- place: str = Field(
29
- ...,
30
- description=(
31
- "The name of the geographical location mentioned in the response. "
32
- "This can be a city, town, region, or any identifiable place."
33
- ),
34
- )
35
- sentiment: Sentiment = Field(
36
- ...,
37
- description=(
38
- "The sentiment associated with the mentioned place, categorized as 'positive', 'negative', or 'neutral'. "
39
- "Assess sentiment based on the context in which the place is mentioned, considering both positive and negative connotations."
40
- ),
41
- )
42
 
 
 
 
43
 
44
- class BriefSummary(BaseModel):
45
- """A summary of the response with generated metadata"""
46
 
47
- summary: str = Field(
48
- ...,
49
- description=(
50
- "A concise summary of the response, capturing the main points and overall sentiment. "
51
- "The summary should reflect the key arguments and conclusions presented in the response."
52
- ),
53
- )
54
- themes: list[Theme] = Field(
55
- ...,
56
- description=(
57
- "A list of themes associated with the response. Themes are overarching topics or "
58
- "categories that the response addresses, such as 'Climate change' or 'Infrastructure'. "
59
- "Identify themes based on the content and context of the response."
60
- ),
61
- )
62
- policies: list[PolicySelection] = Field(
63
- ...,
64
- description=(
65
- "A list of policies associated with the response, each accompanied by directly related "
66
- "information as bullet points. Bullet points should provide specific details or examples "
67
- "that illustrate how the policy is relevant to the response."
68
- ),
69
- )
70
- places: list[Place] = Field(
71
- ...,
72
- description=(
73
- "All places mentioned in the response, with the sentiment categorized as 'positive', 'negative', or 'neutral'. "
74
- "A place can be a city, region, or any geographical location. Assess sentiment based on the context "
75
- "in which the place is mentioned, considering both positive and negative connotations."
76
- ),
77
  )
78
- is_constructive: bool = Field(
79
- ...,
80
- description=(
81
- "A flag indicating whether the response is constructive. A response is considered constructive if it "
82
- "provides actionable suggestions or feedback, addresses specific themes or policies, and is presented "
83
- "in a coherent and logical manner."
84
- ),
85
  )
86
 
87
 
88
- SLLM = LLM.with_structured_output(BriefSummary, strict=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
- # TODO: Split out the policy stuff from this class. Find policies later based on
91
- # what themes are already identified (should improve accuracy)
92
- map_prompt = ChatPromptTemplate.from_messages([("system", map_template)])
93
- map_chain = map_prompt | SLLM
94
 
95
 
96
  if __name__ == "__main__":
@@ -100,6 +89,8 @@ if __name__ == "__main__":
100
  the major settlement of Cambourne has been created - now over the projected 3,000 homes and
101
  Papworth Everard has grown beyond recognition. This in itself is a matter of concern.
102
  """
 
103
 
104
- result = map_chain.invoke({"context": test_document})
 
105
  __import__("pprint").pprint(dict(result))
 
1
+ from enum import Enum, auto
2
+ from typing import Optional, Set, Type
3
 
4
+ from langchain.schema import BaseCache
5
  from langchain_core.prompts import ChatPromptTemplate
6
+ from pydantic import BaseModel, create_model
 
7
 
8
  from planning_ai.common.utils import Paths
9
  from planning_ai.llms.llm import LLM
10
+ from planning_ai.themes import THEMES_AND_POLICIES
11
 
12
+ # with open(Paths.PROMPTS / "themes.txt", "r") as f:
13
+ # themes_txt = f.read()
14
 
15
  with open(Paths.PROMPTS / "map.txt", "r") as f:
16
+ # map_template = f"{themes_txt}\n\n {f.read()}"
17
+ map_template = f.read()
18
 
19
 
20
+ def create_policy_enum(
21
+ policy_groups: Set[str], name: str = "DynamicPolicyEnum"
22
+ ) -> Enum:
23
+ """
24
+ Create a dynamic enum for policies based on the given policy groups.
25
 
26
+ Args:
27
+ policy_groups (Set[str]): A set of policy group names.
28
+ name (str): Name of the enum to be created.
29
 
30
+ Returns:
31
+ Type[Enum]: A dynamically created Enum class for the policies.
32
+ """
33
+ return Enum(name, {policy: auto() for policy in policy_groups})
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
+ def create_brief_summary_model(policy_enum: Enum) -> Type[BaseModel]:
37
+ """
38
+ Dynamically create a BriefSummary model using the provided policy enum.
39
 
40
+ Args:
41
+ policy_enum (Type[Enum]): The dynamically created policy enum.
42
 
43
+ Returns:
44
+ Type[BaseModel]: A dynamically generated Pydantic model for BriefSummary.
45
+ """
46
+
47
+ DynamicPolicy = create_model(
48
+ "DynamicPolicy",
49
+ policy=(policy_enum, ...),
50
+ note=(str, ...),
51
+ __config__={"extra": "forbid"},
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  )
53
+
54
+ return create_model(
55
+ "DynamicBriefSummary",
56
+ summary=(str, ...),
57
+ policies=(Optional[list[DynamicPolicy]], ...),
58
+ __module__=__name__,
59
+ __config__={"extra": "forbid"},
60
  )
61
 
62
 
63
+ def create_dynamic_map_chain(themes, prompt: str):
64
+ policy_groups = set()
65
+ for theme in themes:
66
+ if theme in THEMES_AND_POLICIES:
67
+ policy_groups.update(THEMES_AND_POLICIES[theme])
68
+
69
+ PolicyEnum = create_policy_enum(policy_groups)
70
+ DynamicBriefSummary = create_brief_summary_model(PolicyEnum)
71
+
72
+ SLLM = LLM.with_structured_output(DynamicBriefSummary, strict=True)
73
+
74
+ prompt = (
75
+ f"{prompt}\n\nAvailable Policies:\n\n- "
76
+ + "\n- ".join(policy_groups)
77
+ + "\n\nContext:\n\n{context}"
78
+ )
79
+ map_prompt = ChatPromptTemplate.from_messages([("system", prompt)])
80
+ map_chain = map_prompt | SLLM
81
 
82
+ return map_chain
 
 
 
83
 
84
 
85
  if __name__ == "__main__":
 
89
  the major settlement of Cambourne has been created - now over the projected 3,000 homes and
90
  Papworth Everard has grown beyond recognition. This in itself is a matter of concern.
91
  """
92
+ test_themes = {"Great Places", "Homes"}
93
 
94
+ dynamic_map_chain = create_dynamic_map_chain(test_themes, prompt=map_template)
95
+ result = dynamic_map_chain.invoke({"context": test_document, "themes": test_themes})
96
  __import__("pprint").pprint(dict(result))
planning_ai/chains/policy_chain.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_core.output_parsers import StrOutputParser
2
+ from langchain_core.prompts import ChatPromptTemplate
3
+
4
+ from planning_ai.common.utils import Paths
5
+ from planning_ai.llms.llm import LLM
6
+
7
+ with open(Paths.PROMPTS / "policy.txt", "r") as f:
8
+ policy_template = f.read()
9
+
10
+
11
+ policy_prompt = ChatPromptTemplate([("system", policy_template)])
12
+ policy_chain = policy_prompt | LLM | StrOutputParser()
13
+
14
+
15
+ if __name__ == "__main__":
16
+ test_policy = "Protecting open spaces"
17
+ test_bullet = "* " + "\n* ".join(
18
+ [
19
+ "The response emphasizes the need to preserve greenfield land, which relates to protecting open spaces.",
20
+ "The response notes that greenspace land should be preserved."
21
+ "The response emphasizes the need for creating more parks, which relates to protecting open spaces.",
22
+ ]
23
+ )
24
+
25
+ result = policy_chain.invoke({"policy": test_policy, "bullet_points": test_bullet})
26
+ print(result)
planning_ai/chains/prompts/map.txt CHANGED
@@ -1,40 +1,4 @@
1
- Summarise the following response to a planning application, focusing on the themes and policies proposed by the council. Follow these steps:
2
 
3
- 1. **Summary:** Provide a concise, neutral summary that captures the key points of the response, particularly in relation to the council's proposed themes.
4
- 2. **Themes:** List the council's themes discussed in the response.
5
- 3. **Policies:** Identify relevant policies associated with the extracted themes.
6
- 4. **Places:** Mention any geographical locations considered by the author.
7
- 5. **Constructiveness:** Indicate whether the response is constructive. A response is constructive if it provides any feedback or commentary on the plan, regardless of its depth or specificity.
8
 
9
- **Few-shot examples for reference:**
10
-
11
- ---
12
-
13
- **Example 1:**
14
-
15
- Response:
16
- "I am in favour of this new park development as it will provide much-needed green space for families. However, the parking situation needs to be reconsidered."
17
-
18
- - **Summary:** The author supports the park development for its benefit to families but expresses concern about parking.
19
- - **Themes:** Biodiversity and green spaces, Infrastructure
20
- - **Places:** None
21
- - **Constructiveness:** True
22
-
23
- ---
24
-
25
- **Example 2:**
26
-
27
- Response:
28
- "This development in Cambridge will destroy local wildlife and create traffic chaos. It should not go ahead."
29
-
30
- - **Summary:** The author opposes the development due to concerns about wildlife and traffic congestion.
31
- - **Themes:** Biodiversity and green spaces, Infrastructure
32
- - **Places:** Cambridge
33
- - **Constructiveness:** True
34
-
35
- ---
36
-
37
- **Now summarise the following response in British English:**
38
-
39
- Response:
40
- {context}
 
1
+ Read the following response to a planning application, first summarise the response, then identify relevant 'policies' from any given. For each policy, list at least one section of the response that is related. Do **not** invent new policies. You **must** return valid JSON in the format given.
2
 
3
+ Choose from the following list given, by name **only**:
 
 
 
 
4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
planning_ai/chains/prompts/ocr.txt CHANGED
@@ -2,7 +2,7 @@ The images provided are from a planning response form filled out by a member of
2
 
3
  Please follow these instructions to process the images:
4
 
5
- 1. **Extract Free-Form Information Only**: Focus on extracting and outputting the free-form written content from the images. Do not include single-word answers, brief responses, or any extra content that is not part of the detailed responses.
6
  2. **Verbatim Output**: Ensure that the extracted information is output exactly as it appears in the images. Add a heading before each section of free-form text if it helps with organisation, but ensure the heading is not added by the model itself. Ignore blank sections entirely—do not generate or include any additional thoughts or content.
7
  3. **Sequential Processing**: The images are sequentially ordered. A response might continue from one image to the next, so capture the full context across multiple images if necessary.
8
  4. **Ignore Non-Relevant Content**: Exclude any content that does not fit the criteria of free-form, detailed responses.
 
2
 
3
  Please follow these instructions to process the images:
4
 
5
+ 1. **Extract Free-Form Information Only**: Focus on extracting and outputting the free-form written content from the images. Do not include single-word answers, brief responses, or any extra content that is not part of the detailed responses. If there is no response, state **nothing**.
6
  2. **Verbatim Output**: Ensure that the extracted information is output exactly as it appears in the images. Add a heading before each section of free-form text if it helps with organisation, but ensure the heading is not added by the model itself. Ignore blank sections entirely—do not generate or include any additional thoughts or content.
7
  3. **Sequential Processing**: The images are sequentially ordered. A response might continue from one image to the next, so capture the full context across multiple images if necessary.
8
  4. **Ignore Non-Relevant Content**: Exclude any content that does not fit the criteria of free-form, detailed responses.
planning_ai/chains/prompts/policy.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ The following is a list of bullet points relating to a particular planning policy. Rewrite the bullet points to focus only on the key action or idea, excluding additional context (like the name of the policy). If multiple bullet points share the same concept, combine them together.
2
+
3
+ Policy: {policy}
4
+
5
+ Bullet Points:
6
+
7
+ {bullet_points}
planning_ai/chains/prompts/reduce.txt CHANGED
@@ -1,41 +1,9 @@
1
- The following is one or more summaries of responses to a new plan by the South Cambridgeshire Council:
2
 
3
  {context}
4
 
5
- You are tasked with producing a **detailed and thorough final summary** that consolidates the main themes raised in the responses. **Do not add, infer, or create information.** Only use content directly mentioned in the summaries. Write in British English.
6
 
7
- Each provided summary may contain both supporting and opposing key points. Summarise these points, indicating whether they **support** or **oppose** the proposed plan. Each point should be grouped into a relevant header chosen from the 'Aims' associated with the response summary. **Only include aims mentioned in the provided responses.** Omit any aim that is not discussed.
8
-
9
- **Guidelines**:
10
-
11
- - Provide an **extended, one page, balanced summary** of the key themes at the beginning, capturing the overall sentiment and notable trends.
12
- - In the 'Key points' sections, **group points by aim only if that aim is explicitly mentioned**.
13
- - Be sure to include specific, concise points that reflect the underlying concerns or support expressed by respondents.
14
- - Do **not** include information, assumptions, or summaries of aims that were not explicitly mentioned in the responses.
15
-
16
- **Format**:
17
-
18
- ## Summary
19
-
20
- <Provide an extended, comprehensive overview of all the main themes. Mention key concerns, positive feedback, and overall trends.>
21
-
22
- ## Key points raised in support
23
-
24
- For each key point raised in support, group them by aim **only if that aim is mentioned in the responses**.
25
-
26
- ### [Aim name]
27
-
28
- - <Key point 1>
29
- - <Key point 2>
30
- - ...
31
-
32
- ## Key points raised in opposition
33
-
34
- For each key point raised in opposition, group them by aim **only if that aim is mentioned in the responses**.
35
-
36
- ### [Aim name]
37
-
38
- - <Key point 1>
39
- - <Key point 2>
40
- - ...
41
 
 
 
1
+ The following contains summaries of public responses to a new plan proposed by the South Cambridgeshire Council:
2
 
3
  {context}
4
 
5
+ As a representative of the Cambridgeshire Council, your task is to craft a **comprehensive and articulate executive summary**. This summary will serve as the introductory section of a major report, highlighting the key themes and concerns raised in the public responses. Ensure that the summary is clear, concise, and professional, reflecting the tone and standards expected in official council documents. **Do not add, infer, or create information.** Use only the content explicitly mentioned in the summaries. Adhere to British English conventions.
6
 
7
+ Each time you make a reference to a response document, please add an inline citation which corresponds with the documents numerical ID. For example 'Concerns regarding the impact of increased housing density on the character of Cambridge were prevalent [1][2][11].'.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+ ## Executive Summary
planning_ai/chains/prompts/themes.txt CHANGED
@@ -1,82 +0,0 @@
1
- The following themes are proposed by the South Cambridgeshire Council with each of their associated policies.
2
-
3
- # Climate change
4
-
5
- Net zero carbon new buildings
6
- Water efficiency in new developments
7
- Designing for a changing climate
8
- Flooding and integrated water management
9
- Renewable energy projects and infrastructure
10
- Reducing waste and supporting the circular economy
11
- Supporting land-based carbon sequestration
12
-
13
- # Biodiversity and green spaces
14
-
15
- Biodiversity and geodiversity
16
- Green infrastructure
17
- Improving Tree Canopy Cover and the Tree Population
18
- River corridors
19
- Protecting open spaces
20
- Providing and enhancing open spaces
21
-
22
- # Wellbeing and social inclusion
23
-
24
- Creating healthy new developments
25
- Community, sports and leisure facilities
26
- Meanwhile uses during long term redevelopments
27
- Creating inclusive employment and business opportunities through new developments
28
- Pollution, health and safety
29
-
30
- # Great places
31
-
32
- People and place responsive design
33
- Protection and enhancement of landscape character
34
- Protection and enhancement of the Cambridge Green Belt
35
- Achieving high quality development
36
- Establishing high quality landscape and public realm
37
- Conservation and enhancement of heritage assets
38
- Adapting heritage assets to climate change
39
- Protection of public houses
40
-
41
- # Jobs
42
-
43
- New employment and development proposals
44
- Supporting the rural economy
45
- Protecting the best agricultural land
46
- Protecting existing business space
47
- Enabling remote working
48
- Affordable workspace and creative industries
49
- Supporting a range of facilities in employment parks
50
- Retail and centres
51
- Visitor accommodation, attractions and facilities
52
- Faculty development and specialist / language schools
53
-
54
- # Homes
55
-
56
- Affordable housing
57
- Exception sites for affordable housing
58
- Housing mix
59
- Housing density
60
- Garden land and subdivision of existing plots
61
- Residential space standards and accessible homes
62
- Specialist housing and homes for older people
63
- Self and custom build homes
64
- Build to rent homes
65
- Houses in multiple occupation (HMOs)
66
- Student accommodation
67
- Dwellings in the countryside
68
- Residential moorings
69
- Residential caravan sites
70
- Gypsy and Traveller and Travelling Showpeople sites
71
- Community-led housing
72
-
73
- # Infrastructure
74
-
75
- Sustainable transport and connectivity
76
- Parking and electric vehicles
77
- Freight and delivery consolidation
78
- Safeguarding important infrastructure
79
- Aviation development
80
- Energy infrastructure masterplanning
81
- Infrastructure and delivery
82
- Digital infrastructure
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
planning_ai/common/utils.py CHANGED
@@ -11,6 +11,18 @@ pl.Config(
11
  )
12
 
13
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  class Paths:
15
  DATA = Path("data")
16
 
@@ -25,7 +37,14 @@ class Paths:
25
 
26
  @classmethod
27
  def ensure_directories_exist(cls):
28
- for path in [cls.DATA, cls.RAW, cls.STAGING, cls.OUT, cls.SUMMARY, cls.SUMMARIES]:
 
 
 
 
 
 
 
29
  path.mkdir(parents=True, exist_ok=True)
30
 
31
 
 
11
  )
12
 
13
 
14
+ def filename_reducer(docs_a, docs_b):
15
+ if docs_a == []:
16
+ return docs_b
17
+ b_dict = {d["filename"]: d for d in docs_b}
18
+
19
+ for i, dict_a in enumerate(docs_a):
20
+ filename = dict_a.get("filename")
21
+ if filename in b_dict:
22
+ docs_a[i] = b_dict[filename]
23
+ return docs_a
24
+
25
+
26
  class Paths:
27
  DATA = Path("data")
28
 
 
37
 
38
  @classmethod
39
  def ensure_directories_exist(cls):
40
+ for path in [
41
+ cls.DATA,
42
+ cls.RAW,
43
+ cls.STAGING,
44
+ cls.OUT,
45
+ cls.SUMMARY,
46
+ cls.SUMMARIES,
47
+ ]:
48
  path.mkdir(parents=True, exist_ok=True)
49
 
50
 
planning_ai/graph.py CHANGED
@@ -7,7 +7,13 @@ from planning_ai.nodes.hallucination_node import (
7
  map_fix_hallucinations,
8
  map_hallucinations,
9
  )
10
- from planning_ai.nodes.map_node import generate_summary, map_summaries
 
 
 
 
 
 
11
  from planning_ai.nodes.reduce_node import generate_final_summary
12
  from planning_ai.states import OverallState
13
 
@@ -25,13 +31,21 @@ def create_graph():
25
  StateGraph: The compiled state graph ready for execution.
26
  """
27
  graph = StateGraph(OverallState)
 
 
28
  graph.add_node("generate_summary", generate_summary)
29
  graph.add_node("check_hallucination", check_hallucination)
30
  graph.add_node("fix_hallucination", fix_hallucination)
31
- # graph.add_node("generate_final_summary", generate_final_summary)
32
 
 
 
 
 
 
 
33
  graph.add_conditional_edges(
34
- START,
35
  map_summaries,
36
  ["generate_summary"],
37
  )
@@ -51,7 +65,7 @@ def create_graph():
51
  ["check_hallucination"],
52
  )
53
 
54
- # graph.add_edge("check_hallucination", "generate_final_summary")
55
- # graph.add_edge("generate_final_summary", END)
56
 
57
  return graph.compile()
 
7
  map_fix_hallucinations,
8
  map_hallucinations,
9
  )
10
+ from planning_ai.nodes.map_node import (
11
+ add_entities,
12
+ generate_summary,
13
+ map_retrieve_themes,
14
+ map_summaries,
15
+ retrieve_themes,
16
+ )
17
  from planning_ai.nodes.reduce_node import generate_final_summary
18
  from planning_ai.states import OverallState
19
 
 
31
  StateGraph: The compiled state graph ready for execution.
32
  """
33
  graph = StateGraph(OverallState)
34
+ graph.add_node("add_entities", add_entities)
35
+ graph.add_node("retrieve_themes", retrieve_themes)
36
  graph.add_node("generate_summary", generate_summary)
37
  graph.add_node("check_hallucination", check_hallucination)
38
  graph.add_node("fix_hallucination", fix_hallucination)
39
+ graph.add_node("generate_final_summary", generate_final_summary)
40
 
41
+ graph.add_edge(START, "add_entities")
42
+ graph.add_conditional_edges(
43
+ "add_entities",
44
+ map_retrieve_themes,
45
+ ["retrieve_themes"],
46
+ )
47
  graph.add_conditional_edges(
48
+ "retrieve_themes",
49
  map_summaries,
50
  ["generate_summary"],
51
  )
 
65
  ["check_hallucination"],
66
  )
67
 
68
+ graph.add_edge("check_hallucination", "generate_final_summary")
69
+ graph.add_edge("generate_final_summary", END)
70
 
71
  return graph.compile()
planning_ai/main.py CHANGED
@@ -1,18 +1,28 @@
 
1
  import os
 
2
  import time
3
  from collections import Counter
 
4
  from pathlib import Path
5
 
6
- import geopandas as gpd
7
  import matplotlib.pyplot as plt
8
  import polars as pl
9
  from dotenv import load_dotenv
10
- from langchain_community.document_loaders import DirectoryLoader, TextLoader
11
- from langchain_text_splitters import CharacterTextSplitter
12
- from opencage.geocoder import OpenCageGeocode
 
 
 
13
 
14
  from planning_ai.common.utils import Paths
15
  from planning_ai.graph import create_graph
 
 
 
 
16
 
17
  load_dotenv()
18
 
@@ -64,81 +74,72 @@ def map_locations(places_df: pl.DataFrame):
64
 
65
  def build_quarto_doc(doc_title, out):
66
  final = out["generate_final_summary"]
67
- executive_summary = (
68
- final["final_summary"].split("## Key points raised in support")[0].strip()
69
- )
70
- key_points = final["final_summary"].split("## Key points raised in support")[1]
71
-
72
- aims = []
73
- for summary in final["summaries_fixed"]:
74
- aim = summary["summary"].aims
75
- aims.extend(aim)
76
-
77
- value_counts = Counter(aims)
78
- total_values = sum(value_counts.values())
79
- percentages = {
80
- key: {"count": count, "percentage": (count / total_values)}
81
- for key, count in value_counts.items()
82
- }
83
- top_5 = sorted(percentages.items(), key=lambda x: x[1]["percentage"], reverse=True)[
84
- :5
85
- ]
86
- thematic_breakdown = "| **Aim** | **Percentage** | **Count** |\n|---|---|---|\n"
87
- thematic_breakdown += "\n".join(
88
- [f"| {item} | {d['percentage']:.2%} | {d['count']} |" for item, d in top_5]
89
- )
90
-
91
- places_df = (
92
- pl.DataFrame(
93
- [
94
- place.dict()
95
- for summary in final["summaries_fixed"]
96
- for place in summary["summary"].places
97
- ]
98
- )
99
- .group_by("place")
100
- .agg(
101
- pl.col("place").len().alias("Count"),
102
- pl.col("sentiment").mean().alias("Mean Sentiment"),
103
- )
104
- .rename({"place": "Place"})
105
- )
106
-
107
- map_locations(places_df)
108
 
109
- places_breakdown = (
110
- places_df.sort("Count", descending=True)
111
- .head()
112
- .to_pandas()
113
- .to_markdown(index=False)
114
- )
115
-
116
- stances = [summary["summary"].stance for summary in final["summaries_fixed"]]
117
- value_counts = Counter(stances)
118
- total_values = sum(value_counts.values())
119
- percentages = {
120
- key: {"count": count, "percentage": (count / total_values)}
121
- for key, count in value_counts.items()
122
- }
123
- stances_top = sorted(
124
- percentages.items(), key=lambda x: x[1]["percentage"], reverse=True
125
- )
126
- stances_breakdown = " | ".join(
127
- [
128
- f"**{item}**: {stance['percentage']:.2%} _({stance['count']})_"
129
- for item, stance in stances_top
130
- ]
131
- )
132
-
133
- short_summaries = "\n\n".join(
134
- [
135
- f"#### **TODO**\n"
136
- f"{summary['summary'].summary}\n\n"
137
- f"**Stance**: {summary['summary'].stance}\n\n"
138
- f"**Constructiveness**: {summary['summary'].rating}\n\n"
139
- for summary in final["summaries_fixed"]
140
- ]
141
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
  quarto_doc = (
144
  "---\n"
@@ -153,53 +154,63 @@ def build_quarto_doc(doc_title, out):
153
  "monofontoptions:\n"
154
  " - Scale=0.55\n"
155
  "---\n\n"
156
- f"{executive_summary}\n\n"
157
- f"{stances_breakdown}\n\n"
158
- "## Aim Breakdown\n\n"
159
- "The aim breakdown identifies which aims are mentioned "
160
- "within each response. "
161
- "A single response may discuss multiple topics.\n"
162
- f"\n\n{thematic_breakdown}\n\n"
163
- f"\n\n{places_breakdown}\n\n"
164
- f"![Locations mentioned by sentiment](./figs/places.png)\n\n"
165
- "## Key points raised in support\n\n"
166
- f"{key_points}\n\n"
167
- "## Summaries\n"
168
- f"{short_summaries}"
 
 
169
  )
170
 
171
  with open(Paths.SUMMARY / f"{doc_title.replace(' ', '_')}.qmd", "w") as f:
172
  f.write(quarto_doc)
173
 
174
 
175
- def main():
176
- loader = DirectoryLoader(
177
- path=str(Paths.STAGING / "pdfs"),
178
- show_progress=True,
179
- use_multithreading=True,
180
- loader_cls=TextLoader,
181
- recursive=True,
182
- )
183
- docs = [doc for doc in loader.load() if doc.page_content]
184
- text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
185
- chunk_size=1000, chunk_overlap=0
 
 
186
  )
187
- split_docs = text_splitter.split_documents(docs)
 
 
 
 
 
 
 
 
 
 
 
 
188
 
189
  app = create_graph()
190
 
191
  step = None
192
- for step in app.stream(
193
- {
194
- "documents": [doc.page_content for doc in split_docs],
195
- "filenames": [Path(doc.metadata["source"]) for doc in split_docs],
196
- }
197
- ):
198
- print(list(step.keys()))
199
 
200
  if step is None:
201
  raise ValueError("No steps were processed!")
202
-
203
  return step
204
 
205
 
@@ -208,7 +219,9 @@ if __name__ == "__main__":
208
 
209
  tic = time.time()
210
  out = main()
211
- # build_quarto_doc(doc_title, out)
 
 
212
  toc = time.time()
213
 
214
  print(f"Time taken: {(toc - tic) / 60:.2f} minutes.")
 
1
+ import logging
2
  import os
3
+ import re
4
  import time
5
  from collections import Counter
6
+ from itertools import groupby
7
  from pathlib import Path
8
 
9
+ # import geopandas as gpd
10
  import matplotlib.pyplot as plt
11
  import polars as pl
12
  from dotenv import load_dotenv
13
+ from langchain_community.document_loaders import (
14
+ DirectoryLoader,
15
+ PolarsDataFrameLoader,
16
+ TextLoader,
17
+ )
18
+ from langchain_text_splitters import CharacterTextSplitter, markdown
19
 
20
  from planning_ai.common.utils import Paths
21
  from planning_ai.graph import create_graph
22
+ from planning_ai.themes import THEMES_AND_POLICIES
23
+
24
+ # from opencage.geocoder import OpenCageGeocode
25
+
26
 
27
  load_dotenv()
28
 
 
74
 
75
  def build_quarto_doc(doc_title, out):
76
  final = out["generate_final_summary"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
+ # value_counts = Counter(aims)
79
+ # total_values = sum(value_counts.values())
80
+ # percentages = {
81
+ # key: {"count": count, "percentage": (count / total_values)}
82
+ # for key, count in value_counts.items()
83
+ # }
84
+ # top_5 = sorted(percentages.items(), key=lambda x: x[1]["percentage"], reverse=True)[
85
+ # :5
86
+ # ]
87
+ # thematic_breakdown = "| **Aim** | **Percentage** | **Count** |\n|---|---|---|\n"
88
+ # thematic_breakdown += "\n".join(
89
+ # [f"| {item} | {d['percentage']:.2%} | {d['count']} |" for item, d in top_5]
90
+ # )
91
+ #
92
+ # places_df = (
93
+ # pl.DataFrame(
94
+ # [
95
+ # place.dict()
96
+ # for summary in final["summaries_fixed"]
97
+ # for place in summary["summary"].places
98
+ # ]
99
+ # )
100
+ # .group_by("place")
101
+ # .agg(
102
+ # pl.col("place").len().alias("Count"),
103
+ # pl.col("sentiment").mean().alias("Mean Sentiment"),
104
+ # )
105
+ # .rename({"place": "Place"})
106
+ # )
107
+ #
108
+ # map_locations(places_df)
109
+ #
110
+ # places_breakdown = (
111
+ # places_df.sort("Count", descending=True)
112
+ # .head()
113
+ # .to_pandas()
114
+ # .to_markdown(index=False)
115
+ # )
116
+ #
117
+ # stances = [summary["summary"].stance for summary in final["summaries_fixed"]]
118
+ # value_counts = Counter(stances)
119
+ # total_values = sum(value_counts.values())
120
+ # percentages = {
121
+ # key: {"count": count, "percentage": (count / total_values)}
122
+ # for key, count in value_counts.items()
123
+ # }
124
+ # stances_top = sorted(
125
+ # percentages.items(), key=lambda x: x[1]["percentage"], reverse=True
126
+ # )
127
+ # stances_breakdown = " | ".join(
128
+ # [
129
+ # f"**{item}**: {stance['percentage']:.2%} _({stance['count']})_"
130
+ # for item, stance in stances_top
131
+ # ]
132
+ # )
133
+ #
134
+ # short_summaries = "\n\n".join(
135
+ # [
136
+ # f"#### **TODO**\n"
137
+ # f"{summary['summary'].summary}\n\n"
138
+ # f"**Stance**: {summary['summary'].stance}\n\n"
139
+ # f"**Constructiveness**: {summary['summary'].rating}\n\n"
140
+ # for summary in final["summaries_fixed"]
141
+ # ]
142
+ # )
143
 
144
  quarto_doc = (
145
  "---\n"
 
154
  "monofontoptions:\n"
155
  " - Scale=0.55\n"
156
  "---\n\n"
157
+ f"{final['final_summary']}\n\n"
158
+ f"{final['policies']}"
159
+ # f"{executive_summary}\n\n"
160
+ # f"{stances_breakdown}\n\n"
161
+ # "## Aim Breakdown\n\n"
162
+ # "The aim breakdown identifies which aims are mentioned "
163
+ # "within each response. "
164
+ # "A single response may discuss multiple topics.\n"
165
+ # f"\n\n{thematic_breakdown}\n\n"
166
+ # f"\n\n{places_breakdown}\n\n"
167
+ # f"![Locations mentioned by sentiment](./figs/places.png)\n\n"
168
+ # "## Key points raised in support\n\n"
169
+ # f"{key_points}\n\n"
170
+ # "## Summaries\n"
171
+ # f"{short_summaries}"
172
  )
173
 
174
  with open(Paths.SUMMARY / f"{doc_title.replace(' ', '_')}.qmd", "w") as f:
175
  f.write(quarto_doc)
176
 
177
 
178
+ def read_docs():
179
+ df = pl.read_parquet(Paths.STAGING / "gcpt3.parquet")
180
+ df = df.filter(
181
+ pl.col("representations_document") == "Local Plan Issues and Options Report"
182
+ ).unique("id")
183
+ loader = PolarsDataFrameLoader(df, page_content_column="text")
184
+
185
+ docs = list(
186
+ {
187
+ doc.page_content: {"document": doc, "filename": doc.metadata["id"]}
188
+ for doc in loader.load()
189
+ if doc.page_content and len(doc.page_content.split(" ")) > 5
190
+ }.values()
191
  )
192
+ return docs
193
+
194
+
195
+ def main():
196
+ docs = read_docs()
197
+ n_docs = len(docs)
198
+
199
+ logging.warning(f"{n_docs} documents being processed!")
200
+
201
+ # text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
202
+ # chunk_size=10_240, chunk_overlap=0
203
+ # )
204
+ # split_docs = text_splitter.split_documents(docs)
205
 
206
  app = create_graph()
207
 
208
  step = None
209
+ for step in app.stream({"documents": docs, "n_docs": n_docs}):
210
+ print(step.keys())
 
 
 
 
 
211
 
212
  if step is None:
213
  raise ValueError("No steps were processed!")
 
214
  return step
215
 
216
 
 
219
 
220
  tic = time.time()
221
  out = main()
222
+ build_quarto_doc(doc_title, out)
223
+ print(out["generate_final_summary"]["final_summary"])
224
+
225
  toc = time.time()
226
 
227
  print(f"Time taken: {(toc - tic) / 60:.2f} minutes.")
planning_ai/nodes/hallucination_node.py CHANGED
@@ -1,12 +1,31 @@
1
- from langgraph.constants import Send
 
2
 
3
- from planning_ai.chains.fix_chain import fix_chain
 
 
 
 
4
  from planning_ai.chains.hallucination_chain import (
5
  HallucinationChecker,
6
  hallucination_chain,
7
  )
 
8
  from planning_ai.states import DocumentState, OverallState
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
 
11
  def check_hallucination(state: DocumentState):
12
  """Checks for hallucinations in the summary of a document.
@@ -23,25 +42,29 @@ def check_hallucination(state: DocumentState):
23
  dict: A dictionary containing either a list of fixed summaries or hallucinations
24
  that need to be addressed.
25
  """
26
- if state["iteration"] > 5:
27
- state["iteration"] = -99
28
- return {"summaries_fixed": [state]}
29
-
30
- response: HallucinationChecker = hallucination_chain.invoke(
31
- {"document": state["document"], "summary": state["summary"]}
32
- ) # type: ignore
 
 
 
 
 
 
 
 
 
 
33
  if response.score == 1:
34
- return {"summaries_fixed": [state]}
35
 
36
  return {
37
- "hallucinations": [
38
- {
39
- "hallucination": response,
40
- "document": state["document"],
41
- "filename": state["filename"],
42
- "summary": state["summary"],
43
- "iteration": state["iteration"] + 1,
44
- }
45
  ]
46
  }
47
 
@@ -60,7 +83,7 @@ def map_hallucinations(state: OverallState):
60
  list: A list of Send objects directing each summary to the check_hallucination
61
  function.
62
  """
63
- return [Send("check_hallucination", summary) for summary in state["summaries"]]
64
 
65
 
66
  def fix_hallucination(state: DocumentState):
@@ -77,24 +100,24 @@ def fix_hallucination(state: DocumentState):
77
  dict: A dictionary containing the updated summaries after attempting to fix
78
  hallucinations.
79
  """
80
- response = fix_chain.invoke(
81
- {
82
- "context": state["document"],
83
- "summary": state["summary"],
84
- "explanation": state["hallucination"],
85
- }
86
- )
87
- state["summary"] = response # type: ignore
88
- return {
89
- "summaries": [
90
  {
91
- "document": state["document"],
92
- "filename": state["filename"],
93
- "summary": state["summary"],
94
- "iteration": state["iteration"],
95
  }
96
- ]
97
- }
 
 
 
 
 
 
 
98
 
99
 
100
  def map_fix_hallucinations(state: OverallState):
@@ -112,11 +135,11 @@ def map_fix_hallucinations(state: OverallState):
112
  fix_hallucination function.
113
  """
114
  hallucinations = []
115
- if "hallucinations" in state:
116
  hallucinations = [
117
- hallucination
118
- for hallucination in state["hallucinations"]
119
- if hallucination["hallucination"].score != 1
120
  ]
121
  return [
122
  Send("fix_hallucination", hallucination) for hallucination in hallucinations
 
1
+ import json
2
+ import logging
3
 
4
+ from langchain_core.exceptions import OutputParserException
5
+ from langgraph.types import Send
6
+ from pydantic import BaseModel
7
+
8
+ from planning_ai.chains.fix_chain import fix_template
9
  from planning_ai.chains.hallucination_chain import (
10
  HallucinationChecker,
11
  hallucination_chain,
12
  )
13
+ from planning_ai.chains.map_chain import create_dynamic_map_chain
14
  from planning_ai.states import DocumentState, OverallState
15
 
16
+ logging.basicConfig(
17
+ level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s"
18
+ )
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class BasicSummaryBroken(BaseModel):
23
+ summary: str
24
+ policies: None
25
+
26
+
27
+ ITERATIONS = 2
28
+
29
 
30
  def check_hallucination(state: DocumentState):
31
  """Checks for hallucinations in the summary of a document.
 
42
  dict: A dictionary containing either a list of fixed summaries or hallucinations
43
  that need to be addressed.
44
  """
45
+ logger.warning(f"Checking hallucinations for document {state['filename']}")
46
+ # Stop trying after 2 iterations
47
+ if state["iteration"] > ITERATIONS:
48
+ state["iteration"] = 99
49
+ state["hallucination"].score = 1
50
+ return {"documents": [state]}
51
+
52
+ try:
53
+ response = hallucination_chain.invoke(
54
+ {"document": state["document"], "summary": state["summary"].summary}
55
+ )
56
+ except (OutputParserException, json.JSONDecodeError) as e:
57
+ logger.error(f"Failed to decode JSON: {e}.")
58
+ state["iteration"] = 99
59
+ state["hallucination"] = HallucinationChecker(score=1, explanation="INVALID")
60
+ state["summary"] = BasicSummaryBroken(summary="INVALID", policies=None)
61
+ return {"documents": [state]}
62
  if response.score == 1:
63
+ return {"documents": [{**state, "hallucination": response}]}
64
 
65
  return {
66
+ "documents": [
67
+ {**state, "hallucination": response, "iteration": state["iteration"] + 1}
 
 
 
 
 
 
68
  ]
69
  }
70
 
 
83
  list: A list of Send objects directing each summary to the check_hallucination
84
  function.
85
  """
86
+ return [Send("check_hallucination", document) for document in state["documents"]]
87
 
88
 
89
  def fix_hallucination(state: DocumentState):
 
100
  dict: A dictionary containing the updated summaries after attempting to fix
101
  hallucinations.
102
  """
103
+ logger.warning(f"Fixing hallucinations for document {state['filename']}")
104
+ fix_chain = create_dynamic_map_chain(state["themes"], fix_template)
105
+ try:
106
+ response = fix_chain.invoke(
 
 
 
 
 
 
107
  {
108
+ "context": state["document"],
109
+ "summary": state["summary"].summary,
110
+ "explanation": state["hallucination"].explanation,
 
111
  }
112
+ )
113
+ except (OutputParserException, json.JSONDecodeError) as e:
114
+ logger.error(f"Failed to decode JSON: {e}.")
115
+ state["iteration"] = 99
116
+ state["hallucination"] = HallucinationChecker(score=1, explanation="INVALID")
117
+ state["summary"] = BasicSummaryBroken(summary="INVALID", policies=None)
118
+ return {"documents": [state]}
119
+ state["summary"] = response # type: ignore
120
+ return {"documents": [state]}
121
 
122
 
123
  def map_fix_hallucinations(state: OverallState):
 
135
  fix_hallucination function.
136
  """
137
  hallucinations = []
138
+ if "documents" in state:
139
  hallucinations = [
140
+ document
141
+ for document in state["documents"]
142
+ if document["hallucination"].score != 1
143
  ]
144
  return [
145
  Send("fix_hallucination", hallucination) for hallucination in hallucinations
planning_ai/nodes/map_node.py CHANGED
@@ -1,16 +1,89 @@
1
  import json
 
2
  from pathlib import Path
 
3
 
4
- from langgraph.constants import Send
 
 
5
  from presidio_analyzer import AnalyzerEngine
6
  from presidio_anonymizer import AnonymizerEngine
 
7
 
8
- from planning_ai.chains.map_chain import map_chain
 
9
  from planning_ai.common.utils import Paths
 
10
  from planning_ai.states import DocumentState, OverallState
11
 
12
- anonymizer = AnonymizerEngine()
 
 
 
 
 
 
 
 
 
 
13
  analyzer = AnalyzerEngine()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
 
16
  def remove_pii(document: str) -> str:
@@ -25,12 +98,14 @@ def remove_pii(document: str) -> str:
25
  Returns:
26
  str: The document text with PII anonymized.
27
  """
 
28
  results = analyzer.analyze(
29
  text=document,
30
  entities=["PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS"],
31
  language="en",
32
  )
33
- document = anonymizer.anonymize(text=document, analyzer_results=results)
 
34
  return document
35
 
36
 
@@ -47,43 +122,27 @@ def generate_summary(state: DocumentState) -> dict:
47
  Returns:
48
  dict: A dictionary containing the generated summary and updated document state.
49
  """
50
- state["document"] = remove_pii(state["document"])
51
- response = map_chain.invoke({"context": state["document"]})
52
- summary = response.summary
53
- themes = [theme.value for theme in response.themes]
54
- policies = [policy.dict() for policy in response.policies]
55
-
56
- out_policies = []
57
- for theme in policies:
58
- name = theme["theme"].value
59
- policy_list = theme["policies"]
60
- out_policies.append({"theme": name, "policies": policy_list})
61
-
62
- out_places = []
63
- for place in response.places:
64
- name = place.place
65
- sentiment = place.sentiment.value
66
- out_places.append({"place": name, "sentiment": sentiment})
67
-
68
- save_output = {
69
- "summary": summary,
70
- "themes": themes,
71
- "policies": out_policies,
72
- "places": out_places,
73
- }
74
-
75
- outfile = f"{Path(state["filename"]).stem}_summary.json"
76
- with open(Paths.SUMMARIES / outfile, "w") as file:
77
- json.dump(save_output, file, indent=4)
78
-
79
- output = {
80
- "summary": response,
81
- "document": state["document"],
82
- "filename": str(state["filename"]),
83
- "iteration": 1,
84
- }
85
-
86
- return {"summaries": [output]}
87
 
88
 
89
  def map_summaries(state: OverallState) -> list[Send]:
@@ -99,10 +158,5 @@ def map_summaries(state: OverallState) -> list[Send]:
99
  list: A list of Send objects directing each document to the `generate_summary`
100
  function.
101
  """
102
- return [
103
- Send(
104
- "generate_summary",
105
- {"document": document, "filename": filename},
106
- )
107
- for document, filename in zip(state["documents"], state["filenames"])
108
- ]
 
1
  import json
2
+ import logging
3
  from pathlib import Path
4
+ from typing import TypedDict
5
 
6
+ import spacy
7
+ from langchain_core.exceptions import OutputParserException
8
+ from langgraph.types import Send
9
  from presidio_analyzer import AnalyzerEngine
10
  from presidio_anonymizer import AnonymizerEngine
11
+ from pydantic import BaseModel, ValidationError
12
 
13
+ from planning_ai.chains.hallucination_chain import HallucinationChecker
14
+ from planning_ai.chains.map_chain import create_dynamic_map_chain, map_template
15
  from planning_ai.common.utils import Paths
16
+ from planning_ai.retrievers.theme_retriever import grade_chain, theme_retriever
17
  from planning_ai.states import DocumentState, OverallState
18
 
19
+ logging.basicConfig(
20
+ level=logging.WARN, format="%(asctime)s - %(levelname)s - %(message)s"
21
+ )
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class BasicSummaryBroken(BaseModel):
26
+ summary: str
27
+ policies: None
28
+
29
+
30
  analyzer = AnalyzerEngine()
31
+ anonymizer = AnonymizerEngine()
32
+
33
+ nlp = spacy.load("en_core_web_lg")
34
+
35
+
36
+ def retrieve_themes(state: DocumentState) -> dict:
37
+ theme_documents = theme_retriever.invoke(input=state["document"].page_content)
38
+
39
+ # TODO: add something similar but more efficient?
40
+ grade_scores = []
41
+ for doc in theme_documents:
42
+ try:
43
+ score = grade_chain.invoke(
44
+ {
45
+ "context": doc.page_content,
46
+ "document": state["document"].page_content,
47
+ }
48
+ ).binary_score
49
+ except (OutputParserException, json.JSONDecodeError) as e:
50
+ logger.error(f"Failed to decode JSON: {e}.\n Setting to 'no'")
51
+ score = "no"
52
+ grade_scores.append(score)
53
+
54
+ theme_documents = [
55
+ doc for doc, include in zip(theme_documents, grade_scores) if include == "yes"
56
+ ]
57
+
58
+ # TODO: Add metadata to this as string?
59
+ theme_documents_text = "\n\n".join([d.page_content for d in theme_documents])
60
+
61
+ # state["document"].page_content = (
62
+ # f"{state['document'].page_content}\n\n"
63
+ # f"Related Information:\n\n{theme_documents_text}"
64
+ # )
65
+ state["theme_docs"] = theme_documents
66
+ state["themes"] = {doc.metadata["theme"] for doc in theme_documents}
67
+
68
+ logger.warning(f"Retrieved relevant theme documents for: {state['filename']}")
69
+ return {"documents": [state]}
70
+
71
+
72
+ def map_retrieve_themes(state: OverallState) -> list[Send]:
73
+ logger.warning("Mapping documents to retrieve themes.")
74
+ return [Send("retrieve_themes", document) for document in state["documents"]]
75
+
76
+
77
+ def add_entities(state: OverallState) -> OverallState:
78
+ for idx, document in enumerate(
79
+ nlp.pipe(
80
+ [doc["document"].page_content for doc in state["documents"]],
81
+ )
82
+ ):
83
+ state["documents"][idx]["entities"] = [
84
+ {"entity": ent.text, "label": ent.label_} for ent in document.ents
85
+ ]
86
+ return state
87
 
88
 
89
  def remove_pii(document: str) -> str:
 
98
  Returns:
99
  str: The document text with PII anonymized.
100
  """
101
+ logger.warning("Starting PII removal.")
102
  results = analyzer.analyze(
103
  text=document,
104
  entities=["PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS"],
105
  language="en",
106
  )
107
+ document = anonymizer.anonymize(text=document, analyzer_results=results).text
108
+ logger.warning("PII removal completed.")
109
  return document
110
 
111
 
 
122
  Returns:
123
  dict: A dictionary containing the generated summary and updated document state.
124
  """
125
+ logger.warning(f"Generating summary for document: {state['filename']}")
126
+
127
+ state["document"].page_content = remove_pii(state["document"].page_content)
128
+ if not state["themes"]:
129
+ state["iteration"] = 99
130
+ state["hallucination"] = HallucinationChecker(score=1, explanation="INVALID")
131
+ state["summary"] = BasicSummaryBroken(summary="INVALID", policies=None)
132
+ return {"documents": [state]}
133
+
134
+ map_chain = create_dynamic_map_chain(themes=state["themes"], prompt=map_template)
135
+ try:
136
+ response = map_chain.invoke({"context": state["document"].page_content})
137
+ except (OutputParserException, json.JSONDecodeError) as e:
138
+ logger.error(f"Failed to decode JSON: {e}.")
139
+ state["iteration"] = 99
140
+ state["hallucination"] = HallucinationChecker(score=1, explanation="INVALID")
141
+ state["summary"] = BasicSummaryBroken(summary="INVALID", policies=None)
142
+ return {"documents": [state]}
143
+
144
+ logger.warning(f"Summary generation completed for document: {state['filename']}")
145
+ return {"documents": [{**state, "summary": response, "iteration": 1}]}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
 
147
 
148
  def map_summaries(state: OverallState) -> list[Send]:
 
158
  list: A list of Send objects directing each document to the `generate_summary`
159
  function.
160
  """
161
+ logger.warning("Mapping documents to generate summaries.")
162
+ return [Send("generate_summary", document) for document in state["documents"]]
 
 
 
 
 
planning_ai/nodes/reduce_node.py CHANGED
@@ -1,5 +1,36 @@
 
 
 
 
 
 
 
1
  from planning_ai.chains.reduce_chain import reduce_chain
2
  from planning_ai.states import OverallState
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
 
5
  def generate_final_summary(state: OverallState):
@@ -18,17 +49,87 @@ def generate_final_summary(state: OverallState):
18
  dict: A dictionary containing the final summary, along with the original
19
  documents, summaries, fixed summaries, and hallucinations.
20
  """
21
- if len(state["documents"]) == len(state["summaries_fixed"]):
 
 
 
 
22
  summaries = [
23
- str(summary["summary"])
24
- for summary in state["summaries_fixed"]
25
- if summary["summary"].stance != "NEUTRAL" and summary["summary"].rating >= 5
 
 
 
26
  ]
27
- response = reduce_chain.invoke({"context": summaries})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  return {
29
- "final_summary": response,
30
- "summaries_fixed": state["summaries_fixed"],
31
- "summaries": state["summaries"],
32
- "hallucinations": state["hallucinations"],
33
- "documents": state["documents"],
34
  }
 
1
+ import json
2
+ import logging
3
+ from pathlib import Path
4
+
5
+ import polars as pl
6
+
7
+ from planning_ai.chains.policy_chain import policy_chain
8
  from planning_ai.chains.reduce_chain import reduce_chain
9
  from planning_ai.states import OverallState
10
+ from planning_ai.themes import THEMES_AND_POLICIES
11
+
12
+ logging.basicConfig(
13
+ level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s"
14
+ )
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ # TODO: Reduce down the grouping of policies. I.e. combine points that are closely related and add citations.
19
+ def markdown_bullets(summaries):
20
+ policies = {"themes": [], "policies": [], "details": []}
21
+ for summary in summaries:
22
+ if not summary["summary"].policies:
23
+ continue
24
+ for policy in summary["summary"].policies:
25
+ for theme, p in THEMES_AND_POLICIES.items():
26
+ if policy.policy.name in p:
27
+ policies["themes"].append(theme)
28
+ policies["policies"].append(policy.policy.name)
29
+ policies["details"].append(policy.note)
30
+ df = pl.DataFrame(policies)
31
+
32
+ grouped = df.group_by(["themes", "policies"]).agg(pl.col("details"))
33
+ return grouped
34
 
35
 
36
  def generate_final_summary(state: OverallState):
 
49
  dict: A dictionary containing the final summary, along with the original
50
  documents, summaries, fixed summaries, and hallucinations.
51
  """
52
+ logger.warning("Generating final summary")
53
+ final_docs = [doc for doc in state["documents"] if doc["hallucination"].score == 1]
54
+ logger.warning(f"Number of final docs: {len(final_docs)}")
55
+
56
+ if len(final_docs) == state["n_docs"]:
57
  summaries = [
58
+ # f"Document ID: [{id}]\n\n{doc["summary"].summary}"
59
+ doc
60
+ for id, doc in zip(range(state["n_docs"]), final_docs)
61
+ if doc["summary"].summary != "INVALID"
62
+ and doc["themes"] != set()
63
+ and doc["iteration"] != 99
64
  ]
65
+
66
+ out = []
67
+ for doc in summaries:
68
+ summary = doc["summary"].model_dump()
69
+ if summary["policies"] is not None:
70
+ policies = [
71
+ {"policy": policy["policy"].name, "note": policy["note"]}
72
+ for policy in summary["policies"]
73
+ ]
74
+ else:
75
+ policies = []
76
+ summary = summary["summary"]
77
+ out.append(
78
+ {
79
+ "document": doc["document"].model_dump()["page_content"],
80
+ "filename": doc["filename"],
81
+ "entities": doc["entities"],
82
+ "theme_docs": [d.model_dump() for d in doc["theme_docs"]],
83
+ "themes": list(doc["themes"]),
84
+ "summary": summary,
85
+ "policies": policies,
86
+ "iteration": doc["iteration"],
87
+ "hallucination": doc["hallucination"].model_dump(),
88
+ }
89
+ )
90
+
91
+ for doc in out:
92
+ filename = Path(str(doc["filename"])).stem
93
+ with open(f"data/out/summaries/{filename}.json", "w") as f:
94
+ json.dump(doc, f)
95
+
96
+ summaries_text = [s["summary"].summary for s in summaries]
97
+ final_responses = []
98
+ batch_size = 50
99
+ for i in range(0, len(summaries_text), batch_size):
100
+ logger.warning("Processing batches.")
101
+ batch = summaries_text[i : i + batch_size]
102
+ response = reduce_chain.invoke({"context": batch})
103
+ final_responses.append(response)
104
+
105
+ final_response = reduce_chain.invoke({"context": "\n\n".join(final_responses)})
106
+ pols = markdown_bullets(summaries)
107
+
108
+ pol_out = []
109
+ for _, policy in pols.group_by(["themes", "policies"]):
110
+ logger.warning("Processing policies.")
111
+ bullets = "* " + "* \n".join(policy["details"][0])
112
+ pchain_out = policy_chain.invoke(
113
+ {"policy": policy["policies"][0], "bullet_points": bullets}
114
+ )
115
+ pol_out.append(
116
+ {
117
+ "theme": policy["themes"][0],
118
+ "policy": policy["policies"][0],
119
+ "points": pchain_out,
120
+ }
121
+ )
122
+
123
+ themes = ""
124
+ for theme, policies in pl.DataFrame(pol_out).group_by("theme"):
125
+ themes += f"# {theme[0]}\n\n"
126
+ for row in policies.iter_rows(named=True):
127
+ themes += f"\n## {row['policy']}\n\n"
128
+ themes += f"{row['points']}\n"
129
+ themes += "\n"
130
+
131
  return {
132
+ "final_summary": final_response,
133
+ "documents": final_docs,
134
+ "policies": themes,
 
 
135
  }
planning_ai/preprocessing/gcpt3.py CHANGED
@@ -1,4 +1,5 @@
1
  import logging
 
2
  from pathlib import Path
3
  from typing import Any
4
 
@@ -59,14 +60,13 @@ def process_files(files: list[Path], schema: dict[str, Any]) -> None:
59
 
60
  def download_attachments():
61
  df = pl.read_parquet(Paths.STAGING / "gcpt3.parquet")
 
62
 
63
- existing_files = {int(f.stem) for f in (Paths.RAW / "pdfs").glob("*.pdf")}
64
  failed_files = set()
65
-
66
  failed_file_path = Paths.RAW / "failed_downloads.txt"
67
  if failed_file_path.exists():
68
  with open(failed_file_path, "r") as file:
69
- failed_files = set(int(l) for l in file.read().splitlines())
70
 
71
  for row in tqdm(
72
  df.drop_nulls(subset="attachments_id")
@@ -74,52 +74,64 @@ def download_attachments():
74
  .sample(shuffle=True, fraction=1)
75
  .rows(named=True)
76
  ):
77
- attachment_id = int(row["attachments_id"])
78
-
79
- if attachment_id in existing_files or attachment_id in failed_files:
80
- print(f"Skipping {attachment_id} (already exists or previously failed)")
81
- continue
82
  if (
83
- row["attachments_url"].endswith(".pdf")
84
- and not row["attachments_url"].startswith("https://egov.scambs.gov.uk")
85
- and not row["attachments_url"].startswith("http://egov.scambs.gov.uk")
 
 
86
  ):
87
- file_path = Paths.RAW / "pdfs" / f"{attachment_id}.pdf"
88
- try:
89
- response = requests.get(row["attachments_url"], timeout=10)
90
- response.raise_for_status()
91
-
92
- with open(file_path, "wb") as f:
93
- f.write(response.content)
94
- print(f"Downloaded {attachment_id} to {file_path}")
95
-
96
- except requests.RequestException as e:
97
- logging.error(f"RequestException for {attachment_id}: {e}")
98
- failed_files.add(attachment_id)
99
- with open(failed_file_path, "a") as file:
100
- file.write(f"{attachment_id}\n")
101
- print(f"Skipping {attachment_id} due to error: {e}")
102
-
103
- except Exception as e:
104
- logging.error(f"Unexpected error for {attachment_id}: {e}")
105
- failed_files.add(attachment_id)
106
- with open(failed_file_path, "a") as file:
107
- file.write(f"{attachment_id}\n")
108
- print(f"Unexpected error for {attachment_id}: {e}")
 
 
 
109
 
110
 
111
  def convert_txt():
 
112
  df = pl.read_parquet(Paths.STAGING / "gcpt3.parquet")
113
 
114
- # attachment_txt =
 
 
115
 
116
- f"{df['text']}\n\nPOSITION: {df['representations_support/object']}"
 
 
 
 
 
 
 
117
 
118
 
119
  def main() -> None:
120
  files = list(Path(Paths.RAW / "gcpt3").glob("*.json"))
121
  schema = get_schema()
122
  process_files(files, schema)
 
 
123
 
124
 
125
  if __name__ == "__main__":
 
1
  import logging
2
+ import textwrap
3
  from pathlib import Path
4
  from typing import Any
5
 
 
60
 
61
  def download_attachments():
62
  df = pl.read_parquet(Paths.STAGING / "gcpt3.parquet")
63
+ existing_files = {f.stem for f in (Paths.RAW / "pdfs").glob("*.pdf")}
64
 
 
65
  failed_files = set()
 
66
  failed_file_path = Paths.RAW / "failed_downloads.txt"
67
  if failed_file_path.exists():
68
  with open(failed_file_path, "r") as file:
69
+ failed_files = set(l for l in file.read().splitlines())
70
 
71
  for row in tqdm(
72
  df.drop_nulls(subset="attachments_id")
 
74
  .sample(shuffle=True, fraction=1)
75
  .rows(named=True)
76
  ):
 
 
 
 
 
77
  if (
78
+ row["attachments_url"].startswith(
79
+ ("https://egov.scambs.gov.uk", "http://egov.scambs.gov.uk")
80
+ )
81
+ or row["attachments_id"] in existing_files
82
+ or row["attachments_id"] in failed_files
83
  ):
84
+ failed_files.add(row["attachments_id"])
85
+ continue
86
+ file_path = Paths.RAW / "pdfs" / f"{row['attachments_id']}.pdf"
87
+ try:
88
+ response = requests.get(row["attachments_url"], timeout=3)
89
+ response.raise_for_status()
90
+
91
+ with open(file_path, "wb") as f:
92
+ f.write(response.content)
93
+ print(f"Downloaded {row['attachments_url']} to {file_path}")
94
+
95
+ except requests.RequestException as e:
96
+ logging.error(f"RequestException for {row['attachments_url']}: {e}")
97
+ failed_files.add(row["attachments_id"])
98
+ with open(failed_file_path, "a") as file:
99
+ file.write(f"{row['attachments_id']}\n")
100
+ print(f"Skipping {row['attachments_url']} due to error: {e}")
101
+
102
+ except Exception as e:
103
+ logging.error(f"Unexpected error for {row['attachments_url']}: {e}")
104
+ row["attachments_url"]
105
+ failed_files.add(row["attachments_id"])
106
+ with open(failed_file_path, "a") as file:
107
+ file.write(f"{row['attachments_id']}\n")
108
+ print(f"Unexpected error for {row['attachments_url']}: {e}")
109
 
110
 
111
  def convert_txt():
112
+ # TODO: add pdf content
113
  df = pl.read_parquet(Paths.STAGING / "gcpt3.parquet")
114
 
115
+ for response_doc, dfd in df.group_by("representations_document"):
116
+ for row in tqdm(dfd.rows(named=True)):
117
+ text = f"{row["text"]}"
118
 
119
+ with open(
120
+ Paths.STAGING
121
+ / "txt"
122
+ / f"{response_doc}"
123
+ / f"{row['representations_id']}.txt",
124
+ "w",
125
+ ) as f:
126
+ f.write(text)
127
 
128
 
129
  def main() -> None:
130
  files = list(Path(Paths.RAW / "gcpt3").glob("*.json"))
131
  schema = get_schema()
132
  process_files(files, schema)
133
+ download_attachments()
134
+ convert_txt()
135
 
136
 
137
  if __name__ == "__main__":
planning_ai/preprocessing/process_pdfs.py CHANGED
@@ -2,36 +2,16 @@ import base64
2
  import os
3
  from io import BytesIO
4
 
5
- import cv2
6
- import numpy as np
7
  import requests
8
  from dotenv import load_dotenv
9
  from pdf2image import convert_from_path
 
10
  from tqdm import tqdm
11
 
12
  from planning_ai.common.utils import Paths
13
 
14
  load_dotenv()
15
 
16
- import easyocr
17
- from pdf2image import convert_from_path
18
-
19
- pdf_path = "data/raw/pdfs/25.pdf"
20
- # pdf_path = "../../data/raw/pdfs/26.pdf"
21
- images = convert_from_path(pdf_path)
22
-
23
- reader = easyocr.Reader(lang_list=["en"], gpu=True)
24
-
25
- for i, image in enumerate(images):
26
- results = reader.readtext(np.array(image))
27
- print(f"Page {i+1}:")
28
- confidences = []
29
- for result in results:
30
- confidences.append(result[2])
31
- print(f"Detected text: {result[1]} (confidence: {result[2]:.2f})")
32
-
33
- np.array(confidences).mean()
34
-
35
 
36
  def encode_images_to_base64(images):
37
  image_b64 = []
@@ -61,13 +41,28 @@ def send_request_to_api(messages):
61
  return response.json()
62
 
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  def main():
65
  pdfs = (Paths.RAW / "pdfs").glob("*.pdf")
66
  with open("planning_ai/preprocessing/prompts/ocr.txt", "r") as f:
67
  ocr_prompt = f.read()
68
 
69
  for file in tqdm(pdfs):
70
- if file.stem:
 
 
71
  images = convert_from_path(file)
72
  image_b64 = encode_images_to_base64(images)
73
 
@@ -79,12 +74,15 @@ def main():
79
  ]
80
 
81
  response = send_request_to_api(messages)
 
 
82
  out = response["choices"][0]["message"]["content"]
83
- outfile = Paths.STAGING / "pdfs" / f"{file.stem}.txt"
84
  if outfile.exists():
85
  continue
86
  with open(outfile, "w") as f:
87
  f.write(out)
 
 
88
 
89
 
90
  if __name__ == "__main__":
 
2
  import os
3
  from io import BytesIO
4
 
 
 
5
  import requests
6
  from dotenv import load_dotenv
7
  from pdf2image import convert_from_path
8
+ from PyPDF2 import PdfReader
9
  from tqdm import tqdm
10
 
11
  from planning_ai.common.utils import Paths
12
 
13
  load_dotenv()
14
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  def encode_images_to_base64(images):
17
  image_b64 = []
 
41
  return response.json()
42
 
43
 
44
+ def extract_text_from_pdf(file_path):
45
+ """Extracts text from a PDF file using PyPDF2."""
46
+ try:
47
+ reader = PdfReader(file_path, strict=True)
48
+ text = []
49
+ for page in reader.pages:
50
+ text.append(page.extract_text() or "")
51
+ return "\n".join(text).strip()
52
+ except Exception as e:
53
+ print(e)
54
+ return None
55
+
56
+
57
  def main():
58
  pdfs = (Paths.RAW / "pdfs").glob("*.pdf")
59
  with open("planning_ai/preprocessing/prompts/ocr.txt", "r") as f:
60
  ocr_prompt = f.read()
61
 
62
  for file in tqdm(pdfs):
63
+ outfile = Paths.STAGING / "pdfs" / f"{file.stem}.txt"
64
+
65
+ try:
66
  images = convert_from_path(file)
67
  image_b64 = encode_images_to_base64(images)
68
 
 
74
  ]
75
 
76
  response = send_request_to_api(messages)
77
+ if not "choices" in response:
78
+ continue
79
  out = response["choices"][0]["message"]["content"]
 
80
  if outfile.exists():
81
  continue
82
  with open(outfile, "w") as f:
83
  f.write(out)
84
+ except:
85
+ continue
86
 
87
 
88
  if __name__ == "__main__":
planning_ai/report.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # build_quarto_doc(doc_title, out)
2
+ #
3
+ # d = [
4
+ # i for i in out["generate_final_summary"]["summaries_fixed"] if i["iteration"] == 4
5
+ # ][0]
6
+ # d["document"]
7
+ #
8
+ # h = [
9
+ # i["summary"].summary
10
+ # for i in out["generate_final_summary"]["hallucinations"]
11
+ # if i["document"] == d["document"]
12
+ # ]
13
+ #
14
+ # e = [
15
+ # i["hallucination"].explanation
16
+ # for i in out["generate_final_summary"]["hallucinations"]
17
+ # if i["document"] == d["document"]
18
+ # ]
19
+ #
20
+ # test = {
21
+ # "document": d["document"],
22
+ # "final_summary": d["summary"].summary,
23
+ # "attempts": h,
24
+ # "reasoning": e,
25
+ # }
26
+ #
27
+ # print(f"Document:\n\n{test['document']}\n\n")
28
+ # print(f"Final:\n\n{test['final_summary']}\n\n")
29
+ # print("Attempts: \n\n*", "\n\n* ".join(test["attempts"]), "\n\n")
30
+ # print("Reasoning: \n\n*", "\n\n* ".join(test["reasoning"]), "\n\n")
planning_ai/retrievers/theme_retriever.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from pathlib import Path
3
+
4
+ from chromadb import PersistentClient
5
+ from langchain_community.document_loaders import PyPDFLoader
6
+ from langchain_community.vectorstores import Chroma
7
+ from langchain_core.prompts import PromptTemplate
8
+ from langchain_openai import OpenAIEmbeddings
9
+ from pydantic import BaseModel, Field
10
+
11
+ from planning_ai.llms.llm import LLM
12
+
13
+ # See: https://consultations.greatercambridgeplanning.org/greater-cambridge-local-plan-preferred-options/supporting-documents
14
+
15
+ PDFS = {
16
+ "Biodiversity and Green Spaces": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPBiodiversityandGreenSpacesAug21v2Nov21_0.pdf",
17
+ "Climate Change": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPClimateChangeAug21v2Nov21_0.pdf",
18
+ "Great Places": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPGreatPlacesAug21v1Aug21.pdf",
19
+ "Homes": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPHomesAug21v2Nov21.pdf",
20
+ "Infrastructure": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPInfrastructureAug21v2Nov21.pdf",
21
+ "Jobs": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPJobsAug21v2Nov21.pdf",
22
+ # "Strategy topic paper": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPStrategyAug21v3Nov21_0.pdf",
23
+ "Wellbeing and Social Inclusion": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPWellbeingAug21v2Nov21.pdf",
24
+ }
25
+
26
+
27
+ class Grade(BaseModel):
28
+ """Binary score for relevance check."""
29
+
30
+ binary_score: str = Field(description="Relevance score 'yes' or 'no'")
31
+
32
+
33
+ def create_db():
34
+ chroma_dir = Path("./chroma_themesdb")
35
+ if chroma_dir.exists():
36
+ persistent_client = PersistentClient(path="./chroma_themesdb")
37
+ vectorstore = Chroma(
38
+ client=persistent_client,
39
+ collection_name="themes-chroma",
40
+ embedding_function=OpenAIEmbeddings(),
41
+ )
42
+
43
+ else:
44
+ docs = []
45
+ for name, pdf in PDFS.items():
46
+ doc = PyPDFLoader(pdf).load()[5:]
47
+ for d in doc:
48
+ d.metadata["theme"] = name
49
+ docs.extend(doc)
50
+
51
+ logging.warning(f"Building ChromaDB...")
52
+ vectorstore = Chroma.from_documents(
53
+ documents=docs,
54
+ collection_name="themes-chroma",
55
+ embedding=OpenAIEmbeddings(),
56
+ persist_directory="./chroma_themesdb",
57
+ )
58
+ return vectorstore
59
+
60
+
61
+ grade_template = PromptTemplate(
62
+ template="""You are a grader assessing relevance of a retrieved document to a user question. \n
63
+ Here is the retrieved document: \n\n {context} \n\n
64
+ Here is the original document: {document} \n
65
+ If the retrieved document contains keyword(s) or semantic meaning related to the original, grade it as relevant. \n
66
+ Give a binary score 'yes' or 'no' score to indicate whether the retrieved document is relevant to the original.""",
67
+ input_variables=["context", "document"],
68
+ )
69
+
70
+
71
+ SLLM = LLM.with_structured_output(Grade, strict=True)
72
+ grade_chain = grade_template | SLLM
73
+
74
+ vectorstore = create_db()
75
+ theme_retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
76
+ logging.warning(f"Finished building ChromaDB...")
77
+
78
+ if __name__ == "__main__":
79
+ test_content = """
80
+ We would certainly support this and would emphasise the importance of trying
81
+ to solve the severance problems created by the M11 and A14.
82
+ """
83
+
84
+ len(theme_retriever.invoke(input=test_content))
planning_ai/states.py CHANGED
@@ -1,9 +1,39 @@
1
  import operator
2
  from pathlib import Path
3
- from typing import Annotated, List, TypedDict
 
 
 
4
 
5
  from planning_ai.chains.hallucination_chain import HallucinationChecker
6
- from planning_ai.chains.map_chain import BriefSummary
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
 
9
  class OverallState(TypedDict):
@@ -23,35 +53,17 @@ class OverallState(TypedDict):
23
  iterations (list[int]): A list of iteration counts for processing each document.
24
  """
25
 
26
- documents: list[str]
27
-
28
- final_summary: str
29
- summaries: Annotated[list, operator.add]
30
- summaries_fixed: Annotated[list, operator.add]
31
- hallucinations: Annotated[list, operator.add]
32
-
33
- filenames: List[Path]
34
- iterations: list[int]
35
-
36
-
37
- class DocumentState(TypedDict):
38
- """Represents the state of an individual document during processing.
39
-
40
- This class is a TypedDict that encapsulates the state of a single document
41
- during the processing workflow. It includes the document text, summary,
42
- hallucination details, filename, and iteration count.
43
-
44
- Attributes:
45
- document (str): The text of the document.
46
- summary (BriefSummary): The summary of the document.
47
- hallucination (HallucinationChecker): The hallucination details for the document's summary.
48
- filename (Path): The file path of the document.
49
- iteration (int): The current iteration count for processing the document.
50
- """
51
 
52
- document: str
53
- summary: BriefSummary
54
- hallucination: HallucinationChecker
55
 
56
- filename: Path
57
- iteration: int
 
 
 
 
 
 
 
 
1
  import operator
2
  from pathlib import Path
3
+ from typing import Annotated, List, Optional, TypedDict
4
+
5
+ from langchain_core.documents import Document
6
+ from pydantic import BaseModel
7
 
8
  from planning_ai.chains.hallucination_chain import HallucinationChecker
9
+ from planning_ai.common.utils import filename_reducer
10
+
11
+
12
+ class DocumentState(TypedDict):
13
+ """Represents the state of an individual document during processing.
14
+
15
+ This class is a TypedDict that encapsulates the state of a single document
16
+ during the processing workflow. It includes the document text, summary,
17
+ hallucination details, filename, and iteration count.
18
+
19
+ Attributes:
20
+ document (str): The text of the document.
21
+ summary (BriefSummary): The summary of the document.
22
+ hallucination (HallucinationChecker): The hallucination details for the document's summary.
23
+ filename (Path): The file path of the document.
24
+ iteration (int): The current iteration count for processing the document.
25
+ """
26
+
27
+ document: Document
28
+ filename: Path
29
+
30
+ entities: list[dict]
31
+ themes: set[str]
32
+ summary: BaseModel
33
+ theme_docs: list[Document]
34
+ hallucination: HallucinationChecker
35
+
36
+ iteration: int
37
 
38
 
39
  class OverallState(TypedDict):
 
53
  iterations (list[int]): A list of iteration counts for processing each document.
54
  """
55
 
56
+ documents: Annotated[list[DocumentState], filename_reducer]
57
+ n_docs: int
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
+ final_summary: Optional[str]
 
 
60
 
61
+ # documents_related: Annotated[list, operator.add]
62
+ # related_theme_docs: Annotated[list, operator.add]
63
+ #
64
+ # summaries: Annotated[list, operator.add]
65
+ # summaries_fixed: Annotated[list, operator.add]
66
+ # hallucinations: Annotated[list, operator.add]
67
+ #
68
+ # filenames: Annotated[list, operator.add]
69
+ # iterations: list[int]
planning_ai/themes.py CHANGED
@@ -1,134 +1,76 @@
1
- from enum import Enum
2
-
3
- from pydantic import BaseModel, field_validator
4
-
5
-
6
- class Theme(str, Enum):
7
- climate = "Climate change"
8
- biodiversity = "Biodiversity and green spaces"
9
- wellbeing = "Wellbeing and social inclusion"
10
- great_places = "Great places"
11
- jobs = "Jobs"
12
- homes = "Homes"
13
- infrastructure = "Infrastructure"
14
-
15
-
16
- class ClimatePolicies(str, Enum):
17
- CC_NZ = "Net zero carbon new buildings"
18
- CC_WE = "Water efficiency in new developments"
19
- CC_DC = "Designing for a changing climate"
20
- CC_FM = "Flooding and integrated water management"
21
- CC_RE = "Renewable energy projects and infrastructure"
22
- CC_CE = "Reducing waste and supporting the circular economy"
23
- CC_CS = "Supporting land-based carbon sequestration"
24
-
25
-
26
- class BiodiversityPolicies(str, Enum):
27
- BG_BG = "Biodiversity and geodiversity"
28
- BG_GI = "Green infrastructure"
29
- BG_TC = "Improving Tree Canopy Cover and the Tree Population"
30
- BG_RC = "River corridors"
31
- BG_PO = "Protecting open spaces"
32
- BG_EO = "Providing and enhancing open spaces"
33
-
34
-
35
- class WellbeingPolicies(str, Enum):
36
- WS_HD = "Creating healthy new developments"
37
- WS_CF = "Community, sports and leisure facilities"
38
- WS_MU = "Meanwhile uses during long term redevelopments"
39
- WS_IO = "Creating inclusive employment and business opportunities through new developments"
40
- WS_HS = "Pollution, health and safety"
41
-
42
-
43
- class GreatPlacesPolicies(str, Enum):
44
- GP_PP = "People and place responsive design"
45
- GP_LC = "Protection and enhancement of landscape character"
46
- GP_GB = "Protection and enhancement of the Cambridge Green Belt"
47
- GP_QD = "Achieving high quality development"
48
- GP_QP = "Establishing high quality landscape and public realm"
49
- GP_HA = "Conservation and enhancement of heritage assets"
50
- GP_CC = "Adapting heritage assets to climate change"
51
- GP_PH = "Protection of public houses"
52
-
53
-
54
- class JobsPolicies(str, Enum):
55
- J_NE = "New employment and development proposals"
56
- J_RE = "Supporting the rural economy"
57
- J_AL = "Protecting the best agricultural land"
58
- J_PB = "Protecting existing business space"
59
- J_RW = "Enabling remote working"
60
- J_AW = "Affordable workspace and creative industries"
61
- J_EP = "Supporting a range of facilities in employment parks"
62
- J_RC = "Retail and centres"
63
- J_VA = "Visitor accommodation, attractions and facilities"
64
- J_FD = "Faculty development and specialist / language schools"
65
-
66
-
67
- class HomesPolicies(str, Enum):
68
- H_AH = "Affordable housing"
69
- H_ES = "Exception sites for affordable housing"
70
- H_HM = "Housing mix"
71
- H_HD = "Housing density"
72
- H_GL = "Garden land and subdivision of existing plots"
73
- H_SS = "Residential space standards and accessible homes"
74
- H_SH = "Specialist housing and homes for older people"
75
- H_CB = "Self and custom build homes"
76
- H_BR = "Build to rent homes"
77
- H_MO = "Houses in multiple occupation (HMOs)"
78
- H_SA = "Student accommodation"
79
- H_DC = "Dwellings in the countryside"
80
- H_RM = "Residential moorings"
81
- H_RC = "Residential caravan sites"
82
- H_GT = "Gypsy and Traveller and Travelling Showpeople sites"
83
- H_CH = "Community-led housing"
84
-
85
-
86
- class InfrastructurePolicies(str, Enum):
87
- I_ST = "Sustainable transport and connectivity"
88
- I_EV = "Parking and electric vehicles"
89
- I_FD = "Freight and delivery consolidation"
90
- I_SI = "Safeguarding important infrastructure"
91
- I_AD = "Aviation development"
92
- I_EI = "Energy infrastructure masterplanning"
93
- I_ID = "Infrastructure and delivery"
94
- I_DI = "Digital infrastructure"
95
-
96
-
97
- THEME_TO_POLICY_GROUP = {
98
- Theme.climate: ClimatePolicies,
99
- Theme.biodiversity: BiodiversityPolicies,
100
- Theme.wellbeing: WellbeingPolicies,
101
- Theme.great_places: GreatPlacesPolicies,
102
- Theme.jobs: JobsPolicies,
103
- Theme.homes: HomesPolicies,
104
- Theme.infrastructure: InfrastructurePolicies,
105
  }
106
-
107
-
108
- class PolicyDetail(BaseModel):
109
- policy: str
110
- details: list[str]
111
-
112
-
113
- class PolicySelection(BaseModel):
114
- theme: Theme
115
- policies: list[PolicyDetail]
116
-
117
- @field_validator("policies", mode="before")
118
- @classmethod
119
- def validate_policies(cls, policies, info):
120
- """Ensure policies match the selected theme."""
121
- if not isinstance(policies, list):
122
- raise ValueError("Policies must be provided as a list.")
123
-
124
- theme = info.data.get("theme")
125
- if not theme:
126
- raise ValueError("Theme must be provided before validating policies.")
127
-
128
- allowed_policies = [p.value for p in THEME_TO_POLICY_GROUP[theme]]
129
- for policy in policies:
130
- if policy["policy"] not in allowed_policies:
131
- raise ValueError(
132
- f"Policy '{policy['policy']}' is not valid for theme '{theme.value}'."
133
- )
134
- return policies
 
1
+ THEMES_AND_POLICIES = {
2
+ "Climate Change": [
3
+ "Net zero carbon new buildings",
4
+ "Water efficiency in new developments",
5
+ "Designing for a changing climate",
6
+ "Flooding and integrated water management",
7
+ "Renewable energy projects and infrastructure",
8
+ "Reducing waste and supporting the circular economy",
9
+ "Supporting land-based carbon sequestration",
10
+ ],
11
+ "Biodiversity and Green Spaces": [
12
+ "Biodiversity and geodiversity",
13
+ "Green infrastructure",
14
+ "Improving Tree Canopy Cover and the Tree Population",
15
+ "River corridors",
16
+ "Protecting open spaces",
17
+ "Providing and enhancing open spaces",
18
+ ],
19
+ "Wellbeing and Social Inclusion": [
20
+ "Creating healthy new developments",
21
+ "Community, sports and leisure facilities",
22
+ "Meanwhile uses during long term redevelopments",
23
+ "Creating inclusive employment and business opportunities through new developments",
24
+ "Pollution, health and safety",
25
+ ],
26
+ "Great Places": [
27
+ "People and place responsive design",
28
+ "Protection and enhancement of landscape character",
29
+ "Protection and enhancement of the Cambridge Green Belt",
30
+ "Achieving high quality development",
31
+ "Establishing high quality landscape and public realm",
32
+ "Conservation and enhancement of heritage assets",
33
+ "Adapting heritage assets to climate change",
34
+ "Protection of public houses",
35
+ ],
36
+ "Jobs": [
37
+ "New employment and development proposals",
38
+ "Supporting the rural economy",
39
+ "Protecting the best agricultural land",
40
+ "Protecting existing business space",
41
+ "Enabling remote working",
42
+ "Affordable workspace and creative industries",
43
+ "Supporting a range of facilities in employment parks",
44
+ "Retail and centres",
45
+ "Visitor accommodation, attractions and facilities",
46
+ "Faculty development and specialist / language schools",
47
+ ],
48
+ "Homes": [
49
+ "Affordable housing",
50
+ "Exception sites for affordable housing",
51
+ "Housing mix",
52
+ "Housing density",
53
+ "Garden land and subdivision of existing plots",
54
+ "Residential space standards and accessible homes",
55
+ "Specialist housing and homes for older people",
56
+ "Self and custom build homes",
57
+ "Build to rent homes",
58
+ "Houses in multiple occupation (HMOs)",
59
+ "Student accommodation",
60
+ "Dwellings in the countryside",
61
+ "Residential moorings",
62
+ "Residential caravan sites",
63
+ "Gypsy and Traveller and Travelling Showpeople sites",
64
+ "Community-led housing",
65
+ ],
66
+ "Infrastructure": [
67
+ "Sustainable transport and connectivity",
68
+ "Parking and electric vehicles",
69
+ "Freight and delivery consolidation",
70
+ "Safeguarding important infrastructure",
71
+ "Aviation development",
72
+ "Energy infrastructure masterplanning",
73
+ "Infrastructure and delivery",
74
+ "Digital infrastructure",
75
+ ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  }