Spaces:
Sleeping
Sleeping
docs: add docstrings
Browse filesFormer-commit-id: 911e225c96e47b8d034b22d01b39c349fec66480 [formerly bb5a07ee701317833fadcd936f5a3ca7dd5dc00a]
Former-commit-id: 25e8de686c9a6ffc9d589923e1e53e26e5ee1dbb
- planning_ai/chains/map_chain.py +2 -2
- planning_ai/chains/policy_chain.py +1 -1
- planning_ai/common/utils.py +18 -3
- planning_ai/documents/document.py +41 -3
- planning_ai/eval/compare_summaries.py +30 -1
- planning_ai/graph.py +1 -1
- planning_ai/nodes/hallucination_node.py +18 -0
- planning_ai/nodes/map_node.py +35 -2
- planning_ai/nodes/reduce_node.py +1 -1
- planning_ai/preprocessing/gcpt3.py +0 -1
- planning_ai/preprocessing/prompts/ocr.txt +0 -10
- planning_ai/retrievers/theme_retriever.py +0 -84
- planning_ai/states.py +0 -2
planning_ai/chains/map_chain.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
from enum import Enum, auto
|
2 |
-
from typing import
|
3 |
|
4 |
from langchain_core.prompts import ChatPromptTemplate
|
5 |
-
from pydantic import BaseModel,
|
6 |
|
7 |
from planning_ai.common.utils import Paths
|
8 |
from planning_ai.llms.llm import GPT4o
|
|
|
1 |
from enum import Enum, auto
|
2 |
+
from typing import Type
|
3 |
|
4 |
from langchain_core.prompts import ChatPromptTemplate
|
5 |
+
from pydantic import BaseModel, create_model
|
6 |
|
7 |
from planning_ai.common.utils import Paths
|
8 |
from planning_ai.llms.llm import GPT4o
|
planning_ai/chains/policy_chain.py
CHANGED
@@ -36,6 +36,6 @@ if __name__ == "__main__":
|
|
36 |
test_docids = [1, 13, 21]
|
37 |
|
38 |
result = policy_chain.invoke(
|
39 |
-
{"theme": "Climate Change", "policy": test_policy, "details":
|
40 |
)
|
41 |
print(result)
|
|
|
36 |
test_docids = [1, 13, 21]
|
37 |
|
38 |
result = policy_chain.invoke(
|
39 |
+
{"theme": "Climate Change", "policy": test_policy, "details": test_bullet}
|
40 |
)
|
41 |
print(result)
|
planning_ai/common/utils.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
import shutil
|
2 |
from pathlib import Path
|
3 |
|
4 |
import polars as pl
|
@@ -13,6 +12,17 @@ pl.Config(
|
|
13 |
|
14 |
|
15 |
def filename_reducer(docs_a, docs_b):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
if docs_a == []:
|
17 |
return docs_b
|
18 |
b_dict = {d["filename"]: d for d in docs_b}
|
@@ -25,6 +35,10 @@ def filename_reducer(docs_a, docs_b):
|
|
25 |
|
26 |
|
27 |
class Paths:
|
|
|
|
|
|
|
|
|
28 |
DATA = Path("data")
|
29 |
|
30 |
RAW = DATA / "raw"
|
@@ -40,8 +54,9 @@ class Paths:
|
|
40 |
|
41 |
@classmethod
|
42 |
def ensure_directories_exist(cls):
|
43 |
-
|
44 |
-
|
|
|
45 |
for path in [
|
46 |
cls.DATA,
|
47 |
cls.RAW,
|
|
|
|
|
1 |
from pathlib import Path
|
2 |
|
3 |
import polars as pl
|
|
|
12 |
|
13 |
|
14 |
def filename_reducer(docs_a, docs_b):
|
15 |
+
"""
|
16 |
+
Reduces two lists of document dictionaries by updating docs_a with entries from docs_b
|
17 |
+
based on matching filenames.
|
18 |
+
|
19 |
+
Args:
|
20 |
+
docs_a (list): A list of dictionaries, each containing a "filename" key.
|
21 |
+
docs_b (list): A list of dictionaries, each containing a "filename" key.
|
22 |
+
|
23 |
+
Returns:
|
24 |
+
list: The updated list of dictionaries from docs_a with entries from docs_b.
|
25 |
+
"""
|
26 |
if docs_a == []:
|
27 |
return docs_b
|
28 |
b_dict = {d["filename"]: d for d in docs_b}
|
|
|
35 |
|
36 |
|
37 |
class Paths:
|
38 |
+
"""
|
39 |
+
A utility class for managing directory paths used in the project.
|
40 |
+
"""
|
41 |
+
|
42 |
DATA = Path("data")
|
43 |
|
44 |
RAW = DATA / "raw"
|
|
|
54 |
|
55 |
@classmethod
|
56 |
def ensure_directories_exist(cls):
|
57 |
+
"""
|
58 |
+
Ensures that all necessary directories exist, creating them if necessary.
|
59 |
+
"""
|
60 |
for path in [
|
61 |
cls.DATA,
|
62 |
cls.RAW,
|
planning_ai/documents/document.py
CHANGED
@@ -2,7 +2,6 @@ import logging
|
|
2 |
import re
|
3 |
from collections import Counter
|
4 |
|
5 |
-
|
6 |
import geopandas as gpd
|
7 |
import matplotlib as mpl
|
8 |
import matplotlib.pyplot as plt
|
@@ -36,6 +35,14 @@ WARDS = [
|
|
36 |
|
37 |
|
38 |
def _process_postcodes(final):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
documents = final["documents"]
|
40 |
postcodes = [doc["document"].metadata["respondentpostcode"] for doc in documents]
|
41 |
postcodes = (
|
@@ -52,9 +59,18 @@ def _process_postcodes(final):
|
|
52 |
|
53 |
|
54 |
def _process_policies(final):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
def process_policy_group(policy_group, theme, stance):
|
56 |
details = "".join(
|
57 |
-
f
|
58 |
+ "".join(
|
59 |
f"- {detail} {doc_id}\n"
|
60 |
for detail, doc_id in zip(row["detail"], row["doc_id"])
|
@@ -83,6 +99,14 @@ def _process_policies(final):
|
|
83 |
|
84 |
|
85 |
def _process_stances(final):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
documents = final["documents"]
|
87 |
stances = [
|
88 |
doc["document"].metadata["representations_support/object"] for doc in documents
|
@@ -105,6 +129,14 @@ def _process_stances(final):
|
|
105 |
|
106 |
|
107 |
def _process_themes(final):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
108 |
documents = final["documents"]
|
109 |
themes = Counter(
|
110 |
[theme["theme"].value for doc in documents for theme in doc["themes"]]
|
@@ -121,6 +153,11 @@ def _process_themes(final):
|
|
121 |
|
122 |
|
123 |
def fig_oa(postcodes):
|
|
|
|
|
|
|
|
|
|
|
124 |
oa_lookup = pl.read_csv(
|
125 |
Paths.RAW
|
126 |
/ "Output_Area_to_Local_Authority_District_(April_2023)_Lookup_in_England_and_Wales.csv"
|
@@ -247,7 +284,6 @@ def fig_wards(postcodes):
|
|
247 |
ax=ax,
|
248 |
column="count",
|
249 |
legend=True,
|
250 |
-
vmax=20,
|
251 |
legend_kwds={"label": "Number of Representations"},
|
252 |
)
|
253 |
ward_boundaries.plot(ax=ax, color="none", edgecolor="gray")
|
@@ -389,6 +425,8 @@ def build_final_report(out, rep):
|
|
389 |
"## Unused Documents\n\n"
|
390 |
"Please note that the following documents were not used to produce this report:\n\n"
|
391 |
f"{str(unused_documents)}"
|
|
|
|
|
392 |
)
|
393 |
|
394 |
out_path = Paths.SUMMARY / f"Summary_of_Submitted_Responses-{rep}.md"
|
|
|
2 |
import re
|
3 |
from collections import Counter
|
4 |
|
|
|
5 |
import geopandas as gpd
|
6 |
import matplotlib as mpl
|
7 |
import matplotlib.pyplot as plt
|
|
|
35 |
|
36 |
|
37 |
def _process_postcodes(final):
|
38 |
+
"""Processes postcodes from the final document data.
|
39 |
+
|
40 |
+
Args:
|
41 |
+
final (dict): A dictionary containing document data.
|
42 |
+
|
43 |
+
Returns:
|
44 |
+
pl.DataFrame: A DataFrame with postcodes and their counts joined with ONSPD data.
|
45 |
+
"""
|
46 |
documents = final["documents"]
|
47 |
postcodes = [doc["document"].metadata["respondentpostcode"] for doc in documents]
|
48 |
postcodes = (
|
|
|
59 |
|
60 |
|
61 |
def _process_policies(final):
|
62 |
+
"""Processes policies from the final document data.
|
63 |
+
|
64 |
+
Args:
|
65 |
+
final (dict): A dictionary containing document data.
|
66 |
+
|
67 |
+
Returns:
|
68 |
+
tuple: A tuple containing strings of support, object, and other policies.
|
69 |
+
"""
|
70 |
+
|
71 |
def process_policy_group(policy_group, theme, stance):
|
72 |
details = "".join(
|
73 |
+
f"\n### {row['policies']}\n\n"
|
74 |
+ "".join(
|
75 |
f"- {detail} {doc_id}\n"
|
76 |
for detail, doc_id in zip(row["detail"], row["doc_id"])
|
|
|
99 |
|
100 |
|
101 |
def _process_stances(final):
|
102 |
+
"""Processes stances from the final document data.
|
103 |
+
|
104 |
+
Args:
|
105 |
+
final (dict): A dictionary containing document data.
|
106 |
+
|
107 |
+
Returns:
|
108 |
+
str: A formatted string of stances with their percentages and counts.
|
109 |
+
"""
|
110 |
documents = final["documents"]
|
111 |
stances = [
|
112 |
doc["document"].metadata["representations_support/object"] for doc in documents
|
|
|
129 |
|
130 |
|
131 |
def _process_themes(final):
|
132 |
+
"""Processes themes from the final document data.
|
133 |
+
|
134 |
+
Args:
|
135 |
+
final (dict): A dictionary containing document data.
|
136 |
+
|
137 |
+
Returns:
|
138 |
+
str: A markdown table of themes with their counts and percentages.
|
139 |
+
"""
|
140 |
documents = final["documents"]
|
141 |
themes = Counter(
|
142 |
[theme["theme"].value for doc in documents for theme in doc["themes"]]
|
|
|
153 |
|
154 |
|
155 |
def fig_oa(postcodes):
|
156 |
+
"""Generates a figure for Output Area (OA) classifications.
|
157 |
+
|
158 |
+
Args:
|
159 |
+
postcodes (pl.DataFrame): A DataFrame containing postcode data.
|
160 |
+
"""
|
161 |
oa_lookup = pl.read_csv(
|
162 |
Paths.RAW
|
163 |
/ "Output_Area_to_Local_Authority_District_(April_2023)_Lookup_in_England_and_Wales.csv"
|
|
|
284 |
ax=ax,
|
285 |
column="count",
|
286 |
legend=True,
|
|
|
287 |
legend_kwds={"label": "Number of Representations"},
|
288 |
)
|
289 |
ward_boundaries.plot(ax=ax, color="none", edgecolor="gray")
|
|
|
425 |
"## Unused Documents\n\n"
|
426 |
"Please note that the following documents were not used to produce this report:\n\n"
|
427 |
f"{str(unused_documents)}"
|
428 |
+
"Documents are excluded if they provide no relevant information. These documents "
|
429 |
+
"are typically very short, and contain information that provides no relation to policies or themes."
|
430 |
)
|
431 |
|
432 |
out_path = Paths.SUMMARY / f"Summary_of_Submitted_Responses-{rep}.md"
|
planning_ai/eval/compare_summaries.py
CHANGED
@@ -8,10 +8,21 @@ from planning_ai.llms.llm import GPT4o
|
|
8 |
|
9 |
|
10 |
class SummaryEvaluator(BaseModel):
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
|
14 |
def load_templates():
|
|
|
|
|
|
|
|
|
|
|
15 |
with open("./planning_ai/eval/eval.txt", "r") as f:
|
16 |
compare_template = f.read()
|
17 |
with open("./planning_ai/eval/summary.txt", "r") as f:
|
@@ -20,6 +31,15 @@ def load_templates():
|
|
20 |
|
21 |
|
22 |
def initialize_chains(compare_template, summary_template):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
SLLM = GPT4o.with_structured_output(SummaryEvaluator, strict=True)
|
24 |
compare_prompt = ChatPromptTemplate([("system", compare_template)])
|
25 |
compare_chain = compare_prompt | SLLM
|
@@ -31,6 +51,15 @@ def initialize_chains(compare_template, summary_template):
|
|
31 |
|
32 |
|
33 |
def process_summaries(compare_chain, summary_chain):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
original = pl.read_parquet(Paths.STAGING / "gcpt3.parquet").filter(
|
35 |
pl.col("attachments_id").is_null()
|
36 |
)
|
|
|
8 |
|
9 |
|
10 |
class SummaryEvaluator(BaseModel):
|
11 |
+
"""Model for evaluating summaries.
|
12 |
+
|
13 |
+
Attributes:
|
14 |
+
score (int): The number of the best summary.
|
15 |
+
"""
|
16 |
+
|
17 |
+
score: int = Field(...)
|
18 |
|
19 |
|
20 |
def load_templates():
|
21 |
+
"""Loads the comparison and summary templates from files.
|
22 |
+
|
23 |
+
Returns:
|
24 |
+
tuple: A tuple containing the compare template and summary template as strings.
|
25 |
+
"""
|
26 |
with open("./planning_ai/eval/eval.txt", "r") as f:
|
27 |
compare_template = f.read()
|
28 |
with open("./planning_ai/eval/summary.txt", "r") as f:
|
|
|
31 |
|
32 |
|
33 |
def initialize_chains(compare_template, summary_template):
|
34 |
+
"""Initializes the comparison and summary chains.
|
35 |
+
|
36 |
+
Args:
|
37 |
+
compare_template (str): The template for comparison.
|
38 |
+
summary_template (str): The template for summary.
|
39 |
+
|
40 |
+
Returns:
|
41 |
+
tuple: A tuple containing the compare chain and summary chain.
|
42 |
+
"""
|
43 |
SLLM = GPT4o.with_structured_output(SummaryEvaluator, strict=True)
|
44 |
compare_prompt = ChatPromptTemplate([("system", compare_template)])
|
45 |
compare_chain = compare_prompt | SLLM
|
|
|
51 |
|
52 |
|
53 |
def process_summaries(compare_chain, summary_chain):
|
54 |
+
"""Processes summaries by comparing and scoring them.
|
55 |
+
|
56 |
+
Args:
|
57 |
+
compare_chain: The chain used for comparing summaries.
|
58 |
+
summary_chain: The chain used for generating summaries.
|
59 |
+
|
60 |
+
Returns:
|
61 |
+
polars.DataFrame: A DataFrame containing the original text, summaries, and scores.
|
62 |
+
"""
|
63 |
original = pl.read_parquet(Paths.STAGING / "gcpt3.parquet").filter(
|
64 |
pl.col("attachments_id").is_null()
|
65 |
)
|
planning_ai/graph.py
CHANGED
@@ -7,7 +7,7 @@ from planning_ai.nodes.hallucination_node import (
|
|
7 |
map_check,
|
8 |
map_fix,
|
9 |
)
|
10 |
-
from planning_ai.nodes.map_node import
|
11 |
from planning_ai.nodes.reduce_node import generate_final_report
|
12 |
from planning_ai.states import OverallState
|
13 |
|
|
|
7 |
map_check,
|
8 |
map_fix,
|
9 |
)
|
10 |
+
from planning_ai.nodes.map_node import generate_summary, map_documents
|
11 |
from planning_ai.nodes.reduce_node import generate_final_report
|
12 |
from planning_ai.states import OverallState
|
13 |
|
planning_ai/nodes/hallucination_node.py
CHANGED
@@ -111,10 +111,28 @@ def fix_hallucination(state: DocumentState):
|
|
111 |
|
112 |
|
113 |
def map_check(state: OverallState):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
return [Send("check_hallucination", doc) for doc in state["documents"]]
|
115 |
|
116 |
|
117 |
def map_fix(state: OverallState):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
return [
|
119 |
Send("fix_hallucination", doc)
|
120 |
for doc in state["documents"]
|
|
|
111 |
|
112 |
|
113 |
def map_check(state: OverallState):
|
114 |
+
"""Maps the check_hallucination function to each document in the overall state.
|
115 |
+
|
116 |
+
Args:
|
117 |
+
state (OverallState): The overall state containing multiple documents.
|
118 |
+
|
119 |
+
Returns:
|
120 |
+
list: A list of Send objects, each representing a request to check for hallucinations
|
121 |
+
in a document.
|
122 |
+
"""
|
123 |
return [Send("check_hallucination", doc) for doc in state["documents"]]
|
124 |
|
125 |
|
126 |
def map_fix(state: OverallState):
|
127 |
+
"""Maps the fix_hallucination function to each hallucinated document that is not processed.
|
128 |
+
|
129 |
+
Args:
|
130 |
+
state (OverallState): The overall state containing multiple documents.
|
131 |
+
|
132 |
+
Returns:
|
133 |
+
list: A list of Send objects, each representing a request to fix hallucinations
|
134 |
+
in a document that is hallucinated and not yet processed.
|
135 |
+
"""
|
136 |
return [
|
137 |
Send("fix_hallucination", doc)
|
138 |
for doc in state["documents"]
|
planning_ai/nodes/map_node.py
CHANGED
@@ -16,6 +16,17 @@ nlp = spacy.load("en_core_web_lg")
|
|
16 |
|
17 |
|
18 |
def retrieve_themes(state: DocumentState) -> DocumentState:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
try:
|
20 |
result = themes_chain.invoke({"document": state["document"].page_content})
|
21 |
if not result.themes:
|
@@ -34,6 +45,17 @@ def retrieve_themes(state: DocumentState) -> DocumentState:
|
|
34 |
|
35 |
|
36 |
def add_entities(state: OverallState) -> OverallState:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
logger.info("Adding entities to all documents.")
|
38 |
for idx, document in enumerate(
|
39 |
nlp.pipe(
|
@@ -47,7 +69,7 @@ def add_entities(state: OverallState) -> OverallState:
|
|
47 |
|
48 |
|
49 |
def remove_pii(document: str) -> str:
|
50 |
-
"""
|
51 |
|
52 |
This function uses the Presidio Analyzer and Anonymizer to detect and anonymize
|
53 |
PII such as names, phone numbers, and email addresses in the given document.
|
@@ -67,7 +89,7 @@ def remove_pii(document: str) -> str:
|
|
67 |
|
68 |
|
69 |
def generate_summary(state: DocumentState) -> dict:
|
70 |
-
"""
|
71 |
|
72 |
This function first anonymizes the document to remove PII, then generates a summary
|
73 |
using the `map_chain`. The summary is added to the document state.
|
@@ -136,5 +158,16 @@ def generate_summary(state: DocumentState) -> dict:
|
|
136 |
|
137 |
|
138 |
def map_documents(state: OverallState) -> list[Send]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
logger.info("Mapping documents to generate summaries.")
|
140 |
return [Send("generate_summary", document) for document in state["documents"]]
|
|
|
16 |
|
17 |
|
18 |
def retrieve_themes(state: DocumentState) -> DocumentState:
|
19 |
+
"""Retrieve themes from a document's content.
|
20 |
+
|
21 |
+
This function uses the `themes_chain` to extract themes from the document's
|
22 |
+
page content. It updates the document state with the themes and their scores.
|
23 |
+
|
24 |
+
Args:
|
25 |
+
state (DocumentState): The current state of the document, including its content.
|
26 |
+
|
27 |
+
Returns:
|
28 |
+
DocumentState: The updated document state with themes and scores.
|
29 |
+
"""
|
30 |
try:
|
31 |
result = themes_chain.invoke({"document": state["document"].page_content})
|
32 |
if not result.themes:
|
|
|
45 |
|
46 |
|
47 |
def add_entities(state: OverallState) -> OverallState:
|
48 |
+
"""Add named entities to all documents in the state.
|
49 |
+
|
50 |
+
This function processes each document using a spaCy NLP pipeline to extract
|
51 |
+
named entities and adds them to the document state.
|
52 |
+
|
53 |
+
Args:
|
54 |
+
state (OverallState): The overall state containing multiple documents.
|
55 |
+
|
56 |
+
Returns:
|
57 |
+
OverallState: The updated state with entities added to each document.
|
58 |
+
"""
|
59 |
logger.info("Adding entities to all documents.")
|
60 |
for idx, document in enumerate(
|
61 |
nlp.pipe(
|
|
|
69 |
|
70 |
|
71 |
def remove_pii(document: str) -> str:
|
72 |
+
"""Remove personally identifiable information (PII) from a document.
|
73 |
|
74 |
This function uses the Presidio Analyzer and Anonymizer to detect and anonymize
|
75 |
PII such as names, phone numbers, and email addresses in the given document.
|
|
|
89 |
|
90 |
|
91 |
def generate_summary(state: DocumentState) -> dict:
|
92 |
+
"""Generate a summary for a document after removing PII.
|
93 |
|
94 |
This function first anonymizes the document to remove PII, then generates a summary
|
95 |
using the `map_chain`. The summary is added to the document state.
|
|
|
158 |
|
159 |
|
160 |
def map_documents(state: OverallState) -> list[Send]:
|
161 |
+
"""Map documents to generate summaries.
|
162 |
+
|
163 |
+
This function prepares a list of `Send` objects to trigger the summary generation
|
164 |
+
process for each document in the state.
|
165 |
+
|
166 |
+
Args:
|
167 |
+
state (OverallState): The overall state containing multiple documents.
|
168 |
+
|
169 |
+
Returns:
|
170 |
+
list[Send]: A list of `Send` objects for summary generation.
|
171 |
+
"""
|
172 |
logger.info("Mapping documents to generate summaries.")
|
173 |
return [Send("generate_summary", document) for document in state["documents"]]
|
planning_ai/nodes/reduce_node.py
CHANGED
@@ -82,7 +82,7 @@ def batch_generate_executive_summaries(summaries):
|
|
82 |
batch_size = 50
|
83 |
for i in range(0, len(summaries_text), batch_size):
|
84 |
logger.info(
|
85 |
-
f"Processing batches... {int(i/50)+1}/{(len(summaries_text)//batch_size)+1}"
|
86 |
)
|
87 |
batch = summaries_text[i : i + batch_size]
|
88 |
response = reduce_chain.invoke({"context": batch})
|
|
|
82 |
batch_size = 50
|
83 |
for i in range(0, len(summaries_text), batch_size):
|
84 |
logger.info(
|
85 |
+
f"Processing batches... {int(i / 50) + 1}/{(len(summaries_text) // batch_size) + 1}"
|
86 |
)
|
87 |
batch = summaries_text[i : i + batch_size]
|
88 |
response = reduce_chain.invoke({"context": batch})
|
planning_ai/preprocessing/gcpt3.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
import logging
|
2 |
-
import textwrap
|
3 |
from io import BytesIO
|
4 |
from pathlib import Path
|
5 |
from typing import Any
|
|
|
1 |
import logging
|
|
|
2 |
from io import BytesIO
|
3 |
from pathlib import Path
|
4 |
from typing import Any
|
planning_ai/preprocessing/prompts/ocr.txt
DELETED
@@ -1,10 +0,0 @@
|
|
1 |
-
The images provided are from a planning response form filled out by a member of the public, containing free-form responses related to a planning application. These responses may be handwritten or typed.
|
2 |
-
|
3 |
-
Please follow these instructions to process the images:
|
4 |
-
|
5 |
-
1. **Extract Free-Form Information Only**: Focus on extracting and outputting the free-form written content from the images. Do not include single-word answers, brief responses, or any extra content that is not part of the detailed responses.
|
6 |
-
2. **Verbatim Output**: Ensure that the extracted information is output exactly as it appears in the images. Add a heading before each section of free-form text if it helps with organisation, but ensure the heading is not added by the model itself. Ignore blank sections entirely—do not generate or include any additional thoughts or content.
|
7 |
-
3. **Sequential Processing**: The images are sequentially ordered. A response might continue from one image to the next, so capture the full context across multiple images if necessary.
|
8 |
-
4. **Ignore Non-Relevant Content**: Exclude any content that does not fit the criteria of free-form, detailed responses.
|
9 |
-
|
10 |
-
Thank you for your attention to these details.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
planning_ai/retrievers/theme_retriever.py
DELETED
@@ -1,84 +0,0 @@
|
|
1 |
-
import logging
|
2 |
-
from pathlib import Path
|
3 |
-
|
4 |
-
from chromadb import PersistentClient
|
5 |
-
from langchain_community.document_loaders import PyPDFLoader
|
6 |
-
from langchain_community.vectorstores import Chroma
|
7 |
-
from langchain_core.prompts import PromptTemplate
|
8 |
-
from langchain_openai import OpenAIEmbeddings
|
9 |
-
from pydantic import BaseModel, Field
|
10 |
-
|
11 |
-
from planning_ai.llms.llm import GPT4o
|
12 |
-
|
13 |
-
# See: https://consultations.greatercambridgeplanning.org/greater-cambridge-local-plan-preferred-options/supporting-documents
|
14 |
-
|
15 |
-
PDFS = {
|
16 |
-
"Biodiversity and Green Spaces": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPBiodiversityandGreenSpacesAug21v2Nov21_0.pdf",
|
17 |
-
"Climate Change": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPClimateChangeAug21v2Nov21_0.pdf",
|
18 |
-
"Great Places": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPGreatPlacesAug21v1Aug21.pdf",
|
19 |
-
"Homes": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPHomesAug21v2Nov21.pdf",
|
20 |
-
"Infrastructure": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPInfrastructureAug21v2Nov21.pdf",
|
21 |
-
"Jobs": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPJobsAug21v2Nov21.pdf",
|
22 |
-
# "Strategy topic paper": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPStrategyAug21v3Nov21_0.pdf",
|
23 |
-
"Wellbeing and Social Inclusion": "https://consultations.greatercambridgeplanning.org/sites/gcp/files/2021-11/TPWellbeingAug21v2Nov21.pdf",
|
24 |
-
}
|
25 |
-
|
26 |
-
|
27 |
-
class Grade(BaseModel):
|
28 |
-
"""Binary score for relevance check."""
|
29 |
-
|
30 |
-
binary_score: str = Field(description="Relevance score 'yes' or 'no'")
|
31 |
-
|
32 |
-
|
33 |
-
def create_db():
|
34 |
-
chroma_dir = Path("./chroma_themesdb")
|
35 |
-
if chroma_dir.exists():
|
36 |
-
persistent_client = PersistentClient(path="./chroma_themesdb")
|
37 |
-
vectorstore = Chroma(
|
38 |
-
client=persistent_client,
|
39 |
-
collection_name="themes-chroma",
|
40 |
-
embedding_function=OpenAIEmbeddings(),
|
41 |
-
)
|
42 |
-
|
43 |
-
else:
|
44 |
-
docs = []
|
45 |
-
for name, pdf in PDFS.items():
|
46 |
-
doc = PyPDFLoader(pdf).load()[5:]
|
47 |
-
for d in doc:
|
48 |
-
d.metadata["theme"] = name
|
49 |
-
docs.extend(doc)
|
50 |
-
|
51 |
-
logging.warning(f"Building ChromaDB...")
|
52 |
-
vectorstore = Chroma.from_documents(
|
53 |
-
documents=docs,
|
54 |
-
collection_name="themes-chroma",
|
55 |
-
embedding=OpenAIEmbeddings(),
|
56 |
-
persist_directory="./chroma_themesdb",
|
57 |
-
)
|
58 |
-
return vectorstore
|
59 |
-
|
60 |
-
|
61 |
-
grade_template = PromptTemplate(
|
62 |
-
template="""You are a grader assessing relevance of a retrieved document to a user question. \n
|
63 |
-
Here is the retrieved document: \n\n {context} \n\n
|
64 |
-
Here is the original document: {document} \n
|
65 |
-
If the retrieved document contains keyword(s) or semantic meaning related to the original, grade it as relevant. \n
|
66 |
-
Give a binary score 'yes' or 'no' score to indicate whether the retrieved document is relevant to the original.""",
|
67 |
-
input_variables=["context", "document"],
|
68 |
-
)
|
69 |
-
|
70 |
-
|
71 |
-
SLLM = GPT4o.with_structured_output(Grade, strict=True)
|
72 |
-
grade_chain = grade_template | SLLM
|
73 |
-
|
74 |
-
vectorstore = create_db()
|
75 |
-
theme_retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
|
76 |
-
logging.warning(f"Finished building ChromaDB...")
|
77 |
-
|
78 |
-
if __name__ == "__main__":
|
79 |
-
test_content = """
|
80 |
-
We would certainly support this and would emphasise the importance of trying
|
81 |
-
to solve the severance problems created by the M11 and A14.
|
82 |
-
"""
|
83 |
-
|
84 |
-
len(theme_retriever.invoke(input=test_content))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
planning_ai/states.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
from pathlib import Path
|
2 |
from typing import Annotated, TypedDict
|
3 |
|
4 |
import polars as pl
|
@@ -6,7 +5,6 @@ from langchain_core.documents import Document
|
|
6 |
from pydantic import BaseModel
|
7 |
|
8 |
from planning_ai.chains.hallucination_chain import HallucinationChecker
|
9 |
-
from planning_ai.chains.themes_chain import ThemeScore
|
10 |
from planning_ai.common.utils import filename_reducer
|
11 |
|
12 |
|
|
|
|
|
1 |
from typing import Annotated, TypedDict
|
2 |
|
3 |
import polars as pl
|
|
|
5 |
from pydantic import BaseModel
|
6 |
|
7 |
from planning_ai.chains.hallucination_chain import HallucinationChecker
|
|
|
8 |
from planning_ai.common.utils import filename_reducer
|
9 |
|
10 |
|