Spaces:
Build error
Build error
feat: add second document for summaries
Browse files- planning_ai/main.py +120 -59
planning_ai/main.py
CHANGED
@@ -18,7 +18,8 @@ load_dotenv()
|
|
18 |
|
19 |
|
20 |
def build_quarto_doc(doc_title, out):
|
21 |
-
final = out["
|
|
|
22 |
|
23 |
quarto_doc = (
|
24 |
"---\n"
|
@@ -34,18 +35,16 @@ def build_quarto_doc(doc_title, out):
|
|
34 |
" - Scale=0.55\n"
|
35 |
"---\n\n"
|
36 |
f"{final['executive']}\n\n"
|
37 |
-
"
|
38 |
-
"
|
|
|
39 |
" within each Ward that had at least one response.\n\n"
|
40 |
f"{{#fig-wards}}\n\n"
|
41 |
-
"
|
42 |
" within each IMD quintile.\n\n"
|
43 |
f"{{#fig-imd}}\n\n"
|
44 |
"# Themes and Policies\n\n"
|
45 |
-
"
|
46 |
-
f"{final['policies_support']}"
|
47 |
-
"## Object\n\n"
|
48 |
-
f"{final['policies_object']}"
|
49 |
)
|
50 |
|
51 |
with open(Paths.SUMMARY / f"{doc_title.replace(' ', '_')}.qmd", "w") as f:
|
@@ -53,43 +52,45 @@ def build_quarto_doc(doc_title, out):
|
|
53 |
|
54 |
|
55 |
def read_docs():
|
56 |
-
df = pl.read_parquet(Paths.STAGING / "
|
57 |
-
|
58 |
-
int(pdf.stem) if pdf.stem.isdigit() else 0
|
59 |
-
for pdf in (Paths.STAGING / "pdfs_azure").glob("*.pdf")
|
60 |
-
]
|
61 |
-
pdf_loader = PyPDFDirectoryLoader(Paths.STAGING / "pdfs_azure", silent_errors=True)
|
62 |
-
out = pdf_loader.load()
|
63 |
-
|
64 |
-
pdfs_combined = {}
|
65 |
-
for page in out:
|
66 |
-
id = Path(page.metadata["source"]).stem
|
67 |
-
if id in pdfs_combined:
|
68 |
-
pdfs_combined[id] = pdfs_combined[id] + page.page_content
|
69 |
-
else:
|
70 |
-
pdfs_combined[id] = page.page_content
|
71 |
-
|
72 |
-
pdfs_combined = (
|
73 |
-
pl.from_dict(pdfs_combined)
|
74 |
-
.transpose(include_header=True)
|
75 |
-
.rename({"column": "attachments_id", "column_0": "pdf_text"})
|
76 |
-
.with_columns(pl.col("attachments_id").cast(int))
|
77 |
-
)
|
78 |
-
|
79 |
-
df = (
|
80 |
-
df.filter(
|
81 |
-
(
|
82 |
-
pl.col("representations_document")
|
83 |
-
== "Greater Cambridge Local Plan Preferred Options"
|
84 |
-
)
|
85 |
-
& (pl.col("attachments_id").is_in(pdf_ids))
|
86 |
-
)
|
87 |
-
.unique("id")
|
88 |
-
.with_row_index()
|
89 |
-
)
|
90 |
-
df = df.join(pdfs_combined, on="attachments_id").with_columns(
|
91 |
-
pl.col("text") + "\n\n" + pl.col("pdf_text")
|
92 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
|
94 |
loader = PolarsDataFrameLoader(df, page_content_column="text")
|
95 |
|
@@ -119,8 +120,13 @@ def process_postcodes(documents):
|
|
119 |
def wards_pop(postcodes):
|
120 |
wards = (
|
121 |
pl.read_csv(Paths.RAW / "TS001-2021-3-filtered-2025-01-09T11_07_15Z.csv")
|
122 |
-
.
|
123 |
-
|
|
|
|
|
|
|
|
|
|
|
124 |
.sum()
|
125 |
)
|
126 |
postcodes = postcodes.join(wards, on="OSWARD").with_columns(
|
@@ -129,13 +135,23 @@ def wards_pop(postcodes):
|
|
129 |
ward_boundaries = gpd.read_file(
|
130 |
Paths.RAW / "Wards_December_2021_GB_BFE_2022_7523259277605796091.zip"
|
131 |
)
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
139 |
camb_ward_boundaries = ward_boundaries[
|
140 |
ward_boundaries["WD21CD"].isin(camb_ward_codes)
|
141 |
]
|
@@ -148,9 +164,8 @@ def wards_pop(postcodes):
|
|
148 |
camb_ward_boundaries.plot(ax=ax, color="white", edgecolor="black")
|
149 |
ward_boundaries_prop.plot(ax=ax, column="prop", legend=True)
|
150 |
|
151 |
-
__import__("ipdb").set_trace()
|
152 |
bounds = camb_ward_boundaries.total_bounds
|
153 |
-
buffer =
|
154 |
ax.set_xlim([bounds[0] - buffer, bounds[2] + buffer])
|
155 |
ax.set_ylim([bounds[1] - buffer, bounds[3] + buffer])
|
156 |
|
@@ -191,18 +206,62 @@ def imd_bar(postcodes):
|
|
191 |
ax1.set_ylabel("Proporition of Population (%)")
|
192 |
ax1.tick_params(axis="y")
|
193 |
|
194 |
-
plt.title("Comparison of Responses by IMD
|
195 |
|
196 |
plt.tight_layout()
|
197 |
plt.savefig(Paths.SUMMARY / "figs" / "imd_decile.png")
|
198 |
|
199 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
200 |
def main():
|
201 |
-
docs = read_docs()[:
|
202 |
n_docs = len(docs)
|
203 |
|
204 |
logging.warning(f"{n_docs} documents being processed!")
|
205 |
-
|
206 |
app = create_graph()
|
207 |
|
208 |
step = None
|
@@ -212,10 +271,11 @@ def main():
|
|
212 |
if step is None:
|
213 |
raise ValueError("No steps were processed!")
|
214 |
|
215 |
-
postcodes = process_postcodes(step["
|
216 |
wards_pop(postcodes)
|
217 |
imd_bar(postcodes)
|
218 |
build_quarto_doc(doc_title, step)
|
|
|
219 |
return step
|
220 |
|
221 |
|
@@ -226,4 +286,5 @@ if __name__ == "__main__":
|
|
226 |
out = main()
|
227 |
toc = time.time()
|
228 |
|
|
|
229 |
print(f"Time taken: {(toc - tic) / 60:.2f} minutes.")
|
|
|
18 |
|
19 |
|
20 |
def build_quarto_doc(doc_title, out):
|
21 |
+
final = out["generate_final_documents"]
|
22 |
+
policies = process_policies(out)
|
23 |
|
24 |
quarto_doc = (
|
25 |
"---\n"
|
|
|
35 |
" - Scale=0.55\n"
|
36 |
"---\n\n"
|
37 |
f"{final['executive']}\n\n"
|
38 |
+
r"\newpage"
|
39 |
+
"\n# Figures\n\n"
|
40 |
+
"@fig-wards shows the percentage of responses by total population"
|
41 |
" within each Ward that had at least one response.\n\n"
|
42 |
f"{{#fig-wards}}\n\n"
|
43 |
+
"@fig-imd shows the percentage of responses by total population"
|
44 |
" within each IMD quintile.\n\n"
|
45 |
f"{{#fig-imd}}\n\n"
|
46 |
"# Themes and Policies\n\n"
|
47 |
+
f"{policies}"
|
|
|
|
|
|
|
48 |
)
|
49 |
|
50 |
with open(Paths.SUMMARY / f"{doc_title.replace(' ', '_')}.qmd", "w") as f:
|
|
|
52 |
|
53 |
|
54 |
def read_docs():
|
55 |
+
df = pl.read_parquet(Paths.STAGING / "gcpt3_testing.parquet").drop_nulls(
|
56 |
+
subset="text"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
)
|
58 |
+
# pdf_ids = [
|
59 |
+
# int(pdf.stem) if pdf.stem.isdigit() else 0
|
60 |
+
# for pdf in (Paths.STAGING / "pdfs_azure").glob("*.pdf")
|
61 |
+
# ]
|
62 |
+
# pdf_loader = PyPDFDirectoryLoader(Paths.STAGING / "pdfs_azure", silent_errors=True)
|
63 |
+
# out = pdf_loader.load()
|
64 |
+
#
|
65 |
+
# pdfs_combined = {}
|
66 |
+
# for page in out:
|
67 |
+
# id = Path(page.metadata["source"]).stem
|
68 |
+
# if id in pdfs_combined:
|
69 |
+
# pdfs_combined[id] = pdfs_combined[id] + page.page_content
|
70 |
+
# else:
|
71 |
+
# pdfs_combined[id] = page.page_content
|
72 |
+
#
|
73 |
+
# pdfs_combined = (
|
74 |
+
# pl.from_dict(pdfs_combined)
|
75 |
+
# .transpose(include_header=True)
|
76 |
+
# .rename({"column": "attachments_id", "column_0": "pdf_text"})
|
77 |
+
# .with_columns(pl.col("attachments_id").cast(int))
|
78 |
+
# )
|
79 |
+
#
|
80 |
+
# df = (
|
81 |
+
# df.filter(
|
82 |
+
# (
|
83 |
+
# pl.col("representations_document")
|
84 |
+
# == "Greater Cambridge Local Plan Preferred Options"
|
85 |
+
# )
|
86 |
+
# & (pl.col("attachments_id").is_in(pdf_ids))
|
87 |
+
# )
|
88 |
+
# .unique("id")
|
89 |
+
# .with_row_index()
|
90 |
+
# )
|
91 |
+
# df = df.join(pdfs_combined, on="attachments_id").with_columns(
|
92 |
+
# pl.col("text") + "\n\n" + pl.col("pdf_text")
|
93 |
+
# )
|
94 |
|
95 |
loader = PolarsDataFrameLoader(df, page_content_column="text")
|
96 |
|
|
|
120 |
def wards_pop(postcodes):
|
121 |
wards = (
|
122 |
pl.read_csv(Paths.RAW / "TS001-2021-3-filtered-2025-01-09T11_07_15Z.csv")
|
123 |
+
.rename(
|
124 |
+
{
|
125 |
+
"Electoral wards and divisions Code": "OSWARD",
|
126 |
+
"Electoral wards and divisions": "WARDNAME",
|
127 |
+
}
|
128 |
+
)
|
129 |
+
.group_by(["OSWARD", "WARDNAME"])
|
130 |
.sum()
|
131 |
)
|
132 |
postcodes = postcodes.join(wards, on="OSWARD").with_columns(
|
|
|
135 |
ward_boundaries = gpd.read_file(
|
136 |
Paths.RAW / "Wards_December_2021_GB_BFE_2022_7523259277605796091.zip"
|
137 |
)
|
138 |
+
|
139 |
+
camb_ward_codes = [
|
140 |
+
"E05013050",
|
141 |
+
"E05013051",
|
142 |
+
"E05013052",
|
143 |
+
"E05013053",
|
144 |
+
"E05013054",
|
145 |
+
"E05013055",
|
146 |
+
"E05013056",
|
147 |
+
"E05013057",
|
148 |
+
"E05013058",
|
149 |
+
"E05013059",
|
150 |
+
"E05013060",
|
151 |
+
"E05013061",
|
152 |
+
"E05013062",
|
153 |
+
"E05013063",
|
154 |
+
]
|
155 |
camb_ward_boundaries = ward_boundaries[
|
156 |
ward_boundaries["WD21CD"].isin(camb_ward_codes)
|
157 |
]
|
|
|
164 |
camb_ward_boundaries.plot(ax=ax, color="white", edgecolor="black")
|
165 |
ward_boundaries_prop.plot(ax=ax, column="prop", legend=True)
|
166 |
|
|
|
167 |
bounds = camb_ward_boundaries.total_bounds
|
168 |
+
buffer = 1000
|
169 |
ax.set_xlim([bounds[0] - buffer, bounds[2] + buffer])
|
170 |
ax.set_ylim([bounds[1] - buffer, bounds[3] + buffer])
|
171 |
|
|
|
206 |
ax1.set_ylabel("Proporition of Population (%)")
|
207 |
ax1.tick_params(axis="y")
|
208 |
|
209 |
+
plt.title("Comparison of Responses by IMD Quintile")
|
210 |
|
211 |
plt.tight_layout()
|
212 |
plt.savefig(Paths.SUMMARY / "figs" / "imd_decile.png")
|
213 |
|
214 |
|
215 |
+
def process_policies(step):
|
216 |
+
df = step["generate_final_documents"]["policies"]
|
217 |
+
|
218 |
+
all_policies = ""
|
219 |
+
for (theme, stance), policy in df.group_by(
|
220 |
+
["themes", "stance"], maintain_order=True
|
221 |
+
):
|
222 |
+
details = "".join(
|
223 |
+
f'\n### {row["policies"]}\n\n'
|
224 |
+
+ "".join(
|
225 |
+
f"- {detail} {doc_id}\n"
|
226 |
+
for detail, doc_id in zip(row["details"], row["doc_id"])
|
227 |
+
)
|
228 |
+
for row in policy.rows(named=True)
|
229 |
+
)
|
230 |
+
all_policies += f"## {theme} - {stance}\n\n{details}\n"
|
231 |
+
return all_policies
|
232 |
+
|
233 |
+
|
234 |
+
def build_quarto_summaries_doc(out):
|
235 |
+
full_text = "".join(
|
236 |
+
f"**Document ID**: {document['document'].metadata['index']}\n\n"
|
237 |
+
f"**Original Document**\n\n{document['document'].page_content}\n\n"
|
238 |
+
f"**Summarised Document**\n\n{document['summary'].summary}\n\n"
|
239 |
+
# f"**Identified Entities**\n\n{document['entities']}\n\n"
|
240 |
+
for document in out["generate_final_documents"]["documents"]
|
241 |
+
)
|
242 |
+
quarto_header = (
|
243 |
+
"---\n"
|
244 |
+
f"title: 'Summary Documents'\n"
|
245 |
+
"format:\n"
|
246 |
+
" PrettyPDF-pdf:\n"
|
247 |
+
" papersize: A4\n"
|
248 |
+
"execute:\n"
|
249 |
+
" freeze: auto\n"
|
250 |
+
" echo: false\n"
|
251 |
+
"monofont: 'JetBrains Mono'\n"
|
252 |
+
"monofontoptions:\n"
|
253 |
+
" - Scale=0.55\n"
|
254 |
+
"---\n\n"
|
255 |
+
)
|
256 |
+
with open(Paths.SUMMARY / "Summary_Documents.qmd", "w") as f:
|
257 |
+
f.write(f"{quarto_header}{full_text}")
|
258 |
+
|
259 |
+
|
260 |
def main():
|
261 |
+
docs = read_docs()[:200]
|
262 |
n_docs = len(docs)
|
263 |
|
264 |
logging.warning(f"{n_docs} documents being processed!")
|
|
|
265 |
app = create_graph()
|
266 |
|
267 |
step = None
|
|
|
271 |
if step is None:
|
272 |
raise ValueError("No steps were processed!")
|
273 |
|
274 |
+
postcodes = process_postcodes(step["generate_final_documents"]["documents"])
|
275 |
wards_pop(postcodes)
|
276 |
imd_bar(postcodes)
|
277 |
build_quarto_doc(doc_title, step)
|
278 |
+
build_quarto_summaries_doc(step)
|
279 |
return step
|
280 |
|
281 |
|
|
|
286 |
out = main()
|
287 |
toc = time.time()
|
288 |
|
289 |
+
out["generate_final_documents"]["documents"][0]
|
290 |
print(f"Time taken: {(toc - tic) / 60:.2f} minutes.")
|