cjber commited on
Commit
d1abf02
·
1 Parent(s): 1af4802

feat: add second document for summaries

Browse files
Files changed (1) hide show
  1. planning_ai/main.py +120 -59
planning_ai/main.py CHANGED
@@ -18,7 +18,8 @@ load_dotenv()
18
 
19
 
20
  def build_quarto_doc(doc_title, out):
21
- final = out["generate_final_summary"]
 
22
 
23
  quarto_doc = (
24
  "---\n"
@@ -34,18 +35,16 @@ def build_quarto_doc(doc_title, out):
34
  " - Scale=0.55\n"
35
  "---\n\n"
36
  f"{final['executive']}\n\n"
37
- "# Figures\n\n"
38
- "Figure @fig-wards shows the percentage of responses by total population"
 
39
  " within each Ward that had at least one response.\n\n"
40
  f"![Ward Proportions](./figs/wards.png){{#fig-wards}}\n\n"
41
- "Figure @fig-imd shows the percentage of responses by total population"
42
  " within each IMD quintile.\n\n"
43
  f"![IMD Quintile Props](./figs/imd_decile.png){{#fig-imd}}\n\n"
44
  "# Themes and Policies\n\n"
45
- "## Support\n\n"
46
- f"{final['policies_support']}"
47
- "## Object\n\n"
48
- f"{final['policies_object']}"
49
  )
50
 
51
  with open(Paths.SUMMARY / f"{doc_title.replace(' ', '_')}.qmd", "w") as f:
@@ -53,43 +52,45 @@ def build_quarto_doc(doc_title, out):
53
 
54
 
55
  def read_docs():
56
- df = pl.read_parquet(Paths.STAGING / "gcpt3.parquet")
57
- pdf_ids = [
58
- int(pdf.stem) if pdf.stem.isdigit() else 0
59
- for pdf in (Paths.STAGING / "pdfs_azure").glob("*.pdf")
60
- ]
61
- pdf_loader = PyPDFDirectoryLoader(Paths.STAGING / "pdfs_azure", silent_errors=True)
62
- out = pdf_loader.load()
63
-
64
- pdfs_combined = {}
65
- for page in out:
66
- id = Path(page.metadata["source"]).stem
67
- if id in pdfs_combined:
68
- pdfs_combined[id] = pdfs_combined[id] + page.page_content
69
- else:
70
- pdfs_combined[id] = page.page_content
71
-
72
- pdfs_combined = (
73
- pl.from_dict(pdfs_combined)
74
- .transpose(include_header=True)
75
- .rename({"column": "attachments_id", "column_0": "pdf_text"})
76
- .with_columns(pl.col("attachments_id").cast(int))
77
- )
78
-
79
- df = (
80
- df.filter(
81
- (
82
- pl.col("representations_document")
83
- == "Greater Cambridge Local Plan Preferred Options"
84
- )
85
- & (pl.col("attachments_id").is_in(pdf_ids))
86
- )
87
- .unique("id")
88
- .with_row_index()
89
- )
90
- df = df.join(pdfs_combined, on="attachments_id").with_columns(
91
- pl.col("text") + "\n\n" + pl.col("pdf_text")
92
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
  loader = PolarsDataFrameLoader(df, page_content_column="text")
95
 
@@ -119,8 +120,13 @@ def process_postcodes(documents):
119
  def wards_pop(postcodes):
120
  wards = (
121
  pl.read_csv(Paths.RAW / "TS001-2021-3-filtered-2025-01-09T11_07_15Z.csv")
122
- .with_columns(pl.col("Electoral wards and divisions Code").alias("OSWARD"))
123
- .group_by("OSWARD")
 
 
 
 
 
124
  .sum()
125
  )
126
  postcodes = postcodes.join(wards, on="OSWARD").with_columns(
@@ -129,13 +135,23 @@ def wards_pop(postcodes):
129
  ward_boundaries = gpd.read_file(
130
  Paths.RAW / "Wards_December_2021_GB_BFE_2022_7523259277605796091.zip"
131
  )
132
- camb_ward_codes = (
133
- wards.filter(pl.col("Electoral wards and divisions").str.contains("Cambridge"))[
134
- "Electoral wards and divisions Code"
135
- ]
136
- .unique()
137
- .to_list()
138
- )
 
 
 
 
 
 
 
 
 
 
139
  camb_ward_boundaries = ward_boundaries[
140
  ward_boundaries["WD21CD"].isin(camb_ward_codes)
141
  ]
@@ -148,9 +164,8 @@ def wards_pop(postcodes):
148
  camb_ward_boundaries.plot(ax=ax, color="white", edgecolor="black")
149
  ward_boundaries_prop.plot(ax=ax, column="prop", legend=True)
150
 
151
- __import__("ipdb").set_trace()
152
  bounds = camb_ward_boundaries.total_bounds
153
- buffer = 0.1
154
  ax.set_xlim([bounds[0] - buffer, bounds[2] + buffer])
155
  ax.set_ylim([bounds[1] - buffer, bounds[3] + buffer])
156
 
@@ -191,18 +206,62 @@ def imd_bar(postcodes):
191
  ax1.set_ylabel("Proporition of Population (%)")
192
  ax1.tick_params(axis="y")
193
 
194
- plt.title("Comparison of Responses by IMD Decile")
195
 
196
  plt.tight_layout()
197
  plt.savefig(Paths.SUMMARY / "figs" / "imd_decile.png")
198
 
199
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  def main():
201
- docs = read_docs()[:5]
202
  n_docs = len(docs)
203
 
204
  logging.warning(f"{n_docs} documents being processed!")
205
-
206
  app = create_graph()
207
 
208
  step = None
@@ -212,10 +271,11 @@ def main():
212
  if step is None:
213
  raise ValueError("No steps were processed!")
214
 
215
- postcodes = process_postcodes(step["generate_final_summary"]["documents"])
216
  wards_pop(postcodes)
217
  imd_bar(postcodes)
218
  build_quarto_doc(doc_title, step)
 
219
  return step
220
 
221
 
@@ -226,4 +286,5 @@ if __name__ == "__main__":
226
  out = main()
227
  toc = time.time()
228
 
 
229
  print(f"Time taken: {(toc - tic) / 60:.2f} minutes.")
 
18
 
19
 
20
  def build_quarto_doc(doc_title, out):
21
+ final = out["generate_final_documents"]
22
+ policies = process_policies(out)
23
 
24
  quarto_doc = (
25
  "---\n"
 
35
  " - Scale=0.55\n"
36
  "---\n\n"
37
  f"{final['executive']}\n\n"
38
+ r"\newpage"
39
+ "\n# Figures\n\n"
40
+ "@fig-wards shows the percentage of responses by total population"
41
  " within each Ward that had at least one response.\n\n"
42
  f"![Ward Proportions](./figs/wards.png){{#fig-wards}}\n\n"
43
+ "@fig-imd shows the percentage of responses by total population"
44
  " within each IMD quintile.\n\n"
45
  f"![IMD Quintile Props](./figs/imd_decile.png){{#fig-imd}}\n\n"
46
  "# Themes and Policies\n\n"
47
+ f"{policies}"
 
 
 
48
  )
49
 
50
  with open(Paths.SUMMARY / f"{doc_title.replace(' ', '_')}.qmd", "w") as f:
 
52
 
53
 
54
  def read_docs():
55
+ df = pl.read_parquet(Paths.STAGING / "gcpt3_testing.parquet").drop_nulls(
56
+ subset="text"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  )
58
+ # pdf_ids = [
59
+ # int(pdf.stem) if pdf.stem.isdigit() else 0
60
+ # for pdf in (Paths.STAGING / "pdfs_azure").glob("*.pdf")
61
+ # ]
62
+ # pdf_loader = PyPDFDirectoryLoader(Paths.STAGING / "pdfs_azure", silent_errors=True)
63
+ # out = pdf_loader.load()
64
+ #
65
+ # pdfs_combined = {}
66
+ # for page in out:
67
+ # id = Path(page.metadata["source"]).stem
68
+ # if id in pdfs_combined:
69
+ # pdfs_combined[id] = pdfs_combined[id] + page.page_content
70
+ # else:
71
+ # pdfs_combined[id] = page.page_content
72
+ #
73
+ # pdfs_combined = (
74
+ # pl.from_dict(pdfs_combined)
75
+ # .transpose(include_header=True)
76
+ # .rename({"column": "attachments_id", "column_0": "pdf_text"})
77
+ # .with_columns(pl.col("attachments_id").cast(int))
78
+ # )
79
+ #
80
+ # df = (
81
+ # df.filter(
82
+ # (
83
+ # pl.col("representations_document")
84
+ # == "Greater Cambridge Local Plan Preferred Options"
85
+ # )
86
+ # & (pl.col("attachments_id").is_in(pdf_ids))
87
+ # )
88
+ # .unique("id")
89
+ # .with_row_index()
90
+ # )
91
+ # df = df.join(pdfs_combined, on="attachments_id").with_columns(
92
+ # pl.col("text") + "\n\n" + pl.col("pdf_text")
93
+ # )
94
 
95
  loader = PolarsDataFrameLoader(df, page_content_column="text")
96
 
 
120
  def wards_pop(postcodes):
121
  wards = (
122
  pl.read_csv(Paths.RAW / "TS001-2021-3-filtered-2025-01-09T11_07_15Z.csv")
123
+ .rename(
124
+ {
125
+ "Electoral wards and divisions Code": "OSWARD",
126
+ "Electoral wards and divisions": "WARDNAME",
127
+ }
128
+ )
129
+ .group_by(["OSWARD", "WARDNAME"])
130
  .sum()
131
  )
132
  postcodes = postcodes.join(wards, on="OSWARD").with_columns(
 
135
  ward_boundaries = gpd.read_file(
136
  Paths.RAW / "Wards_December_2021_GB_BFE_2022_7523259277605796091.zip"
137
  )
138
+
139
+ camb_ward_codes = [
140
+ "E05013050",
141
+ "E05013051",
142
+ "E05013052",
143
+ "E05013053",
144
+ "E05013054",
145
+ "E05013055",
146
+ "E05013056",
147
+ "E05013057",
148
+ "E05013058",
149
+ "E05013059",
150
+ "E05013060",
151
+ "E05013061",
152
+ "E05013062",
153
+ "E05013063",
154
+ ]
155
  camb_ward_boundaries = ward_boundaries[
156
  ward_boundaries["WD21CD"].isin(camb_ward_codes)
157
  ]
 
164
  camb_ward_boundaries.plot(ax=ax, color="white", edgecolor="black")
165
  ward_boundaries_prop.plot(ax=ax, column="prop", legend=True)
166
 
 
167
  bounds = camb_ward_boundaries.total_bounds
168
+ buffer = 1000
169
  ax.set_xlim([bounds[0] - buffer, bounds[2] + buffer])
170
  ax.set_ylim([bounds[1] - buffer, bounds[3] + buffer])
171
 
 
206
  ax1.set_ylabel("Proporition of Population (%)")
207
  ax1.tick_params(axis="y")
208
 
209
+ plt.title("Comparison of Responses by IMD Quintile")
210
 
211
  plt.tight_layout()
212
  plt.savefig(Paths.SUMMARY / "figs" / "imd_decile.png")
213
 
214
 
215
+ def process_policies(step):
216
+ df = step["generate_final_documents"]["policies"]
217
+
218
+ all_policies = ""
219
+ for (theme, stance), policy in df.group_by(
220
+ ["themes", "stance"], maintain_order=True
221
+ ):
222
+ details = "".join(
223
+ f'\n### {row["policies"]}\n\n'
224
+ + "".join(
225
+ f"- {detail} {doc_id}\n"
226
+ for detail, doc_id in zip(row["details"], row["doc_id"])
227
+ )
228
+ for row in policy.rows(named=True)
229
+ )
230
+ all_policies += f"## {theme} - {stance}\n\n{details}\n"
231
+ return all_policies
232
+
233
+
234
+ def build_quarto_summaries_doc(out):
235
+ full_text = "".join(
236
+ f"**Document ID**: {document['document'].metadata['index']}\n\n"
237
+ f"**Original Document**\n\n{document['document'].page_content}\n\n"
238
+ f"**Summarised Document**\n\n{document['summary'].summary}\n\n"
239
+ # f"**Identified Entities**\n\n{document['entities']}\n\n"
240
+ for document in out["generate_final_documents"]["documents"]
241
+ )
242
+ quarto_header = (
243
+ "---\n"
244
+ f"title: 'Summary Documents'\n"
245
+ "format:\n"
246
+ " PrettyPDF-pdf:\n"
247
+ " papersize: A4\n"
248
+ "execute:\n"
249
+ " freeze: auto\n"
250
+ " echo: false\n"
251
+ "monofont: 'JetBrains Mono'\n"
252
+ "monofontoptions:\n"
253
+ " - Scale=0.55\n"
254
+ "---\n\n"
255
+ )
256
+ with open(Paths.SUMMARY / "Summary_Documents.qmd", "w") as f:
257
+ f.write(f"{quarto_header}{full_text}")
258
+
259
+
260
  def main():
261
+ docs = read_docs()[:200]
262
  n_docs = len(docs)
263
 
264
  logging.warning(f"{n_docs} documents being processed!")
 
265
  app = create_graph()
266
 
267
  step = None
 
271
  if step is None:
272
  raise ValueError("No steps were processed!")
273
 
274
+ postcodes = process_postcodes(step["generate_final_documents"]["documents"])
275
  wards_pop(postcodes)
276
  imd_bar(postcodes)
277
  build_quarto_doc(doc_title, step)
278
+ build_quarto_summaries_doc(step)
279
  return step
280
 
281
 
 
286
  out = main()
287
  toc = time.time()
288
 
289
+ out["generate_final_documents"]["documents"][0]
290
  print(f"Time taken: {(toc - tic) / 60:.2f} minutes.")