cjber commited on
Commit
44a2630
·
1 Parent(s): 648f926

remove redundant code

Browse files
Files changed (1) hide show
  1. planning_ai/main.py +111 -149
planning_ai/main.py CHANGED
@@ -1,146 +1,21 @@
1
  import logging
2
- import os
3
- import re
4
  import time
5
- from collections import Counter
6
- from itertools import groupby
7
- from pathlib import Path
8
 
9
- # import geopandas as gpd
10
  import matplotlib.pyplot as plt
11
  import polars as pl
12
  from dotenv import load_dotenv
13
- from langchain_community.document_loaders import (
14
- DirectoryLoader,
15
- PolarsDataFrameLoader,
16
- TextLoader,
17
- )
18
- from langchain_text_splitters import CharacterTextSplitter, markdown
19
 
20
  from planning_ai.common.utils import Paths
21
  from planning_ai.graph import create_graph
22
- from planning_ai.themes import THEMES_AND_POLICIES
23
-
24
- # from opencage.geocoder import OpenCageGeocode
25
-
26
 
27
  load_dotenv()
28
 
29
 
30
- def _geocode_points(x):
31
- api_key = os.getenv("OPENCAGE_API_KEY")
32
- geocoder = OpenCageGeocode(key=api_key)
33
- out = geocoder.geocode(x)
34
- if out:
35
- return out[0]["geometry"]
36
- else:
37
- return {"lat": -99.0, "lng": -99.0}
38
-
39
-
40
- def map_locations(places_df: pl.DataFrame):
41
- lad = gpd.read_file(Paths.RAW / "LAD_BUC_2022.gpkg").to_crs("epsg:4326")
42
- lad_camb = lad[lad["LAD22NM"].str.contains("Cambridge")]
43
- places_df = places_df.with_columns(
44
- pl.col("Place")
45
- .map_elements(
46
- lambda x: _geocode_points(x),
47
- return_dtype=pl.Struct,
48
- )
49
- .alias("geometry")
50
- ).with_columns(pl.col("geometry").struct[0], pl.col("geometry").struct[1])
51
-
52
- places_pd = places_df.to_pandas()
53
- places_gdf = (
54
- gpd.GeoDataFrame(
55
- places_pd,
56
- geometry=gpd.points_from_xy(x=places_df["lng"], y=places_df["lat"]),
57
- )
58
- .set_crs("epsg:4326")
59
- .clip(lad)
60
- )
61
-
62
- _, ax = plt.subplots()
63
- lad.plot(ax=ax, color="white", edgecolor="gray")
64
- lad_camb.plot(ax=ax, color="white", edgecolor="black")
65
- places_gdf.plot(ax=ax, column="Mean Sentiment", markersize=5, legend=True)
66
-
67
- bounds = lad_camb.total_bounds
68
- buffer = 0.1
69
- ax.set_xlim([bounds[0] - buffer, bounds[2] + buffer])
70
- ax.set_ylim([bounds[1] - buffer, bounds[3] + buffer])
71
- plt.axis("off")
72
- plt.savefig(Paths.SUMMARY / "figs" / "places.png")
73
-
74
-
75
  def build_quarto_doc(doc_title, out):
76
  final = out["generate_final_summary"]
77
 
78
- # value_counts = Counter(aims)
79
- # total_values = sum(value_counts.values())
80
- # percentages = {
81
- # key: {"count": count, "percentage": (count / total_values)}
82
- # for key, count in value_counts.items()
83
- # }
84
- # top_5 = sorted(percentages.items(), key=lambda x: x[1]["percentage"], reverse=True)[
85
- # :5
86
- # ]
87
- # thematic_breakdown = "| **Aim** | **Percentage** | **Count** |\n|---|---|---|\n"
88
- # thematic_breakdown += "\n".join(
89
- # [f"| {item} | {d['percentage']:.2%} | {d['count']} |" for item, d in top_5]
90
- # )
91
- #
92
- # places_df = (
93
- # pl.DataFrame(
94
- # [
95
- # place.dict()
96
- # for summary in final["summaries_fixed"]
97
- # for place in summary["summary"].places
98
- # ]
99
- # )
100
- # .group_by("place")
101
- # .agg(
102
- # pl.col("place").len().alias("Count"),
103
- # pl.col("sentiment").mean().alias("Mean Sentiment"),
104
- # )
105
- # .rename({"place": "Place"})
106
- # )
107
- #
108
- # map_locations(places_df)
109
- #
110
- # places_breakdown = (
111
- # places_df.sort("Count", descending=True)
112
- # .head()
113
- # .to_pandas()
114
- # .to_markdown(index=False)
115
- # )
116
- #
117
- # stances = [summary["summary"].stance for summary in final["summaries_fixed"]]
118
- # value_counts = Counter(stances)
119
- # total_values = sum(value_counts.values())
120
- # percentages = {
121
- # key: {"count": count, "percentage": (count / total_values)}
122
- # for key, count in value_counts.items()
123
- # }
124
- # stances_top = sorted(
125
- # percentages.items(), key=lambda x: x[1]["percentage"], reverse=True
126
- # )
127
- # stances_breakdown = " | ".join(
128
- # [
129
- # f"**{item}**: {stance['percentage']:.2%} _({stance['count']})_"
130
- # for item, stance in stances_top
131
- # ]
132
- # )
133
- #
134
- # short_summaries = "\n\n".join(
135
- # [
136
- # f"#### **TODO**\n"
137
- # f"{summary['summary'].summary}\n\n"
138
- # f"**Stance**: {summary['summary'].stance}\n\n"
139
- # f"**Constructiveness**: {summary['summary'].rating}\n\n"
140
- # for summary in final["summaries_fixed"]
141
- # ]
142
- # )
143
-
144
  quarto_doc = (
145
  "---\n"
146
  f"title: '{doc_title}'\n"
@@ -154,21 +29,19 @@ def build_quarto_doc(doc_title, out):
154
  "monofontoptions:\n"
155
  " - Scale=0.55\n"
156
  "---\n\n"
157
- f"{final['final_summary']}\n\n"
158
- f"{final['policies']}"
159
- # f"{executive_summary}\n\n"
160
- # f"{stances_breakdown}\n\n"
161
- # "## Aim Breakdown\n\n"
162
- # "The aim breakdown identifies which aims are mentioned "
163
- # "within each response. "
164
- # "A single response may discuss multiple topics.\n"
165
- # f"\n\n{thematic_breakdown}\n\n"
166
- # f"\n\n{places_breakdown}\n\n"
167
- # f"![Locations mentioned by sentiment](./figs/places.png)\n\n"
168
- # "## Key points raised in support\n\n"
169
- # f"{key_points}\n\n"
170
- # "## Summaries\n"
171
- # f"{short_summaries}"
172
  )
173
 
174
  with open(Paths.SUMMARY / f"{doc_title.replace(' ', '_')}.qmd", "w") as f:
@@ -177,23 +50,111 @@ def build_quarto_doc(doc_title, out):
177
 
178
  def read_docs():
179
  df = pl.read_parquet(Paths.STAGING / "gcpt3.parquet")
180
- df = df.filter(
181
- pl.col("representations_document") == "Local Plan Issues and Options Report"
182
- ).unique("id")
 
 
 
 
183
  loader = PolarsDataFrameLoader(df, page_content_column="text")
184
 
185
  docs = list(
186
  {
187
  doc.page_content: {"document": doc, "filename": doc.metadata["id"]}
188
  for doc in loader.load()
189
- if doc.page_content and len(doc.page_content.split(" ")) > 5
190
  }.values()
191
  )
192
  return docs
193
 
194
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
195
  def main():
196
- docs = read_docs()
197
  n_docs = len(docs)
198
 
199
  logging.warning(f"{n_docs} documents being processed!")
@@ -219,9 +180,10 @@ if __name__ == "__main__":
219
 
220
  tic = time.time()
221
  out = main()
 
 
 
222
  build_quarto_doc(doc_title, out)
223
- print(out["generate_final_summary"]["final_summary"])
224
-
225
  toc = time.time()
226
 
227
  print(f"Time taken: {(toc - tic) / 60:.2f} minutes.")
 
1
  import logging
 
 
2
  import time
 
 
 
3
 
4
+ import geopandas as gpd
5
  import matplotlib.pyplot as plt
6
  import polars as pl
7
  from dotenv import load_dotenv
8
+ from langchain_community.document_loaders import PolarsDataFrameLoader
 
 
 
 
 
9
 
10
  from planning_ai.common.utils import Paths
11
  from planning_ai.graph import create_graph
 
 
 
 
12
 
13
  load_dotenv()
14
 
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  def build_quarto_doc(doc_title, out):
17
  final = out["generate_final_summary"]
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  quarto_doc = (
20
  "---\n"
21
  f"title: '{doc_title}'\n"
 
29
  "monofontoptions:\n"
30
  " - Scale=0.55\n"
31
  "---\n\n"
32
+ f"{final['executive']}\n\n"
33
+ "# Figures\n\n"
34
+ "Figure @fig-wards shows the percentage of responses by total population"
35
+ " within each Ward that had at least one response.\n\n"
36
+ f"![Ward Proportions](./figs/wards.png){{#fig-wards}}\n\n"
37
+ "Figure @fig-imd shows the percentage of responses by total population"
38
+ " within each IMD quintile.\n\n"
39
+ f"![IMD Quintile Props](./figs/imd_decile.png){{#fig-imd}}\n\n"
40
+ "# Themes and Policies\n\n"
41
+ "## Support\n\n"
42
+ f"{final['policies_support']}"
43
+ "## Object\n\n"
44
+ f"{final['policies_object']}"
 
 
45
  )
46
 
47
  with open(Paths.SUMMARY / f"{doc_title.replace(' ', '_')}.qmd", "w") as f:
 
50
 
51
  def read_docs():
52
  df = pl.read_parquet(Paths.STAGING / "gcpt3.parquet")
53
+ df = (
54
+ df.filter(
55
+ pl.col("representations_document") == "Local Plan Issues and Options Report"
56
+ )
57
+ .unique("id")
58
+ .with_row_index()
59
+ )
60
  loader = PolarsDataFrameLoader(df, page_content_column="text")
61
 
62
  docs = list(
63
  {
64
  doc.page_content: {"document": doc, "filename": doc.metadata["id"]}
65
  for doc in loader.load()
66
+ if doc.page_content and len(doc.page_content.split(" ")) > 25
67
  }.values()
68
  )
69
  return docs
70
 
71
 
72
+ def process_postcodes(documents):
73
+ postcodes = [doc["document"].metadata["respondentpostcode"] for doc in documents]
74
+ postcodes = (
75
+ pl.DataFrame({"postcode": postcodes})["postcode"]
76
+ .value_counts()
77
+ .with_columns(pl.col("postcode").str.replace_all(" ", ""))
78
+ )
79
+ onspd = pl.read_csv(
80
+ "./data/raw/onspd/ONSPD_FEB_2024.csv", columns=["PCD", "OSWARD", "LSOA11"]
81
+ ).with_columns(pl.col("PCD").str.replace_all(" ", "").alias("postcode"))
82
+ postcodes = postcodes.join(onspd, on="postcode")
83
+ return postcodes
84
+
85
+
86
+ def wards_pop(postcodes):
87
+ wards = (
88
+ pl.read_csv("./data/raw/TS001-2021-3-filtered-2025-01-09T11_07_15Z.csv")
89
+ .with_columns(pl.col("Electoral wards and divisions Code").alias("OSWARD"))
90
+ .group_by("OSWARD")
91
+ .sum()
92
+ )
93
+ postcodes = postcodes.join(wards, on="OSWARD").with_columns(
94
+ ((pl.col("count") / pl.col("Observation")) * 100).alias("prop")
95
+ )
96
+ ward_boundaries = gpd.read_file(
97
+ "./data/raw/Wards_December_2021_GB_BFE_2022_7523259277605796091.zip"
98
+ )
99
+ ward_boundaries = ward_boundaries.merge(
100
+ postcodes.to_pandas(), left_on="WD21CD", right_on="OSWARD"
101
+ )
102
+
103
+ _, ax = plt.subplots()
104
+ ward_boundaries.plot(ax=ax, column="prop", legend=True)
105
+
106
+ plt.axis("off")
107
+ plt.savefig(Paths.SUMMARY / "figs" / "wards.png")
108
+
109
+
110
+ def imd_bar(postcodes):
111
+ # Load the IMD data
112
+ imd = pl.read_csv(
113
+ "./data/raw/uk_imd2019.csv", columns=["LSOA", "LA_decile"]
114
+ ).with_columns(((pl.col("LA_decile") - 1) // 2) + 1)
115
+ pops = pl.read_excel(
116
+ "./data/raw/sapelsoabroadage20112022.xlsx",
117
+ sheet_name="Mid-2022 LSOA 2021",
118
+ read_options={"header_row": 3},
119
+ columns=["LSOA 2021 Code", "Total"],
120
+ )
121
+
122
+ # Join the postcodes data with IMD decile data
123
+ postcodes = (
124
+ postcodes.join(imd, left_on="LSOA11", right_on="LSOA")
125
+ .join(pops, left_on="LSOA11", right_on="LSOA 2021 Code")
126
+ .group_by("LA_decile")
127
+ .agg(pl.col("count").sum(), pl.col("LSOA11").count(), pl.col("Total").sum())
128
+ .sort("LA_decile")
129
+ .with_columns(((pl.col("count") / pl.col("Total")) * 100).alias("prop"))
130
+ )
131
+
132
+ # Convert the Polars DataFrame to a Pandas DataFrame for plotting
133
+ postcodes_pd = postcodes.to_pandas()
134
+
135
+ # Create a figure with two y-axes
136
+ fig, ax1 = plt.subplots()
137
+
138
+ # Plot the number of responses
139
+ ax1.bar(
140
+ postcodes_pd["LA_decile"],
141
+ postcodes_pd["prop"],
142
+ label="Percentage of Population (%)",
143
+ )
144
+ ax1.set_xlabel("IMD Quintile")
145
+ ax1.set_ylabel("Proporition of Population (%)")
146
+ ax1.tick_params(axis="y")
147
+
148
+ plt.title("Comparison of Responses by IMD Decile")
149
+
150
+ # Save the figure
151
+ plt.tight_layout()
152
+ plt.savefig(Paths.SUMMARY / "figs" / "imd_decile.png")
153
+ # plt.show()
154
+
155
+
156
  def main():
157
+ docs = read_docs()[:500]
158
  n_docs = len(docs)
159
 
160
  logging.warning(f"{n_docs} documents being processed!")
 
180
 
181
  tic = time.time()
182
  out = main()
183
+ postcodes = process_postcodes(out["generate_final_summary"]["documents"])
184
+ wards_pop(postcodes)
185
+ imd_bar(postcodes)
186
  build_quarto_doc(doc_title, out)
 
 
187
  toc = time.time()
188
 
189
  print(f"Time taken: {(toc - tic) / 60:.2f} minutes.")