cjber commited on
Commit
2d9d5d2
·
1 Parent(s): 76aaf8d

refactor to use Paths class

Browse files
Files changed (1) hide show
  1. planning_ai/main.py +12 -24
planning_ai/main.py CHANGED
@@ -59,14 +59,13 @@ def read_docs():
59
  )
60
  loader = PolarsDataFrameLoader(df, page_content_column="text")
61
 
62
- docs = list(
63
  {
64
  doc.page_content: {"document": doc, "filename": doc.metadata["id"]}
65
  for doc in loader.load()
66
  if doc.page_content and len(doc.page_content.split(" ")) > 25
67
  }.values()
68
  )
69
- return docs
70
 
71
 
72
  def process_postcodes(documents):
@@ -77,7 +76,7 @@ def process_postcodes(documents):
77
  .with_columns(pl.col("postcode").str.replace_all(" ", ""))
78
  )
79
  onspd = pl.read_csv(
80
- "./data/raw/onspd/ONSPD_FEB_2024.csv", columns=["PCD", "OSWARD", "LSOA11"]
81
  ).with_columns(pl.col("PCD").str.replace_all(" ", "").alias("postcode"))
82
  postcodes = postcodes.join(onspd, on="postcode")
83
  return postcodes
@@ -85,7 +84,7 @@ def process_postcodes(documents):
85
 
86
  def wards_pop(postcodes):
87
  wards = (
88
- pl.read_csv("./data/raw/TS001-2021-3-filtered-2025-01-09T11_07_15Z.csv")
89
  .with_columns(pl.col("Electoral wards and divisions Code").alias("OSWARD"))
90
  .group_by("OSWARD")
91
  .sum()
@@ -94,7 +93,7 @@ def wards_pop(postcodes):
94
  ((pl.col("count") / pl.col("Observation")) * 100).alias("prop")
95
  )
96
  ward_boundaries = gpd.read_file(
97
- "./data/raw/Wards_December_2021_GB_BFE_2022_7523259277605796091.zip"
98
  )
99
  ward_boundaries = ward_boundaries.merge(
100
  postcodes.to_pandas(), left_on="WD21CD", right_on="OSWARD"
@@ -108,18 +107,16 @@ def wards_pop(postcodes):
108
 
109
 
110
  def imd_bar(postcodes):
111
- # Load the IMD data
112
  imd = pl.read_csv(
113
- "./data/raw/uk_imd2019.csv", columns=["LSOA", "LA_decile"]
114
  ).with_columns(((pl.col("LA_decile") - 1) // 2) + 1)
115
  pops = pl.read_excel(
116
- "./data/raw/sapelsoabroadage20112022.xlsx",
117
  sheet_name="Mid-2022 LSOA 2021",
118
  read_options={"header_row": 3},
119
  columns=["LSOA 2021 Code", "Total"],
120
  )
121
 
122
- # Join the postcodes data with IMD decile data
123
  postcodes = (
124
  postcodes.join(imd, left_on="LSOA11", right_on="LSOA")
125
  .join(pops, left_on="LSOA11", right_on="LSOA 2021 Code")
@@ -129,13 +126,10 @@ def imd_bar(postcodes):
129
  .with_columns(((pl.col("count") / pl.col("Total")) * 100).alias("prop"))
130
  )
131
 
132
- # Convert the Polars DataFrame to a Pandas DataFrame for plotting
133
  postcodes_pd = postcodes.to_pandas()
134
 
135
- # Create a figure with two y-axes
136
- fig, ax1 = plt.subplots()
137
 
138
- # Plot the number of responses
139
  ax1.bar(
140
  postcodes_pd["LA_decile"],
141
  postcodes_pd["prop"],
@@ -147,10 +141,8 @@ def imd_bar(postcodes):
147
 
148
  plt.title("Comparison of Responses by IMD Decile")
149
 
150
- # Save the figure
151
  plt.tight_layout()
152
  plt.savefig(Paths.SUMMARY / "figs" / "imd_decile.png")
153
- # plt.show()
154
 
155
 
156
  def main():
@@ -159,11 +151,6 @@ def main():
159
 
160
  logging.warning(f"{n_docs} documents being processed!")
161
 
162
- # text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
163
- # chunk_size=10_240, chunk_overlap=0
164
- # )
165
- # split_docs = text_splitter.split_documents(docs)
166
-
167
  app = create_graph()
168
 
169
  step = None
@@ -172,6 +159,11 @@ def main():
172
 
173
  if step is None:
174
  raise ValueError("No steps were processed!")
 
 
 
 
 
175
  return step
176
 
177
 
@@ -180,10 +172,6 @@ if __name__ == "__main__":
180
 
181
  tic = time.time()
182
  out = main()
183
- postcodes = process_postcodes(out["generate_final_summary"]["documents"])
184
- wards_pop(postcodes)
185
- imd_bar(postcodes)
186
- build_quarto_doc(doc_title, out)
187
  toc = time.time()
188
 
189
  print(f"Time taken: {(toc - tic) / 60:.2f} minutes.")
 
59
  )
60
  loader = PolarsDataFrameLoader(df, page_content_column="text")
61
 
62
+ return list(
63
  {
64
  doc.page_content: {"document": doc, "filename": doc.metadata["id"]}
65
  for doc in loader.load()
66
  if doc.page_content and len(doc.page_content.split(" ")) > 25
67
  }.values()
68
  )
 
69
 
70
 
71
  def process_postcodes(documents):
 
76
  .with_columns(pl.col("postcode").str.replace_all(" ", ""))
77
  )
78
  onspd = pl.read_csv(
79
+ Paths.RAW / "onspd" / "ONSPD_FEB_2024.csv", columns=["PCD", "OSWARD", "LSOA11"]
80
  ).with_columns(pl.col("PCD").str.replace_all(" ", "").alias("postcode"))
81
  postcodes = postcodes.join(onspd, on="postcode")
82
  return postcodes
 
84
 
85
  def wards_pop(postcodes):
86
  wards = (
87
+ pl.read_csv(Paths.RAW / "TS001-2021-3-filtered-2025-01-09T11_07_15Z.csv")
88
  .with_columns(pl.col("Electoral wards and divisions Code").alias("OSWARD"))
89
  .group_by("OSWARD")
90
  .sum()
 
93
  ((pl.col("count") / pl.col("Observation")) * 100).alias("prop")
94
  )
95
  ward_boundaries = gpd.read_file(
96
+ Paths.RAW / "Wards_December_2021_GB_BFE_2022_7523259277605796091.zip"
97
  )
98
  ward_boundaries = ward_boundaries.merge(
99
  postcodes.to_pandas(), left_on="WD21CD", right_on="OSWARD"
 
107
 
108
 
109
  def imd_bar(postcodes):
 
110
  imd = pl.read_csv(
111
+ Paths.RAW / "uk_imd2019.csv", columns=["LSOA", "LA_decile"]
112
  ).with_columns(((pl.col("LA_decile") - 1) // 2) + 1)
113
  pops = pl.read_excel(
114
+ Paths.RAW / "sapelsoabroadage20112022.xlsx",
115
  sheet_name="Mid-2022 LSOA 2021",
116
  read_options={"header_row": 3},
117
  columns=["LSOA 2021 Code", "Total"],
118
  )
119
 
 
120
  postcodes = (
121
  postcodes.join(imd, left_on="LSOA11", right_on="LSOA")
122
  .join(pops, left_on="LSOA11", right_on="LSOA 2021 Code")
 
126
  .with_columns(((pl.col("count") / pl.col("Total")) * 100).alias("prop"))
127
  )
128
 
 
129
  postcodes_pd = postcodes.to_pandas()
130
 
131
+ _, ax1 = plt.subplots()
 
132
 
 
133
  ax1.bar(
134
  postcodes_pd["LA_decile"],
135
  postcodes_pd["prop"],
 
141
 
142
  plt.title("Comparison of Responses by IMD Decile")
143
 
 
144
  plt.tight_layout()
145
  plt.savefig(Paths.SUMMARY / "figs" / "imd_decile.png")
 
146
 
147
 
148
  def main():
 
151
 
152
  logging.warning(f"{n_docs} documents being processed!")
153
 
 
 
 
 
 
154
  app = create_graph()
155
 
156
  step = None
 
159
 
160
  if step is None:
161
  raise ValueError("No steps were processed!")
162
+
163
+ postcodes = process_postcodes(step["generate_final_summary"]["documents"])
164
+ wards_pop(postcodes)
165
+ imd_bar(postcodes)
166
+ build_quarto_doc(doc_title, step)
167
  return step
168
 
169
 
 
172
 
173
  tic = time.time()
174
  out = main()
 
 
 
 
175
  toc = time.time()
176
 
177
  print(f"Time taken: {(toc - tic) / 60:.2f} minutes.")