Spaces:
Build error
Build error
refactor to use Paths class
Browse files- planning_ai/main.py +12 -24
planning_ai/main.py
CHANGED
@@ -59,14 +59,13 @@ def read_docs():
|
|
59 |
)
|
60 |
loader = PolarsDataFrameLoader(df, page_content_column="text")
|
61 |
|
62 |
-
|
63 |
{
|
64 |
doc.page_content: {"document": doc, "filename": doc.metadata["id"]}
|
65 |
for doc in loader.load()
|
66 |
if doc.page_content and len(doc.page_content.split(" ")) > 25
|
67 |
}.values()
|
68 |
)
|
69 |
-
return docs
|
70 |
|
71 |
|
72 |
def process_postcodes(documents):
|
@@ -77,7 +76,7 @@ def process_postcodes(documents):
|
|
77 |
.with_columns(pl.col("postcode").str.replace_all(" ", ""))
|
78 |
)
|
79 |
onspd = pl.read_csv(
|
80 |
-
"
|
81 |
).with_columns(pl.col("PCD").str.replace_all(" ", "").alias("postcode"))
|
82 |
postcodes = postcodes.join(onspd, on="postcode")
|
83 |
return postcodes
|
@@ -85,7 +84,7 @@ def process_postcodes(documents):
|
|
85 |
|
86 |
def wards_pop(postcodes):
|
87 |
wards = (
|
88 |
-
pl.read_csv("
|
89 |
.with_columns(pl.col("Electoral wards and divisions Code").alias("OSWARD"))
|
90 |
.group_by("OSWARD")
|
91 |
.sum()
|
@@ -94,7 +93,7 @@ def wards_pop(postcodes):
|
|
94 |
((pl.col("count") / pl.col("Observation")) * 100).alias("prop")
|
95 |
)
|
96 |
ward_boundaries = gpd.read_file(
|
97 |
-
"
|
98 |
)
|
99 |
ward_boundaries = ward_boundaries.merge(
|
100 |
postcodes.to_pandas(), left_on="WD21CD", right_on="OSWARD"
|
@@ -108,18 +107,16 @@ def wards_pop(postcodes):
|
|
108 |
|
109 |
|
110 |
def imd_bar(postcodes):
|
111 |
-
# Load the IMD data
|
112 |
imd = pl.read_csv(
|
113 |
-
"
|
114 |
).with_columns(((pl.col("LA_decile") - 1) // 2) + 1)
|
115 |
pops = pl.read_excel(
|
116 |
-
"
|
117 |
sheet_name="Mid-2022 LSOA 2021",
|
118 |
read_options={"header_row": 3},
|
119 |
columns=["LSOA 2021 Code", "Total"],
|
120 |
)
|
121 |
|
122 |
-
# Join the postcodes data with IMD decile data
|
123 |
postcodes = (
|
124 |
postcodes.join(imd, left_on="LSOA11", right_on="LSOA")
|
125 |
.join(pops, left_on="LSOA11", right_on="LSOA 2021 Code")
|
@@ -129,13 +126,10 @@ def imd_bar(postcodes):
|
|
129 |
.with_columns(((pl.col("count") / pl.col("Total")) * 100).alias("prop"))
|
130 |
)
|
131 |
|
132 |
-
# Convert the Polars DataFrame to a Pandas DataFrame for plotting
|
133 |
postcodes_pd = postcodes.to_pandas()
|
134 |
|
135 |
-
|
136 |
-
fig, ax1 = plt.subplots()
|
137 |
|
138 |
-
# Plot the number of responses
|
139 |
ax1.bar(
|
140 |
postcodes_pd["LA_decile"],
|
141 |
postcodes_pd["prop"],
|
@@ -147,10 +141,8 @@ def imd_bar(postcodes):
|
|
147 |
|
148 |
plt.title("Comparison of Responses by IMD Decile")
|
149 |
|
150 |
-
# Save the figure
|
151 |
plt.tight_layout()
|
152 |
plt.savefig(Paths.SUMMARY / "figs" / "imd_decile.png")
|
153 |
-
# plt.show()
|
154 |
|
155 |
|
156 |
def main():
|
@@ -159,11 +151,6 @@ def main():
|
|
159 |
|
160 |
logging.warning(f"{n_docs} documents being processed!")
|
161 |
|
162 |
-
# text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
|
163 |
-
# chunk_size=10_240, chunk_overlap=0
|
164 |
-
# )
|
165 |
-
# split_docs = text_splitter.split_documents(docs)
|
166 |
-
|
167 |
app = create_graph()
|
168 |
|
169 |
step = None
|
@@ -172,6 +159,11 @@ def main():
|
|
172 |
|
173 |
if step is None:
|
174 |
raise ValueError("No steps were processed!")
|
|
|
|
|
|
|
|
|
|
|
175 |
return step
|
176 |
|
177 |
|
@@ -180,10 +172,6 @@ if __name__ == "__main__":
|
|
180 |
|
181 |
tic = time.time()
|
182 |
out = main()
|
183 |
-
postcodes = process_postcodes(out["generate_final_summary"]["documents"])
|
184 |
-
wards_pop(postcodes)
|
185 |
-
imd_bar(postcodes)
|
186 |
-
build_quarto_doc(doc_title, out)
|
187 |
toc = time.time()
|
188 |
|
189 |
print(f"Time taken: {(toc - tic) / 60:.2f} minutes.")
|
|
|
59 |
)
|
60 |
loader = PolarsDataFrameLoader(df, page_content_column="text")
|
61 |
|
62 |
+
return list(
|
63 |
{
|
64 |
doc.page_content: {"document": doc, "filename": doc.metadata["id"]}
|
65 |
for doc in loader.load()
|
66 |
if doc.page_content and len(doc.page_content.split(" ")) > 25
|
67 |
}.values()
|
68 |
)
|
|
|
69 |
|
70 |
|
71 |
def process_postcodes(documents):
|
|
|
76 |
.with_columns(pl.col("postcode").str.replace_all(" ", ""))
|
77 |
)
|
78 |
onspd = pl.read_csv(
|
79 |
+
Paths.RAW / "onspd" / "ONSPD_FEB_2024.csv", columns=["PCD", "OSWARD", "LSOA11"]
|
80 |
).with_columns(pl.col("PCD").str.replace_all(" ", "").alias("postcode"))
|
81 |
postcodes = postcodes.join(onspd, on="postcode")
|
82 |
return postcodes
|
|
|
84 |
|
85 |
def wards_pop(postcodes):
|
86 |
wards = (
|
87 |
+
pl.read_csv(Paths.RAW / "TS001-2021-3-filtered-2025-01-09T11_07_15Z.csv")
|
88 |
.with_columns(pl.col("Electoral wards and divisions Code").alias("OSWARD"))
|
89 |
.group_by("OSWARD")
|
90 |
.sum()
|
|
|
93 |
((pl.col("count") / pl.col("Observation")) * 100).alias("prop")
|
94 |
)
|
95 |
ward_boundaries = gpd.read_file(
|
96 |
+
Paths.RAW / "Wards_December_2021_GB_BFE_2022_7523259277605796091.zip"
|
97 |
)
|
98 |
ward_boundaries = ward_boundaries.merge(
|
99 |
postcodes.to_pandas(), left_on="WD21CD", right_on="OSWARD"
|
|
|
107 |
|
108 |
|
109 |
def imd_bar(postcodes):
|
|
|
110 |
imd = pl.read_csv(
|
111 |
+
Paths.RAW / "uk_imd2019.csv", columns=["LSOA", "LA_decile"]
|
112 |
).with_columns(((pl.col("LA_decile") - 1) // 2) + 1)
|
113 |
pops = pl.read_excel(
|
114 |
+
Paths.RAW / "sapelsoabroadage20112022.xlsx",
|
115 |
sheet_name="Mid-2022 LSOA 2021",
|
116 |
read_options={"header_row": 3},
|
117 |
columns=["LSOA 2021 Code", "Total"],
|
118 |
)
|
119 |
|
|
|
120 |
postcodes = (
|
121 |
postcodes.join(imd, left_on="LSOA11", right_on="LSOA")
|
122 |
.join(pops, left_on="LSOA11", right_on="LSOA 2021 Code")
|
|
|
126 |
.with_columns(((pl.col("count") / pl.col("Total")) * 100).alias("prop"))
|
127 |
)
|
128 |
|
|
|
129 |
postcodes_pd = postcodes.to_pandas()
|
130 |
|
131 |
+
_, ax1 = plt.subplots()
|
|
|
132 |
|
|
|
133 |
ax1.bar(
|
134 |
postcodes_pd["LA_decile"],
|
135 |
postcodes_pd["prop"],
|
|
|
141 |
|
142 |
plt.title("Comparison of Responses by IMD Decile")
|
143 |
|
|
|
144 |
plt.tight_layout()
|
145 |
plt.savefig(Paths.SUMMARY / "figs" / "imd_decile.png")
|
|
|
146 |
|
147 |
|
148 |
def main():
|
|
|
151 |
|
152 |
logging.warning(f"{n_docs} documents being processed!")
|
153 |
|
|
|
|
|
|
|
|
|
|
|
154 |
app = create_graph()
|
155 |
|
156 |
step = None
|
|
|
159 |
|
160 |
if step is None:
|
161 |
raise ValueError("No steps were processed!")
|
162 |
+
|
163 |
+
postcodes = process_postcodes(step["generate_final_summary"]["documents"])
|
164 |
+
wards_pop(postcodes)
|
165 |
+
imd_bar(postcodes)
|
166 |
+
build_quarto_doc(doc_title, step)
|
167 |
return step
|
168 |
|
169 |
|
|
|
172 |
|
173 |
tic = time.time()
|
174 |
out = main()
|
|
|
|
|
|
|
|
|
175 |
toc = time.time()
|
176 |
|
177 |
print(f"Time taken: {(toc - tic) / 60:.2f} minutes.")
|