Spaces:
Sleeping
Sleeping
fix: change data files
Browse filesFormer-commit-id: bf8b15fd17fa43a865f2f53f1413514cfe71f9e8 [formerly 3905c023f81520f802a7b5190e1342dcd2e9abc3]
Former-commit-id: 8974bf77bf276e48b87bd7943fb14346271440f4
- planning_ai/documents/document.py +2 -13
- planning_ai/main.py +1 -1
- planning_ai/preprocessing/azure_doc.py +10 -18
planning_ai/documents/document.py
CHANGED
@@ -290,19 +290,8 @@ def fig_wards(postcodes, rep):
|
|
290 |
|
291 |
|
292 |
def fig_imd(postcodes, rep):
|
293 |
-
imd = pl.
|
294 |
-
|
295 |
-
).with_columns(((pl.col("SOA_decile") - 1) // 2) + 1)
|
296 |
-
|
297 |
-
lsoa_camb = pl.read_parquet(Paths.RAW / "lsoa_camb.parquet")
|
298 |
-
|
299 |
-
imd = imd.join(lsoa_camb, left_on="LSOA", right_on="LSOA11CD")
|
300 |
-
pops = pl.read_excel(
|
301 |
-
Paths.RAW / "sapelsoabroadage20112022.xlsx",
|
302 |
-
sheet_name="Mid-2022 LSOA 2021",
|
303 |
-
read_options={"header_row": 3},
|
304 |
-
columns=["LSOA 2021 Code", "Total"],
|
305 |
-
)
|
306 |
imd = (
|
307 |
postcodes.join(imd, left_on="LSOA11", right_on="LSOA", how="right")
|
308 |
.join(pops, left_on="LSOA", right_on="LSOA 2021 Code")
|
|
|
290 |
|
291 |
|
292 |
def fig_imd(postcodes, rep):
|
293 |
+
imd = pl.read_parquet(Paths.RAW / "imd_camb.parquet")
|
294 |
+
pops = pl.read_parquet(Paths.RAW / "pops_camb.parquet")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
295 |
imd = (
|
296 |
postcodes.join(imd, left_on="LSOA11", right_on="LSOA", how="right")
|
297 |
.join(pops, left_on="LSOA", right_on="LSOA 2021 Code")
|
planning_ai/main.py
CHANGED
@@ -48,7 +48,7 @@ def read_docs(representations_document: str):
|
|
48 |
else ""
|
49 |
),
|
50 |
}
|
51 |
-
# for now
|
52 |
# to instead combine pdfs somehow
|
53 |
pdf.metadata["filename"] = int(f"{pdf.metadata['id']}999{pdf.metadata['page']}")
|
54 |
|
|
|
48 |
else ""
|
49 |
),
|
50 |
}
|
51 |
+
# for now concat page number to keep all pdf pages separate. might want
|
52 |
# to instead combine pdfs somehow
|
53 |
pdf.metadata["filename"] = int(f"{pdf.metadata['id']}999{pdf.metadata['page']}")
|
54 |
|
planning_ai/preprocessing/azure_doc.py
CHANGED
@@ -63,32 +63,24 @@ def analyze_document_with_azure(pdf_path, out_pdf, failed_txt):
|
|
63 |
def azure_process_pdfs():
|
64 |
pdfs = (Paths.RAW / "pdfs").glob("*.pdf")
|
65 |
|
66 |
-
for pdf_path in tqdm(pdfs):
|
67 |
print(f"Processing {pdf_path}")
|
68 |
|
69 |
out_pdf = Path(f"./data/staging/pdfs_azure/{pdf_path.stem}.pdf")
|
70 |
failed_txt = Path(f"./data/staging/pdfs_azure/{pdf_path.stem}.txt")
|
71 |
|
72 |
text, reader = read_pdf(pdf_path)
|
73 |
-
if text is None:
|
74 |
-
|
75 |
-
f.
|
76 |
-
|
77 |
-
|
78 |
-
|
|
|
|
|
|
|
79 |
write_pdf(reader, out_pdf)
|
80 |
|
81 |
-
if out_pdf.exists() or failed_txt.exists():
|
82 |
-
continue
|
83 |
-
|
84 |
-
if pdf_path.stat().st_size > 1_000_000:
|
85 |
-
with open(failed_txt, "w") as f:
|
86 |
-
f.write("")
|
87 |
-
print("PDF too large!")
|
88 |
-
continue
|
89 |
-
|
90 |
-
analyze_document_with_azure(pdf_path, out_pdf, failed_txt)
|
91 |
-
|
92 |
|
93 |
if __name__ == "__main__":
|
94 |
azure_process_pdfs()
|
|
|
63 |
def azure_process_pdfs():
|
64 |
pdfs = (Paths.RAW / "pdfs").glob("*.pdf")
|
65 |
|
66 |
+
for pdf_path in tqdm(list(pdfs)):
|
67 |
print(f"Processing {pdf_path}")
|
68 |
|
69 |
out_pdf = Path(f"./data/staging/pdfs_azure/{pdf_path.stem}.pdf")
|
70 |
failed_txt = Path(f"./data/staging/pdfs_azure/{pdf_path.stem}.txt")
|
71 |
|
72 |
text, reader = read_pdf(pdf_path)
|
73 |
+
if text is None or len(text) < 1_000:
|
74 |
+
if pdf_path.stat().st_size > 1_000_000:
|
75 |
+
print(f"Processing {pdf_path} failed.")
|
76 |
+
continue
|
77 |
+
try:
|
78 |
+
analyze_document_with_azure(pdf_path, out_pdf, failed_txt)
|
79 |
+
except Exception as e:
|
80 |
+
print(f"Failed to use Azure: {e}")
|
81 |
+
else:
|
82 |
write_pdf(reader, out_pdf)
|
83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
|
85 |
if __name__ == "__main__":
|
86 |
azure_process_pdfs()
|