cjber commited on
Commit
ef249f5
·
1 Parent(s): 27250c3

fix: change data files

Browse files

Former-commit-id: bf8b15fd17fa43a865f2f53f1413514cfe71f9e8 [formerly 3905c023f81520f802a7b5190e1342dcd2e9abc3]
Former-commit-id: 8974bf77bf276e48b87bd7943fb14346271440f4

planning_ai/documents/document.py CHANGED
@@ -290,19 +290,8 @@ def fig_wards(postcodes, rep):
290
 
291
 
292
  def fig_imd(postcodes, rep):
293
- imd = pl.read_csv(
294
- Paths.RAW / "uk_imd2019.csv", columns=["LSOA", "SOA_decile"]
295
- ).with_columns(((pl.col("SOA_decile") - 1) // 2) + 1)
296
-
297
- lsoa_camb = pl.read_parquet(Paths.RAW / "lsoa_camb.parquet")
298
-
299
- imd = imd.join(lsoa_camb, left_on="LSOA", right_on="LSOA11CD")
300
- pops = pl.read_excel(
301
- Paths.RAW / "sapelsoabroadage20112022.xlsx",
302
- sheet_name="Mid-2022 LSOA 2021",
303
- read_options={"header_row": 3},
304
- columns=["LSOA 2021 Code", "Total"],
305
- )
306
  imd = (
307
  postcodes.join(imd, left_on="LSOA11", right_on="LSOA", how="right")
308
  .join(pops, left_on="LSOA", right_on="LSOA 2021 Code")
 
290
 
291
 
292
  def fig_imd(postcodes, rep):
293
+ imd = pl.read_parquet(Paths.RAW / "imd_camb.parquet")
294
+ pops = pl.read_parquet(Paths.RAW / "pops_camb.parquet")
 
 
 
 
 
 
 
 
 
 
 
295
  imd = (
296
  postcodes.join(imd, left_on="LSOA11", right_on="LSOA", how="right")
297
  .join(pops, left_on="LSOA", right_on="LSOA 2021 Code")
planning_ai/main.py CHANGED
@@ -48,7 +48,7 @@ def read_docs(representations_document: str):
48
  else ""
49
  ),
50
  }
51
- # for now I concat page number to keep all pdf pages separate. I might want
52
  # to instead combine pdfs somehow
53
  pdf.metadata["filename"] = int(f"{pdf.metadata['id']}999{pdf.metadata['page']}")
54
 
 
48
  else ""
49
  ),
50
  }
51
+ # for now concat page number to keep all pdf pages separate. might want
52
  # to instead combine pdfs somehow
53
  pdf.metadata["filename"] = int(f"{pdf.metadata['id']}999{pdf.metadata['page']}")
54
 
planning_ai/preprocessing/azure_doc.py CHANGED
@@ -63,32 +63,24 @@ def analyze_document_with_azure(pdf_path, out_pdf, failed_txt):
63
  def azure_process_pdfs():
64
  pdfs = (Paths.RAW / "pdfs").glob("*.pdf")
65
 
66
- for pdf_path in tqdm(pdfs):
67
  print(f"Processing {pdf_path}")
68
 
69
  out_pdf = Path(f"./data/staging/pdfs_azure/{pdf_path.stem}.pdf")
70
  failed_txt = Path(f"./data/staging/pdfs_azure/{pdf_path.stem}.txt")
71
 
72
  text, reader = read_pdf(pdf_path)
73
- if text is None:
74
- with open(failed_txt, "w") as f:
75
- f.write("")
76
- continue
77
-
78
- if len(text) > 10_000:
 
 
 
79
  write_pdf(reader, out_pdf)
80
 
81
- if out_pdf.exists() or failed_txt.exists():
82
- continue
83
-
84
- if pdf_path.stat().st_size > 1_000_000:
85
- with open(failed_txt, "w") as f:
86
- f.write("")
87
- print("PDF too large!")
88
- continue
89
-
90
- analyze_document_with_azure(pdf_path, out_pdf, failed_txt)
91
-
92
 
93
  if __name__ == "__main__":
94
  azure_process_pdfs()
 
63
  def azure_process_pdfs():
64
  pdfs = (Paths.RAW / "pdfs").glob("*.pdf")
65
 
66
+ for pdf_path in tqdm(list(pdfs)):
67
  print(f"Processing {pdf_path}")
68
 
69
  out_pdf = Path(f"./data/staging/pdfs_azure/{pdf_path.stem}.pdf")
70
  failed_txt = Path(f"./data/staging/pdfs_azure/{pdf_path.stem}.txt")
71
 
72
  text, reader = read_pdf(pdf_path)
73
+ if text is None or len(text) < 1_000:
74
+ if pdf_path.stat().st_size > 1_000_000:
75
+ print(f"Processing {pdf_path} failed.")
76
+ continue
77
+ try:
78
+ analyze_document_with_azure(pdf_path, out_pdf, failed_txt)
79
+ except Exception as e:
80
+ print(f"Failed to use Azure: {e}")
81
+ else:
82
  write_pdf(reader, out_pdf)
83
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
  if __name__ == "__main__":
86
  azure_process_pdfs()