Spaces:

cjber
/

planning-ai

Sleeping

cjber commited on Feb 25

Commit

ef249f5

1 Parent(s): 27250c3

fix: change data files

Former-commit-id: bf8b15fd17fa43a865f2f53f1413514cfe71f9e8 [formerly 3905c023f81520f802a7b5190e1342dcd2e9abc3]
Former-commit-id: 8974bf77bf276e48b87bd7943fb14346271440f4

Files changed (3) hide show

planning_ai/documents/document.py +2 -13
planning_ai/main.py +1 -1
planning_ai/preprocessing/azure_doc.py +10 -18

planning_ai/documents/document.py CHANGED Viewed

@@ -290,19 +290,8 @@ def fig_wards(postcodes, rep):
 def fig_imd(postcodes, rep):
-    imd = pl.read_csv(
-        Paths.RAW / "uk_imd2019.csv", columns=["LSOA", "SOA_decile"]
-    ).with_columns(((pl.col("SOA_decile") - 1) // 2) + 1)
-    lsoa_camb = pl.read_parquet(Paths.RAW / "lsoa_camb.parquet")
-    imd = imd.join(lsoa_camb, left_on="LSOA", right_on="LSOA11CD")
-    pops = pl.read_excel(
-        Paths.RAW / "sapelsoabroadage20112022.xlsx",
-        sheet_name="Mid-2022 LSOA 2021",
-        read_options={"header_row": 3},
-        columns=["LSOA 2021 Code", "Total"],
-    )
     imd = (
         postcodes.join(imd, left_on="LSOA11", right_on="LSOA", how="right")
         .join(pops, left_on="LSOA", right_on="LSOA 2021 Code")

 def fig_imd(postcodes, rep):
+    imd = pl.read_parquet(Paths.RAW / "imd_camb.parquet")
+    pops = pl.read_parquet(Paths.RAW / "pops_camb.parquet")
     imd = (
         postcodes.join(imd, left_on="LSOA11", right_on="LSOA", how="right")
         .join(pops, left_on="LSOA", right_on="LSOA 2021 Code")

planning_ai/main.py CHANGED Viewed

@@ -48,7 +48,7 @@ def read_docs(representations_document: str):
                 else ""
             ),
         }
-        # for now I concat page number to keep all pdf pages separate. I might want
         # to instead combine pdfs somehow
         pdf.metadata["filename"] = int(f"{pdf.metadata['id']}999{pdf.metadata['page']}")

                 else ""
             ),
         }
+        # for now concat page number to keep all pdf pages separate. might want
         # to instead combine pdfs somehow
         pdf.metadata["filename"] = int(f"{pdf.metadata['id']}999{pdf.metadata['page']}")

planning_ai/preprocessing/azure_doc.py CHANGED Viewed

@@ -63,32 +63,24 @@ def analyze_document_with_azure(pdf_path, out_pdf, failed_txt):
 def azure_process_pdfs():
     pdfs = (Paths.RAW / "pdfs").glob("*.pdf")
-    for pdf_path in tqdm(pdfs):
         print(f"Processing {pdf_path}")
         out_pdf = Path(f"./data/staging/pdfs_azure/{pdf_path.stem}.pdf")
         failed_txt = Path(f"./data/staging/pdfs_azure/{pdf_path.stem}.txt")
         text, reader = read_pdf(pdf_path)
-        if text is None:
-            with open(failed_txt, "w") as f:
-                f.write("")
-            continue
-        if len(text) > 10_000:
             write_pdf(reader, out_pdf)
-        if out_pdf.exists() or failed_txt.exists():
-            continue
-        if pdf_path.stat().st_size > 1_000_000:
-            with open(failed_txt, "w") as f:
-                f.write("")
-            print("PDF too large!")
-            continue
-        analyze_document_with_azure(pdf_path, out_pdf, failed_txt)
 if __name__ == "__main__":
     azure_process_pdfs()

 def azure_process_pdfs():
     pdfs = (Paths.RAW / "pdfs").glob("*.pdf")
+    for pdf_path in tqdm(list(pdfs)):
         print(f"Processing {pdf_path}")
         out_pdf = Path(f"./data/staging/pdfs_azure/{pdf_path.stem}.pdf")
         failed_txt = Path(f"./data/staging/pdfs_azure/{pdf_path.stem}.txt")
         text, reader = read_pdf(pdf_path)
+        if text is None or len(text) < 1_000:
+            if pdf_path.stat().st_size > 1_000_000:
+                print(f"Processing {pdf_path} failed.")
+                continue
+            try:
+                analyze_document_with_azure(pdf_path, out_pdf, failed_txt)
+            except Exception as e:
+                print(f"Failed to use Azure: {e}")
+        else:
             write_pdf(reader, out_pdf)
 if __name__ == "__main__":
     azure_process_pdfs()