Spaces:

cjber
/

planning-ai

Build error

App Files Files Community

cjber commited on Jan 14

Commit

eec0aab

1 Parent(s): e843ca2

add pdf content to documents

Browse files

Files changed (1) hide show

planning_ai/main.py +37 -3

planning_ai/main.py CHANGED Viewed

@@ -1,11 +1,15 @@
 import logging
 import time
 import geopandas as gpd
 import matplotlib.pyplot as plt
 import polars as pl
 from dotenv import load_dotenv
-from langchain_community.document_loaders import PolarsDataFrameLoader
 from planning_ai.common.utils import Paths
 from planning_ai.graph import create_graph
@@ -50,13 +54,43 @@ def build_quarto_doc(doc_title, out):
 def read_docs():
     df = pl.read_parquet(Paths.STAGING / "gcpt3.parquet")
     df = (
         df.filter(
-            pl.col("representations_document") == "Local Plan Issues and Options Report"
         )
         .unique("id")
         .with_row_index()
     )
     loader = PolarsDataFrameLoader(df, page_content_column="text")
     return list(
@@ -146,7 +180,7 @@ def imd_bar(postcodes):
 def main():
-    docs = read_docs()[:500]
     n_docs = len(docs)
     logging.warning(f"{n_docs} documents being processed!")

 import logging
 import time
+from pathlib import Path
 import geopandas as gpd
 import matplotlib.pyplot as plt
 import polars as pl
 from dotenv import load_dotenv
+from langchain_community.document_loaders import (
+    PolarsDataFrameLoader,
+    PyPDFDirectoryLoader,
+)
 from planning_ai.common.utils import Paths
 from planning_ai.graph import create_graph
 def read_docs():
     df = pl.read_parquet(Paths.STAGING / "gcpt3.parquet")
+    pdf_ids = [
+        int(pdf.stem) if pdf.stem.isdigit() else 0
+        for pdf in (Paths.STAGING / "pdfs_azure").glob("*.pdf")
+    ]
+    pdf_loader = PyPDFDirectoryLoader(Paths.STAGING / "pdfs_azure")
+    out = pdf_loader.load()
+    pdfs_combined = {}
+    for page in out:
+        id = Path(page.metadata["source"]).stem
+        if id in pdfs_combined:
+            pdfs_combined[id] = pdfs_combined[id] + page.page_content
+        else:
+            pdfs_combined[id] = page.page_content
+    pdfs_combined = (
+        pl.from_dict(pdfs_combined)
+        .transpose(include_header=True)
+        .rename({"column": "attachments_id", "column_0": "pdf_text"})
+        .with_columns(pl.col("attachments_id").cast(int))
+    )
     df = (
         df.filter(
+            (
+                pl.col("representations_document")
+                == "Greater Cambridge Local Plan Preferred Options"
+            )
+            & (pl.col("attachments_id").is_in(pdf_ids))
         )
         .unique("id")
         .with_row_index()
     )
+    df = df.join(pdfs_combined, on="attachments_id").with_columns(
+        pl.col("text") + "\n\n" + pl.col("pdf_text")
+    )
     loader = PolarsDataFrameLoader(df, page_content_column="text")
     return list(
 def main():
+    docs = read_docs()
     n_docs = len(docs)
     logging.warning(f"{n_docs} documents being processed!")