cjber commited on
Commit
eec0aab
·
1 Parent(s): e843ca2

add pdf content to documents

Browse files
Files changed (1) hide show
  1. planning_ai/main.py +37 -3
planning_ai/main.py CHANGED
@@ -1,11 +1,15 @@
1
  import logging
2
  import time
 
3
 
4
  import geopandas as gpd
5
  import matplotlib.pyplot as plt
6
  import polars as pl
7
  from dotenv import load_dotenv
8
- from langchain_community.document_loaders import PolarsDataFrameLoader
 
 
 
9
 
10
  from planning_ai.common.utils import Paths
11
  from planning_ai.graph import create_graph
@@ -50,13 +54,43 @@ def build_quarto_doc(doc_title, out):
50
 
51
  def read_docs():
52
  df = pl.read_parquet(Paths.STAGING / "gcpt3.parquet")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  df = (
54
  df.filter(
55
- pl.col("representations_document") == "Local Plan Issues and Options Report"
 
 
 
 
56
  )
57
  .unique("id")
58
  .with_row_index()
59
  )
 
 
 
 
60
  loader = PolarsDataFrameLoader(df, page_content_column="text")
61
 
62
  return list(
@@ -146,7 +180,7 @@ def imd_bar(postcodes):
146
 
147
 
148
  def main():
149
- docs = read_docs()[:500]
150
  n_docs = len(docs)
151
 
152
  logging.warning(f"{n_docs} documents being processed!")
 
1
  import logging
2
  import time
3
+ from pathlib import Path
4
 
5
  import geopandas as gpd
6
  import matplotlib.pyplot as plt
7
  import polars as pl
8
  from dotenv import load_dotenv
9
+ from langchain_community.document_loaders import (
10
+ PolarsDataFrameLoader,
11
+ PyPDFDirectoryLoader,
12
+ )
13
 
14
  from planning_ai.common.utils import Paths
15
  from planning_ai.graph import create_graph
 
54
 
55
  def read_docs():
56
  df = pl.read_parquet(Paths.STAGING / "gcpt3.parquet")
57
+ pdf_ids = [
58
+ int(pdf.stem) if pdf.stem.isdigit() else 0
59
+ for pdf in (Paths.STAGING / "pdfs_azure").glob("*.pdf")
60
+ ]
61
+ pdf_loader = PyPDFDirectoryLoader(Paths.STAGING / "pdfs_azure")
62
+ out = pdf_loader.load()
63
+
64
+ pdfs_combined = {}
65
+ for page in out:
66
+ id = Path(page.metadata["source"]).stem
67
+ if id in pdfs_combined:
68
+ pdfs_combined[id] = pdfs_combined[id] + page.page_content
69
+ else:
70
+ pdfs_combined[id] = page.page_content
71
+
72
+ pdfs_combined = (
73
+ pl.from_dict(pdfs_combined)
74
+ .transpose(include_header=True)
75
+ .rename({"column": "attachments_id", "column_0": "pdf_text"})
76
+ .with_columns(pl.col("attachments_id").cast(int))
77
+ )
78
+
79
  df = (
80
  df.filter(
81
+ (
82
+ pl.col("representations_document")
83
+ == "Greater Cambridge Local Plan Preferred Options"
84
+ )
85
+ & (pl.col("attachments_id").is_in(pdf_ids))
86
  )
87
  .unique("id")
88
  .with_row_index()
89
  )
90
+ df = df.join(pdfs_combined, on="attachments_id").with_columns(
91
+ pl.col("text") + "\n\n" + pl.col("pdf_text")
92
+ )
93
+
94
  loader = PolarsDataFrameLoader(df, page_content_column="text")
95
 
96
  return list(
 
180
 
181
 
182
  def main():
183
+ docs = read_docs()
184
  n_docs = len(docs)
185
 
186
  logging.warning(f"{n_docs} documents being processed!")