Spaces:
Build error
Build error
add pdf content to documents
Browse files- planning_ai/main.py +37 -3
planning_ai/main.py
CHANGED
@@ -1,11 +1,15 @@
|
|
1 |
import logging
|
2 |
import time
|
|
|
3 |
|
4 |
import geopandas as gpd
|
5 |
import matplotlib.pyplot as plt
|
6 |
import polars as pl
|
7 |
from dotenv import load_dotenv
|
8 |
-
from langchain_community.document_loaders import
|
|
|
|
|
|
|
9 |
|
10 |
from planning_ai.common.utils import Paths
|
11 |
from planning_ai.graph import create_graph
|
@@ -50,13 +54,43 @@ def build_quarto_doc(doc_title, out):
|
|
50 |
|
51 |
def read_docs():
|
52 |
df = pl.read_parquet(Paths.STAGING / "gcpt3.parquet")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
df = (
|
54 |
df.filter(
|
55 |
-
|
|
|
|
|
|
|
|
|
56 |
)
|
57 |
.unique("id")
|
58 |
.with_row_index()
|
59 |
)
|
|
|
|
|
|
|
|
|
60 |
loader = PolarsDataFrameLoader(df, page_content_column="text")
|
61 |
|
62 |
return list(
|
@@ -146,7 +180,7 @@ def imd_bar(postcodes):
|
|
146 |
|
147 |
|
148 |
def main():
|
149 |
-
docs = read_docs()
|
150 |
n_docs = len(docs)
|
151 |
|
152 |
logging.warning(f"{n_docs} documents being processed!")
|
|
|
1 |
import logging
|
2 |
import time
|
3 |
+
from pathlib import Path
|
4 |
|
5 |
import geopandas as gpd
|
6 |
import matplotlib.pyplot as plt
|
7 |
import polars as pl
|
8 |
from dotenv import load_dotenv
|
9 |
+
from langchain_community.document_loaders import (
|
10 |
+
PolarsDataFrameLoader,
|
11 |
+
PyPDFDirectoryLoader,
|
12 |
+
)
|
13 |
|
14 |
from planning_ai.common.utils import Paths
|
15 |
from planning_ai.graph import create_graph
|
|
|
54 |
|
55 |
def read_docs():
|
56 |
df = pl.read_parquet(Paths.STAGING / "gcpt3.parquet")
|
57 |
+
pdf_ids = [
|
58 |
+
int(pdf.stem) if pdf.stem.isdigit() else 0
|
59 |
+
for pdf in (Paths.STAGING / "pdfs_azure").glob("*.pdf")
|
60 |
+
]
|
61 |
+
pdf_loader = PyPDFDirectoryLoader(Paths.STAGING / "pdfs_azure")
|
62 |
+
out = pdf_loader.load()
|
63 |
+
|
64 |
+
pdfs_combined = {}
|
65 |
+
for page in out:
|
66 |
+
id = Path(page.metadata["source"]).stem
|
67 |
+
if id in pdfs_combined:
|
68 |
+
pdfs_combined[id] = pdfs_combined[id] + page.page_content
|
69 |
+
else:
|
70 |
+
pdfs_combined[id] = page.page_content
|
71 |
+
|
72 |
+
pdfs_combined = (
|
73 |
+
pl.from_dict(pdfs_combined)
|
74 |
+
.transpose(include_header=True)
|
75 |
+
.rename({"column": "attachments_id", "column_0": "pdf_text"})
|
76 |
+
.with_columns(pl.col("attachments_id").cast(int))
|
77 |
+
)
|
78 |
+
|
79 |
df = (
|
80 |
df.filter(
|
81 |
+
(
|
82 |
+
pl.col("representations_document")
|
83 |
+
== "Greater Cambridge Local Plan Preferred Options"
|
84 |
+
)
|
85 |
+
& (pl.col("attachments_id").is_in(pdf_ids))
|
86 |
)
|
87 |
.unique("id")
|
88 |
.with_row_index()
|
89 |
)
|
90 |
+
df = df.join(pdfs_combined, on="attachments_id").with_columns(
|
91 |
+
pl.col("text") + "\n\n" + pl.col("pdf_text")
|
92 |
+
)
|
93 |
+
|
94 |
loader = PolarsDataFrameLoader(df, page_content_column="text")
|
95 |
|
96 |
return list(
|
|
|
180 |
|
181 |
|
182 |
def main():
|
183 |
+
docs = read_docs()
|
184 |
n_docs = len(docs)
|
185 |
|
186 |
logging.warning(f"{n_docs} documents being processed!")
|