Spaces:
Build error
Build error
fix: add timer
Browse files- app.py +21 -6
- planning_ai/chains/prompts/reduce_final.txt +4 -2
- planning_ai/documents/document.py +3 -0
- planning_ai/main.py +5 -4
app.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
from os import getenv
|
2 |
|
3 |
import polars as pl
|
@@ -57,6 +58,10 @@ def initialize_session_state():
|
|
57 |
st.session_state["files_extracted"] = False
|
58 |
if "completed" not in st.session_state:
|
59 |
st.session_state["completed"] = False
|
|
|
|
|
|
|
|
|
60 |
|
61 |
|
62 |
def upload_and_extract_files():
|
@@ -106,7 +111,7 @@ This program allows you to process JDI `.json` files automatically, to extract d
|
|
106 |
"""
|
107 |
)
|
108 |
if uploaded_file := st.file_uploader("Choose a `.7z` file:", type="7z"):
|
109 |
-
with st.spinner("Extracting files..."):
|
110 |
try:
|
111 |
# Remove old files
|
112 |
_ = [file.unlink() for file in UPLOAD_DIR.glob("*.json")]
|
@@ -130,21 +135,31 @@ def build_report():
|
|
130 |
"Do **not** close this page while the report is being built."
|
131 |
)
|
132 |
if st.button("Build Report", type="primary"):
|
133 |
-
|
|
|
134 |
try:
|
135 |
preprocess_main()
|
136 |
-
|
|
|
|
|
|
|
137 |
except Exception as e:
|
138 |
st.error(f"An error occurred during preprocessing: {e}")
|
139 |
-
with st.spinner("Extracting text from PDFs..."):
|
140 |
try:
|
141 |
azure_process_pdfs()
|
142 |
-
|
|
|
|
|
|
|
143 |
except Exception as e:
|
144 |
st.error(f"An error occurred during PDF text extraction: {e}")
|
145 |
-
with st.spinner("Building report..."):
|
146 |
report_main()
|
|
|
147 |
st.session_state["completed"] = True
|
|
|
|
|
148 |
|
149 |
|
150 |
def display_download_buttons():
|
|
|
1 |
+
import time
|
2 |
from os import getenv
|
3 |
|
4 |
import polars as pl
|
|
|
58 |
st.session_state["files_extracted"] = False
|
59 |
if "completed" not in st.session_state:
|
60 |
st.session_state["completed"] = False
|
61 |
+
if "start_time" not in st.session_state:
|
62 |
+
st.session_state["start_time"] = None
|
63 |
+
if "end_time" not in st.session_state:
|
64 |
+
st.session_state["end_time"] = None
|
65 |
|
66 |
|
67 |
def upload_and_extract_files():
|
|
|
111 |
"""
|
112 |
)
|
113 |
if uploaded_file := st.file_uploader("Choose a `.7z` file:", type="7z"):
|
114 |
+
with st.spinner("Extracting files...", show_time=True):
|
115 |
try:
|
116 |
# Remove old files
|
117 |
_ = [file.unlink() for file in UPLOAD_DIR.glob("*.json")]
|
|
|
135 |
"Do **not** close this page while the report is being built."
|
136 |
)
|
137 |
if st.button("Build Report", type="primary"):
|
138 |
+
st.session_state["start_time"] = time.time()
|
139 |
+
with st.spinner("Preprocessing files...", show_time=True):
|
140 |
try:
|
141 |
preprocess_main()
|
142 |
+
time_taken = time.time() - st.session_state["start_time"]
|
143 |
+
st.success(
|
144 |
+
f"Preprocessing completed successfully in {time_taken:.1f} seconds!"
|
145 |
+
)
|
146 |
except Exception as e:
|
147 |
st.error(f"An error occurred during preprocessing: {e}")
|
148 |
+
with st.spinner("Extracting text from PDFs...", show_time=True):
|
149 |
try:
|
150 |
azure_process_pdfs()
|
151 |
+
time_taken = time.time() - st.session_state["start_time"]
|
152 |
+
st.success(
|
153 |
+
f"Text extraction completed successfully in {time_taken:.1f} seconds!"
|
154 |
+
)
|
155 |
except Exception as e:
|
156 |
st.error(f"An error occurred during PDF text extraction: {e}")
|
157 |
+
with st.spinner("Building report...", show_time=True):
|
158 |
report_main()
|
159 |
+
st.session_state["end_time"] = time.time()
|
160 |
st.session_state["completed"] = True
|
161 |
+
total_time = st.session_state["end_time"] - st.session_state["start_time"]
|
162 |
+
st.success(f"Report building completed in {total_time:.1f} seconds!")
|
163 |
|
164 |
|
165 |
def display_download_buttons():
|
planning_ai/chains/prompts/reduce_final.txt
CHANGED
@@ -1,9 +1,11 @@
|
|
1 |
The following contains a collection of documents that each summarise a different collection of planning responses:
|
2 |
|
|
|
|
|
3 |
{context}
|
4 |
|
5 |
-
|
6 |
|
7 |
-
|
8 |
|
9 |
Do **not** include any headings. **Only** include the text and citations.
|
|
|
1 |
The following contains a collection of documents that each summarise a different collection of planning responses:
|
2 |
|
3 |
+
---
|
4 |
+
|
5 |
{context}
|
6 |
|
7 |
+
---
|
8 |
|
9 |
+
Your task is to **reduce** this collection of summaries into a single page **executive summary** that captures the key points raised. This summary **must** preserve the correct inline citations used in the provided summaries. You may combine citations that relate to the same point, do not include more than 5 inline citations for a single point. Attempt to only select the most important.
|
10 |
|
11 |
Do **not** include any headings. **Only** include the text and citations.
|
planning_ai/documents/document.py
CHANGED
@@ -400,6 +400,9 @@ def build_final_report(out, rep):
|
|
400 |
f"{str(unused_documents)}\n\n"
|
401 |
"Documents are excluded if they provide no relevant information. These documents "
|
402 |
"are typically very short, and contain information that provides no relation to policies or themes."
|
|
|
|
|
|
|
403 |
)
|
404 |
|
405 |
out_path = Paths.SUMMARY / f"Summary_of_Submitted_Responses-{rep}.md"
|
|
|
400 |
f"{str(unused_documents)}\n\n"
|
401 |
"Documents are excluded if they provide no relevant information. These documents "
|
402 |
"are typically very short, and contain information that provides no relation to policies or themes."
|
403 |
+
"Unused document numbers relate to the 'id' column for 'text' submissions. For pdfs this 'id' "
|
404 |
+
"is combined with the page number, separated by '999'. For example: 175933-999-12 refers "
|
405 |
+
"to page 12 of the document attached to representation 175933."
|
406 |
)
|
407 |
|
408 |
out_path = Paths.SUMMARY / f"Summary_of_Submitted_Responses-{rep}.md"
|
planning_ai/main.py
CHANGED
@@ -2,7 +2,6 @@ import time
|
|
2 |
from pathlib import Path
|
3 |
|
4 |
import polars as pl
|
5 |
-
from dotenv import load_dotenv
|
6 |
from langchain_community.document_loaders import (
|
7 |
PolarsDataFrameLoader,
|
8 |
PyPDFDirectoryLoader,
|
@@ -13,8 +12,6 @@ from planning_ai.documents.document import build_final_report, build_summaries_d
|
|
13 |
from planning_ai.graph import create_graph
|
14 |
from planning_ai.logging import logger
|
15 |
|
16 |
-
load_dotenv()
|
17 |
-
|
18 |
|
19 |
def read_docs(representations_document: str):
|
20 |
logger.warning("Reading documents...")
|
@@ -31,8 +28,11 @@ def read_docs(representations_document: str):
|
|
31 |
logger.warning("Loading PDFs...")
|
32 |
pdfs = pdf_loader.load()
|
33 |
|
|
|
34 |
for pdf in pdfs:
|
35 |
pdf.metadata["id"] = Path(pdf.metadata["source"]).stem
|
|
|
|
|
36 |
meta = (
|
37 |
df.filter(pl.col("attachments_id") == int(pdf.metadata["id"]))
|
38 |
.select(["respondentpostcode", "representations_support/object"])
|
@@ -51,13 +51,14 @@ def read_docs(representations_document: str):
|
|
51 |
# for now concat page number to keep all pdf pages separate. might want
|
52 |
# to instead combine pdfs somehow
|
53 |
pdf.metadata["filename"] = int(f"{pdf.metadata['id']}999{pdf.metadata['page']}")
|
|
|
54 |
|
55 |
df = df.unique("id").with_columns(filename=pl.col("id"))
|
56 |
|
57 |
loader = PolarsDataFrameLoader(df, page_content_column="text")
|
58 |
logger.warning("Loading text files...")
|
59 |
text = loader.load()
|
60 |
-
out = text +
|
61 |
|
62 |
# removes duplicates documents based on page_content
|
63 |
docs = list(
|
|
|
2 |
from pathlib import Path
|
3 |
|
4 |
import polars as pl
|
|
|
5 |
from langchain_community.document_loaders import (
|
6 |
PolarsDataFrameLoader,
|
7 |
PyPDFDirectoryLoader,
|
|
|
12 |
from planning_ai.graph import create_graph
|
13 |
from planning_ai.logging import logger
|
14 |
|
|
|
|
|
15 |
|
16 |
def read_docs(representations_document: str):
|
17 |
logger.warning("Reading documents...")
|
|
|
28 |
logger.warning("Loading PDFs...")
|
29 |
pdfs = pdf_loader.load()
|
30 |
|
31 |
+
pdfs_filtered = []
|
32 |
for pdf in pdfs:
|
33 |
pdf.metadata["id"] = Path(pdf.metadata["source"]).stem
|
34 |
+
if int(pdf.metadata["id"]) not in df["attachments_id"]:
|
35 |
+
continue
|
36 |
meta = (
|
37 |
df.filter(pl.col("attachments_id") == int(pdf.metadata["id"]))
|
38 |
.select(["respondentpostcode", "representations_support/object"])
|
|
|
51 |
# for now concat page number to keep all pdf pages separate. might want
|
52 |
# to instead combine pdfs somehow
|
53 |
pdf.metadata["filename"] = int(f"{pdf.metadata['id']}999{pdf.metadata['page']}")
|
54 |
+
pdfs_filtered.append(pdf)
|
55 |
|
56 |
df = df.unique("id").with_columns(filename=pl.col("id"))
|
57 |
|
58 |
loader = PolarsDataFrameLoader(df, page_content_column="text")
|
59 |
logger.warning("Loading text files...")
|
60 |
text = loader.load()
|
61 |
+
out = text + pdfs_filtered
|
62 |
|
63 |
# removes duplicates documents based on page_content
|
64 |
docs = list(
|