Spaces:

cjber
/

planning-ai

Build error

App Files Files Community

cjber commited on Feb 25

Commit

a56fc0e

1 Parent(s): c548b99

fix: add timer

Browse files

Files changed (4) hide show

app.py +21 -6
planning_ai/chains/prompts/reduce_final.txt +4 -2
planning_ai/documents/document.py +3 -0
planning_ai/main.py +5 -4

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from os import getenv
 import polars as pl
@@ -57,6 +58,10 @@ def initialize_session_state():
         st.session_state["files_extracted"] = False
     if "completed" not in st.session_state:
         st.session_state["completed"] = False
 def upload_and_extract_files():
@@ -106,7 +111,7 @@ This program allows you to process JDI `.json` files automatically, to extract d
             """
         )
     if uploaded_file := st.file_uploader("Choose a `.7z` file:", type="7z"):
-        with st.spinner("Extracting files..."):
             try:
                 # Remove old files
                 _ = [file.unlink() for file in UPLOAD_DIR.glob("*.json")]
@@ -130,21 +135,31 @@ def build_report():
         "Do **not** close this page while the report is being built."
     )
     if st.button("Build Report", type="primary"):
-        with st.spinner("Preprocessing files..."):
             try:
                 preprocess_main()
-                st.success("Preprocessing completed successfully!")
             except Exception as e:
                 st.error(f"An error occurred during preprocessing: {e}")
-        with st.spinner("Extracting text from PDFs..."):
             try:
                 azure_process_pdfs()
-                st.success("Text extraction completed successfully!")
             except Exception as e:
                 st.error(f"An error occurred during PDF text extraction: {e}")
-        with st.spinner("Building report..."):
             report_main()
             st.session_state["completed"] = True
 def display_download_buttons():

+import time
 from os import getenv
 import polars as pl
         st.session_state["files_extracted"] = False
     if "completed" not in st.session_state:
         st.session_state["completed"] = False
+    if "start_time" not in st.session_state:
+        st.session_state["start_time"] = None
+    if "end_time" not in st.session_state:
+        st.session_state["end_time"] = None
 def upload_and_extract_files():
             """
         )
     if uploaded_file := st.file_uploader("Choose a `.7z` file:", type="7z"):
+        with st.spinner("Extracting files...", show_time=True):
             try:
                 # Remove old files
                 _ = [file.unlink() for file in UPLOAD_DIR.glob("*.json")]
         "Do **not** close this page while the report is being built."
     )
     if st.button("Build Report", type="primary"):
+        st.session_state["start_time"] = time.time()
+        with st.spinner("Preprocessing files...", show_time=True):
             try:
                 preprocess_main()
+                time_taken = time.time() - st.session_state["start_time"]
+                st.success(
+                    f"Preprocessing completed successfully in {time_taken:.1f} seconds!"
+                )
             except Exception as e:
                 st.error(f"An error occurred during preprocessing: {e}")
+        with st.spinner("Extracting text from PDFs...", show_time=True):
             try:
                 azure_process_pdfs()
+                time_taken = time.time() - st.session_state["start_time"]
+                st.success(
+                    f"Text extraction completed successfully in {time_taken:.1f} seconds!"
+                )
             except Exception as e:
                 st.error(f"An error occurred during PDF text extraction: {e}")
+        with st.spinner("Building report...", show_time=True):
             report_main()
+            st.session_state["end_time"] = time.time()
             st.session_state["completed"] = True
+            total_time = st.session_state["end_time"] - st.session_state["start_time"]
+            st.success(f"Report building completed in {total_time:.1f} seconds!")
 def display_download_buttons():

planning_ai/chains/prompts/reduce_final.txt CHANGED Viewed

@@ -1,9 +1,11 @@
 The following contains a collection of documents that each summarise a different collection of planning responses:
 {context}
-As a representative of the Cambridgeshire Council, your task is to craft a **comprehensive and articulate executive summary**. This summary will serve as the introductory section of a major report, highlighting the key themes and concerns raised in the public responses. Ensure that the summary is clear, concise, and professional, reflecting the tone and standards expected in official council documents. **Do not add, infer, or create information.** Use only the content explicitly mentioned in the above context. Adhere to British English conventions.
-Within each summary there are inline citations. When aggregating these summaries into a final executive summary, ensure that these original citations are preserved after each related paragraph. If there are more than 5 citations within a single citation block, keep only the top 5 most relevant citations.
 Do **not** include any headings. **Only** include the text and citations.

 The following contains a collection of documents that each summarise a different collection of planning responses:
+---
 {context}
+---
+Your task is to **reduce** this collection of summaries into a single page **executive summary** that captures the key points raised. This summary **must** preserve the correct inline citations used in the provided summaries. You may combine citations that relate to the same point, do not include more than 5 inline citations for a single point. Attempt to only select the most important.
 Do **not** include any headings. **Only** include the text and citations.

planning_ai/documents/document.py CHANGED Viewed

@@ -400,6 +400,9 @@ def build_final_report(out, rep):
         f"{str(unused_documents)}\n\n"
         "Documents are excluded if they provide no relevant information. These documents "
         "are typically very short, and contain information that provides no relation to policies or themes."
     )
     out_path = Paths.SUMMARY / f"Summary_of_Submitted_Responses-{rep}.md"

         f"{str(unused_documents)}\n\n"
         "Documents are excluded if they provide no relevant information. These documents "
         "are typically very short, and contain information that provides no relation to policies or themes."
+        "Unused document numbers relate to the 'id' column for 'text' submissions. For pdfs this 'id' "
+        "is combined with the page number, separated by '999'. For example: 175933-999-12 refers "
+        "to page 12 of the document attached to representation 175933."
     )
     out_path = Paths.SUMMARY / f"Summary_of_Submitted_Responses-{rep}.md"

planning_ai/main.py CHANGED Viewed

@@ -2,7 +2,6 @@ import time
 from pathlib import Path
 import polars as pl
-from dotenv import load_dotenv
 from langchain_community.document_loaders import (
     PolarsDataFrameLoader,
     PyPDFDirectoryLoader,
@@ -13,8 +12,6 @@ from planning_ai.documents.document import build_final_report, build_summaries_d
 from planning_ai.graph import create_graph
 from planning_ai.logging import logger
-load_dotenv()
 def read_docs(representations_document: str):
     logger.warning("Reading documents...")
@@ -31,8 +28,11 @@ def read_docs(representations_document: str):
     logger.warning("Loading PDFs...")
     pdfs = pdf_loader.load()
     for pdf in pdfs:
         pdf.metadata["id"] = Path(pdf.metadata["source"]).stem
         meta = (
             df.filter(pl.col("attachments_id") == int(pdf.metadata["id"]))
             .select(["respondentpostcode", "representations_support/object"])
@@ -51,13 +51,14 @@ def read_docs(representations_document: str):
         # for now concat page number to keep all pdf pages separate. might want
         # to instead combine pdfs somehow
         pdf.metadata["filename"] = int(f"{pdf.metadata['id']}999{pdf.metadata['page']}")
     df = df.unique("id").with_columns(filename=pl.col("id"))
     loader = PolarsDataFrameLoader(df, page_content_column="text")
     logger.warning("Loading text files...")
     text = loader.load()
-    out = text + pdfs
     # removes duplicates documents based on page_content
     docs = list(

 from pathlib import Path
 import polars as pl
 from langchain_community.document_loaders import (
     PolarsDataFrameLoader,
     PyPDFDirectoryLoader,
 from planning_ai.graph import create_graph
 from planning_ai.logging import logger
 def read_docs(representations_document: str):
     logger.warning("Reading documents...")
     logger.warning("Loading PDFs...")
     pdfs = pdf_loader.load()
+    pdfs_filtered = []
     for pdf in pdfs:
         pdf.metadata["id"] = Path(pdf.metadata["source"]).stem
+        if int(pdf.metadata["id"]) not in df["attachments_id"]:
+            continue
         meta = (
             df.filter(pl.col("attachments_id") == int(pdf.metadata["id"]))
             .select(["respondentpostcode", "representations_support/object"])
         # for now concat page number to keep all pdf pages separate. might want
         # to instead combine pdfs somehow
         pdf.metadata["filename"] = int(f"{pdf.metadata['id']}999{pdf.metadata['page']}")
+        pdfs_filtered.append(pdf)
     df = df.unique("id").with_columns(filename=pl.col("id"))
     loader = PolarsDataFrameLoader(df, page_content_column="text")
     logger.warning("Loading text files...")
     text = loader.load()
+    out = text + pdfs_filtered
     # removes duplicates documents based on page_content
     docs = list(