cjber commited on
Commit
a56fc0e
·
1 Parent(s): c548b99

fix: add timer

Browse files
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from os import getenv
2
 
3
  import polars as pl
@@ -57,6 +58,10 @@ def initialize_session_state():
57
  st.session_state["files_extracted"] = False
58
  if "completed" not in st.session_state:
59
  st.session_state["completed"] = False
 
 
 
 
60
 
61
 
62
  def upload_and_extract_files():
@@ -106,7 +111,7 @@ This program allows you to process JDI `.json` files automatically, to extract d
106
  """
107
  )
108
  if uploaded_file := st.file_uploader("Choose a `.7z` file:", type="7z"):
109
- with st.spinner("Extracting files..."):
110
  try:
111
  # Remove old files
112
  _ = [file.unlink() for file in UPLOAD_DIR.glob("*.json")]
@@ -130,21 +135,31 @@ def build_report():
130
  "Do **not** close this page while the report is being built."
131
  )
132
  if st.button("Build Report", type="primary"):
133
- with st.spinner("Preprocessing files..."):
 
134
  try:
135
  preprocess_main()
136
- st.success("Preprocessing completed successfully!")
 
 
 
137
  except Exception as e:
138
  st.error(f"An error occurred during preprocessing: {e}")
139
- with st.spinner("Extracting text from PDFs..."):
140
  try:
141
  azure_process_pdfs()
142
- st.success("Text extraction completed successfully!")
 
 
 
143
  except Exception as e:
144
  st.error(f"An error occurred during PDF text extraction: {e}")
145
- with st.spinner("Building report..."):
146
  report_main()
 
147
  st.session_state["completed"] = True
 
 
148
 
149
 
150
  def display_download_buttons():
 
1
+ import time
2
  from os import getenv
3
 
4
  import polars as pl
 
58
  st.session_state["files_extracted"] = False
59
  if "completed" not in st.session_state:
60
  st.session_state["completed"] = False
61
+ if "start_time" not in st.session_state:
62
+ st.session_state["start_time"] = None
63
+ if "end_time" not in st.session_state:
64
+ st.session_state["end_time"] = None
65
 
66
 
67
  def upload_and_extract_files():
 
111
  """
112
  )
113
  if uploaded_file := st.file_uploader("Choose a `.7z` file:", type="7z"):
114
+ with st.spinner("Extracting files...", show_time=True):
115
  try:
116
  # Remove old files
117
  _ = [file.unlink() for file in UPLOAD_DIR.glob("*.json")]
 
135
  "Do **not** close this page while the report is being built."
136
  )
137
  if st.button("Build Report", type="primary"):
138
+ st.session_state["start_time"] = time.time()
139
+ with st.spinner("Preprocessing files...", show_time=True):
140
  try:
141
  preprocess_main()
142
+ time_taken = time.time() - st.session_state["start_time"]
143
+ st.success(
144
+ f"Preprocessing completed successfully in {time_taken:.1f} seconds!"
145
+ )
146
  except Exception as e:
147
  st.error(f"An error occurred during preprocessing: {e}")
148
+ with st.spinner("Extracting text from PDFs...", show_time=True):
149
  try:
150
  azure_process_pdfs()
151
+ time_taken = time.time() - st.session_state["start_time"]
152
+ st.success(
153
+ f"Text extraction completed successfully in {time_taken:.1f} seconds!"
154
+ )
155
  except Exception as e:
156
  st.error(f"An error occurred during PDF text extraction: {e}")
157
+ with st.spinner("Building report...", show_time=True):
158
  report_main()
159
+ st.session_state["end_time"] = time.time()
160
  st.session_state["completed"] = True
161
+ total_time = st.session_state["end_time"] - st.session_state["start_time"]
162
+ st.success(f"Report building completed in {total_time:.1f} seconds!")
163
 
164
 
165
  def display_download_buttons():
planning_ai/chains/prompts/reduce_final.txt CHANGED
@@ -1,9 +1,11 @@
1
  The following contains a collection of documents that each summarise a different collection of planning responses:
2
 
 
 
3
  {context}
4
 
5
- As a representative of the Cambridgeshire Council, your task is to craft a **comprehensive and articulate executive summary**. This summary will serve as the introductory section of a major report, highlighting the key themes and concerns raised in the public responses. Ensure that the summary is clear, concise, and professional, reflecting the tone and standards expected in official council documents. **Do not add, infer, or create information.** Use only the content explicitly mentioned in the above context. Adhere to British English conventions.
6
 
7
- Within each summary there are inline citations. When aggregating these summaries into a final executive summary, ensure that these original citations are preserved after each related paragraph. If there are more than 5 citations within a single citation block, keep only the top 5 most relevant citations.
8
 
9
  Do **not** include any headings. **Only** include the text and citations.
 
1
  The following contains a collection of documents that each summarise a different collection of planning responses:
2
 
3
+ ---
4
+
5
  {context}
6
 
7
+ ---
8
 
9
+ Your task is to **reduce** this collection of summaries into a single page **executive summary** that captures the key points raised. This summary **must** preserve the correct inline citations used in the provided summaries. You may combine citations that relate to the same point, do not include more than 5 inline citations for a single point. Attempt to only select the most important.
10
 
11
  Do **not** include any headings. **Only** include the text and citations.
planning_ai/documents/document.py CHANGED
@@ -400,6 +400,9 @@ def build_final_report(out, rep):
400
  f"{str(unused_documents)}\n\n"
401
  "Documents are excluded if they provide no relevant information. These documents "
402
  "are typically very short, and contain information that provides no relation to policies or themes."
 
 
 
403
  )
404
 
405
  out_path = Paths.SUMMARY / f"Summary_of_Submitted_Responses-{rep}.md"
 
400
  f"{str(unused_documents)}\n\n"
401
  "Documents are excluded if they provide no relevant information. These documents "
402
  "are typically very short, and contain information that provides no relation to policies or themes."
403
+ "Unused document numbers relate to the 'id' column for 'text' submissions. For pdfs this 'id' "
404
+ "is combined with the page number, separated by '999'. For example: 175933-999-12 refers "
405
+ "to page 12 of the document attached to representation 175933."
406
  )
407
 
408
  out_path = Paths.SUMMARY / f"Summary_of_Submitted_Responses-{rep}.md"
planning_ai/main.py CHANGED
@@ -2,7 +2,6 @@ import time
2
  from pathlib import Path
3
 
4
  import polars as pl
5
- from dotenv import load_dotenv
6
  from langchain_community.document_loaders import (
7
  PolarsDataFrameLoader,
8
  PyPDFDirectoryLoader,
@@ -13,8 +12,6 @@ from planning_ai.documents.document import build_final_report, build_summaries_d
13
  from planning_ai.graph import create_graph
14
  from planning_ai.logging import logger
15
 
16
- load_dotenv()
17
-
18
 
19
  def read_docs(representations_document: str):
20
  logger.warning("Reading documents...")
@@ -31,8 +28,11 @@ def read_docs(representations_document: str):
31
  logger.warning("Loading PDFs...")
32
  pdfs = pdf_loader.load()
33
 
 
34
  for pdf in pdfs:
35
  pdf.metadata["id"] = Path(pdf.metadata["source"]).stem
 
 
36
  meta = (
37
  df.filter(pl.col("attachments_id") == int(pdf.metadata["id"]))
38
  .select(["respondentpostcode", "representations_support/object"])
@@ -51,13 +51,14 @@ def read_docs(representations_document: str):
51
  # for now concat page number to keep all pdf pages separate. might want
52
  # to instead combine pdfs somehow
53
  pdf.metadata["filename"] = int(f"{pdf.metadata['id']}999{pdf.metadata['page']}")
 
54
 
55
  df = df.unique("id").with_columns(filename=pl.col("id"))
56
 
57
  loader = PolarsDataFrameLoader(df, page_content_column="text")
58
  logger.warning("Loading text files...")
59
  text = loader.load()
60
- out = text + pdfs
61
 
62
  # removes duplicates documents based on page_content
63
  docs = list(
 
2
  from pathlib import Path
3
 
4
  import polars as pl
 
5
  from langchain_community.document_loaders import (
6
  PolarsDataFrameLoader,
7
  PyPDFDirectoryLoader,
 
12
  from planning_ai.graph import create_graph
13
  from planning_ai.logging import logger
14
 
 
 
15
 
16
  def read_docs(representations_document: str):
17
  logger.warning("Reading documents...")
 
28
  logger.warning("Loading PDFs...")
29
  pdfs = pdf_loader.load()
30
 
31
+ pdfs_filtered = []
32
  for pdf in pdfs:
33
  pdf.metadata["id"] = Path(pdf.metadata["source"]).stem
34
+ if int(pdf.metadata["id"]) not in df["attachments_id"]:
35
+ continue
36
  meta = (
37
  df.filter(pl.col("attachments_id") == int(pdf.metadata["id"]))
38
  .select(["respondentpostcode", "representations_support/object"])
 
51
  # for now concat page number to keep all pdf pages separate. might want
52
  # to instead combine pdfs somehow
53
  pdf.metadata["filename"] = int(f"{pdf.metadata['id']}999{pdf.metadata['page']}")
54
+ pdfs_filtered.append(pdf)
55
 
56
  df = df.unique("id").with_columns(filename=pl.col("id"))
57
 
58
  loader = PolarsDataFrameLoader(df, page_content_column="text")
59
  logger.warning("Loading text files...")
60
  text = loader.load()
61
+ out = text + pdfs_filtered
62
 
63
  # removes duplicates documents based on page_content
64
  docs = list(