cjber commited on
Commit
7ba5cef
·
1 Parent(s): 07f0f16

fix: remove caching as it doesn't regenerate automatically with new files

Browse files

Former-commit-id: 22799dd0ee191972843945e1fa918ac0d8667ad4 [formerly b24513f77292e73dffdc51ccc84a096143561a6a]
Former-commit-id: b708c40962ede960a2550b6cbaa34222e1ee9d2a

Files changed (1) hide show
  1. app.py +56 -67
app.py CHANGED
@@ -86,28 +86,16 @@ def upload_and_extract_files():
86
  archive.extractall(path=UPLOAD_DIR)
87
  st.session_state["files_extracted"] = True
88
  st.success(f"Extracted `{len(list(UPLOAD_DIR.glob('*.json')))}` files.")
 
89
  except Exception as e:
90
  st.error(f"Failed to extract files {e}")
91
 
92
 
93
- @st.cache_data
94
- def cache_preprocess_main():
95
- return preprocess_main()
96
-
97
-
98
- @st.cache_data
99
- def cache_process_pdfs():
100
- return azure_process_pdfs()
101
-
102
-
103
- @st.cache_resource
104
- def cache_report_main():
105
- return report_main()
106
-
107
-
108
  def build_report():
109
  """Build the report from extracted files."""
110
  if st.session_state["files_extracted"] and not st.session_state["completed"]:
 
 
111
  st.title("Build Report")
112
  st.write(
113
  "Once the files are extracted, click the button below to build the report."
@@ -115,64 +103,71 @@ def build_report():
115
  if st.button("Build Report", type="primary"):
116
  with st.spinner("Preprocessing files..."):
117
  try:
118
- cache_preprocess_main()
119
  st.success("Preprocessing completed successfully!")
120
  except Exception as e:
121
  st.error(f"An error occurred during preprocessing: {e}")
122
  with st.spinner("Extracting text from PDFs..."):
123
  try:
124
- cache_process_pdfs()
125
  st.success("Text extraction completed successfully!")
126
  except Exception as e:
127
  st.error(f"An error occurred during PDF text extraction: {e}")
128
  with st.spinner("Building report..."):
129
- cache_report_main()
130
  st.session_state["completed"] = True
131
 
132
 
133
  def display_download_buttons():
134
  """Display download buttons for the generated reports."""
135
- if st.session_state["completed"]:
136
- representations_documents = (
137
- pl.scan_parquet(Paths.STAGING / "gcpt3.parquet")
138
- .select(pl.col("representations_document"))
139
- .unique()
140
- .collect()["representations_document"]
141
- .to_list()
142
- )
143
-
144
- st.success("Reports built successfully! Please click download buttons below.")
145
- for rep in representations_documents:
146
- report_path = Paths.SUMMARY / f"Summary_Documents-{rep}.pdf"
147
- summaries_path = Paths.SUMMARY / f"Summary_of_Submitted_Responses-{rep}.pdf"
148
-
149
- col1, col2 = st.columns(2, border=True)
150
- with col1:
151
- with open(summaries_path, "rb") as pdf_file:
152
- st.markdown("**Executive Report Download**")
153
- st.download_button(
154
- label=f"{rep}",
155
- data=pdf_file,
156
- file_name=f"Summary_of_Submitted_Responses-{rep}.pdf",
157
- mime="application/pdf",
158
- type="primary",
159
- )
160
- with col2:
161
- with open(report_path, "rb") as pdf_file:
162
- st.markdown("**Represtations Summary Download**")
163
- st.download_button(
164
- label=f"{rep}",
165
- data=pdf_file,
166
- file_name=f"Summary_Documents-{rep}.pdf",
167
- mime="application/pdf",
168
- type="primary",
169
- )
170
-
171
-
172
- def reset_session_state():
173
- """Reset session state after report generation."""
174
- st.session_state["files_extracted"] = False
175
- st.session_state["completed"] = False
 
 
 
 
 
 
 
176
 
177
 
178
  def main():
@@ -190,14 +185,8 @@ def main():
190
  elif st.session_state["authentication_status"] is None:
191
  st.warning("Please enter your username and password")
192
 
193
- display_download_buttons()
194
-
195
- st.write("---")
196
- if st.session_state.get("completed"):
197
- st.warning(
198
- "You **must** clear the memory if you are processing a new collection of representations."
199
- )
200
- st.button("Clear Memory", on_click=reset_session_state, type="primary")
201
 
202
 
203
  if __name__ == "__main__":
 
86
  archive.extractall(path=UPLOAD_DIR)
87
  st.session_state["files_extracted"] = True
88
  st.success(f"Extracted `{len(list(UPLOAD_DIR.glob('*.json')))}` files.")
89
+ st.session_state["completed"] = False
90
  except Exception as e:
91
  st.error(f"Failed to extract files {e}")
92
 
93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  def build_report():
95
  """Build the report from extracted files."""
96
  if st.session_state["files_extracted"] and not st.session_state["completed"]:
97
+ # Remove old files
98
+ _ = [file.unlink() for file in (Paths.OUT / "summaries").rglob("*.pdf")]
99
  st.title("Build Report")
100
  st.write(
101
  "Once the files are extracted, click the button below to build the report."
 
103
  if st.button("Build Report", type="primary"):
104
  with st.spinner("Preprocessing files..."):
105
  try:
106
+ preprocess_main()
107
  st.success("Preprocessing completed successfully!")
108
  except Exception as e:
109
  st.error(f"An error occurred during preprocessing: {e}")
110
  with st.spinner("Extracting text from PDFs..."):
111
  try:
112
+ azure_process_pdfs()
113
  st.success("Text extraction completed successfully!")
114
  except Exception as e:
115
  st.error(f"An error occurred during PDF text extraction: {e}")
116
  with st.spinner("Building report..."):
117
+ report_main()
118
  st.session_state["completed"] = True
119
 
120
 
121
  def display_download_buttons():
122
  """Display download buttons for the generated reports."""
123
+ representations_documents = (
124
+ pl.scan_parquet(Paths.STAGING / "gcpt3.parquet")
125
+ .select(pl.col("representations_document"))
126
+ .unique()
127
+ .collect()["representations_document"]
128
+ .to_list()
129
+ )
130
+
131
+ # remove some old intermediate files
132
+ _ = [file.unlink() for file in (Paths.STAGING / "pdfs_azure").glob("*.pdf")]
133
+ with open((Paths.RAW / "failed_downloads.txt"), "w") as f:
134
+ f.write("")
135
+
136
+ st.success("Reports built successfully! Please click download buttons below.")
137
+ st.write("---")
138
+ st.header("Download Reports")
139
+ st.markdown(
140
+ f"""
141
+ The processing has produced {len(representations_documents)} reports based on the different
142
+ representation documents. The following download buttons provides links to all of these reports,
143
+ alongside summaries for each representation used to form those reports.
144
+ """
145
+ )
146
+ for rep in representations_documents:
147
+ report_path = Paths.SUMMARY / f"Summary_Documents-{rep}.pdf"
148
+ summaries_path = Paths.SUMMARY / f"Summary_of_Submitted_Responses-{rep}.pdf"
149
+
150
+ col1, col2 = st.columns(2, border=True)
151
+ with col1:
152
+ with open(summaries_path, "rb") as pdf_file:
153
+ st.markdown("**Executive Report Download**")
154
+ st.download_button(
155
+ label=f"{rep}",
156
+ data=pdf_file,
157
+ file_name=f"Summary_of_Submitted_Responses-{rep}.pdf",
158
+ mime="application/pdf",
159
+ type="primary",
160
+ )
161
+ with col2:
162
+ with open(report_path, "rb") as pdf_file:
163
+ st.markdown("**Represtations Summary Download**")
164
+ st.download_button(
165
+ label=f"{rep}",
166
+ data=pdf_file,
167
+ file_name=f"Summary_Documents-{rep}.pdf",
168
+ mime="application/pdf",
169
+ type="primary",
170
+ )
171
 
172
 
173
  def main():
 
185
  elif st.session_state["authentication_status"] is None:
186
  st.warning("Please enter your username and password")
187
 
188
+ if st.session_state["completed"]:
189
+ display_download_buttons()
 
 
 
 
 
 
190
 
191
 
192
  if __name__ == "__main__":