Spaces:
Build error
Build error
fix: remove caching as it doesn't regenerate automatically with new files
Browse filesFormer-commit-id: 22799dd0ee191972843945e1fa918ac0d8667ad4 [formerly b24513f77292e73dffdc51ccc84a096143561a6a]
Former-commit-id: b708c40962ede960a2550b6cbaa34222e1ee9d2a
app.py
CHANGED
@@ -86,28 +86,16 @@ def upload_and_extract_files():
|
|
86 |
archive.extractall(path=UPLOAD_DIR)
|
87 |
st.session_state["files_extracted"] = True
|
88 |
st.success(f"Extracted `{len(list(UPLOAD_DIR.glob('*.json')))}` files.")
|
|
|
89 |
except Exception as e:
|
90 |
st.error(f"Failed to extract files {e}")
|
91 |
|
92 |
|
93 |
-
@st.cache_data
|
94 |
-
def cache_preprocess_main():
|
95 |
-
return preprocess_main()
|
96 |
-
|
97 |
-
|
98 |
-
@st.cache_data
|
99 |
-
def cache_process_pdfs():
|
100 |
-
return azure_process_pdfs()
|
101 |
-
|
102 |
-
|
103 |
-
@st.cache_resource
|
104 |
-
def cache_report_main():
|
105 |
-
return report_main()
|
106 |
-
|
107 |
-
|
108 |
def build_report():
|
109 |
"""Build the report from extracted files."""
|
110 |
if st.session_state["files_extracted"] and not st.session_state["completed"]:
|
|
|
|
|
111 |
st.title("Build Report")
|
112 |
st.write(
|
113 |
"Once the files are extracted, click the button below to build the report."
|
@@ -115,64 +103,71 @@ def build_report():
|
|
115 |
if st.button("Build Report", type="primary"):
|
116 |
with st.spinner("Preprocessing files..."):
|
117 |
try:
|
118 |
-
|
119 |
st.success("Preprocessing completed successfully!")
|
120 |
except Exception as e:
|
121 |
st.error(f"An error occurred during preprocessing: {e}")
|
122 |
with st.spinner("Extracting text from PDFs..."):
|
123 |
try:
|
124 |
-
|
125 |
st.success("Text extraction completed successfully!")
|
126 |
except Exception as e:
|
127 |
st.error(f"An error occurred during PDF text extraction: {e}")
|
128 |
with st.spinner("Building report..."):
|
129 |
-
|
130 |
st.session_state["completed"] = True
|
131 |
|
132 |
|
133 |
def display_download_buttons():
|
134 |
"""Display download buttons for the generated reports."""
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
|
140 |
-
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
176 |
|
177 |
|
178 |
def main():
|
@@ -190,14 +185,8 @@ def main():
|
|
190 |
elif st.session_state["authentication_status"] is None:
|
191 |
st.warning("Please enter your username and password")
|
192 |
|
193 |
-
|
194 |
-
|
195 |
-
st.write("---")
|
196 |
-
if st.session_state.get("completed"):
|
197 |
-
st.warning(
|
198 |
-
"You **must** clear the memory if you are processing a new collection of representations."
|
199 |
-
)
|
200 |
-
st.button("Clear Memory", on_click=reset_session_state, type="primary")
|
201 |
|
202 |
|
203 |
if __name__ == "__main__":
|
|
|
86 |
archive.extractall(path=UPLOAD_DIR)
|
87 |
st.session_state["files_extracted"] = True
|
88 |
st.success(f"Extracted `{len(list(UPLOAD_DIR.glob('*.json')))}` files.")
|
89 |
+
st.session_state["completed"] = False
|
90 |
except Exception as e:
|
91 |
st.error(f"Failed to extract files {e}")
|
92 |
|
93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
def build_report():
|
95 |
"""Build the report from extracted files."""
|
96 |
if st.session_state["files_extracted"] and not st.session_state["completed"]:
|
97 |
+
# Remove old files
|
98 |
+
_ = [file.unlink() for file in (Paths.OUT / "summaries").rglob("*.pdf")]
|
99 |
st.title("Build Report")
|
100 |
st.write(
|
101 |
"Once the files are extracted, click the button below to build the report."
|
|
|
103 |
if st.button("Build Report", type="primary"):
|
104 |
with st.spinner("Preprocessing files..."):
|
105 |
try:
|
106 |
+
preprocess_main()
|
107 |
st.success("Preprocessing completed successfully!")
|
108 |
except Exception as e:
|
109 |
st.error(f"An error occurred during preprocessing: {e}")
|
110 |
with st.spinner("Extracting text from PDFs..."):
|
111 |
try:
|
112 |
+
azure_process_pdfs()
|
113 |
st.success("Text extraction completed successfully!")
|
114 |
except Exception as e:
|
115 |
st.error(f"An error occurred during PDF text extraction: {e}")
|
116 |
with st.spinner("Building report..."):
|
117 |
+
report_main()
|
118 |
st.session_state["completed"] = True
|
119 |
|
120 |
|
121 |
def display_download_buttons():
|
122 |
"""Display download buttons for the generated reports."""
|
123 |
+
representations_documents = (
|
124 |
+
pl.scan_parquet(Paths.STAGING / "gcpt3.parquet")
|
125 |
+
.select(pl.col("representations_document"))
|
126 |
+
.unique()
|
127 |
+
.collect()["representations_document"]
|
128 |
+
.to_list()
|
129 |
+
)
|
130 |
+
|
131 |
+
# remove some old intermediate files
|
132 |
+
_ = [file.unlink() for file in (Paths.STAGING / "pdfs_azure").glob("*.pdf")]
|
133 |
+
with open((Paths.RAW / "failed_downloads.txt"), "w") as f:
|
134 |
+
f.write("")
|
135 |
+
|
136 |
+
st.success("Reports built successfully! Please click download buttons below.")
|
137 |
+
st.write("---")
|
138 |
+
st.header("Download Reports")
|
139 |
+
st.markdown(
|
140 |
+
f"""
|
141 |
+
The processing has produced {len(representations_documents)} reports based on the different
|
142 |
+
representation documents. The following download buttons provides links to all of these reports,
|
143 |
+
alongside summaries for each representation used to form those reports.
|
144 |
+
"""
|
145 |
+
)
|
146 |
+
for rep in representations_documents:
|
147 |
+
report_path = Paths.SUMMARY / f"Summary_Documents-{rep}.pdf"
|
148 |
+
summaries_path = Paths.SUMMARY / f"Summary_of_Submitted_Responses-{rep}.pdf"
|
149 |
+
|
150 |
+
col1, col2 = st.columns(2, border=True)
|
151 |
+
with col1:
|
152 |
+
with open(summaries_path, "rb") as pdf_file:
|
153 |
+
st.markdown("**Executive Report Download**")
|
154 |
+
st.download_button(
|
155 |
+
label=f"{rep}",
|
156 |
+
data=pdf_file,
|
157 |
+
file_name=f"Summary_of_Submitted_Responses-{rep}.pdf",
|
158 |
+
mime="application/pdf",
|
159 |
+
type="primary",
|
160 |
+
)
|
161 |
+
with col2:
|
162 |
+
with open(report_path, "rb") as pdf_file:
|
163 |
+
st.markdown("**Represtations Summary Download**")
|
164 |
+
st.download_button(
|
165 |
+
label=f"{rep}",
|
166 |
+
data=pdf_file,
|
167 |
+
file_name=f"Summary_Documents-{rep}.pdf",
|
168 |
+
mime="application/pdf",
|
169 |
+
type="primary",
|
170 |
+
)
|
171 |
|
172 |
|
173 |
def main():
|
|
|
185 |
elif st.session_state["authentication_status"] is None:
|
186 |
st.warning("Please enter your username and password")
|
187 |
|
188 |
+
if st.session_state["completed"]:
|
189 |
+
display_download_buttons()
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
|
191 |
|
192 |
if __name__ == "__main__":
|