cjber commited on
Commit
62a60b6
·
1 Parent(s): ea59777

feat: single click to process doc

Browse files
Files changed (1) hide show
  1. app.py +41 -66
app.py CHANGED
@@ -10,52 +10,48 @@ UPLOAD_DIR = Paths.RAW / "gcpt3"
10
 
11
  if "files_extracted" not in st.session_state:
12
  st.session_state["files_extracted"] = False
13
- if "files_processed" not in st.session_state:
14
- st.session_state["files_processed"] = False
15
- if "pdfs_processed" not in st.session_state:
16
- st.session_state["pdfs_processed"] = False
17
 
18
  st.title("Planning AI")
19
 
20
 
21
- st.header("1. Upload JDL response `.json` files")
22
  st.write(
23
- """
24
- Upload your `.json` files here as a `7zip` file, they will be saved to the `data/raw/gcpt3` directory.
25
-
26
- The `.json` files should look like the following:
27
 
28
- ```json
29
- {
30
- "id": 10008,
31
- "method": "Paper",
32
- "respondentpostcode": "CB2 9NE",
33
- "text": "",
34
- "attachments": [
35
- {
36
- "id": 3803,
37
- "url": "http:\/\/www.cambridge.gov.uk\/public\/ldf\/localplan2031\/15417.pdf",
38
- "published": false
39
- }
40
- ],
41
- "representations": [
42
- {
43
- "id": 15417,
44
- "support\/object": "Object",
45
- "document": "Issues and Options Report",
46
- "documentelementid": 29785,
47
- "documentelementtitle": "3 - Spatial Strategy, Question 3.10",
48
- "summary": "No more green belt taken away, which is prime agricultural land. Noise pollution & light pollution for surrounding villages and new houses being built, no bus services either!"
49
- },
50
- ]
51
- }
52
- ```
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  """
55
- )
56
- uploaded_file = st.file_uploader("Choose a `.7z` file:", type="7z")
57
-
58
- if uploaded_file and not st.session_state["files_extracted"]:
59
  with st.spinner("Extracting files..."):
60
  try:
61
  with py7zr.SevenZipFile(uploaded_file, mode="r") as archive:
@@ -70,49 +66,28 @@ if uploaded_file and not st.session_state["files_extracted"]:
70
  if not st.session_state["files_extracted"]:
71
  st.write("No files uploaded yet.")
72
 
 
 
73
  if st.session_state["files_extracted"]:
74
- st.header("2. Process uploaded `.json` files")
75
  st.write(
76
- "Once the files are extracted, click the button below to start preprocessing the `.json` files."
77
  )
78
- if st.button("Process Files"):
79
- with st.spinner("Running preprocessing..."):
80
  try:
81
  preprocess_main()
82
- st.session_state["files_processed"] = True
83
  st.success("Preprocessing completed successfully!")
84
  except Exception as e:
85
  st.error(f"An error occurred during preprocessing: {e}")
86
-
87
- if st.session_state["files_extracted"] and st.session_state["files_processed"]:
88
- st.header("3. Extract text from PDFs.")
89
- st.write(
90
- "After preprocessing the `.json` files, you can now extract text from the PDFs by clicking the button below."
91
- )
92
- if st.button("Process PDFs"):
93
  with st.spinner("Extracting text from PDFs..."):
94
  try:
95
  azure_process_pdfs()
96
- st.session_state["pdfs_processed"] = True
97
  st.success("Text extraction completed successfully!")
98
  except Exception as e:
99
  st.error(f"An error occurred during PDF text extraction: {e}")
100
-
101
- if (
102
- st.session_state["files_extracted"]
103
- and st.session_state["files_processed"]
104
- and st.session_state["pdfs_processed"]
105
- ):
106
- st.title("Build final report.")
107
- st.write(
108
- "After extracting text from PDFs, you can now run the full report building pipeline!"
109
- )
110
- if st.button("Build Report", type="primary"):
111
  with st.spinner("Building report..."):
112
- try:
113
- report_main()
114
- except Exception as e:
115
- st.error(f"An error occurred during report building: {e}")
116
  report_path = Paths.SUMMARY / "Summary_Documents.pdf"
117
  summaries_path = Paths.SUMMARY / "Summary_of_Submitted_Responses.pdf"
118
 
 
10
 
11
  if "files_extracted" not in st.session_state:
12
  st.session_state["files_extracted"] = False
 
 
 
 
13
 
14
  st.title("Planning AI")
15
 
16
 
17
+ st.header("Upload JDL response `.json` files")
18
  st.write(
19
+ "Upload your `.json` files here as a `7zip` file, they will be saved to the `data/raw/gcpt3` directory."
20
+ )
 
 
21
 
22
+ with st.expander("File Format"):
23
+ st.write(
24
+ """
25
+ The `.json` files should look like the following:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
+ ```json
28
+ {
29
+ "id": 10008,
30
+ "method": "Paper",
31
+ "respondentpostcode": "CB2 9NE",
32
+ "text": "",
33
+ "attachments": [
34
+ {
35
+ "id": 3803,
36
+ "url": "http:\/\/www.cambridge.gov.uk\/public\/ldf\/localplan2031\/15417.pdf",
37
+ "published": false
38
+ }
39
+ ],
40
+ "representations": [
41
+ {
42
+ "id": 15417,
43
+ "support\/object": "Object",
44
+ "document": "Issues and Options Report",
45
+ "documentelementid": 29785,
46
+ "documentelementtitle": "3 - Spatial Strategy, Question 3.10",
47
+ "summary": "No more green belt taken away, which is prime agricultural land. Noise pollution & light pollution for surrounding villages and new houses being built, no bus services either!"
48
+ },
49
+ ]
50
+ }
51
+ ```
52
  """
53
+ )
54
+ if uploaded_file := st.file_uploader("Choose a `.7z` file:", type="7z"):
 
 
55
  with st.spinner("Extracting files..."):
56
  try:
57
  with py7zr.SevenZipFile(uploaded_file, mode="r") as archive:
 
66
  if not st.session_state["files_extracted"]:
67
  st.write("No files uploaded yet.")
68
 
69
+ st.write("---")
70
+
71
  if st.session_state["files_extracted"]:
72
+ st.title("Build Report")
73
  st.write(
74
+ "Once the files are extracted, click the button below to build the report."
75
  )
76
+ if st.button("Build Report", type="primary"):
77
+ with st.spinner("Preprocessing files..."):
78
  try:
79
  preprocess_main()
 
80
  st.success("Preprocessing completed successfully!")
81
  except Exception as e:
82
  st.error(f"An error occurred during preprocessing: {e}")
 
 
 
 
 
 
 
83
  with st.spinner("Extracting text from PDFs..."):
84
  try:
85
  azure_process_pdfs()
 
86
  st.success("Text extraction completed successfully!")
87
  except Exception as e:
88
  st.error(f"An error occurred during PDF text extraction: {e}")
 
 
 
 
 
 
 
 
 
 
 
89
  with st.spinner("Building report..."):
90
+ report_main()
 
 
 
91
  report_path = Paths.SUMMARY / "Summary_Documents.pdf"
92
  summaries_path = Paths.SUMMARY / "Summary_of_Submitted_Responses.pdf"
93