Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -108,91 +108,66 @@ class StreamlitDocProcessor:
|
|
108 |
def upload_and_process_page(self):
|
109 |
st.header("Upload and Process Documents")
|
110 |
|
111 |
-
# Add instructions
|
112 |
-
with st.expander("Instructions", expanded=True):
|
113 |
-
st.write("""
|
114 |
-
1. Click 'Browse files' to select documents
|
115 |
-
2. You can select multiple files at once
|
116 |
-
3. Supported formats: PDF and XML
|
117 |
-
4. Wait for processing to complete
|
118 |
-
5. Processed files will be listed below
|
119 |
-
""")
|
120 |
-
|
121 |
-
# File uploader
|
122 |
uploaded_files = st.file_uploader(
|
123 |
"Upload PDF or XML files",
|
124 |
type=['pdf', 'xml'],
|
125 |
-
accept_multiple_files=True
|
126 |
-
help="Select one or more PDF or XML files to upload"
|
127 |
)
|
128 |
-
|
129 |
if uploaded_files:
|
130 |
for uploaded_file in uploaded_files:
|
131 |
-
# Create progress
|
132 |
-
|
133 |
-
|
134 |
-
progress_bar = st.progress(0)
|
135 |
-
with col2:
|
136 |
-
status_text = st.empty()
|
137 |
|
138 |
-
|
139 |
-
file_ext = os.path.splitext(uploaded_file.name)[1][1:] # Get extension without dot
|
140 |
if uploaded_file.name not in st.session_state.processed_files.get(file_ext, []):
|
141 |
try:
|
142 |
-
# Create a temporary file
|
143 |
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as tmp_file:
|
144 |
tmp_file.write(uploaded_file.getbuffer())
|
145 |
temp_path = tmp_file.name
|
146 |
-
|
147 |
-
|
148 |
-
status_text.info('Processing...')
|
149 |
-
progress_bar.progress(25)
|
150 |
|
151 |
-
|
152 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
153 |
|
154 |
if result['success']:
|
155 |
if file_ext not in st.session_state.processed_files:
|
156 |
st.session_state.processed_files[file_ext] = []
|
157 |
st.session_state.processed_files[file_ext].append(uploaded_file.name)
|
158 |
progress_bar.progress(100)
|
159 |
-
|
160 |
else:
|
161 |
progress_bar.progress(100)
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
except Exception as e:
|
166 |
-
|
167 |
-
st.error(f"Error processing {uploaded_file.name}: {str(e)}")
|
168 |
finally:
|
169 |
-
# Clean up temporary file
|
170 |
try:
|
171 |
os.unlink(temp_path)
|
172 |
except:
|
173 |
pass
|
174 |
else:
|
175 |
-
|
176 |
progress_bar.progress(100)
|
177 |
|
178 |
-
# Display processed files
|
179 |
-
if any(st.session_state.processed_files.values()):
|
180 |
-
st.subheader("Processed Files")
|
181 |
-
|
182 |
-
col1, col2 = st.columns(2)
|
183 |
-
|
184 |
-
with col1:
|
185 |
-
if st.session_state.processed_files.get('xml'):
|
186 |
-
st.write("π± XML Files:")
|
187 |
-
for file in sorted(st.session_state.processed_files['xml']):
|
188 |
-
st.text(f" β’ {file}")
|
189 |
-
|
190 |
-
with col2:
|
191 |
-
if st.session_state.processed_files.get('pdf'):
|
192 |
-
st.write("π PDF Files:")
|
193 |
-
for file in sorted(st.session_state.processed_files['pdf']):
|
194 |
-
st.text(f" β’ {file}")
|
195 |
-
|
196 |
def qa_page(self):
|
197 |
st.header("Query Documents")
|
198 |
|
|
|
108 |
def upload_and_process_page(self):
|
109 |
st.header("Upload and Process Documents")
|
110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
uploaded_files = st.file_uploader(
|
112 |
"Upload PDF or XML files",
|
113 |
type=['pdf', 'xml'],
|
114 |
+
accept_multiple_files=True
|
|
|
115 |
)
|
116 |
+
|
117 |
if uploaded_files:
|
118 |
for uploaded_file in uploaded_files:
|
119 |
+
# Create progress containers
|
120 |
+
progress_bar = st.progress(0)
|
121 |
+
status_container = st.empty()
|
|
|
|
|
|
|
122 |
|
123 |
+
file_ext = os.path.splitext(uploaded_file.name)[1][1:]
|
|
|
124 |
if uploaded_file.name not in st.session_state.processed_files.get(file_ext, []):
|
125 |
try:
|
|
|
126 |
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as tmp_file:
|
127 |
tmp_file.write(uploaded_file.getbuffer())
|
128 |
temp_path = tmp_file.name
|
129 |
+
|
130 |
+
status_container.text(f'Processing {uploaded_file.name}...')
|
|
|
|
|
131 |
|
132 |
+
if file_ext == 'xml':
|
133 |
+
# Add processing status updates
|
134 |
+
status_container.text('Parsing XML...')
|
135 |
+
progress_bar.progress(10)
|
136 |
+
|
137 |
+
result = st.session_state.processor.process_file(temp_path)
|
138 |
+
|
139 |
+
if result['success']:
|
140 |
+
total_chunks = result['total_chunks']
|
141 |
+
for i, chunk_result in enumerate(result['results']):
|
142 |
+
# Update progress for each batch
|
143 |
+
progress = min(95, int(10 + (85 * (i / total_chunks))))
|
144 |
+
progress_bar.progress(progress)
|
145 |
+
status_container.text(f'Processing chunk {i+1}/{total_chunks}...')
|
146 |
+
else:
|
147 |
+
# Regular PDF processing
|
148 |
+
result = st.session_state.processor.process_file(temp_path)
|
149 |
|
150 |
if result['success']:
|
151 |
if file_ext not in st.session_state.processed_files:
|
152 |
st.session_state.processed_files[file_ext] = []
|
153 |
st.session_state.processed_files[file_ext].append(uploaded_file.name)
|
154 |
progress_bar.progress(100)
|
155 |
+
status_container.success(f"Successfully processed {uploaded_file.name}")
|
156 |
else:
|
157 |
progress_bar.progress(100)
|
158 |
+
status_container.error(f"Failed to process {uploaded_file.name}: {result['error']}")
|
159 |
+
|
|
|
160 |
except Exception as e:
|
161 |
+
status_container.error(f"Error processing {uploaded_file.name}: {str(e)}")
|
|
|
162 |
finally:
|
|
|
163 |
try:
|
164 |
os.unlink(temp_path)
|
165 |
except:
|
166 |
pass
|
167 |
else:
|
168 |
+
status_container.info(f"{uploaded_file.name} has already been processed")
|
169 |
progress_bar.progress(100)
|
170 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
def qa_page(self):
|
172 |
st.header("Query Documents")
|
173 |
|