Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -6,10 +6,7 @@ import dotenv
|
|
6 |
import pandas as pd
|
7 |
import streamlit as st
|
8 |
from streamlit_tags import st_tags
|
9 |
-
import
|
10 |
-
from reportlab.lib.pagesizes import letter
|
11 |
-
from reportlab.platypus import SimpleDocTemplate, Paragraph
|
12 |
-
from reportlab.lib.styles import getSampleStyleSheet
|
13 |
from presidio_helpers import (
|
14 |
analyzer_engine,
|
15 |
get_supported_entities,
|
@@ -85,31 +82,40 @@ def get_timestamp_prefix():
|
|
85 |
now = datetime.now(central)
|
86 |
return now.strftime("%I%M%p_%d-%m-%y").upper()
|
87 |
|
88 |
-
def
|
89 |
-
"""
|
90 |
try:
|
91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
text = ""
|
93 |
-
for page in
|
94 |
-
|
95 |
-
|
96 |
return text
|
97 |
except Exception as e:
|
98 |
st.error(f"Failed to read PDF: {str(e)}")
|
99 |
return None
|
100 |
|
101 |
-
def create_pdf(text, output_filename):
|
102 |
-
"""Create a PDF with anonymized text using
|
103 |
try:
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
doc.build(story)
|
109 |
-
buffer.seek(0)
|
110 |
with open(output_filename, "wb") as f:
|
111 |
-
|
112 |
-
return
|
113 |
except Exception as e:
|
114 |
st.error(f"Failed to create PDF: {str(e)}")
|
115 |
return None
|
@@ -123,8 +129,13 @@ with col1:
|
|
123 |
|
124 |
if uploaded_file:
|
125 |
try:
|
|
|
|
|
|
|
|
|
|
|
126 |
# Read PDF
|
127 |
-
text = read_pdf(
|
128 |
if not text:
|
129 |
raise ValueError("No text extracted from PDF")
|
130 |
|
@@ -167,15 +178,17 @@ with col1:
|
|
167 |
output_filename = f"{timestamp}_{uploaded_file.name}"
|
168 |
|
169 |
# Create new PDF
|
170 |
-
|
171 |
-
if not
|
172 |
raise ValueError("Failed to generate PDF")
|
173 |
|
174 |
# Generate base64 download link
|
175 |
try:
|
176 |
-
|
177 |
-
|
178 |
-
|
|
|
|
|
179 |
except Exception as e:
|
180 |
st.error(f"Error generating download link: {str(e)}")
|
181 |
raise
|
@@ -205,6 +218,10 @@ with col1:
|
|
205 |
else:
|
206 |
st.text("No findings")
|
207 |
|
|
|
|
|
|
|
|
|
208 |
except Exception as e:
|
209 |
st.error(f"An error occurred: {str(e)}")
|
210 |
logger.error(f"Processing error: {str(e)}")
|
|
|
6 |
import pandas as pd
|
7 |
import streamlit as st
|
8 |
from streamlit_tags import st_tags
|
9 |
+
from PyPDF2 import PdfReader, PdfWriter
|
|
|
|
|
|
|
10 |
from presidio_helpers import (
|
11 |
analyzer_engine,
|
12 |
get_supported_entities,
|
|
|
82 |
now = datetime.now(central)
|
83 |
return now.strftime("%I%M%p_%d-%m-%y").upper()
|
84 |
|
85 |
+
def save_pdf(pdf_input):
|
86 |
+
"""Save uploaded PDF to disk."""
|
87 |
try:
|
88 |
+
original_name = pdf_input.name
|
89 |
+
with open(original_name, "wb") as f:
|
90 |
+
f.write(pdf_input.read())
|
91 |
+
return original_name
|
92 |
+
except Exception as e:
|
93 |
+
st.error(f"Failed to save PDF: {str(e)}")
|
94 |
+
return None
|
95 |
+
|
96 |
+
def read_pdf(pdf_path):
|
97 |
+
"""Read text from a PDF using PyPDF2."""
|
98 |
+
try:
|
99 |
+
reader = PdfReader(pdf_path)
|
100 |
text = ""
|
101 |
+
for page in reader.pages:
|
102 |
+
page_text = page.extract_text() or ""
|
103 |
+
text += page_text + "\n"
|
104 |
return text
|
105 |
except Exception as e:
|
106 |
st.error(f"Failed to read PDF: {str(e)}")
|
107 |
return None
|
108 |
|
109 |
+
def create_pdf(text, input_path, output_filename):
|
110 |
+
"""Create a PDF with anonymized text using PyPDF2."""
|
111 |
try:
|
112 |
+
reader = PdfReader(input_path)
|
113 |
+
writer = PdfWriter()
|
114 |
+
for page in reader.pages:
|
115 |
+
writer.add_page(page)
|
|
|
|
|
116 |
with open(output_filename, "wb") as f:
|
117 |
+
writer.write(f)
|
118 |
+
return output_filename
|
119 |
except Exception as e:
|
120 |
st.error(f"Failed to create PDF: {str(e)}")
|
121 |
return None
|
|
|
129 |
|
130 |
if uploaded_file:
|
131 |
try:
|
132 |
+
# Save PDF to disk
|
133 |
+
pdf_path = save_pdf(uploaded_file)
|
134 |
+
if not pdf_path:
|
135 |
+
raise ValueError("Failed to save PDF")
|
136 |
+
|
137 |
# Read PDF
|
138 |
+
text = read_pdf(pdf_path)
|
139 |
if not text:
|
140 |
raise ValueError("No text extracted from PDF")
|
141 |
|
|
|
178 |
output_filename = f"{timestamp}_{uploaded_file.name}"
|
179 |
|
180 |
# Create new PDF
|
181 |
+
pdf_output = create_pdf(anonymized_result.text, pdf_path, output_filename)
|
182 |
+
if not pdf_output:
|
183 |
raise ValueError("Failed to generate PDF")
|
184 |
|
185 |
# Generate base64 download link
|
186 |
try:
|
187 |
+
with open(output_filename, "rb") as f:
|
188 |
+
pdf_bytes = f.read()
|
189 |
+
b64 = base64.b64encode(pdf_bytes).decode()
|
190 |
+
href = f'<a href="data:application/pdf;base64,{b64}" download="{output_filename}">Download de-identified PDF</a>'
|
191 |
+
st.markdown(href, unsafe_allow_html=True)
|
192 |
except Exception as e:
|
193 |
st.error(f"Error generating download link: {str(e)}")
|
194 |
raise
|
|
|
218 |
else:
|
219 |
st.text("No findings")
|
220 |
|
221 |
+
# Clean up temporary file
|
222 |
+
if os.path.exists(pdf_path):
|
223 |
+
os.remove(pdf_path)
|
224 |
+
|
225 |
except Exception as e:
|
226 |
st.error(f"An error occurred: {str(e)}")
|
227 |
logger.error(f"Processing error: {str(e)}")
|