awacke1 commited on
Commit
77c02fb
·
verified ·
1 Parent(s): a525aac

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -26
app.py CHANGED
@@ -6,10 +6,7 @@ import dotenv
6
  import pandas as pd
7
  import streamlit as st
8
  from streamlit_tags import st_tags
9
- import fitz
10
- from reportlab.lib.pagesizes import letter
11
- from reportlab.platypus import SimpleDocTemplate, Paragraph
12
- from reportlab.lib.styles import getSampleStyleSheet
13
  from presidio_helpers import (
14
  analyzer_engine,
15
  get_supported_entities,
@@ -85,31 +82,40 @@ def get_timestamp_prefix():
85
  now = datetime.now(central)
86
  return now.strftime("%I%M%p_%d-%m-%y").upper()
87
 
88
- def read_pdf(pdf_file):
89
- """Read text from a PDF using fitz (PyMuPDF)."""
90
  try:
91
- doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
 
 
 
 
 
 
 
 
 
 
 
92
  text = ""
93
- for page in doc:
94
- text += page.get_text() + "\n"
95
- doc.close()
96
  return text
97
  except Exception as e:
98
  st.error(f"Failed to read PDF: {str(e)}")
99
  return None
100
 
101
- def create_pdf(text, output_filename):
102
- """Create a PDF with anonymized text using reportlab."""
103
  try:
104
- buffer = io.BytesIO()
105
- doc = SimpleDocTemplate(buffer, pagesize=letter)
106
- styles = getSampleStyleSheet()
107
- story = [Paragraph(text.replace("\n", "<br/>"), styles["Normal"])]
108
- doc.build(story)
109
- buffer.seek(0)
110
  with open(output_filename, "wb") as f:
111
- f.write(buffer.getvalue())
112
- return buffer.getvalue()
113
  except Exception as e:
114
  st.error(f"Failed to create PDF: {str(e)}")
115
  return None
@@ -123,8 +129,13 @@ with col1:
123
 
124
  if uploaded_file:
125
  try:
 
 
 
 
 
126
  # Read PDF
127
- text = read_pdf(uploaded_file)
128
  if not text:
129
  raise ValueError("No text extracted from PDF")
130
 
@@ -167,15 +178,17 @@ with col1:
167
  output_filename = f"{timestamp}_{uploaded_file.name}"
168
 
169
  # Create new PDF
170
- pdf_bytes = create_pdf(anonymized_result.text, output_filename)
171
- if not pdf_bytes:
172
  raise ValueError("Failed to generate PDF")
173
 
174
  # Generate base64 download link
175
  try:
176
- b64 = base64.b64encode(pdf_bytes).decode()
177
- href = f'<a href="data:application/pdf;base64,{b64}" download="{output_filename}">Download de-identified PDF</a>'
178
- st.markdown(href, unsafe_allow_html=True)
 
 
179
  except Exception as e:
180
  st.error(f"Error generating download link: {str(e)}")
181
  raise
@@ -205,6 +218,10 @@ with col1:
205
  else:
206
  st.text("No findings")
207
 
 
 
 
 
208
  except Exception as e:
209
  st.error(f"An error occurred: {str(e)}")
210
  logger.error(f"Processing error: {str(e)}")
 
6
  import pandas as pd
7
  import streamlit as st
8
  from streamlit_tags import st_tags
9
+ from PyPDF2 import PdfReader, PdfWriter
 
 
 
10
  from presidio_helpers import (
11
  analyzer_engine,
12
  get_supported_entities,
 
82
  now = datetime.now(central)
83
  return now.strftime("%I%M%p_%d-%m-%y").upper()
84
 
85
+ def save_pdf(pdf_input):
86
+ """Save uploaded PDF to disk."""
87
  try:
88
+ original_name = pdf_input.name
89
+ with open(original_name, "wb") as f:
90
+ f.write(pdf_input.read())
91
+ return original_name
92
+ except Exception as e:
93
+ st.error(f"Failed to save PDF: {str(e)}")
94
+ return None
95
+
96
+ def read_pdf(pdf_path):
97
+ """Read text from a PDF using PyPDF2."""
98
+ try:
99
+ reader = PdfReader(pdf_path)
100
  text = ""
101
+ for page in reader.pages:
102
+ page_text = page.extract_text() or ""
103
+ text += page_text + "\n"
104
  return text
105
  except Exception as e:
106
  st.error(f"Failed to read PDF: {str(e)}")
107
  return None
108
 
109
+ def create_pdf(text, input_path, output_filename):
110
+ """Create a PDF with anonymized text using PyPDF2."""
111
  try:
112
+ reader = PdfReader(input_path)
113
+ writer = PdfWriter()
114
+ for page in reader.pages:
115
+ writer.add_page(page)
 
 
116
  with open(output_filename, "wb") as f:
117
+ writer.write(f)
118
+ return output_filename
119
  except Exception as e:
120
  st.error(f"Failed to create PDF: {str(e)}")
121
  return None
 
129
 
130
  if uploaded_file:
131
  try:
132
+ # Save PDF to disk
133
+ pdf_path = save_pdf(uploaded_file)
134
+ if not pdf_path:
135
+ raise ValueError("Failed to save PDF")
136
+
137
  # Read PDF
138
+ text = read_pdf(pdf_path)
139
  if not text:
140
  raise ValueError("No text extracted from PDF")
141
 
 
178
  output_filename = f"{timestamp}_{uploaded_file.name}"
179
 
180
  # Create new PDF
181
+ pdf_output = create_pdf(anonymized_result.text, pdf_path, output_filename)
182
+ if not pdf_output:
183
  raise ValueError("Failed to generate PDF")
184
 
185
  # Generate base64 download link
186
  try:
187
+ with open(output_filename, "rb") as f:
188
+ pdf_bytes = f.read()
189
+ b64 = base64.b64encode(pdf_bytes).decode()
190
+ href = f'<a href="data:application/pdf;base64,{b64}" download="{output_filename}">Download de-identified PDF</a>'
191
+ st.markdown(href, unsafe_allow_html=True)
192
  except Exception as e:
193
  st.error(f"Error generating download link: {str(e)}")
194
  raise
 
218
  else:
219
  st.text("No findings")
220
 
221
+ # Clean up temporary file
222
+ if os.path.exists(pdf_path):
223
+ os.remove(pdf_path)
224
+
225
  except Exception as e:
226
  st.error(f"An error occurred: {str(e)}")
227
  logger.error(f"Processing error: {str(e)}")