presidio-de-identify

Running on CPU Upgrade

App Files Files Community

awacke1 commited on Apr 14

Commit

77c02fb

verified ·

1 Parent(s): a525aac

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -26

app.py CHANGED Viewed

@@ -6,10 +6,7 @@ import dotenv
 import pandas as pd
 import streamlit as st
 from streamlit_tags import st_tags
-import fitz
-from reportlab.lib.pagesizes import letter
-from reportlab.platypus import SimpleDocTemplate, Paragraph
-from reportlab.lib.styles import getSampleStyleSheet
 from presidio_helpers import (
     analyzer_engine,
     get_supported_entities,
@@ -85,31 +82,40 @@ def get_timestamp_prefix():
     now = datetime.now(central)
     return now.strftime("%I%M%p_%d-%m-%y").upper()
-def read_pdf(pdf_file):
-    """Read text from a PDF using fitz (PyMuPDF)."""
     try:
-        doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
         text = ""
-        for page in doc:
-            text += page.get_text() + "\n"
-        doc.close()
         return text
     except Exception as e:
         st.error(f"Failed to read PDF: {str(e)}")
         return None
-def create_pdf(text, output_filename):
-    """Create a PDF with anonymized text using reportlab."""
     try:
-        buffer = io.BytesIO()
-        doc = SimpleDocTemplate(buffer, pagesize=letter)
-        styles = getSampleStyleSheet()
-        story = [Paragraph(text.replace("\n", "<br/>"), styles["Normal"])]
-        doc.build(story)
-        buffer.seek(0)
         with open(output_filename, "wb") as f:
-            f.write(buffer.getvalue())
-        return buffer.getvalue()
     except Exception as e:
         st.error(f"Failed to create PDF: {str(e)}")
         return None
@@ -123,8 +129,13 @@ with col1:
     if uploaded_file:
         try:
             # Read PDF
-            text = read_pdf(uploaded_file)
             if not text:
                 raise ValueError("No text extracted from PDF")
@@ -167,15 +178,17 @@ with col1:
             output_filename = f"{timestamp}_{uploaded_file.name}"
             # Create new PDF
-            pdf_bytes = create_pdf(anonymized_result.text, output_filename)
-            if not pdf_bytes:
                 raise ValueError("Failed to generate PDF")
             # Generate base64 download link
             try:
-                b64 = base64.b64encode(pdf_bytes).decode()
-                href = f'<a href="data:application/pdf;base64,{b64}" download="{output_filename}">Download de-identified PDF</a>'
-                st.markdown(href, unsafe_allow_html=True)
             except Exception as e:
                 st.error(f"Error generating download link: {str(e)}")
                 raise
@@ -205,6 +218,10 @@ with col1:
                 else:
                     st.text("No findings")
         except Exception as e:
             st.error(f"An error occurred: {str(e)}")
             logger.error(f"Processing error: {str(e)}")

 import pandas as pd
 import streamlit as st
 from streamlit_tags import st_tags
+from PyPDF2 import PdfReader, PdfWriter
 from presidio_helpers import (
     analyzer_engine,
     get_supported_entities,
     now = datetime.now(central)
     return now.strftime("%I%M%p_%d-%m-%y").upper()
+def save_pdf(pdf_input):
+    """Save uploaded PDF to disk."""
     try:
+        original_name = pdf_input.name
+        with open(original_name, "wb") as f:
+            f.write(pdf_input.read())
+        return original_name
+    except Exception as e:
+        st.error(f"Failed to save PDF: {str(e)}")
+        return None
+def read_pdf(pdf_path):
+    """Read text from a PDF using PyPDF2."""
+    try:
+        reader = PdfReader(pdf_path)
         text = ""
+        for page in reader.pages:
+            page_text = page.extract_text() or ""
+            text += page_text + "\n"
         return text
     except Exception as e:
         st.error(f"Failed to read PDF: {str(e)}")
         return None
+def create_pdf(text, input_path, output_filename):
+    """Create a PDF with anonymized text using PyPDF2."""
     try:
+        reader = PdfReader(input_path)
+        writer = PdfWriter()
+        for page in reader.pages:
+            writer.add_page(page)
         with open(output_filename, "wb") as f:
+            writer.write(f)
+        return output_filename
     except Exception as e:
         st.error(f"Failed to create PDF: {str(e)}")
         return None
     if uploaded_file:
         try:
+            # Save PDF to disk
+            pdf_path = save_pdf(uploaded_file)
+            if not pdf_path:
+                raise ValueError("Failed to save PDF")
             # Read PDF
+            text = read_pdf(pdf_path)
             if not text:
                 raise ValueError("No text extracted from PDF")
             output_filename = f"{timestamp}_{uploaded_file.name}"
             # Create new PDF
+            pdf_output = create_pdf(anonymized_result.text, pdf_path, output_filename)
+            if not pdf_output:
                 raise ValueError("Failed to generate PDF")
             # Generate base64 download link
             try:
+                with open(output_filename, "rb") as f:
+                    pdf_bytes = f.read()
+                    b64 = base64.b64encode(pdf_bytes).decode()
+                    href = f'<a href="data:application/pdf;base64,{b64}" download="{output_filename}">Download de-identified PDF</a>'
+                    st.markdown(href, unsafe_allow_html=True)
             except Exception as e:
                 st.error(f"Error generating download link: {str(e)}")
                 raise
                 else:
                     st.text("No findings")
+            # Clean up temporary file
+            if os.path.exists(pdf_path):
+                os.remove(pdf_path)
         except Exception as e:
             st.error(f"An error occurred: {str(e)}")
             logger.error(f"Processing error: {str(e)}")