awacke1 commited on
Commit
35c70df
·
verified ·
1 Parent(s): 75fec96

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +167 -0
app.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import base64
4
+ import datetime
5
+ import dotenv
6
+ import pandas as pd
7
+ import streamlit as st
8
+ import streamlit.components.v1 as components
9
+ from annotated_text import annotated_text
10
+ from streamlit_tags import st_tags
11
+ from PyPDF2 import PdfReader, PdfWriter
12
+ from presidio_helpers import (
13
+ get_supported_entities,
14
+ analyze,
15
+ anonymize,
16
+ annotate,
17
+ analyzer_engine,
18
+ )
19
+
20
+ st.set_page_config(
21
+ page_title="Presidio PHI De-identification",
22
+ layout="wide",
23
+ initial_sidebar_state="expanded",
24
+ menu_items={"About": "https://microsoft.github.io/presidio/"},
25
+ )
26
+
27
+ dotenv.load_dotenv()
28
+ logger = logging.getLogger("presidio-streamlit")
29
+
30
+ # Sidebar
31
+ st.sidebar.header("PHI De-identification with Presidio")
32
+
33
+ model_help_text = "Select Named Entity Recognition (NER) model for PHI detection."
34
+ model_list = [
35
+ ("spaCy/en_core_web_lg", "https://huggingface.co/spacy/en_core_web_lg"),
36
+ ("HuggingFace/obi/deid_roberta_i2b2", "https://huggingface.co/obi/deid_roberta_i2b2"),
37
+ ("flair/ner-english-large", "https://huggingface.co/flair/ner-english-large"),
38
+ ("HuggingFace/StanfordAIMI/stanford-deidentifier-base", "https://huggingface.co/StanfordAIMI/stanford-deidentifier-base"),
39
+ ]
40
+
41
+ st_model = st.sidebar.selectbox(
42
+ "NER model package",
43
+ [model[0] for model in model_list],
44
+ index=1,
45
+ help=model_help_text,
46
+ )
47
+
48
+ # Display HuggingFace link for selected model
49
+ selected_model_url = next(url for model, url in model_list if model == st_model)
50
+ st.sidebar.markdown(f"[View model on HuggingFace]({selected_model_url})")
51
+
52
+ # Extract model package
53
+ st_model_package = st_model.split("/")[0]
54
+ st_model = st_model if st_model_package.lower() not in ("spacy", "huggingface") else "/".join(st_model.split("/")[1:])
55
+
56
+ analyzer_params = (st_model_package, st_model, "", "")
57
+ st.sidebar.warning("Note: Models might take some time to download.")
58
+
59
+ st_operator = st.sidebar.selectbox(
60
+ "De-identification approach",
61
+ ["replace", "redact", "mask"],
62
+ index=0,
63
+ help="Select PHI manipulation method.",
64
+ )
65
+
66
+ st_threshold = st.sidebar.slider(
67
+ label="Acceptance threshold",
68
+ min_value=0.0,
69
+ max_value=1.0,
70
+ value=0.35,
71
+ )
72
+
73
+ st_return_decision_process = st.sidebar.checkbox(
74
+ "Add analysis explanations",
75
+ value=False,
76
+ )
77
+
78
+ # Allow and deny lists
79
+ with st.sidebar.expander("Allowlists and denylists", expanded=False):
80
+ st_allow_list = st_tags(label="Add words to allowlist", text="Enter word and press enter.")
81
+ st_deny_list = st_tags(label="Add words to denylist", text="Enter word and press enter.")
82
+
83
+ # Main panel
84
+ col1, col2 = st.columns(2)
85
+
86
+ with col1:
87
+ st.subheader("Input")
88
+ uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
89
+
90
+ if uploaded_file:
91
+ # Read PDF
92
+ pdf_reader = PdfReader(uploaded_file)
93
+ text = ""
94
+ for page in pdf_reader.pages:
95
+ text += page.extract_text() + "\n"
96
+
97
+ # Analyze
98
+ analyzer = analyzer_engine(*analyzer_params)
99
+ st_analyze_results = analyze(
100
+ *analyzer_params,
101
+ text=text,
102
+ entities=get_supported_entities(*analyzer_params),
103
+ language="en",
104
+ score_threshold=st_threshold,
105
+ return_decision_process=st_return_decision_process,
106
+ allow_list=st_allow_list,
107
+ deny_list=st_deny_list,
108
+ )
109
+
110
+ # Process results
111
+ phi_types = set(res.entity_type for res in st_analyze_results)
112
+ if phi_types:
113
+ st.success(f"Removed PHI types: {', '.join(phi_types)}")
114
+ else:
115
+ st.info("No PHI detected")
116
+
117
+ # Anonymize
118
+ anonymized_result = anonymize(
119
+ text=text,
120
+ operator=st_operator,
121
+ analyze_results=st_analyze_results,
122
+ )
123
+
124
+ # Create new PDF
125
+ pdf_writer = PdfWriter()
126
+ for page in pdf_reader.pages:
127
+ pdf_writer.add_page(page)
128
+
129
+ # Generate output filename with timestamp
130
+ timestamp = datetime.datetime.now().strftime("%I%M%p_%d-%m-%y")
131
+ output_filename = f"{timestamp}_{uploaded_file.name}"
132
+
133
+ # Save modified PDF
134
+ with open(output_filename, "wb") as f:
135
+ pdf_writer.write(f)
136
+
137
+ # Generate base64 download link
138
+ with open(output_filename, "rb") as f:
139
+ pdf_bytes = f.read()
140
+ b64 = base64.b64encode(pdf_bytes).decode()
141
+ href = f'<a href="data:application/pdf;base64,{b64}" download="{output_filename}">Download de-identified PDF</a>'
142
+ st.markdown(href, unsafe_allow_html=True)
143
+
144
+ # Display findings
145
+ with col2:
146
+ st.subheader("Findings")
147
+ if st_analyze_results:
148
+ df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results])
149
+ df["text"] = [text[res.start:res.end] for res in st_analyze_results]
150
+ df_subset = df[["entity_type", "text", "start", "end", "score"]].rename(
151
+ {
152
+ "entity_type": "Entity type",
153
+ "text": "Text",
154
+ "start": "Start",
155
+ "end": "End",
156
+ "score": "Confidence",
157
+ },
158
+ axis=1,
159
+ )
160
+ if st_return_decision_process:
161
+ analysis_explanation_df = pd.DataFrame.from_records(
162
+ [r.analysis_explanation.to_dict() for r in st_analyze_results]
163
+ )
164
+ df_subset = pd.concat([df_subset, analysis_explanation_df], axis=1)
165
+ st.dataframe(df_subset.reset_index(drop=True), use_container_width=True)
166
+ else:
167
+ st.text("No findings")