awacke1 commited on
Commit
bbda733
·
verified ·
1 Parent(s): 5b4c45e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +98 -79
app.py CHANGED
@@ -5,8 +5,6 @@ import datetime
5
  import dotenv
6
  import pandas as pd
7
  import streamlit as st
8
- import streamlit.components.v1 as components
9
- from annotated_text import annotated_text
10
  from streamlit_tags import st_tags
11
  from PyPDF2 import PdfReader, PdfWriter
12
  from presidio_helpers import (
@@ -53,7 +51,7 @@ st_model_package = st_model.split("/")[0]
53
  st_model = st_model if st_model_package.lower() not in ("spacy", "huggingface") else "/".join(st_model.split("/")[1:])
54
 
55
  analyzer_params = (st_model_package, st_model)
56
- st.sidebar.warning("Note: Models might take some time to download.")
57
 
58
  st_operator = st.sidebar.selectbox(
59
  "De-identification approach",
@@ -87,80 +85,101 @@ with col1:
87
  uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
88
 
89
  if uploaded_file:
90
- # Read PDF
91
- pdf_reader = PdfReader(uploaded_file)
92
- text = ""
93
- for page in pdf_reader.pages:
94
- text += page.extract_text() + "\n"
95
-
96
- # Analyze
97
- analyzer = analyzer_engine(*analyzer_params)
98
- st_analyze_results = analyze(
99
- analyzer=analyzer,
100
- text=text,
101
- entities=get_supported_entities(*analyzer_params),
102
- language="en",
103
- score_threshold=st_threshold,
104
- return_decision_process=st_return_decision_process,
105
- allow_list=st_allow_list,
106
- deny_list=st_deny_list,
107
- )
108
-
109
- # Process results
110
- phi_types = set(res.entity_type for res in st_analyze_results)
111
- if phi_types:
112
- st.success(f"Removed PHI types: {', '.join(phi_types)}")
113
- else:
114
- st.info("No PHI detected")
115
-
116
- # Anonymize
117
- anonymized_result = anonymize(
118
- text=text,
119
- operator=st_operator,
120
- analyze_results=st_analyze_results,
121
- )
122
-
123
- # Create new PDF
124
- pdf_writer = PdfWriter()
125
- for page in pdf_reader.pages:
126
- pdf_writer.add_page(page)
127
-
128
- # Generate output filename with timestamp
129
- timestamp = datetime.datetime.now().strftime("%I%M%p_%d-%m-%y")
130
- output_filename = f"{timestamp}_{uploaded_file.name}"
131
-
132
- # Save modified PDF
133
- with open(output_filename, "wb") as f:
134
- pdf_writer.write(f)
135
-
136
- # Generate base64 download link
137
- with open(output_filename, "rb") as f:
138
- pdf_bytes = f.read()
139
- b64 = base64.b64encode(pdf_bytes).decode()
140
- href = f'<a href="data:application/pdf;base64,{b64}" download="{output_filename}">Download de-identified PDF</a>'
141
- st.markdown(href, unsafe_allow_html=True)
142
-
143
- # Display findings
144
- with col2:
145
- st.subheader("Findings")
146
- if st_analyze_results:
147
- df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results])
148
- df["text"] = [text[res.start:res.end] for res in st_analyze_results]
149
- df_subset = df[["entity_type", "text", "start", "end", "score"]].rename(
150
- {
151
- "entity_type": "Entity type",
152
- "text": "Text",
153
- "start": "Start",
154
- "end": "End",
155
- "score": "Confidence",
156
- },
157
- axis=1,
158
- )
159
- if st_return_decision_process:
160
- analysis_explanation_df = pd.DataFrame.from_records(
161
- [r.analysis_explanation.to_dict() for r in st_analyze_results]
162
- )
163
- df_subset = pd.concat([df_subset, analysis_explanation_df], axis=1)
164
- st.dataframe(df_subset.reset_index(drop=True), use_container_width=True)
165
  else:
166
- st.text("No findings")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  import dotenv
6
  import pandas as pd
7
  import streamlit as st
 
 
8
  from streamlit_tags import st_tags
9
  from PyPDF2 import PdfReader, PdfWriter
10
  from presidio_helpers import (
 
51
  st_model = st_model if st_model_package.lower() not in ("spacy", "huggingface") else "/".join(st_model.split("/")[1:])
52
 
53
  analyzer_params = (st_model_package, st_model)
54
+ st.sidebar.warning("Note: Models might take some time to download on first run.")
55
 
56
  st_operator = st.sidebar.selectbox(
57
  "De-identification approach",
 
85
  uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
86
 
87
  if uploaded_file:
88
+ try:
89
+ # Read PDF
90
+ pdf_reader = PdfReader(uploaded_file)
91
+ text = ""
92
+ for page in pdf_reader.pages:
93
+ text += page.extract_text() + "\n"
94
+
95
+ # Initialize analyzer
96
+ try:
97
+ analyzer = analyzer_engine(*analyzer_params)
98
+ except Exception as e:
99
+ st.error(f"Failed to load model: {str(e)}")
100
+ st.info("Ensure models are downloaded (e.g., 'python -m spacy download en_core_web_lg') and check network/permissions.")
101
+ raise
102
+
103
+ # Analyze
104
+ st_analyze_results = analyze(
105
+ analyzer=analyzer,
106
+ text=text,
107
+ entities=get_supported_entities(*analyzer_params),
108
+ language="en",
109
+ score_threshold=st_threshold,
110
+ return_decision_process=st_return_decision_process,
111
+ allow_list=st_allow_list,
112
+ deny_list=st_deny_list,
113
+ )
114
+
115
+ # Process results
116
+ phi_types = set(res.entity_type for res in st_analyze_results)
117
+ if phi_types:
118
+ st.success(f"Removed PHI types: {', '.join(phi_types)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  else:
120
+ st.info("No PHI detected")
121
+
122
+ # Anonymize
123
+ anonymized_result = anonymize(
124
+ text=text,
125
+ operator=st_operator,
126
+ analyze_results=st_analyze_results,
127
+ )
128
+
129
+ # Create new PDF
130
+ pdf_writer = PdfWriter()
131
+ for page in pdf_reader.pages:
132
+ pdf_writer.add_page(page)
133
+
134
+ # Generate output filename with timestamp
135
+ timestamp = datetime.datetime.now().strftime("%I%M%p_%d-%m-%y")
136
+ output_filename = f"{timestamp}_{uploaded_file.name}"
137
+
138
+ # Save modified PDF
139
+ try:
140
+ with open(output_filename, "wb") as f:
141
+ pdf_writer.write(f)
142
+ except PermissionError as e:
143
+ st.error(f"Permission denied when saving PDF: {str(e)}")
144
+ st.info("Check write permissions in the current directory.")
145
+ raise
146
+
147
+ # Generate base64 download link
148
+ try:
149
+ with open(output_filename, "rb") as f:
150
+ pdf_bytes = f.read()
151
+ b64 = base64.b64encode(pdf_bytes).decode()
152
+ href = f'<a href="data:application/pdf;base64,{b64}" download="{output_filename}">Download de-identified PDF</a>'
153
+ st.markdown(href, unsafe_allow_html=True)
154
+ except Exception as e:
155
+ st.error(f"Error generating download link: {str(e)}")
156
+ raise
157
+
158
+ # Display findings
159
+ with col2:
160
+ st.subheader("Findings")
161
+ if st_analyze_results:
162
+ df = pd.DataFrame.from_records([r.to_dict() for r in st_analyze_results])
163
+ df["text"] = [text[res.start:res.end] for res in st_analyze_results]
164
+ df_subset = df[["entity_type", "text", "start", "end", "score"]].rename(
165
+ {
166
+ "entity_type": "Entity type",
167
+ "text": "Text",
168
+ "start": "Start",
169
+ "end": "End",
170
+ "score": "Confidence",
171
+ },
172
+ axis=1,
173
+ )
174
+ if st_return_decision_process:
175
+ analysis_explanation_df = pd.DataFrame.from_records(
176
+ [r.analysis_explanation.to_dict() for r in st_analyze_results]
177
+ )
178
+ df_subset = pd.concat([df_subset, analysis_explanation_df], axis=1)
179
+ st.dataframe(df_subset.reset_index(drop=True), use_container_width=True)
180
+ else:
181
+ st.text("No findings")
182
+
183
+ except Exception as e:
184
+ st.error(f"An error occurred: {str(e)}")
185
+ logger.error(f"Processing error: {str(e)}")