Commit
·
a265560
1
Parent(s):
cb349ad
Added tab to be able to compare pages across multiple documents and redact duplicates
Browse files- Dockerfile +3 -0
- app.py +40 -13
- requirements.txt +1 -0
- tools/file_redaction.py +38 -204
- tools/find_duplicate_pages.py +274 -0
- tools/helper_functions.py +1 -1
- tools/redaction_review.py +61 -1
Dockerfile
CHANGED
@@ -60,6 +60,9 @@ RUN mkdir -p /home/user/app/output \
|
|
60 |
# Copy installed packages from builder stage
|
61 |
COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
|
62 |
|
|
|
|
|
|
|
63 |
# Entrypoint helps to switch between Gradio and Lambda mode
|
64 |
COPY entrypoint.sh /entrypoint.sh
|
65 |
|
|
|
60 |
# Copy installed packages from builder stage
|
61 |
COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
|
62 |
|
63 |
+
# Download NLTK data packages
|
64 |
+
RUN python -m nltk.downloader punkt stopwords punkt_tab
|
65 |
+
|
66 |
# Entrypoint helps to switch between Gradio and Lambda mode
|
67 |
COPY entrypoint.sh /entrypoint.sh
|
68 |
|
app.py
CHANGED
@@ -19,6 +19,7 @@ from tools.data_anonymise import anonymise_data_files
|
|
19 |
from tools.auth import authenticate_user
|
20 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
21 |
from tools.custom_csvlogger import CSVLogger_custom
|
|
|
22 |
|
23 |
today_rev = datetime.now().strftime("%Y%m%d")
|
24 |
|
@@ -68,9 +69,9 @@ with app:
|
|
68 |
all_image_annotations_state = gr.State([])
|
69 |
|
70 |
|
71 |
-
all_line_level_ocr_results_df_state = gr.Dataframe(value=pd.DataFrame(), label="all_line_level_ocr_results_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
|
72 |
-
all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), label="all_decision_process_table", visible=False, type="pandas") # gr.State(pd.DataFrame())
|
73 |
-
review_file_state = gr.Dataframe(value=pd.DataFrame(), label="review_file_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
|
74 |
|
75 |
session_hash_state = gr.State()
|
76 |
s3_output_folder_state = gr.State()
|
@@ -129,16 +130,16 @@ with app:
|
|
129 |
## Settings page variables
|
130 |
default_allow_list_file_name = "default_allow_list.csv"
|
131 |
default_allow_list_loc = output_folder + "/" + default_allow_list_file_name
|
132 |
-
in_allow_list_state = gr.Dataframe(value=pd.DataFrame(), label="in_allow_list_df", visible=False, type="pandas")
|
133 |
|
134 |
default_deny_list_file_name = "default_deny_list.csv"
|
135 |
default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
|
136 |
-
in_deny_list_state = gr.Dataframe(value=pd.DataFrame(), label="in_deny_list_df", visible=False, type="pandas")
|
137 |
in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
|
138 |
|
139 |
fully_redacted_list_file_name = "default_fully_redacted_list.csv"
|
140 |
fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
|
141 |
-
in_fully_redacted_list_state = gr.Dataframe(value=pd.DataFrame(), label="in_full_redacted_list_df", visible=False, type="pandas")
|
142 |
in_fully_redacted_text_in = gr.Textbox(value="Fully redacted page list", visible=False)
|
143 |
|
144 |
# S3 settings for default allow list load
|
@@ -149,6 +150,10 @@ with app:
|
|
149 |
# Base dataframe for recognisers that is not modified subsequent to load
|
150 |
recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", visible=False)
|
151 |
|
|
|
|
|
|
|
|
|
152 |
###
|
153 |
# UI DESIGN
|
154 |
###
|
@@ -164,8 +169,10 @@ with app:
|
|
164 |
|
165 |
NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.""")
|
166 |
|
167 |
-
|
168 |
-
|
|
|
|
|
169 |
with gr.Accordion("Redact document", open = True):
|
170 |
in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'], height=file_input_height)
|
171 |
if RUN_AWS_FUNCTIONS == "1":
|
@@ -194,7 +201,9 @@ with app:
|
|
194 |
pdf_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
|
195 |
pdf_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
|
196 |
|
197 |
-
|
|
|
|
|
198 |
with gr.Tab("Review redactions", id="tab_object_annotation"):
|
199 |
|
200 |
with gr.Accordion(label = "Review redaction file", open=True):
|
@@ -215,7 +224,6 @@ with app:
|
|
215 |
clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
|
216 |
|
217 |
with gr.Row():
|
218 |
-
|
219 |
with gr.Column(scale=1):
|
220 |
|
221 |
zoom_str = str(annotator_zoom_number) + '%'
|
@@ -249,8 +257,9 @@ with app:
|
|
249 |
recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
|
250 |
recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", label="Search results. Click to go to page")
|
251 |
|
252 |
-
|
253 |
# TEXT / TABULAR DATA TAB
|
|
|
254 |
with gr.Tab(label="Open text or Excel/csv files"):
|
255 |
gr.Markdown(
|
256 |
"""
|
@@ -280,7 +289,20 @@ with app:
|
|
280 |
data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
|
281 |
data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
|
282 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
283 |
# SETTINGS TAB
|
|
|
284 |
with gr.Tab(label="Redaction settings"):
|
285 |
with gr.Accordion("Custom allow, deny, and full page redaction lists", open = True):
|
286 |
with gr.Row():
|
@@ -319,7 +341,7 @@ with app:
|
|
319 |
###
|
320 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
|
321 |
|
322 |
-
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state]).\
|
323 |
then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
|
324 |
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
325 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files], api_name="redact_doc").\
|
@@ -410,10 +432,15 @@ with app:
|
|
410 |
text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
|
411 |
then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
|
412 |
|
|
|
|
|
|
|
|
|
|
|
413 |
###
|
414 |
# SETTINGS PAGE INPUT / OUTPUT
|
415 |
###
|
416 |
-
# If a custom allow list is uploaded
|
417 |
in_allow_list.change(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
|
418 |
in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
|
419 |
in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
|
|
|
19 |
from tools.auth import authenticate_user
|
20 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
21 |
from tools.custom_csvlogger import CSVLogger_custom
|
22 |
+
from tools.find_duplicate_pages import identify_similar_pages
|
23 |
|
24 |
today_rev = datetime.now().strftime("%Y%m%d")
|
25 |
|
|
|
69 |
all_image_annotations_state = gr.State([])
|
70 |
|
71 |
|
72 |
+
all_line_level_ocr_results_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_line_level_ocr_results_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
|
73 |
+
all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas") # gr.State(pd.DataFrame())
|
74 |
+
review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
|
75 |
|
76 |
session_hash_state = gr.State()
|
77 |
s3_output_folder_state = gr.State()
|
|
|
130 |
## Settings page variables
|
131 |
default_allow_list_file_name = "default_allow_list.csv"
|
132 |
default_allow_list_loc = output_folder + "/" + default_allow_list_file_name
|
133 |
+
in_allow_list_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_allow_list_df", visible=False, type="pandas")
|
134 |
|
135 |
default_deny_list_file_name = "default_deny_list.csv"
|
136 |
default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
|
137 |
+
in_deny_list_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_deny_list_df", visible=False, type="pandas")
|
138 |
in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
|
139 |
|
140 |
fully_redacted_list_file_name = "default_fully_redacted_list.csv"
|
141 |
fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
|
142 |
+
in_fully_redacted_list_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_full_redacted_list_df", visible=False, type="pandas")
|
143 |
in_fully_redacted_text_in = gr.Textbox(value="Fully redacted page list", visible=False)
|
144 |
|
145 |
# S3 settings for default allow list load
|
|
|
150 |
# Base dataframe for recognisers that is not modified subsequent to load
|
151 |
recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", visible=False)
|
152 |
|
153 |
+
# Duplicate page detection
|
154 |
+
in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
|
155 |
+
duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_deny_list_df", visible=False, type="pandas")
|
156 |
+
|
157 |
###
|
158 |
# UI DESIGN
|
159 |
###
|
|
|
169 |
|
170 |
NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.""")
|
171 |
|
172 |
+
###
|
173 |
+
# REDACTION PDF/IMAGES TABL
|
174 |
+
###
|
175 |
+
with gr.Tab("Redact PDFs/images"):
|
176 |
with gr.Accordion("Redact document", open = True):
|
177 |
in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'], height=file_input_height)
|
178 |
if RUN_AWS_FUNCTIONS == "1":
|
|
|
201 |
pdf_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
|
202 |
pdf_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
|
203 |
|
204 |
+
###
|
205 |
+
# REVIEW REDACTIONS TAB
|
206 |
+
###
|
207 |
with gr.Tab("Review redactions", id="tab_object_annotation"):
|
208 |
|
209 |
with gr.Accordion(label = "Review redaction file", open=True):
|
|
|
224 |
clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
|
225 |
|
226 |
with gr.Row():
|
|
|
227 |
with gr.Column(scale=1):
|
228 |
|
229 |
zoom_str = str(annotator_zoom_number) + '%'
|
|
|
257 |
recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
|
258 |
recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", label="Search results. Click to go to page")
|
259 |
|
260 |
+
###
|
261 |
# TEXT / TABULAR DATA TAB
|
262 |
+
###
|
263 |
with gr.Tab(label="Open text or Excel/csv files"):
|
264 |
gr.Markdown(
|
265 |
"""
|
|
|
289 |
data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
|
290 |
data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
|
291 |
|
292 |
+
###
|
293 |
+
# IDENTIFY DUPLICATE PAGES TAB
|
294 |
+
###
|
295 |
+
with gr.Tab(label="Identify duplicate pages"):
|
296 |
+
with gr.Accordion("Identify duplicate pages to redact", open = True):
|
297 |
+
in_duplicate_pages = gr.File(label="Upload multiple 'ocr_output.csv' data files from redaction jobs here to compare", file_count="multiple", height=file_input_height, file_types=['.csv'])
|
298 |
+
|
299 |
+
find_duplicate_pages_btn = gr.Button(value="Identify duplicate pages", variant="primary")
|
300 |
+
|
301 |
+
duplicate_pages_out =gr.File(label="Duplicate pages analysis output", file_count="multiple", height=file_input_height, file_types=['.csv'])
|
302 |
+
|
303 |
+
###
|
304 |
# SETTINGS TAB
|
305 |
+
###
|
306 |
with gr.Tab(label="Redaction settings"):
|
307 |
with gr.Accordion("Custom allow, deny, and full page redaction lists", open = True):
|
308 |
with gr.Row():
|
|
|
341 |
###
|
342 |
in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
|
343 |
|
344 |
+
document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
|
345 |
then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
|
346 |
then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
|
347 |
outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files], api_name="redact_doc").\
|
|
|
432 |
text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
|
433 |
then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
|
434 |
|
435 |
+
###
|
436 |
+
# IDENTIFY DUPLICATE PAGES
|
437 |
+
###
|
438 |
+
find_duplicate_pages_btn.click(fn=identify_similar_pages, inputs=[in_duplicate_pages], outputs=[duplicate_pages_df, duplicate_pages_out])
|
439 |
+
|
440 |
###
|
441 |
# SETTINGS PAGE INPUT / OUTPUT
|
442 |
###
|
443 |
+
# If a custom allow/deny/duplicate page list is uploaded
|
444 |
in_allow_list.change(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
|
445 |
in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
|
446 |
in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
|
requirements.txt
CHANGED
@@ -7,6 +7,7 @@ presidio_anonymizer==2.2.355
|
|
7 |
presidio-image-redactor==0.0.53
|
8 |
pikepdf==8.15.1
|
9 |
pandas==2.2.3
|
|
|
10 |
spacy==3.8.3
|
11 |
#en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
|
12 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
|
|
7 |
presidio-image-redactor==0.0.53
|
8 |
pikepdf==8.15.1
|
9 |
pandas==2.2.3
|
10 |
+
nltk==3.9.1
|
11 |
spacy==3.8.3
|
12 |
#en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
|
13 |
en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
|
tools/file_redaction.py
CHANGED
@@ -136,7 +136,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
136 |
tic = time.perf_counter()
|
137 |
all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
|
138 |
|
139 |
-
print("prepared_pdf_file_paths:", prepared_pdf_file_paths[0])
|
140 |
review_out_file_paths = [prepared_pdf_file_paths[0]]
|
141 |
|
142 |
if isinstance(custom_recogniser_word_list, pd.DataFrame):
|
@@ -779,6 +779,11 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
|
|
779 |
|
780 |
return page, out_annotation_boxes
|
781 |
|
|
|
|
|
|
|
|
|
|
|
782 |
def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
|
783 |
|
784 |
all_bboxes = []
|
@@ -1051,7 +1056,7 @@ def redact_image_pdf(file_path:str,
|
|
1051 |
|
1052 |
#print("Image is in range of pages to redact")
|
1053 |
if isinstance(image, str):
|
1054 |
-
print("image is a file path", image)
|
1055 |
image = Image.open(image)
|
1056 |
|
1057 |
# Need image size to convert textract OCR outputs to the correct sizes
|
@@ -1119,7 +1124,7 @@ def redact_image_pdf(file_path:str,
|
|
1119 |
line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
|
1120 |
|
1121 |
# Step 2: Analyze text and identify PII
|
1122 |
-
if chosen_redact_entities:
|
1123 |
|
1124 |
redaction_bboxes, comprehend_query_number_new = image_analyser.analyze_text(
|
1125 |
line_level_ocr_results,
|
@@ -1309,7 +1314,7 @@ def redact_image_pdf(file_path:str,
|
|
1309 |
|
1310 |
|
1311 |
###
|
1312 |
-
# PIKEPDF TEXT
|
1313 |
###
|
1314 |
|
1315 |
def get_text_container_characters(text_container:LTTextContainer):
|
@@ -1485,182 +1490,6 @@ def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
|
|
1485 |
pikepdf_annotations_on_page.append(annotation)
|
1486 |
return pikepdf_annotations_on_page
|
1487 |
|
1488 |
-
# def run_page_text_redaction(language: str, # Language of the PDF content
|
1489 |
-
# chosen_redact_entities: List[str], # List of entities to be redacted
|
1490 |
-
# chosen_redact_comprehend_entities: List[str],
|
1491 |
-
# line_level_text_results_list: List[str],
|
1492 |
-
# line_characters: List,
|
1493 |
-
# page_analyser_results: List = [],
|
1494 |
-
# page_analysed_bounding_boxes: List = [],
|
1495 |
-
# comprehend_client = None, # Connection to AWS Comprehend
|
1496 |
-
# allow_list: List[str] = None, # Optional list of allowed entities
|
1497 |
-
# pii_identification_method: str = "Local"
|
1498 |
-
# ):
|
1499 |
-
|
1500 |
-
# # Initialize batching variables
|
1501 |
-
# current_batch = ""
|
1502 |
-
# current_batch_mapping = [] # List of (start_pos, line_index, OCRResult) tuples
|
1503 |
-
# all_text_line_results = [] # Store results for all lines
|
1504 |
-
# text_container_analyser_results = []
|
1505 |
-
# text_container_analysed_bounding_boxes = []
|
1506 |
-
|
1507 |
-
# # First pass: collect all lines into batches
|
1508 |
-
# for i, text_line in enumerate(line_level_text_results_list):
|
1509 |
-
# if chosen_redact_entities:
|
1510 |
-
# if pii_identification_method == "Local":
|
1511 |
-
|
1512 |
-
# #print("chosen_redact_entities:", chosen_redact_entities)
|
1513 |
-
|
1514 |
-
# # Process immediately for local analysis
|
1515 |
-
# text_line_analyser_result = nlp_analyser.analyze(
|
1516 |
-
# text=text_line.text,
|
1517 |
-
# language=language,
|
1518 |
-
# entities=chosen_redact_entities,
|
1519 |
-
# score_threshold=score_threshold,
|
1520 |
-
# return_decision_process=True,
|
1521 |
-
# allow_list=allow_list
|
1522 |
-
# )
|
1523 |
-
# all_text_line_results.append((i, text_line_analyser_result))
|
1524 |
-
|
1525 |
-
|
1526 |
-
# elif pii_identification_method == "AWS Comprehend":
|
1527 |
-
|
1528 |
-
# # First use the local Spacy model to pick up custom entities that AWS Comprehend can't search for.
|
1529 |
-
# custom_redact_entities = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
|
1530 |
-
|
1531 |
-
|
1532 |
-
# text_line_analyser_result = nlp_analyser.analyze(
|
1533 |
-
# text=text_line.text,
|
1534 |
-
# language=language,
|
1535 |
-
# entities=custom_redact_entities,
|
1536 |
-
# score_threshold=score_threshold,
|
1537 |
-
# return_decision_process=True,
|
1538 |
-
# allow_list=allow_list
|
1539 |
-
# )
|
1540 |
-
# all_text_line_results.append((i, text_line_analyser_result))
|
1541 |
-
|
1542 |
-
|
1543 |
-
# if len(text_line.text) >= 3:
|
1544 |
-
# # Add separator between lines
|
1545 |
-
# if current_batch:
|
1546 |
-
# current_batch += " | "
|
1547 |
-
|
1548 |
-
# start_pos = len(current_batch)
|
1549 |
-
# current_batch += text_line.text
|
1550 |
-
# current_batch_mapping.append((start_pos, i, text_line))
|
1551 |
-
|
1552 |
-
# # Process batch if approaching 300 characters or last line
|
1553 |
-
# if len(current_batch) >= 200 or i == len(line_level_text_results_list) - 1:
|
1554 |
-
# print("length of text for Comprehend:", len(current_batch))
|
1555 |
-
|
1556 |
-
# try:
|
1557 |
-
# response = comprehend_client.detect_pii_entities(
|
1558 |
-
# Text=current_batch,
|
1559 |
-
# LanguageCode=language
|
1560 |
-
# )
|
1561 |
-
# except Exception as e:
|
1562 |
-
# print(e)
|
1563 |
-
# time.sleep(3)
|
1564 |
-
# response = comprehend_client.detect_pii_entities(
|
1565 |
-
# Text=current_batch,
|
1566 |
-
# LanguageCode=language
|
1567 |
-
# )
|
1568 |
-
|
1569 |
-
# comprehend_query_number += 1
|
1570 |
-
|
1571 |
-
# # Process response and map back to original lines
|
1572 |
-
# if response and "Entities" in response:
|
1573 |
-
# for entity in response["Entities"]:
|
1574 |
-
# entity_start = entity["BeginOffset"]
|
1575 |
-
# entity_end = entity["EndOffset"]
|
1576 |
-
|
1577 |
-
# # Find which line this entity belongs to
|
1578 |
-
# for batch_start, line_idx, original_line in current_batch_mapping:
|
1579 |
-
# batch_end = batch_start + len(original_line.text)
|
1580 |
-
|
1581 |
-
# # Check if entity belongs to this line
|
1582 |
-
# if batch_start <= entity_start < batch_end:
|
1583 |
-
# # Adjust offsets relative to original line
|
1584 |
-
# relative_start = entity_start - batch_start
|
1585 |
-
# relative_end = min(entity_end - batch_start, len(original_line.text))
|
1586 |
-
|
1587 |
-
# result_text = original_line.text[relative_start:relative_end]
|
1588 |
-
|
1589 |
-
# if result_text not in allow_list:
|
1590 |
-
# if entity.get("Type") in chosen_redact_comprehend_entities:
|
1591 |
-
# # Create adjusted entity
|
1592 |
-
# adjusted_entity = entity.copy()
|
1593 |
-
# adjusted_entity["BeginOffset"] = relative_start
|
1594 |
-
# adjusted_entity["EndOffset"] = relative_end
|
1595 |
-
|
1596 |
-
# recogniser_entity = recognizer_result_from_dict(adjusted_entity)
|
1597 |
-
|
1598 |
-
# # Add to results for this line
|
1599 |
-
# existing_results = next((results for idx, results in all_text_line_results if idx == line_idx), [])
|
1600 |
-
# if not existing_results:
|
1601 |
-
# all_text_line_results.append((line_idx, [recogniser_entity]))
|
1602 |
-
# else:
|
1603 |
-
# existing_results.append(recogniser_entity)
|
1604 |
-
|
1605 |
-
# # Reset batch
|
1606 |
-
# current_batch = ""
|
1607 |
-
# current_batch_mapping = []
|
1608 |
-
|
1609 |
-
# # Second pass: process results for each line
|
1610 |
-
# for i, text_line in enumerate(line_level_text_results_list):
|
1611 |
-
# text_line_analyser_result = []
|
1612 |
-
# text_line_bounding_boxes = []
|
1613 |
-
|
1614 |
-
# # Get results for this line
|
1615 |
-
# line_results = next((results for idx, results in all_text_line_results if idx == i), [])
|
1616 |
-
|
1617 |
-
# if line_results:
|
1618 |
-
# text_line_analyser_result = line_results
|
1619 |
-
|
1620 |
-
# #print("Analysed text container, now merging bounding boxes")
|
1621 |
-
|
1622 |
-
# # Merge bounding boxes if very close together
|
1623 |
-
# text_line_bounding_boxes = merge_text_bounding_boxes(text_line_analyser_result, line_characters[i])
|
1624 |
-
|
1625 |
-
# #print("merged bounding boxes")
|
1626 |
-
|
1627 |
-
# text_container_analyser_results.extend(text_line_analyser_result)
|
1628 |
-
# #text_container_analysed_bounding_boxes.extend(text_line_bounding_boxes)
|
1629 |
-
|
1630 |
-
# #print("text_container_analyser_results:", text_container_analyser_results)
|
1631 |
-
|
1632 |
-
# page_analyser_results.extend(text_container_analyser_results) # Add this line
|
1633 |
-
# page_analysed_bounding_boxes.extend(text_line_bounding_boxes) # Add this line
|
1634 |
-
|
1635 |
-
# return page_analysed_bounding_boxes
|
1636 |
-
|
1637 |
-
# def map_back_entity_results(page_analyser_result, page_text_mapping, all_text_line_results):
|
1638 |
-
# for entity in page_analyser_result:
|
1639 |
-
# entity_start = entity.start
|
1640 |
-
# entity_end = entity.end
|
1641 |
-
|
1642 |
-
# for batch_start, line_idx, original_line, chars in page_text_mapping:
|
1643 |
-
# batch_end = batch_start + len(original_line.text)
|
1644 |
-
|
1645 |
-
# if batch_start <= entity_start < batch_end:
|
1646 |
-
# relative_start = entity_start - batch_start
|
1647 |
-
# relative_end = min(entity_end - batch_start, len(original_line.text))
|
1648 |
-
|
1649 |
-
# adjusted_entity = copy.deepcopy(entity)
|
1650 |
-
# adjusted_entity.start = relative_start
|
1651 |
-
# adjusted_entity.end = relative_end
|
1652 |
-
|
1653 |
-
# existing_entry = next((entry for idx, entry in all_text_line_results if idx == line_idx), None)
|
1654 |
-
|
1655 |
-
# if existing_entry is None:
|
1656 |
-
# all_text_line_results.append((line_idx, [adjusted_entity]))
|
1657 |
-
# else:
|
1658 |
-
# existing_entry.append(adjusted_entity)
|
1659 |
-
# break
|
1660 |
-
|
1661 |
-
# return all_text_line_results
|
1662 |
-
|
1663 |
-
|
1664 |
def redact_text_pdf(
|
1665 |
filename: str, # Path to the PDF file to be redacted
|
1666 |
prepared_pdf_image_path: str, # Path to the prepared PDF image for redaction
|
@@ -1761,15 +1590,14 @@ def redact_text_pdf(
|
|
1761 |
for page_no in progress_bar:
|
1762 |
|
1763 |
reported_page_number = str(page_no + 1)
|
1764 |
-
print("Redacting page:", reported_page_number)
|
1765 |
|
1766 |
# Assuming prepared_pdf_file_paths[page_no] is a PIL image object
|
1767 |
try:
|
1768 |
image = prepared_pdf_image_path[page_no]#.copy()
|
1769 |
#print("image:", image)
|
1770 |
except Exception as e:
|
1771 |
-
print("Could not redact page:", reported_page_number, "due to:")
|
1772 |
-
print(e)
|
1773 |
continue
|
1774 |
|
1775 |
image_annotations = {"image": image, "boxes": []}
|
@@ -1825,27 +1653,33 @@ def redact_text_pdf(
|
|
1825 |
|
1826 |
### REDACTION
|
1827 |
|
1828 |
-
|
1829 |
-
|
1830 |
-
|
1831 |
-
|
1832 |
-
|
1833 |
-
|
1834 |
-
|
1835 |
-
|
1836 |
-
|
1837 |
-
|
1838 |
-
|
1839 |
-
|
1840 |
-
|
1841 |
-
|
1842 |
-
|
1843 |
-
|
1844 |
-
|
1845 |
-
|
1846 |
-
|
1847 |
-
|
1848 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
1849 |
|
1850 |
page_analysed_bounding_boxes = convert_pikepdf_decision_output_to_image_coords(pymupdf_page, page_analysed_bounding_boxes, image)
|
1851 |
|
|
|
136 |
tic = time.perf_counter()
|
137 |
all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
|
138 |
|
139 |
+
#print("prepared_pdf_file_paths:", prepared_pdf_file_paths[0])
|
140 |
review_out_file_paths = [prepared_pdf_file_paths[0]]
|
141 |
|
142 |
if isinstance(custom_recogniser_word_list, pd.DataFrame):
|
|
|
779 |
|
780 |
return page, out_annotation_boxes
|
781 |
|
782 |
+
###
|
783 |
+
# IMAGE-BASED OCR PDF TEXT DETECTION/REDACTION WITH TESSERACT OR AWS TEXTRACT
|
784 |
+
###
|
785 |
+
|
786 |
+
|
787 |
def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
|
788 |
|
789 |
all_bboxes = []
|
|
|
1056 |
|
1057 |
#print("Image is in range of pages to redact")
|
1058 |
if isinstance(image, str):
|
1059 |
+
#print("image is a file path", image)
|
1060 |
image = Image.open(image)
|
1061 |
|
1062 |
# Need image size to convert textract OCR outputs to the correct sizes
|
|
|
1124 |
line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
|
1125 |
|
1126 |
# Step 2: Analyze text and identify PII
|
1127 |
+
if chosen_redact_entities or chosen_redact_comprehend_entities:
|
1128 |
|
1129 |
redaction_bboxes, comprehend_query_number_new = image_analyser.analyze_text(
|
1130 |
line_level_ocr_results,
|
|
|
1314 |
|
1315 |
|
1316 |
###
|
1317 |
+
# PIKEPDF TEXT DETECTION/REDACTION
|
1318 |
###
|
1319 |
|
1320 |
def get_text_container_characters(text_container:LTTextContainer):
|
|
|
1490 |
pikepdf_annotations_on_page.append(annotation)
|
1491 |
return pikepdf_annotations_on_page
|
1492 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1493 |
def redact_text_pdf(
|
1494 |
filename: str, # Path to the PDF file to be redacted
|
1495 |
prepared_pdf_image_path: str, # Path to the prepared PDF image for redaction
|
|
|
1590 |
for page_no in progress_bar:
|
1591 |
|
1592 |
reported_page_number = str(page_no + 1)
|
1593 |
+
#print("Redacting page:", reported_page_number)
|
1594 |
|
1595 |
# Assuming prepared_pdf_file_paths[page_no] is a PIL image object
|
1596 |
try:
|
1597 |
image = prepared_pdf_image_path[page_no]#.copy()
|
1598 |
#print("image:", image)
|
1599 |
except Exception as e:
|
1600 |
+
print("Could not redact page:", reported_page_number, "due to:", e)
|
|
|
1601 |
continue
|
1602 |
|
1603 |
image_annotations = {"image": image, "boxes": []}
|
|
|
1653 |
|
1654 |
### REDACTION
|
1655 |
|
1656 |
+
if chosen_redact_entities or chosen_redact_comprehend_entities:
|
1657 |
+
#print("Identifying redactions on page.")
|
1658 |
+
|
1659 |
+
page_analysed_bounding_boxes = run_page_text_redaction(
|
1660 |
+
language,
|
1661 |
+
chosen_redact_entities,
|
1662 |
+
chosen_redact_comprehend_entities,
|
1663 |
+
all_line_level_text_results_list, #line_level_text_results_list,
|
1664 |
+
all_line_characters,
|
1665 |
+
page_analyser_results,
|
1666 |
+
page_analysed_bounding_boxes,
|
1667 |
+
comprehend_client,
|
1668 |
+
allow_list,
|
1669 |
+
pii_identification_method,
|
1670 |
+
nlp_analyser,
|
1671 |
+
score_threshold,
|
1672 |
+
custom_entities,
|
1673 |
+
comprehend_query_number
|
1674 |
+
)
|
1675 |
+
|
1676 |
+
|
1677 |
+
#print("page_analyser_results:", page_analyser_results)
|
1678 |
+
#print("page_analysed_bounding_boxes:", page_analysed_bounding_boxes)
|
1679 |
+
#print("image:", image)
|
1680 |
+
else:
|
1681 |
+
page_analysed_bounding_boxes = []
|
1682 |
+
|
1683 |
|
1684 |
page_analysed_bounding_boxes = convert_pikepdf_decision_output_to_image_coords(pymupdf_page, page_analysed_bounding_boxes, image)
|
1685 |
|
tools/find_duplicate_pages.py
ADDED
@@ -0,0 +1,274 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import argparse
|
3 |
+
import glob
|
4 |
+
import os
|
5 |
+
import re
|
6 |
+
from tools.helper_functions import output_folder
|
7 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
8 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
9 |
+
import nltk
|
10 |
+
from nltk.corpus import stopwords
|
11 |
+
from nltk.tokenize import word_tokenize
|
12 |
+
from nltk.stem import PorterStemmer
|
13 |
+
import numpy as np
|
14 |
+
import random
|
15 |
+
import string
|
16 |
+
from typing import List
|
17 |
+
|
18 |
+
nltk.download('punkt')
|
19 |
+
nltk.download('stopwords')
|
20 |
+
nltk.download('punkt_tab')
|
21 |
+
|
22 |
+
similarity_threshold = 0.9
|
23 |
+
|
24 |
+
stop_words = set(stopwords.words('english'))
|
25 |
+
# List of words to remove from the stopword set
|
26 |
+
#words_to_remove = ['no', 'nor', 'not', 'don', 'don't', 'wasn', 'wasn't', 'weren', 'weren't', "don't", "wasn't", "weren't"]
|
27 |
+
|
28 |
+
# Remove the specified words from the stopwords set
|
29 |
+
#for word in words_to_remove:
|
30 |
+
# stop_words.discard(word.lower())
|
31 |
+
|
32 |
+
stemmer = PorterStemmer()
|
33 |
+
vectorizer = TfidfVectorizer()
|
34 |
+
|
35 |
+
def combine_ocr_output_text(input_files):
|
36 |
+
"""
|
37 |
+
Combines text from multiple CSV files containing page and text columns.
|
38 |
+
Groups text by file and page number, concatenating text within these groups.
|
39 |
+
|
40 |
+
Args:
|
41 |
+
input_files (list): List of paths to CSV files
|
42 |
+
|
43 |
+
Returns:
|
44 |
+
pd.DataFrame: Combined dataframe with columns [file, page, text]
|
45 |
+
"""
|
46 |
+
all_data = []
|
47 |
+
output_files = []
|
48 |
+
|
49 |
+
if isinstance(input_files, str):
|
50 |
+
file_paths_list = [input_files]
|
51 |
+
else:
|
52 |
+
file_paths_list = input_files
|
53 |
+
|
54 |
+
for file in file_paths_list:
|
55 |
+
|
56 |
+
if isinstance(file, str):
|
57 |
+
file_path = file
|
58 |
+
else:
|
59 |
+
file_path = file.name
|
60 |
+
|
61 |
+
# Read CSV file
|
62 |
+
df = pd.read_csv(file_path)
|
63 |
+
|
64 |
+
# Ensure required columns exist
|
65 |
+
if 'page' not in df.columns or 'text' not in df.columns:
|
66 |
+
print(f"Warning: Skipping {file_path} - missing required columns 'page' and 'text'")
|
67 |
+
continue
|
68 |
+
|
69 |
+
# Group by page and concatenate text
|
70 |
+
grouped = df.groupby('page')['text'].apply(' '.join).reset_index()
|
71 |
+
|
72 |
+
# Add filename column
|
73 |
+
grouped['file'] = os.path.basename(file_path)
|
74 |
+
|
75 |
+
all_data.append(grouped)
|
76 |
+
|
77 |
+
if not all_data:
|
78 |
+
raise ValueError("No valid CSV files were processed")
|
79 |
+
|
80 |
+
# Combine all dataframes
|
81 |
+
combined_df = pd.concat(all_data, ignore_index=True)
|
82 |
+
|
83 |
+
# Reorder columns
|
84 |
+
combined_df = combined_df[['file', 'page', 'text']]
|
85 |
+
|
86 |
+
output_combined_file_path = output_folder + "combined_ocr_output_files.csv"
|
87 |
+
combined_df.to_csv(output_combined_file_path, index=None)
|
88 |
+
|
89 |
+
output_files.append(output_combined_file_path)
|
90 |
+
|
91 |
+
return combined_df, output_files
|
92 |
+
|
93 |
+
def process_data(df, column:str):
|
94 |
+
'''
|
95 |
+
Clean and stem text columns in a data frame
|
96 |
+
'''
|
97 |
+
|
98 |
+
def _clean_text(raw_text):
|
99 |
+
# Remove HTML tags
|
100 |
+
clean = re.sub(r'<.*?>', '', raw_text)
|
101 |
+
clean = re.sub(r' ', ' ', clean)
|
102 |
+
clean = re.sub(r'\r\n', ' ', clean)
|
103 |
+
clean = re.sub(r'<', ' ', clean)
|
104 |
+
clean = re.sub(r'>', ' ', clean)
|
105 |
+
clean = re.sub(r'<strong>', ' ', clean)
|
106 |
+
clean = re.sub(r'</strong>', ' ', clean)
|
107 |
+
|
108 |
+
# Replace non-breaking space \xa0 with a space
|
109 |
+
clean = clean.replace(u'\xa0', u' ')
|
110 |
+
# Remove extra whitespace
|
111 |
+
clean = ' '.join(clean.split())
|
112 |
+
|
113 |
+
# Tokenize the text
|
114 |
+
words = word_tokenize(clean.lower())
|
115 |
+
|
116 |
+
# Remove punctuation and numbers
|
117 |
+
words = [word for word in words if word.isalpha()]
|
118 |
+
|
119 |
+
# Remove stopwords
|
120 |
+
words = [word for word in words if word not in stop_words]
|
121 |
+
|
122 |
+
# Join the cleaned words back into a string
|
123 |
+
return ' '.join(words)
|
124 |
+
|
125 |
+
# Function to apply stemming
|
126 |
+
def _apply_stemming(text):
|
127 |
+
# Tokenize the text
|
128 |
+
words = word_tokenize(text.lower())
|
129 |
+
|
130 |
+
# Apply stemming to each word
|
131 |
+
stemmed_words = [stemmer.stem(word) for word in words]
|
132 |
+
|
133 |
+
# Join the stemmed words back into a single string
|
134 |
+
return ' '.join(stemmed_words)
|
135 |
+
|
136 |
+
|
137 |
+
|
138 |
+
|
139 |
+
df['text_clean'] = df[column].apply(_clean_text)
|
140 |
+
df['text_clean'] = df['text_clean'].apply(_apply_stemming)
|
141 |
+
|
142 |
+
return df
|
143 |
+
|
144 |
+
def identify_similar_pages(input_files:List[str]):
|
145 |
+
|
146 |
+
output_paths = []
|
147 |
+
|
148 |
+
df, output_files = combine_ocr_output_text(input_files)
|
149 |
+
|
150 |
+
output_paths.extend(output_files)
|
151 |
+
|
152 |
+
# Clean text
|
153 |
+
df = process_data(df, 'text')
|
154 |
+
|
155 |
+
# Vectorise text
|
156 |
+
tfidf_matrix = vectorizer.fit_transform(df['text_clean'])
|
157 |
+
|
158 |
+
# Calculate cosine similarity
|
159 |
+
similarity_matrix = cosine_similarity(tfidf_matrix)
|
160 |
+
|
161 |
+
# Find the indices of the most similar pages
|
162 |
+
np.fill_diagonal(similarity_matrix, 0) # Ignore self-comparisons
|
163 |
+
similar_pages = np.argwhere(similarity_matrix > similarity_threshold) # Threshold of similarity
|
164 |
+
|
165 |
+
#print(similar_pages)
|
166 |
+
|
167 |
+
# Create a DataFrame for similar pairs and their scores
|
168 |
+
similarity_df = pd.DataFrame({
|
169 |
+
'Page1_Index': similar_pages[:, 0],
|
170 |
+
'Page2_Index': similar_pages[:, 1],
|
171 |
+
'Page1_File': similar_pages[:, 0],
|
172 |
+
'Page2_File': similar_pages[:, 1],
|
173 |
+
'Similarity_Score': similarity_matrix[similar_pages[:, 0], similar_pages[:, 1]]
|
174 |
+
})
|
175 |
+
|
176 |
+
# Filter out duplicate pairs (keep only one direction)
|
177 |
+
similarity_df = similarity_df[similarity_df['Page1_Index'] < similarity_df['Page2_Index']]
|
178 |
+
|
179 |
+
# Map the indices to their corresponding text and metadata
|
180 |
+
similarity_df['Page1_File'] = similarity_df['Page1_File'].map(df['file'])
|
181 |
+
similarity_df['Page2_File'] = similarity_df['Page2_File'].map(df['file'])
|
182 |
+
|
183 |
+
similarity_df['Page1_Page'] = similarity_df['Page1_Index'].map(df['page'])
|
184 |
+
similarity_df['Page2_Page'] = similarity_df['Page2_Index'].map(df['page'])
|
185 |
+
|
186 |
+
similarity_df['Page1_Text'] = similarity_df['Page1_Index'].map(df['text'])
|
187 |
+
similarity_df['Page2_Text'] = similarity_df['Page2_Index'].map(df['text'])
|
188 |
+
|
189 |
+
similarity_df_out = similarity_df[['Page1_File', 'Page1_Page', 'Page2_File', 'Page2_Page', 'Similarity_Score', 'Page1_Text', 'Page2_Text']]
|
190 |
+
similarity_df_out = similarity_df_out.sort_values(["Page1_File", "Page1_Page", "Page2_File", "Page2_Page", "Similarity_Score"], ascending=[True, True, True, True, False])
|
191 |
+
|
192 |
+
# Save detailed results to a CSV file
|
193 |
+
similarity_file_output_path = output_folder + 'page_similarity_results.csv'
|
194 |
+
similarity_df_out.to_csv(similarity_file_output_path, index=False)
|
195 |
+
|
196 |
+
output_paths.append(similarity_file_output_path)
|
197 |
+
|
198 |
+
if not similarity_df_out.empty:
|
199 |
+
unique_files = similarity_df_out['Page2_File'].unique()
|
200 |
+
for redact_file in unique_files:
|
201 |
+
output_file_name = output_folder + redact_file + "_whole_page.csv"
|
202 |
+
whole_pages_to_redact_df = similarity_df_out.loc[similarity_df_out['Page2_File']==redact_file,:][['Page2_Page']]
|
203 |
+
whole_pages_to_redact_df.to_csv(output_file_name, header=None, index=None)
|
204 |
+
|
205 |
+
output_paths.append(output_file_name)
|
206 |
+
|
207 |
+
|
208 |
+
return similarity_df_out, output_paths
|
209 |
+
|
210 |
+
# Perturb text
|
211 |
+
# Apply the perturbation function with a 10% error probability
|
212 |
+
def perturb_text_with_errors(series):
|
213 |
+
|
214 |
+
def _perturb_text(text, error_probability=0.1):
|
215 |
+
words = text.split() # Split text into words
|
216 |
+
perturbed_words = []
|
217 |
+
|
218 |
+
for word in words:
|
219 |
+
if random.random() < error_probability: # Add a random error
|
220 |
+
perturbation_type = random.choice(['char_error', 'extra_space', 'extra_punctuation'])
|
221 |
+
|
222 |
+
if perturbation_type == 'char_error': # Introduce a character error
|
223 |
+
idx = random.randint(0, len(word) - 1)
|
224 |
+
char = random.choice(string.ascii_lowercase) # Add a random letter
|
225 |
+
word = word[:idx] + char + word[idx:]
|
226 |
+
|
227 |
+
elif perturbation_type == 'extra_space': # Add extra space around a word
|
228 |
+
word = ' ' + word + ' '
|
229 |
+
|
230 |
+
elif perturbation_type == 'extra_punctuation': # Add punctuation to the word
|
231 |
+
punctuation = random.choice(string.punctuation)
|
232 |
+
idx = random.randint(0, len(word)) # Insert punctuation randomly
|
233 |
+
word = word[:idx] + punctuation + word[idx:]
|
234 |
+
|
235 |
+
perturbed_words.append(word)
|
236 |
+
|
237 |
+
return ' '.join(perturbed_words)
|
238 |
+
|
239 |
+
series = series.apply(lambda x: _perturb_text(x, error_probability=0.1))
|
240 |
+
|
241 |
+
return series
|
242 |
+
|
243 |
+
# Run through command line
|
244 |
+
# def main():
|
245 |
+
# parser = argparse.ArgumentParser(description='Combine text from multiple CSV files by page')
|
246 |
+
# parser.add_argument('input_pattern', help='Input file pattern (e.g., "input/*.csv")')
|
247 |
+
# parser.add_argument('--output', '-o', default='combined_text.csv',
|
248 |
+
# help='Output CSV file path (default: combined_text.csv)')
|
249 |
+
|
250 |
+
# args = parser.parse_args()
|
251 |
+
|
252 |
+
# # Get list of input files
|
253 |
+
# input_files = glob.glob(args.input_pattern)
|
254 |
+
|
255 |
+
# if not input_files:
|
256 |
+
# print(f"No files found matching pattern: {args.input_pattern}")
|
257 |
+
# return
|
258 |
+
|
259 |
+
# print(f"Processing {len(input_files)} files...")
|
260 |
+
|
261 |
+
# try:
|
262 |
+
# # Combine the text from all files
|
263 |
+
# combined_df = combine_ocr_output_text(input_files)
|
264 |
+
|
265 |
+
# # Save to CSV
|
266 |
+
# combined_df.to_csv(args.output, index=False)
|
267 |
+
# print(f"Successfully created combined output: {args.output}")
|
268 |
+
# print(f"Total pages processed: {len(combined_df)}")
|
269 |
+
|
270 |
+
# except Exception as e:
|
271 |
+
# print(f"Error processing files: {str(e)}")
|
272 |
+
|
273 |
+
# if __name__ == "__main__":
|
274 |
+
# main()
|
tools/helper_functions.py
CHANGED
@@ -20,7 +20,7 @@ def reset_state_vars():
|
|
20 |
show_share_button=False,
|
21 |
show_remove_button=False,
|
22 |
interactive=False
|
23 |
-
), [], []
|
24 |
|
25 |
def get_or_create_env_var(var_name, default_value):
|
26 |
# Get the environment variable if it exists
|
|
|
20 |
show_share_button=False,
|
21 |
show_remove_button=False,
|
22 |
interactive=False
|
23 |
+
), [], [], [], pd.DataFrame(), pd.DataFrame()
|
24 |
|
25 |
def get_or_create_env_var(var_name, default_value):
|
26 |
# Get the environment variable if it exists
|
tools/redaction_review.py
CHANGED
@@ -1,10 +1,12 @@
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
|
|
|
|
|
|
4 |
from typing import List
|
5 |
from gradio_image_annotation import image_annotator
|
6 |
from gradio_image_annotation.image_annotator import AnnotatedImageData
|
7 |
-
|
8 |
from tools.file_conversion import is_pdf, convert_review_json_to_pandas_df
|
9 |
from tools.helper_functions import get_file_path_end, output_folder
|
10 |
from tools.file_redaction import redact_page_with_pymupdf
|
@@ -381,3 +383,61 @@ def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
|
|
381 |
row_value_page = evt.row_value[0] # This is the page number value
|
382 |
return row_value_page
|
383 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
import numpy as np
|
4 |
+
from xml.etree.ElementTree import Element, SubElement, tostring
|
5 |
+
from xml.dom import minidom
|
6 |
+
import uuid
|
7 |
from typing import List
|
8 |
from gradio_image_annotation import image_annotator
|
9 |
from gradio_image_annotation.image_annotator import AnnotatedImageData
|
|
|
10 |
from tools.file_conversion import is_pdf, convert_review_json_to_pandas_df
|
11 |
from tools.helper_functions import get_file_path_end, output_folder
|
12 |
from tools.file_redaction import redact_page_with_pymupdf
|
|
|
383 |
row_value_page = evt.row_value[0] # This is the page number value
|
384 |
return row_value_page
|
385 |
|
386 |
+
|
387 |
+
|
388 |
+
|
389 |
+
def create_xfdf(df, pdf_path):
|
390 |
+
# Create root element
|
391 |
+
xfdf = Element('xfdf', xmlns="http://ns.adobe.com/xfdf/", xml_space="preserve")
|
392 |
+
|
393 |
+
# Add header
|
394 |
+
header = SubElement(xfdf, 'header')
|
395 |
+
header.set('pdf-filepath', pdf_path)
|
396 |
+
|
397 |
+
# Add annots
|
398 |
+
annots = SubElement(xfdf, 'annots')
|
399 |
+
|
400 |
+
# Process each row in dataframe
|
401 |
+
for _, row in df.iterrows():
|
402 |
+
# Create text annotation
|
403 |
+
text_annot = SubElement(annots, 'text')
|
404 |
+
|
405 |
+
# Generate unique ID for each annotation
|
406 |
+
annot_id = str(uuid.uuid4())
|
407 |
+
text_annot.set('name', annot_id)
|
408 |
+
|
409 |
+
# Set page number (subtract 1 as PDF pages are 0-based)
|
410 |
+
text_annot.set('page', str(int(row['page']) - 1))
|
411 |
+
|
412 |
+
# Set coordinates (convert to PDF coordinate system)
|
413 |
+
# Note: You might need to adjust these calculations based on your PDF dimensions
|
414 |
+
text_annot.set('rect', f"{row['xmin']},{row['ymin']},{row['xmax']},{row['ymax']}")
|
415 |
+
|
416 |
+
# Set color (convert RGB tuple string to comma-separated values)
|
417 |
+
color_str = row['color'].strip('()').replace(' ', '')
|
418 |
+
text_annot.set('color', color_str)
|
419 |
+
|
420 |
+
# Set text content
|
421 |
+
text_annot.set('contents', f"{row['label']}: {row['text']}")
|
422 |
+
|
423 |
+
# Set additional properties
|
424 |
+
text_annot.set('flags', "print")
|
425 |
+
text_annot.set('date', "D:20240123000000")
|
426 |
+
text_annot.set('title', "Annotation")
|
427 |
+
|
428 |
+
# Convert to pretty XML string
|
429 |
+
xml_str = minidom.parseString(tostring(xfdf)).toprettyxml(indent=" ")
|
430 |
+
|
431 |
+
return xml_str
|
432 |
+
|
433 |
+
# Example usage:
|
434 |
+
# Assuming your dataframe is named 'df' and you want to create annotations for 'example.pdf'
|
435 |
+
def convert_df_to_xfdf(df, pdf_path, output_path):
|
436 |
+
xfdf_content = create_xfdf(df, pdf_path)
|
437 |
+
|
438 |
+
with open(output_path, 'w', encoding='utf-8') as f:
|
439 |
+
f.write(xfdf_content)
|
440 |
+
|
441 |
+
# Usage example:
|
442 |
+
# df = your_dataframe
|
443 |
+
# convert_df_to_xfdf(df, 'path/to/your.pdf', 'output.xfdf')
|