seanpedrickcase commited on
Commit
a265560
·
1 Parent(s): cb349ad

Added tab to be able to compare pages across multiple documents and redact duplicates

Browse files
Dockerfile CHANGED
@@ -60,6 +60,9 @@ RUN mkdir -p /home/user/app/output \
60
  # Copy installed packages from builder stage
61
  COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
62
 
 
 
 
63
  # Entrypoint helps to switch between Gradio and Lambda mode
64
  COPY entrypoint.sh /entrypoint.sh
65
 
 
60
  # Copy installed packages from builder stage
61
  COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
62
 
63
+ # Download NLTK data packages
64
+ RUN python -m nltk.downloader punkt stopwords punkt_tab
65
+
66
  # Entrypoint helps to switch between Gradio and Lambda mode
67
  COPY entrypoint.sh /entrypoint.sh
68
 
app.py CHANGED
@@ -19,6 +19,7 @@ from tools.data_anonymise import anonymise_data_files
19
  from tools.auth import authenticate_user
20
  from tools.load_spacy_model_custom_recognisers import custom_entities
21
  from tools.custom_csvlogger import CSVLogger_custom
 
22
 
23
  today_rev = datetime.now().strftime("%Y%m%d")
24
 
@@ -68,9 +69,9 @@ with app:
68
  all_image_annotations_state = gr.State([])
69
 
70
 
71
- all_line_level_ocr_results_df_state = gr.Dataframe(value=pd.DataFrame(), label="all_line_level_ocr_results_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
72
- all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), label="all_decision_process_table", visible=False, type="pandas") # gr.State(pd.DataFrame())
73
- review_file_state = gr.Dataframe(value=pd.DataFrame(), label="review_file_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
74
 
75
  session_hash_state = gr.State()
76
  s3_output_folder_state = gr.State()
@@ -129,16 +130,16 @@ with app:
129
  ## Settings page variables
130
  default_allow_list_file_name = "default_allow_list.csv"
131
  default_allow_list_loc = output_folder + "/" + default_allow_list_file_name
132
- in_allow_list_state = gr.Dataframe(value=pd.DataFrame(), label="in_allow_list_df", visible=False, type="pandas")
133
 
134
  default_deny_list_file_name = "default_deny_list.csv"
135
  default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
136
- in_deny_list_state = gr.Dataframe(value=pd.DataFrame(), label="in_deny_list_df", visible=False, type="pandas")
137
  in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
138
 
139
  fully_redacted_list_file_name = "default_fully_redacted_list.csv"
140
  fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
141
- in_fully_redacted_list_state = gr.Dataframe(value=pd.DataFrame(), label="in_full_redacted_list_df", visible=False, type="pandas")
142
  in_fully_redacted_text_in = gr.Textbox(value="Fully redacted page list", visible=False)
143
 
144
  # S3 settings for default allow list load
@@ -149,6 +150,10 @@ with app:
149
  # Base dataframe for recognisers that is not modified subsequent to load
150
  recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", visible=False)
151
 
 
 
 
 
152
  ###
153
  # UI DESIGN
154
  ###
@@ -164,8 +169,10 @@ with app:
164
 
165
  NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.""")
166
 
167
- # PDF / IMAGES TAB
168
- with gr.Tab("PDFs/images"):
 
 
169
  with gr.Accordion("Redact document", open = True):
170
  in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'], height=file_input_height)
171
  if RUN_AWS_FUNCTIONS == "1":
@@ -194,7 +201,9 @@ with app:
194
  pdf_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
195
  pdf_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
196
 
197
- # Object annotation
 
 
198
  with gr.Tab("Review redactions", id="tab_object_annotation"):
199
 
200
  with gr.Accordion(label = "Review redaction file", open=True):
@@ -215,7 +224,6 @@ with app:
215
  clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
216
 
217
  with gr.Row():
218
-
219
  with gr.Column(scale=1):
220
 
221
  zoom_str = str(annotator_zoom_number) + '%'
@@ -249,8 +257,9 @@ with app:
249
  recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
250
  recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", label="Search results. Click to go to page")
251
 
252
-
253
  # TEXT / TABULAR DATA TAB
 
254
  with gr.Tab(label="Open text or Excel/csv files"):
255
  gr.Markdown(
256
  """
@@ -280,7 +289,20 @@ with app:
280
  data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
281
  data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
282
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  # SETTINGS TAB
 
284
  with gr.Tab(label="Redaction settings"):
285
  with gr.Accordion("Custom allow, deny, and full page redaction lists", open = True):
286
  with gr.Row():
@@ -319,7 +341,7 @@ with app:
319
  ###
320
  in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
321
 
322
- document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state]).\
323
  then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
324
  then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
325
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files], api_name="redact_doc").\
@@ -410,10 +432,15 @@ with app:
410
  text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
411
  then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
412
 
 
 
 
 
 
413
  ###
414
  # SETTINGS PAGE INPUT / OUTPUT
415
  ###
416
- # If a custom allow list is uploaded
417
  in_allow_list.change(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
418
  in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
419
  in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
 
19
  from tools.auth import authenticate_user
20
  from tools.load_spacy_model_custom_recognisers import custom_entities
21
  from tools.custom_csvlogger import CSVLogger_custom
22
+ from tools.find_duplicate_pages import identify_similar_pages
23
 
24
  today_rev = datetime.now().strftime("%Y%m%d")
25
 
 
69
  all_image_annotations_state = gr.State([])
70
 
71
 
72
+ all_line_level_ocr_results_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_line_level_ocr_results_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
73
+ all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="all_decision_process_table", visible=False, type="pandas") # gr.State(pd.DataFrame())
74
+ review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
75
 
76
  session_hash_state = gr.State()
77
  s3_output_folder_state = gr.State()
 
130
  ## Settings page variables
131
  default_allow_list_file_name = "default_allow_list.csv"
132
  default_allow_list_loc = output_folder + "/" + default_allow_list_file_name
133
+ in_allow_list_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_allow_list_df", visible=False, type="pandas")
134
 
135
  default_deny_list_file_name = "default_deny_list.csv"
136
  default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
137
+ in_deny_list_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_deny_list_df", visible=False, type="pandas")
138
  in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
139
 
140
  fully_redacted_list_file_name = "default_fully_redacted_list.csv"
141
  fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
142
+ in_fully_redacted_list_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_full_redacted_list_df", visible=False, type="pandas")
143
  in_fully_redacted_text_in = gr.Textbox(value="Fully redacted page list", visible=False)
144
 
145
  # S3 settings for default allow list load
 
150
  # Base dataframe for recognisers that is not modified subsequent to load
151
  recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", visible=False)
152
 
153
+ # Duplicate page detection
154
+ in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
155
+ duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_deny_list_df", visible=False, type="pandas")
156
+
157
  ###
158
  # UI DESIGN
159
  ###
 
169
 
170
  NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.""")
171
 
172
+ ###
173
+ # REDACTION PDF/IMAGES TABL
174
+ ###
175
+ with gr.Tab("Redact PDFs/images"):
176
  with gr.Accordion("Redact document", open = True):
177
  in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'], height=file_input_height)
178
  if RUN_AWS_FUNCTIONS == "1":
 
201
  pdf_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
202
  pdf_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
203
 
204
+ ###
205
+ # REVIEW REDACTIONS TAB
206
+ ###
207
  with gr.Tab("Review redactions", id="tab_object_annotation"):
208
 
209
  with gr.Accordion(label = "Review redaction file", open=True):
 
224
  clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
225
 
226
  with gr.Row():
 
227
  with gr.Column(scale=1):
228
 
229
  zoom_str = str(annotator_zoom_number) + '%'
 
257
  recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
258
  recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", label="Search results. Click to go to page")
259
 
260
+ ###
261
  # TEXT / TABULAR DATA TAB
262
+ ###
263
  with gr.Tab(label="Open text or Excel/csv files"):
264
  gr.Markdown(
265
  """
 
289
  data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
290
  data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
291
 
292
+ ###
293
+ # IDENTIFY DUPLICATE PAGES TAB
294
+ ###
295
+ with gr.Tab(label="Identify duplicate pages"):
296
+ with gr.Accordion("Identify duplicate pages to redact", open = True):
297
+ in_duplicate_pages = gr.File(label="Upload multiple 'ocr_output.csv' data files from redaction jobs here to compare", file_count="multiple", height=file_input_height, file_types=['.csv'])
298
+
299
+ find_duplicate_pages_btn = gr.Button(value="Identify duplicate pages", variant="primary")
300
+
301
+ duplicate_pages_out =gr.File(label="Duplicate pages analysis output", file_count="multiple", height=file_input_height, file_types=['.csv'])
302
+
303
+ ###
304
  # SETTINGS TAB
305
+ ###
306
  with gr.Tab(label="Redaction settings"):
307
  with gr.Accordion("Custom allow, deny, and full page redaction lists", open = True):
308
  with gr.Row():
 
341
  ###
342
  in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
343
 
344
+ document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
345
  then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
346
  then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
347
  outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files], api_name="redact_doc").\
 
432
  text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
433
  then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
434
 
435
+ ###
436
+ # IDENTIFY DUPLICATE PAGES
437
+ ###
438
+ find_duplicate_pages_btn.click(fn=identify_similar_pages, inputs=[in_duplicate_pages], outputs=[duplicate_pages_df, duplicate_pages_out])
439
+
440
  ###
441
  # SETTINGS PAGE INPUT / OUTPUT
442
  ###
443
+ # If a custom allow/deny/duplicate page list is uploaded
444
  in_allow_list.change(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
445
  in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
446
  in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])
requirements.txt CHANGED
@@ -7,6 +7,7 @@ presidio_anonymizer==2.2.355
7
  presidio-image-redactor==0.0.53
8
  pikepdf==8.15.1
9
  pandas==2.2.3
 
10
  spacy==3.8.3
11
  #en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
12
  en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
 
7
  presidio-image-redactor==0.0.53
8
  pikepdf==8.15.1
9
  pandas==2.2.3
10
+ nltk==3.9.1
11
  spacy==3.8.3
12
  #en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
13
  en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz
tools/file_redaction.py CHANGED
@@ -136,7 +136,7 @@ def choose_and_run_redactor(file_paths:List[str],
136
  tic = time.perf_counter()
137
  all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
138
 
139
- print("prepared_pdf_file_paths:", prepared_pdf_file_paths[0])
140
  review_out_file_paths = [prepared_pdf_file_paths[0]]
141
 
142
  if isinstance(custom_recogniser_word_list, pd.DataFrame):
@@ -779,6 +779,11 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
779
 
780
  return page, out_annotation_boxes
781
 
 
 
 
 
 
782
  def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
783
 
784
  all_bboxes = []
@@ -1051,7 +1056,7 @@ def redact_image_pdf(file_path:str,
1051
 
1052
  #print("Image is in range of pages to redact")
1053
  if isinstance(image, str):
1054
- print("image is a file path", image)
1055
  image = Image.open(image)
1056
 
1057
  # Need image size to convert textract OCR outputs to the correct sizes
@@ -1119,7 +1124,7 @@ def redact_image_pdf(file_path:str,
1119
  line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
1120
 
1121
  # Step 2: Analyze text and identify PII
1122
- if chosen_redact_entities:
1123
 
1124
  redaction_bboxes, comprehend_query_number_new = image_analyser.analyze_text(
1125
  line_level_ocr_results,
@@ -1309,7 +1314,7 @@ def redact_image_pdf(file_path:str,
1309
 
1310
 
1311
  ###
1312
- # PIKEPDF TEXT PDF REDACTION
1313
  ###
1314
 
1315
  def get_text_container_characters(text_container:LTTextContainer):
@@ -1485,182 +1490,6 @@ def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
1485
  pikepdf_annotations_on_page.append(annotation)
1486
  return pikepdf_annotations_on_page
1487
 
1488
- # def run_page_text_redaction(language: str, # Language of the PDF content
1489
- # chosen_redact_entities: List[str], # List of entities to be redacted
1490
- # chosen_redact_comprehend_entities: List[str],
1491
- # line_level_text_results_list: List[str],
1492
- # line_characters: List,
1493
- # page_analyser_results: List = [],
1494
- # page_analysed_bounding_boxes: List = [],
1495
- # comprehend_client = None, # Connection to AWS Comprehend
1496
- # allow_list: List[str] = None, # Optional list of allowed entities
1497
- # pii_identification_method: str = "Local"
1498
- # ):
1499
-
1500
- # # Initialize batching variables
1501
- # current_batch = ""
1502
- # current_batch_mapping = [] # List of (start_pos, line_index, OCRResult) tuples
1503
- # all_text_line_results = [] # Store results for all lines
1504
- # text_container_analyser_results = []
1505
- # text_container_analysed_bounding_boxes = []
1506
-
1507
- # # First pass: collect all lines into batches
1508
- # for i, text_line in enumerate(line_level_text_results_list):
1509
- # if chosen_redact_entities:
1510
- # if pii_identification_method == "Local":
1511
-
1512
- # #print("chosen_redact_entities:", chosen_redact_entities)
1513
-
1514
- # # Process immediately for local analysis
1515
- # text_line_analyser_result = nlp_analyser.analyze(
1516
- # text=text_line.text,
1517
- # language=language,
1518
- # entities=chosen_redact_entities,
1519
- # score_threshold=score_threshold,
1520
- # return_decision_process=True,
1521
- # allow_list=allow_list
1522
- # )
1523
- # all_text_line_results.append((i, text_line_analyser_result))
1524
-
1525
-
1526
- # elif pii_identification_method == "AWS Comprehend":
1527
-
1528
- # # First use the local Spacy model to pick up custom entities that AWS Comprehend can't search for.
1529
- # custom_redact_entities = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
1530
-
1531
-
1532
- # text_line_analyser_result = nlp_analyser.analyze(
1533
- # text=text_line.text,
1534
- # language=language,
1535
- # entities=custom_redact_entities,
1536
- # score_threshold=score_threshold,
1537
- # return_decision_process=True,
1538
- # allow_list=allow_list
1539
- # )
1540
- # all_text_line_results.append((i, text_line_analyser_result))
1541
-
1542
-
1543
- # if len(text_line.text) >= 3:
1544
- # # Add separator between lines
1545
- # if current_batch:
1546
- # current_batch += " | "
1547
-
1548
- # start_pos = len(current_batch)
1549
- # current_batch += text_line.text
1550
- # current_batch_mapping.append((start_pos, i, text_line))
1551
-
1552
- # # Process batch if approaching 300 characters or last line
1553
- # if len(current_batch) >= 200 or i == len(line_level_text_results_list) - 1:
1554
- # print("length of text for Comprehend:", len(current_batch))
1555
-
1556
- # try:
1557
- # response = comprehend_client.detect_pii_entities(
1558
- # Text=current_batch,
1559
- # LanguageCode=language
1560
- # )
1561
- # except Exception as e:
1562
- # print(e)
1563
- # time.sleep(3)
1564
- # response = comprehend_client.detect_pii_entities(
1565
- # Text=current_batch,
1566
- # LanguageCode=language
1567
- # )
1568
-
1569
- # comprehend_query_number += 1
1570
-
1571
- # # Process response and map back to original lines
1572
- # if response and "Entities" in response:
1573
- # for entity in response["Entities"]:
1574
- # entity_start = entity["BeginOffset"]
1575
- # entity_end = entity["EndOffset"]
1576
-
1577
- # # Find which line this entity belongs to
1578
- # for batch_start, line_idx, original_line in current_batch_mapping:
1579
- # batch_end = batch_start + len(original_line.text)
1580
-
1581
- # # Check if entity belongs to this line
1582
- # if batch_start <= entity_start < batch_end:
1583
- # # Adjust offsets relative to original line
1584
- # relative_start = entity_start - batch_start
1585
- # relative_end = min(entity_end - batch_start, len(original_line.text))
1586
-
1587
- # result_text = original_line.text[relative_start:relative_end]
1588
-
1589
- # if result_text not in allow_list:
1590
- # if entity.get("Type") in chosen_redact_comprehend_entities:
1591
- # # Create adjusted entity
1592
- # adjusted_entity = entity.copy()
1593
- # adjusted_entity["BeginOffset"] = relative_start
1594
- # adjusted_entity["EndOffset"] = relative_end
1595
-
1596
- # recogniser_entity = recognizer_result_from_dict(adjusted_entity)
1597
-
1598
- # # Add to results for this line
1599
- # existing_results = next((results for idx, results in all_text_line_results if idx == line_idx), [])
1600
- # if not existing_results:
1601
- # all_text_line_results.append((line_idx, [recogniser_entity]))
1602
- # else:
1603
- # existing_results.append(recogniser_entity)
1604
-
1605
- # # Reset batch
1606
- # current_batch = ""
1607
- # current_batch_mapping = []
1608
-
1609
- # # Second pass: process results for each line
1610
- # for i, text_line in enumerate(line_level_text_results_list):
1611
- # text_line_analyser_result = []
1612
- # text_line_bounding_boxes = []
1613
-
1614
- # # Get results for this line
1615
- # line_results = next((results for idx, results in all_text_line_results if idx == i), [])
1616
-
1617
- # if line_results:
1618
- # text_line_analyser_result = line_results
1619
-
1620
- # #print("Analysed text container, now merging bounding boxes")
1621
-
1622
- # # Merge bounding boxes if very close together
1623
- # text_line_bounding_boxes = merge_text_bounding_boxes(text_line_analyser_result, line_characters[i])
1624
-
1625
- # #print("merged bounding boxes")
1626
-
1627
- # text_container_analyser_results.extend(text_line_analyser_result)
1628
- # #text_container_analysed_bounding_boxes.extend(text_line_bounding_boxes)
1629
-
1630
- # #print("text_container_analyser_results:", text_container_analyser_results)
1631
-
1632
- # page_analyser_results.extend(text_container_analyser_results) # Add this line
1633
- # page_analysed_bounding_boxes.extend(text_line_bounding_boxes) # Add this line
1634
-
1635
- # return page_analysed_bounding_boxes
1636
-
1637
- # def map_back_entity_results(page_analyser_result, page_text_mapping, all_text_line_results):
1638
- # for entity in page_analyser_result:
1639
- # entity_start = entity.start
1640
- # entity_end = entity.end
1641
-
1642
- # for batch_start, line_idx, original_line, chars in page_text_mapping:
1643
- # batch_end = batch_start + len(original_line.text)
1644
-
1645
- # if batch_start <= entity_start < batch_end:
1646
- # relative_start = entity_start - batch_start
1647
- # relative_end = min(entity_end - batch_start, len(original_line.text))
1648
-
1649
- # adjusted_entity = copy.deepcopy(entity)
1650
- # adjusted_entity.start = relative_start
1651
- # adjusted_entity.end = relative_end
1652
-
1653
- # existing_entry = next((entry for idx, entry in all_text_line_results if idx == line_idx), None)
1654
-
1655
- # if existing_entry is None:
1656
- # all_text_line_results.append((line_idx, [adjusted_entity]))
1657
- # else:
1658
- # existing_entry.append(adjusted_entity)
1659
- # break
1660
-
1661
- # return all_text_line_results
1662
-
1663
-
1664
  def redact_text_pdf(
1665
  filename: str, # Path to the PDF file to be redacted
1666
  prepared_pdf_image_path: str, # Path to the prepared PDF image for redaction
@@ -1761,15 +1590,14 @@ def redact_text_pdf(
1761
  for page_no in progress_bar:
1762
 
1763
  reported_page_number = str(page_no + 1)
1764
- print("Redacting page:", reported_page_number)
1765
 
1766
  # Assuming prepared_pdf_file_paths[page_no] is a PIL image object
1767
  try:
1768
  image = prepared_pdf_image_path[page_no]#.copy()
1769
  #print("image:", image)
1770
  except Exception as e:
1771
- print("Could not redact page:", reported_page_number, "due to:")
1772
- print(e)
1773
  continue
1774
 
1775
  image_annotations = {"image": image, "boxes": []}
@@ -1825,27 +1653,33 @@ def redact_text_pdf(
1825
 
1826
  ### REDACTION
1827
 
1828
- page_analysed_bounding_boxes = run_page_text_redaction(
1829
- language,
1830
- chosen_redact_entities,
1831
- chosen_redact_comprehend_entities,
1832
- all_line_level_text_results_list, #line_level_text_results_list,
1833
- all_line_characters,
1834
- page_analyser_results,
1835
- page_analysed_bounding_boxes,
1836
- comprehend_client,
1837
- allow_list,
1838
- pii_identification_method,
1839
- nlp_analyser,
1840
- score_threshold,
1841
- custom_entities,
1842
- comprehend_query_number
1843
- )
1844
-
1845
-
1846
- #print("page_analyser_results:", page_analyser_results)
1847
- #print("page_analysed_bounding_boxes:", page_analysed_bounding_boxes)
1848
- #print("image:", image)
 
 
 
 
 
 
1849
 
1850
  page_analysed_bounding_boxes = convert_pikepdf_decision_output_to_image_coords(pymupdf_page, page_analysed_bounding_boxes, image)
1851
 
 
136
  tic = time.perf_counter()
137
  all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
138
 
139
+ #print("prepared_pdf_file_paths:", prepared_pdf_file_paths[0])
140
  review_out_file_paths = [prepared_pdf_file_paths[0]]
141
 
142
  if isinstance(custom_recogniser_word_list, pd.DataFrame):
 
779
 
780
  return page, out_annotation_boxes
781
 
782
+ ###
783
+ # IMAGE-BASED OCR PDF TEXT DETECTION/REDACTION WITH TESSERACT OR AWS TEXTRACT
784
+ ###
785
+
786
+
787
  def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
788
 
789
  all_bboxes = []
 
1056
 
1057
  #print("Image is in range of pages to redact")
1058
  if isinstance(image, str):
1059
+ #print("image is a file path", image)
1060
  image = Image.open(image)
1061
 
1062
  # Need image size to convert textract OCR outputs to the correct sizes
 
1124
  line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
1125
 
1126
  # Step 2: Analyze text and identify PII
1127
+ if chosen_redact_entities or chosen_redact_comprehend_entities:
1128
 
1129
  redaction_bboxes, comprehend_query_number_new = image_analyser.analyze_text(
1130
  line_level_ocr_results,
 
1314
 
1315
 
1316
  ###
1317
+ # PIKEPDF TEXT DETECTION/REDACTION
1318
  ###
1319
 
1320
  def get_text_container_characters(text_container:LTTextContainer):
 
1490
  pikepdf_annotations_on_page.append(annotation)
1491
  return pikepdf_annotations_on_page
1492
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1493
  def redact_text_pdf(
1494
  filename: str, # Path to the PDF file to be redacted
1495
  prepared_pdf_image_path: str, # Path to the prepared PDF image for redaction
 
1590
  for page_no in progress_bar:
1591
 
1592
  reported_page_number = str(page_no + 1)
1593
+ #print("Redacting page:", reported_page_number)
1594
 
1595
  # Assuming prepared_pdf_file_paths[page_no] is a PIL image object
1596
  try:
1597
  image = prepared_pdf_image_path[page_no]#.copy()
1598
  #print("image:", image)
1599
  except Exception as e:
1600
+ print("Could not redact page:", reported_page_number, "due to:", e)
 
1601
  continue
1602
 
1603
  image_annotations = {"image": image, "boxes": []}
 
1653
 
1654
  ### REDACTION
1655
 
1656
+ if chosen_redact_entities or chosen_redact_comprehend_entities:
1657
+ #print("Identifying redactions on page.")
1658
+
1659
+ page_analysed_bounding_boxes = run_page_text_redaction(
1660
+ language,
1661
+ chosen_redact_entities,
1662
+ chosen_redact_comprehend_entities,
1663
+ all_line_level_text_results_list, #line_level_text_results_list,
1664
+ all_line_characters,
1665
+ page_analyser_results,
1666
+ page_analysed_bounding_boxes,
1667
+ comprehend_client,
1668
+ allow_list,
1669
+ pii_identification_method,
1670
+ nlp_analyser,
1671
+ score_threshold,
1672
+ custom_entities,
1673
+ comprehend_query_number
1674
+ )
1675
+
1676
+
1677
+ #print("page_analyser_results:", page_analyser_results)
1678
+ #print("page_analysed_bounding_boxes:", page_analysed_bounding_boxes)
1679
+ #print("image:", image)
1680
+ else:
1681
+ page_analysed_bounding_boxes = []
1682
+
1683
 
1684
  page_analysed_bounding_boxes = convert_pikepdf_decision_output_to_image_coords(pymupdf_page, page_analysed_bounding_boxes, image)
1685
 
tools/find_duplicate_pages.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import argparse
3
+ import glob
4
+ import os
5
+ import re
6
+ from tools.helper_functions import output_folder
7
+ from sklearn.feature_extraction.text import TfidfVectorizer
8
+ from sklearn.metrics.pairwise import cosine_similarity
9
+ import nltk
10
+ from nltk.corpus import stopwords
11
+ from nltk.tokenize import word_tokenize
12
+ from nltk.stem import PorterStemmer
13
+ import numpy as np
14
+ import random
15
+ import string
16
+ from typing import List
17
+
18
+ nltk.download('punkt')
19
+ nltk.download('stopwords')
20
+ nltk.download('punkt_tab')
21
+
22
+ similarity_threshold = 0.9
23
+
24
+ stop_words = set(stopwords.words('english'))
25
+ # List of words to remove from the stopword set
26
+ #words_to_remove = ['no', 'nor', 'not', 'don', 'don't', 'wasn', 'wasn't', 'weren', 'weren't', "don't", "wasn't", "weren't"]
27
+
28
+ # Remove the specified words from the stopwords set
29
+ #for word in words_to_remove:
30
+ # stop_words.discard(word.lower())
31
+
32
+ stemmer = PorterStemmer()
33
+ vectorizer = TfidfVectorizer()
34
+
35
+ def combine_ocr_output_text(input_files):
36
+ """
37
+ Combines text from multiple CSV files containing page and text columns.
38
+ Groups text by file and page number, concatenating text within these groups.
39
+
40
+ Args:
41
+ input_files (list): List of paths to CSV files
42
+
43
+ Returns:
44
+ pd.DataFrame: Combined dataframe with columns [file, page, text]
45
+ """
46
+ all_data = []
47
+ output_files = []
48
+
49
+ if isinstance(input_files, str):
50
+ file_paths_list = [input_files]
51
+ else:
52
+ file_paths_list = input_files
53
+
54
+ for file in file_paths_list:
55
+
56
+ if isinstance(file, str):
57
+ file_path = file
58
+ else:
59
+ file_path = file.name
60
+
61
+ # Read CSV file
62
+ df = pd.read_csv(file_path)
63
+
64
+ # Ensure required columns exist
65
+ if 'page' not in df.columns or 'text' not in df.columns:
66
+ print(f"Warning: Skipping {file_path} - missing required columns 'page' and 'text'")
67
+ continue
68
+
69
+ # Group by page and concatenate text
70
+ grouped = df.groupby('page')['text'].apply(' '.join).reset_index()
71
+
72
+ # Add filename column
73
+ grouped['file'] = os.path.basename(file_path)
74
+
75
+ all_data.append(grouped)
76
+
77
+ if not all_data:
78
+ raise ValueError("No valid CSV files were processed")
79
+
80
+ # Combine all dataframes
81
+ combined_df = pd.concat(all_data, ignore_index=True)
82
+
83
+ # Reorder columns
84
+ combined_df = combined_df[['file', 'page', 'text']]
85
+
86
+ output_combined_file_path = output_folder + "combined_ocr_output_files.csv"
87
+ combined_df.to_csv(output_combined_file_path, index=None)
88
+
89
+ output_files.append(output_combined_file_path)
90
+
91
+ return combined_df, output_files
92
+
93
+ def process_data(df, column:str):
94
+ '''
95
+ Clean and stem text columns in a data frame
96
+ '''
97
+
98
+ def _clean_text(raw_text):
99
+ # Remove HTML tags
100
+ clean = re.sub(r'<.*?>', '', raw_text)
101
+ clean = re.sub(r'&nbsp;', ' ', clean)
102
+ clean = re.sub(r'\r\n', ' ', clean)
103
+ clean = re.sub(r'&lt;', ' ', clean)
104
+ clean = re.sub(r'&gt;', ' ', clean)
105
+ clean = re.sub(r'<strong>', ' ', clean)
106
+ clean = re.sub(r'</strong>', ' ', clean)
107
+
108
+ # Replace non-breaking space \xa0 with a space
109
+ clean = clean.replace(u'\xa0', u' ')
110
+ # Remove extra whitespace
111
+ clean = ' '.join(clean.split())
112
+
113
+ # Tokenize the text
114
+ words = word_tokenize(clean.lower())
115
+
116
+ # Remove punctuation and numbers
117
+ words = [word for word in words if word.isalpha()]
118
+
119
+ # Remove stopwords
120
+ words = [word for word in words if word not in stop_words]
121
+
122
+ # Join the cleaned words back into a string
123
+ return ' '.join(words)
124
+
125
+ # Function to apply stemming
126
+ def _apply_stemming(text):
127
+ # Tokenize the text
128
+ words = word_tokenize(text.lower())
129
+
130
+ # Apply stemming to each word
131
+ stemmed_words = [stemmer.stem(word) for word in words]
132
+
133
+ # Join the stemmed words back into a single string
134
+ return ' '.join(stemmed_words)
135
+
136
+
137
+
138
+
139
+ df['text_clean'] = df[column].apply(_clean_text)
140
+ df['text_clean'] = df['text_clean'].apply(_apply_stemming)
141
+
142
+ return df
143
+
144
+ def identify_similar_pages(input_files:List[str]):
145
+
146
+ output_paths = []
147
+
148
+ df, output_files = combine_ocr_output_text(input_files)
149
+
150
+ output_paths.extend(output_files)
151
+
152
+ # Clean text
153
+ df = process_data(df, 'text')
154
+
155
+ # Vectorise text
156
+ tfidf_matrix = vectorizer.fit_transform(df['text_clean'])
157
+
158
+ # Calculate cosine similarity
159
+ similarity_matrix = cosine_similarity(tfidf_matrix)
160
+
161
+ # Find the indices of the most similar pages
162
+ np.fill_diagonal(similarity_matrix, 0) # Ignore self-comparisons
163
+ similar_pages = np.argwhere(similarity_matrix > similarity_threshold) # Threshold of similarity
164
+
165
+ #print(similar_pages)
166
+
167
+ # Create a DataFrame for similar pairs and their scores
168
+ similarity_df = pd.DataFrame({
169
+ 'Page1_Index': similar_pages[:, 0],
170
+ 'Page2_Index': similar_pages[:, 1],
171
+ 'Page1_File': similar_pages[:, 0],
172
+ 'Page2_File': similar_pages[:, 1],
173
+ 'Similarity_Score': similarity_matrix[similar_pages[:, 0], similar_pages[:, 1]]
174
+ })
175
+
176
+ # Filter out duplicate pairs (keep only one direction)
177
+ similarity_df = similarity_df[similarity_df['Page1_Index'] < similarity_df['Page2_Index']]
178
+
179
+ # Map the indices to their corresponding text and metadata
180
+ similarity_df['Page1_File'] = similarity_df['Page1_File'].map(df['file'])
181
+ similarity_df['Page2_File'] = similarity_df['Page2_File'].map(df['file'])
182
+
183
+ similarity_df['Page1_Page'] = similarity_df['Page1_Index'].map(df['page'])
184
+ similarity_df['Page2_Page'] = similarity_df['Page2_Index'].map(df['page'])
185
+
186
+ similarity_df['Page1_Text'] = similarity_df['Page1_Index'].map(df['text'])
187
+ similarity_df['Page2_Text'] = similarity_df['Page2_Index'].map(df['text'])
188
+
189
+ similarity_df_out = similarity_df[['Page1_File', 'Page1_Page', 'Page2_File', 'Page2_Page', 'Similarity_Score', 'Page1_Text', 'Page2_Text']]
190
+ similarity_df_out = similarity_df_out.sort_values(["Page1_File", "Page1_Page", "Page2_File", "Page2_Page", "Similarity_Score"], ascending=[True, True, True, True, False])
191
+
192
+ # Save detailed results to a CSV file
193
+ similarity_file_output_path = output_folder + 'page_similarity_results.csv'
194
+ similarity_df_out.to_csv(similarity_file_output_path, index=False)
195
+
196
+ output_paths.append(similarity_file_output_path)
197
+
198
+ if not similarity_df_out.empty:
199
+ unique_files = similarity_df_out['Page2_File'].unique()
200
+ for redact_file in unique_files:
201
+ output_file_name = output_folder + redact_file + "_whole_page.csv"
202
+ whole_pages_to_redact_df = similarity_df_out.loc[similarity_df_out['Page2_File']==redact_file,:][['Page2_Page']]
203
+ whole_pages_to_redact_df.to_csv(output_file_name, header=None, index=None)
204
+
205
+ output_paths.append(output_file_name)
206
+
207
+
208
+ return similarity_df_out, output_paths
209
+
210
+ # Perturb text
211
+ # Apply the perturbation function with a 10% error probability
212
+ def perturb_text_with_errors(series):
213
+
214
+ def _perturb_text(text, error_probability=0.1):
215
+ words = text.split() # Split text into words
216
+ perturbed_words = []
217
+
218
+ for word in words:
219
+ if random.random() < error_probability: # Add a random error
220
+ perturbation_type = random.choice(['char_error', 'extra_space', 'extra_punctuation'])
221
+
222
+ if perturbation_type == 'char_error': # Introduce a character error
223
+ idx = random.randint(0, len(word) - 1)
224
+ char = random.choice(string.ascii_lowercase) # Add a random letter
225
+ word = word[:idx] + char + word[idx:]
226
+
227
+ elif perturbation_type == 'extra_space': # Add extra space around a word
228
+ word = ' ' + word + ' '
229
+
230
+ elif perturbation_type == 'extra_punctuation': # Add punctuation to the word
231
+ punctuation = random.choice(string.punctuation)
232
+ idx = random.randint(0, len(word)) # Insert punctuation randomly
233
+ word = word[:idx] + punctuation + word[idx:]
234
+
235
+ perturbed_words.append(word)
236
+
237
+ return ' '.join(perturbed_words)
238
+
239
+ series = series.apply(lambda x: _perturb_text(x, error_probability=0.1))
240
+
241
+ return series
242
+
243
+ # Run through command line
244
+ # def main():
245
+ # parser = argparse.ArgumentParser(description='Combine text from multiple CSV files by page')
246
+ # parser.add_argument('input_pattern', help='Input file pattern (e.g., "input/*.csv")')
247
+ # parser.add_argument('--output', '-o', default='combined_text.csv',
248
+ # help='Output CSV file path (default: combined_text.csv)')
249
+
250
+ # args = parser.parse_args()
251
+
252
+ # # Get list of input files
253
+ # input_files = glob.glob(args.input_pattern)
254
+
255
+ # if not input_files:
256
+ # print(f"No files found matching pattern: {args.input_pattern}")
257
+ # return
258
+
259
+ # print(f"Processing {len(input_files)} files...")
260
+
261
+ # try:
262
+ # # Combine the text from all files
263
+ # combined_df = combine_ocr_output_text(input_files)
264
+
265
+ # # Save to CSV
266
+ # combined_df.to_csv(args.output, index=False)
267
+ # print(f"Successfully created combined output: {args.output}")
268
+ # print(f"Total pages processed: {len(combined_df)}")
269
+
270
+ # except Exception as e:
271
+ # print(f"Error processing files: {str(e)}")
272
+
273
+ # if __name__ == "__main__":
274
+ # main()
tools/helper_functions.py CHANGED
@@ -20,7 +20,7 @@ def reset_state_vars():
20
  show_share_button=False,
21
  show_remove_button=False,
22
  interactive=False
23
- ), [], []
24
 
25
  def get_or_create_env_var(var_name, default_value):
26
  # Get the environment variable if it exists
 
20
  show_share_button=False,
21
  show_remove_button=False,
22
  interactive=False
23
+ ), [], [], [], pd.DataFrame(), pd.DataFrame()
24
 
25
  def get_or_create_env_var(var_name, default_value):
26
  # Get the environment variable if it exists
tools/redaction_review.py CHANGED
@@ -1,10 +1,12 @@
1
  import gradio as gr
2
  import pandas as pd
3
  import numpy as np
 
 
 
4
  from typing import List
5
  from gradio_image_annotation import image_annotator
6
  from gradio_image_annotation.image_annotator import AnnotatedImageData
7
-
8
  from tools.file_conversion import is_pdf, convert_review_json_to_pandas_df
9
  from tools.helper_functions import get_file_path_end, output_folder
10
  from tools.file_redaction import redact_page_with_pymupdf
@@ -381,3 +383,61 @@ def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
381
  row_value_page = evt.row_value[0] # This is the page number value
382
  return row_value_page
383
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import pandas as pd
3
  import numpy as np
4
+ from xml.etree.ElementTree import Element, SubElement, tostring
5
+ from xml.dom import minidom
6
+ import uuid
7
  from typing import List
8
  from gradio_image_annotation import image_annotator
9
  from gradio_image_annotation.image_annotator import AnnotatedImageData
 
10
  from tools.file_conversion import is_pdf, convert_review_json_to_pandas_df
11
  from tools.helper_functions import get_file_path_end, output_folder
12
  from tools.file_redaction import redact_page_with_pymupdf
 
383
  row_value_page = evt.row_value[0] # This is the page number value
384
  return row_value_page
385
 
386
+
387
+
388
+
389
+ def create_xfdf(df, pdf_path):
390
+ # Create root element
391
+ xfdf = Element('xfdf', xmlns="http://ns.adobe.com/xfdf/", xml_space="preserve")
392
+
393
+ # Add header
394
+ header = SubElement(xfdf, 'header')
395
+ header.set('pdf-filepath', pdf_path)
396
+
397
+ # Add annots
398
+ annots = SubElement(xfdf, 'annots')
399
+
400
+ # Process each row in dataframe
401
+ for _, row in df.iterrows():
402
+ # Create text annotation
403
+ text_annot = SubElement(annots, 'text')
404
+
405
+ # Generate unique ID for each annotation
406
+ annot_id = str(uuid.uuid4())
407
+ text_annot.set('name', annot_id)
408
+
409
+ # Set page number (subtract 1 as PDF pages are 0-based)
410
+ text_annot.set('page', str(int(row['page']) - 1))
411
+
412
+ # Set coordinates (convert to PDF coordinate system)
413
+ # Note: You might need to adjust these calculations based on your PDF dimensions
414
+ text_annot.set('rect', f"{row['xmin']},{row['ymin']},{row['xmax']},{row['ymax']}")
415
+
416
+ # Set color (convert RGB tuple string to comma-separated values)
417
+ color_str = row['color'].strip('()').replace(' ', '')
418
+ text_annot.set('color', color_str)
419
+
420
+ # Set text content
421
+ text_annot.set('contents', f"{row['label']}: {row['text']}")
422
+
423
+ # Set additional properties
424
+ text_annot.set('flags', "print")
425
+ text_annot.set('date', "D:20240123000000")
426
+ text_annot.set('title', "Annotation")
427
+
428
+ # Convert to pretty XML string
429
+ xml_str = minidom.parseString(tostring(xfdf)).toprettyxml(indent=" ")
430
+
431
+ return xml_str
432
+
433
+ # Example usage:
434
+ # Assuming your dataframe is named 'df' and you want to create annotations for 'example.pdf'
435
+ def convert_df_to_xfdf(df, pdf_path, output_path):
436
+ xfdf_content = create_xfdf(df, pdf_path)
437
+
438
+ with open(output_path, 'w', encoding='utf-8') as f:
439
+ f.write(xfdf_content)
440
+
441
+ # Usage example:
442
+ # df = your_dataframe
443
+ # convert_df_to_xfdf(df, 'path/to/your.pdf', 'output.xfdf')