Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on 21 days ago

Commit

a265560

1 Parent(s): cb349ad

Added tab to be able to compare pages across multiple documents and redact duplicates

Browse files

Files changed (7) hide show

Dockerfile +3 -0
app.py +40 -13
requirements.txt +1 -0
tools/file_redaction.py +38 -204
tools/find_duplicate_pages.py +274 -0
tools/helper_functions.py +1 -1
tools/redaction_review.py +61 -1

Dockerfile CHANGED Viewed

@@ -60,6 +60,9 @@ RUN mkdir -p /home/user/app/output \
 # Copy installed packages from builder stage
 COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
 # Entrypoint helps to switch between Gradio and Lambda mode
 COPY entrypoint.sh /entrypoint.sh

 # Copy installed packages from builder stage
 COPY --from=builder /install /usr/local/lib/python3.11/site-packages/
+# Download NLTK data packages
+RUN python -m nltk.downloader punkt stopwords punkt_tab
 # Entrypoint helps to switch between Gradio and Lambda mode
 COPY entrypoint.sh /entrypoint.sh

app.py CHANGED Viewed

@@ -19,6 +19,7 @@ from tools.data_anonymise import anonymise_data_files
 from tools.auth import authenticate_user
 from tools.load_spacy_model_custom_recognisers import custom_entities
 from tools.custom_csvlogger import CSVLogger_custom
 today_rev = datetime.now().strftime("%Y%m%d")
@@ -68,9 +69,9 @@ with app:
     all_image_annotations_state = gr.State([])
-    all_line_level_ocr_results_df_state = gr.Dataframe(value=pd.DataFrame(), label="all_line_level_ocr_results_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
-    all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), label="all_decision_process_table", visible=False, type="pandas") # gr.State(pd.DataFrame())
-    review_file_state = gr.Dataframe(value=pd.DataFrame(), label="review_file_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
     session_hash_state = gr.State()
     s3_output_folder_state = gr.State()
@@ -129,16 +130,16 @@ with app:
     ## Settings page variables
     default_allow_list_file_name = "default_allow_list.csv"
     default_allow_list_loc = output_folder + "/" + default_allow_list_file_name
-    in_allow_list_state = gr.Dataframe(value=pd.DataFrame(), label="in_allow_list_df", visible=False, type="pandas")
     default_deny_list_file_name = "default_deny_list.csv"
     default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
-    in_deny_list_state = gr.Dataframe(value=pd.DataFrame(), label="in_deny_list_df", visible=False, type="pandas")
     in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
     fully_redacted_list_file_name = "default_fully_redacted_list.csv"
     fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
-    in_fully_redacted_list_state = gr.Dataframe(value=pd.DataFrame(), label="in_full_redacted_list_df", visible=False, type="pandas")
     in_fully_redacted_text_in = gr.Textbox(value="Fully redacted page list", visible=False)
     # S3 settings for default allow list load
@@ -149,6 +150,10 @@ with app:
     # Base dataframe for recognisers that is not modified subsequent to load
     recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", visible=False)
     ###
     # UI DESIGN
     ###
@@ -164,8 +169,10 @@ with app:
     NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.""")
-    # PDF / IMAGES TAB
-    with gr.Tab("PDFs/images"):
         with gr.Accordion("Redact document", open = True):
             in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'], height=file_input_height)
             if RUN_AWS_FUNCTIONS == "1":
@@ -194,7 +201,9 @@ with app:
         pdf_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
         pdf_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
-    # Object annotation
     with gr.Tab("Review redactions", id="tab_object_annotation"):
         with gr.Accordion(label = "Review redaction file", open=True):
@@ -215,7 +224,6 @@ with app:
             clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
         with gr.Row():
             with gr.Column(scale=1):
                 zoom_str = str(annotator_zoom_number) + '%'
@@ -249,8 +257,9 @@ with app:
             recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
             recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", label="Search results. Click to go to page")
     # TEXT / TABULAR DATA TAB
     with gr.Tab(label="Open text or Excel/csv files"):
         gr.Markdown(
     """
@@ -280,7 +289,20 @@ with app:
         data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
         data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
     # SETTINGS TAB
     with gr.Tab(label="Redaction settings"):
         with gr.Accordion("Custom allow, deny, and full page redaction lists", open = True):
             with gr.Row():
@@ -319,7 +341,7 @@ with app:
     ###
     in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
-    document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state]).\
     then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
     then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
                     outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files], api_name="redact_doc").\
@@ -410,10 +432,15 @@ with app:
     text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
     then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
     ###
     # SETTINGS PAGE INPUT / OUTPUT
     ###
-    # If a custom allow list is uploaded
     in_allow_list.change(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
     in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
     in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])

 from tools.auth import authenticate_user
 from tools.load_spacy_model_custom_recognisers import custom_entities
 from tools.custom_csvlogger import CSVLogger_custom
+from tools.find_duplicate_pages import identify_similar_pages
 today_rev = datetime.now().strftime("%Y%m%d")
     all_image_annotations_state = gr.State([])
+    all_line_level_ocr_results_df_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"),  label="all_line_level_ocr_results_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
+    all_decision_process_table_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"),  label="all_decision_process_table", visible=False, type="pandas") # gr.State(pd.DataFrame())
+    review_file_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="review_file_df", visible=False, type="pandas") #gr.State(pd.DataFrame())
     session_hash_state = gr.State()
     s3_output_folder_state = gr.State()
     ## Settings page variables
     default_allow_list_file_name = "default_allow_list.csv"
     default_allow_list_loc = output_folder + "/" + default_allow_list_file_name
+    in_allow_list_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_allow_list_df", visible=False, type="pandas")
     default_deny_list_file_name = "default_deny_list.csv"
     default_deny_list_loc = output_folder + "/" + default_deny_list_file_name
+    in_deny_list_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_deny_list_df", visible=False, type="pandas")
     in_deny_list_text_in = gr.Textbox(value="Deny list", visible=False)
     fully_redacted_list_file_name = "default_fully_redacted_list.csv"
     fully_redacted_list_loc = output_folder + "/" + fully_redacted_list_file_name
+    in_fully_redacted_list_state = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_full_redacted_list_df", visible=False, type="pandas")
     in_fully_redacted_text_in = gr.Textbox(value="Fully redacted page list", visible=False)
     # S3 settings for default allow list load
     # Base dataframe for recognisers that is not modified subsequent to load
     recogniser_entity_dataframe_base = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", visible=False)
+    # Duplicate page detection
+    in_duplicate_pages_text = gr.Textbox(label="in_duplicate_pages_text", visible=False)
+    duplicate_pages_df = gr.Dataframe(value=pd.DataFrame(), headers=None, col_count=0, row_count = (0, "dynamic"), label="in_deny_list_df", visible=False, type="pandas")
     ###
     # UI DESIGN
     ###
     NOTE: The app is not 100% accurate, and it will miss some personal information. It is essential that all outputs are reviewed **by a human** before using the final outputs.""")
+    ###
+    # REDACTION PDF/IMAGES TABL
+    ###
+    with gr.Tab("Redact PDFs/images"):
         with gr.Accordion("Redact document", open = True):
             in_doc_files = gr.File(label="Choose a document or image file (PDF, JPG, PNG)", file_count= "single", file_types=['.pdf', '.jpg', '.png', '.json'], height=file_input_height)
             if RUN_AWS_FUNCTIONS == "1":
         pdf_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
         pdf_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
+    ###
+    # REVIEW REDACTIONS TAB
+    ###
     with gr.Tab("Review redactions", id="tab_object_annotation"):
         with gr.Accordion(label = "Review redaction file", open=True):
             clear_all_redactions_on_page_btn = gr.Button("Clear all redactions on page", visible=False)
         with gr.Row():
             with gr.Column(scale=1):
                 zoom_str = str(annotator_zoom_number) + '%'
             recogniser_entity_dropdown = gr.Dropdown(label="Redaction category", value="ALL", allow_custom_value=True)
             recogniser_entity_dataframe = gr.Dataframe(pd.DataFrame(data={"page":[], "label":[]}), col_count=2, type="pandas", label="Search results. Click to go to page")
+    ###
     # TEXT / TABULAR DATA TAB
+    ###
     with gr.Tab(label="Open text or Excel/csv files"):
         gr.Markdown(
     """
         data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
         data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
+    ###
+    # IDENTIFY DUPLICATE PAGES TAB
+    ###
+    with gr.Tab(label="Identify duplicate pages"):
+        with gr.Accordion("Identify duplicate pages to redact", open = True):
+            in_duplicate_pages = gr.File(label="Upload multiple 'ocr_output.csv' data files from redaction jobs here to compare", file_count="multiple", height=file_input_height, file_types=['.csv'])
+            find_duplicate_pages_btn = gr.Button(value="Identify duplicate pages", variant="primary")
+            duplicate_pages_out =gr.File(label="Duplicate pages analysis output", file_count="multiple", height=file_input_height, file_types=['.csv'])
+    ###
     # SETTINGS TAB
+    ###
     with gr.Tab(label="Redaction settings"):
         with gr.Accordion("Custom allow, deny, and full page redaction lists", open = True):
             with gr.Row():
     ###
     in_doc_files.upload(fn=get_input_file_names, inputs=[in_doc_files], outputs=[doc_file_name_no_extension_textbox, doc_file_name_with_extension_textbox, doc_full_file_name_textbox, doc_file_name_textbox_list])
+    document_redact_btn.click(fn = reset_state_vars, outputs=[pdf_doc_state, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, textract_metadata_textbox, annotator, output_file_list_state, log_files_output_list_state, recogniser_entity_dropdown, recogniser_entity_dataframe, recogniser_entity_dataframe_base]).\
     then(fn = prepare_image_or_pdf, inputs=[in_doc_files, in_redaction_method, in_allow_list, latest_file_completed_text, output_summary, first_loop_state, annotate_max_pages, current_loop_page_number, all_image_annotations_state], outputs=[output_summary, prepared_pdf_state, images_pdf_state, annotate_max_pages, annotate_max_pages_bottom, pdf_doc_state, all_image_annotations_state, review_file_state], api_name="prepare_doc").\
     then(fn = choose_and_run_redactor, inputs=[in_doc_files, prepared_pdf_state, images_pdf_state, in_redact_language, in_redact_entities, in_redact_comprehend_entities, in_redaction_method, in_allow_list_state, in_deny_list_state, in_fully_redacted_list_state, latest_file_completed_text, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state, page_min, page_max, estimated_time_taken_number, handwrite_signature_checkbox, textract_metadata_textbox, all_image_annotations_state, all_line_level_ocr_results_df_state, all_decision_process_table_state, pdf_doc_state, current_loop_page_number, page_break_return, pii_identification_method_drop, comprehend_query_number],
                     outputs=[output_summary, output_file, output_file_list_state, latest_file_completed_text, log_files_output, log_files_output_list_state, estimated_time_taken_number, textract_metadata_textbox, pdf_doc_state, all_image_annotations_state, current_loop_page_number, page_break_return, all_line_level_ocr_results_df_state, all_decision_process_table_state, comprehend_query_number, output_review_files], api_name="redact_doc").\
     text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
     then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
+    ###
+    # IDENTIFY DUPLICATE PAGES
+    ###
+    find_duplicate_pages_btn.click(fn=identify_similar_pages, inputs=[in_duplicate_pages], outputs=[duplicate_pages_df, duplicate_pages_out])
     ###
     # SETTINGS PAGE INPUT / OUTPUT
     ###
+    # If a custom allow/deny/duplicate page list is uploaded
     in_allow_list.change(fn=custom_regex_load, inputs=[in_allow_list], outputs=[in_allow_list_text, in_allow_list_state])
     in_deny_list.change(fn=custom_regex_load, inputs=[in_deny_list, in_deny_list_text_in], outputs=[in_deny_list_text, in_deny_list_state])
     in_fully_redacted_list.change(fn=custom_regex_load, inputs=[in_fully_redacted_list, in_fully_redacted_text_in], outputs=[in_fully_redacted_list_text, in_fully_redacted_list_state])

requirements.txt CHANGED Viewed

@@ -7,6 +7,7 @@ presidio_anonymizer==2.2.355
 presidio-image-redactor==0.0.53
 pikepdf==8.15.1
 pandas==2.2.3
 spacy==3.8.3
 #en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz

 presidio-image-redactor==0.0.53
 pikepdf==8.15.1
 pandas==2.2.3
+nltk==3.9.1
 spacy==3.8.3
 #en_core_web_lg @ https://github.com/explosion/spacy-#models/releases/download/en_core_web_lg-3.8.0/en_core_web_sm-#3.8.0.tar.gz
 en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0.tar.gz

tools/file_redaction.py CHANGED Viewed

@@ -136,7 +136,7 @@ def choose_and_run_redactor(file_paths:List[str],
     tic = time.perf_counter()
     all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
-    print("prepared_pdf_file_paths:", prepared_pdf_file_paths[0])
     review_out_file_paths = [prepared_pdf_file_paths[0]]
     if isinstance(custom_recogniser_word_list, pd.DataFrame):
@@ -779,6 +779,11 @@ def redact_page_with_pymupdf(page:Page, page_annotations:dict, image=None, custo
     return page, out_annotation_boxes
 def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
     all_bboxes = []
@@ -1051,7 +1056,7 @@ def redact_image_pdf(file_path:str,
             #print("Image is in range of pages to redact")
             if isinstance(image, str):
-                print("image is a file path", image)
                 image = Image.open(image)
             # Need image size to convert textract OCR outputs to the correct sizes
@@ -1119,7 +1124,7 @@ def redact_image_pdf(file_path:str,
                 line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
             # Step 2: Analyze text and identify PII
-            if chosen_redact_entities:
                 redaction_bboxes, comprehend_query_number_new = image_analyser.analyze_text(
                     line_level_ocr_results,
@@ -1309,7 +1314,7 @@ def redact_image_pdf(file_path:str,
 ###
-# PIKEPDF TEXT PDF REDACTION
 ###
 def get_text_container_characters(text_container:LTTextContainer):
@@ -1485,182 +1490,6 @@ def create_pikepdf_annotations_for_bounding_boxes(analysed_bounding_boxes):
         pikepdf_annotations_on_page.append(annotation)
     return pikepdf_annotations_on_page
-# def run_page_text_redaction(language: str,  # Language of the PDF content
-#     chosen_redact_entities: List[str],  # List of entities to be redacted
-#     chosen_redact_comprehend_entities: List[str],
-#     line_level_text_results_list: List[str],
-#     line_characters: List,
-#     page_analyser_results: List = [],
-#     page_analysed_bounding_boxes: List = [],
-#     comprehend_client = None, # Connection to AWS Comprehend
-#     allow_list: List[str] = None,  # Optional list of allowed entities
-#     pii_identification_method: str = "Local"
-#     ):
-#     # Initialize batching variables
-#     current_batch = ""
-#     current_batch_mapping = []  # List of (start_pos, line_index, OCRResult) tuples
-#     all_text_line_results = []  # Store results for all lines
-#     text_container_analyser_results = []
-#     text_container_analysed_bounding_boxes = []
-#     # First pass: collect all lines into batches
-#     for i, text_line in enumerate(line_level_text_results_list):
-#         if chosen_redact_entities:
-#             if pii_identification_method == "Local":
-#                 #print("chosen_redact_entities:", chosen_redact_entities)
-#                 # Process immediately for local analysis
-#                 text_line_analyser_result = nlp_analyser.analyze(
-#                     text=text_line.text,
-#                     language=language,
-#                     entities=chosen_redact_entities,
-#                     score_threshold=score_threshold,
-#                     return_decision_process=True,
-#                     allow_list=allow_list
-#                 )
-#                 all_text_line_results.append((i, text_line_analyser_result))
-#             elif pii_identification_method == "AWS Comprehend":
-#                 # First use the local Spacy model to pick up custom entities that AWS Comprehend can't search for.
-#                 custom_redact_entities = [entity for entity in chosen_redact_comprehend_entities if entity in custom_entities]
-#                 text_line_analyser_result = nlp_analyser.analyze(
-#                     text=text_line.text,
-#                     language=language,
-#                     entities=custom_redact_entities,
-#                     score_threshold=score_threshold,
-#                     return_decision_process=True,
-#                     allow_list=allow_list
-#                 )
-#                 all_text_line_results.append((i, text_line_analyser_result))
-#                 if len(text_line.text) >= 3:
-#                     # Add separator between lines
-#                     if current_batch:
-#                         current_batch += " | "
-#                     start_pos = len(current_batch)
-#                     current_batch += text_line.text
-#                     current_batch_mapping.append((start_pos, i, text_line))
-#                     # Process batch if approaching 300 characters or last line
-#                     if len(current_batch) >= 200 or i == len(line_level_text_results_list) - 1:
-#                         print("length of text for Comprehend:", len(current_batch))
-#                         try:
-#                             response = comprehend_client.detect_pii_entities(
-#                                 Text=current_batch,
-#                                 LanguageCode=language
-#                             )
-#                         except Exception as e:
-#                             print(e)
-#                             time.sleep(3)
-#                             response = comprehend_client.detect_pii_entities(
-#                                 Text=current_batch,
-#                                 LanguageCode=language
-#                             )
-#                         comprehend_query_number += 1
-#                         # Process response and map back to original lines
-#                         if response and "Entities" in response:
-#                             for entity in response["Entities"]:
-#                                 entity_start = entity["BeginOffset"]
-#                                 entity_end = entity["EndOffset"]
-#                                 # Find which line this entity belongs to
-#                                 for batch_start, line_idx, original_line in current_batch_mapping:
-#                                     batch_end = batch_start + len(original_line.text)
-#                                     # Check if entity belongs to this line
-#                                     if batch_start <= entity_start < batch_end:
-#                                         # Adjust offsets relative to original line
-#                                         relative_start = entity_start - batch_start
-#                                         relative_end = min(entity_end - batch_start, len(original_line.text))
-#                                         result_text = original_line.text[relative_start:relative_end]
-#                                         if result_text not in allow_list:
-#                                             if entity.get("Type") in chosen_redact_comprehend_entities:
-#                                                 # Create adjusted entity
-#                                                 adjusted_entity = entity.copy()
-#                                                 adjusted_entity["BeginOffset"] = relative_start
-#                                                 adjusted_entity["EndOffset"] = relative_end
-#                                                 recogniser_entity = recognizer_result_from_dict(adjusted_entity)
-#                                                 # Add to results for this line
-#                                                 existing_results = next((results for idx, results in all_text_line_results if idx == line_idx), [])
-#                                                 if not existing_results:
-#                                                     all_text_line_results.append((line_idx, [recogniser_entity]))
-#                                                 else:
-#                                                     existing_results.append(recogniser_entity)
-#                         # Reset batch
-#                         current_batch = ""
-#                         current_batch_mapping = []
-#     # Second pass: process results for each line
-#     for i, text_line in enumerate(line_level_text_results_list):
-#         text_line_analyser_result = []
-#         text_line_bounding_boxes = []
-#         # Get results for this line
-#         line_results = next((results for idx, results in all_text_line_results if idx == i), [])
-#         if line_results:
-#             text_line_analyser_result = line_results
-#             #print("Analysed text container, now merging bounding boxes")
-#             # Merge bounding boxes if very close together
-#             text_line_bounding_boxes = merge_text_bounding_boxes(text_line_analyser_result, line_characters[i])
-#             #print("merged bounding boxes")
-#             text_container_analyser_results.extend(text_line_analyser_result)
-#             #text_container_analysed_bounding_boxes.extend(text_line_bounding_boxes)
-#             #print("text_container_analyser_results:", text_container_analyser_results)
-#             page_analyser_results.extend(text_container_analyser_results)  # Add this line
-#             page_analysed_bounding_boxes.extend(text_line_bounding_boxes)  # Add this line
-#     return page_analysed_bounding_boxes
-# def map_back_entity_results(page_analyser_result, page_text_mapping, all_text_line_results):
-#     for entity in page_analyser_result:
-#         entity_start = entity.start
-#         entity_end = entity.end
-#         for batch_start, line_idx, original_line, chars in page_text_mapping:
-#             batch_end = batch_start + len(original_line.text)
-#             if batch_start <= entity_start < batch_end:
-#                 relative_start = entity_start - batch_start
-#                 relative_end = min(entity_end - batch_start, len(original_line.text))
-#                 adjusted_entity = copy.deepcopy(entity)
-#                 adjusted_entity.start = relative_start
-#                 adjusted_entity.end = relative_end
-#                 existing_entry = next((entry for idx, entry in all_text_line_results if idx == line_idx), None)
-#                 if existing_entry is None:
-#                     all_text_line_results.append((line_idx, [adjusted_entity]))
-#                 else:
-#                     existing_entry.append(adjusted_entity)
-#                 break
-#     return all_text_line_results
 def redact_text_pdf(
     filename: str,  # Path to the PDF file to be redacted
     prepared_pdf_image_path: str,  # Path to the prepared PDF image for redaction
@@ -1761,15 +1590,14 @@ def redact_text_pdf(
     for page_no in progress_bar:
         reported_page_number = str(page_no + 1)
-        print("Redacting page:", reported_page_number)
         # Assuming prepared_pdf_file_paths[page_no] is a PIL image object
         try:
             image = prepared_pdf_image_path[page_no]#.copy()
             #print("image:", image)
         except Exception as e:
-            print("Could not redact page:", reported_page_number, "due to:")
-            print(e)
             continue
         image_annotations = {"image": image, "boxes": []}
@@ -1825,27 +1653,33 @@ def redact_text_pdf(
                     ### REDACTION
-                    page_analysed_bounding_boxes = run_page_text_redaction(
-                                                        language,
-                                                        chosen_redact_entities,
-                                                        chosen_redact_comprehend_entities,
-                                                        all_line_level_text_results_list, #line_level_text_results_list,
-                                                        all_line_characters,
-                                                        page_analyser_results,
-                                                        page_analysed_bounding_boxes,
-                                                        comprehend_client,
-                                                        allow_list,
-                                                        pii_identification_method,
-                                                        nlp_analyser,
-                                                        score_threshold,
-                                                        custom_entities,
-                                                        comprehend_query_number
-                                                        )
-                #print("page_analyser_results:", page_analyser_results)
-                #print("page_analysed_bounding_boxes:", page_analysed_bounding_boxes)
-                #print("image:", image)
                 page_analysed_bounding_boxes = convert_pikepdf_decision_output_to_image_coords(pymupdf_page, page_analysed_bounding_boxes, image)

     tic = time.perf_counter()
     all_request_metadata = all_request_metadata_str.split('\n') if all_request_metadata_str else []
+    #print("prepared_pdf_file_paths:", prepared_pdf_file_paths[0])
     review_out_file_paths = [prepared_pdf_file_paths[0]]
     if isinstance(custom_recogniser_word_list, pd.DataFrame):
     return page, out_annotation_boxes
+###
+# IMAGE-BASED OCR PDF TEXT DETECTION/REDACTION WITH TESSERACT OR AWS TEXTRACT
+###
 def merge_img_bboxes(bboxes, combined_results: Dict, signature_recogniser_results=[], handwriting_recogniser_results=[], handwrite_signature_checkbox: List[str]=["Redact all identified handwriting", "Redact all identified signatures"], horizontal_threshold:int=50, vertical_threshold:int=12):
     all_bboxes = []
             #print("Image is in range of pages to redact")
             if isinstance(image, str):
+                #print("image is a file path", image)
                 image = Image.open(image)
             # Need image size to convert textract OCR outputs to the correct sizes
                 line_level_ocr_results, handwriting_or_signature_boxes, signature_recogniser_results, handwriting_recogniser_results, line_level_ocr_results_with_children = json_to_ocrresult(text_blocks, page_width, page_height, reported_page_number)
             # Step 2: Analyze text and identify PII
+            if chosen_redact_entities or chosen_redact_comprehend_entities:
                 redaction_bboxes, comprehend_query_number_new = image_analyser.analyze_text(
                     line_level_ocr_results,
 ###
+# PIKEPDF TEXT DETECTION/REDACTION
 ###
 def get_text_container_characters(text_container:LTTextContainer):
         pikepdf_annotations_on_page.append(annotation)
     return pikepdf_annotations_on_page
 def redact_text_pdf(
     filename: str,  # Path to the PDF file to be redacted
     prepared_pdf_image_path: str,  # Path to the prepared PDF image for redaction
     for page_no in progress_bar:
         reported_page_number = str(page_no + 1)
+        #print("Redacting page:", reported_page_number)
         # Assuming prepared_pdf_file_paths[page_no] is a PIL image object
         try:
             image = prepared_pdf_image_path[page_no]#.copy()
             #print("image:", image)
         except Exception as e:
+            print("Could not redact page:", reported_page_number, "due to:", e)
             continue
         image_annotations = {"image": image, "boxes": []}
                     ### REDACTION
+                    if chosen_redact_entities or chosen_redact_comprehend_entities:
+                        #print("Identifying redactions on page.")
+                        page_analysed_bounding_boxes = run_page_text_redaction(
+                                                            language,
+                                                            chosen_redact_entities,
+                                                            chosen_redact_comprehend_entities,
+                                                            all_line_level_text_results_list, #line_level_text_results_list,
+                                                            all_line_characters,
+                                                            page_analyser_results,
+                                                            page_analysed_bounding_boxes,
+                                                            comprehend_client,
+                                                            allow_list,
+                                                            pii_identification_method,
+                                                            nlp_analyser,
+                                                            score_threshold,
+                                                            custom_entities,
+                                                            comprehend_query_number
+                                                            )
+                    #print("page_analyser_results:", page_analyser_results)
+                    #print("page_analysed_bounding_boxes:", page_analysed_bounding_boxes)
+                    #print("image:", image)
+                    else:
+                        page_analysed_bounding_boxes = []
                 page_analysed_bounding_boxes = convert_pikepdf_decision_output_to_image_coords(pymupdf_page, page_analysed_bounding_boxes, image)

tools/find_duplicate_pages.py ADDED Viewed

	@@ -0,0 +1,274 @@

+import pandas as pd
+import argparse
+import glob
+import os
+import re
+from tools.helper_functions import output_folder
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+import nltk
+from nltk.corpus import stopwords
+from nltk.tokenize import word_tokenize
+from nltk.stem import PorterStemmer
+import numpy as np
+import random
+import string
+from typing import List
+nltk.download('punkt')
+nltk.download('stopwords')
+nltk.download('punkt_tab')
+similarity_threshold = 0.9
+stop_words = set(stopwords.words('english'))
+# List of words to remove from the stopword set
+#words_to_remove = ['no', 'nor', 'not', 'don', 'don't', 'wasn', 'wasn't', 'weren', 'weren't', "don't", "wasn't", "weren't"]
+# Remove the specified words from the stopwords set
+#for word in words_to_remove:
+#    stop_words.discard(word.lower())
+stemmer = PorterStemmer()
+vectorizer = TfidfVectorizer()
+def combine_ocr_output_text(input_files):
+    """
+    Combines text from multiple CSV files containing page and text columns.
+    Groups text by file and page number, concatenating text within these groups.
+    Args:
+        input_files (list): List of paths to CSV files
+    Returns:
+        pd.DataFrame: Combined dataframe with columns [file, page, text]
+    """
+    all_data = []
+    output_files = []
+    if isinstance(input_files, str):
+        file_paths_list = [input_files]
+    else:
+        file_paths_list = input_files
+    for file in file_paths_list:
+        if isinstance(file, str):
+            file_path = file
+        else:
+            file_path = file.name
+        # Read CSV file
+        df = pd.read_csv(file_path)
+        # Ensure required columns exist
+        if 'page' not in df.columns or 'text' not in df.columns:
+            print(f"Warning: Skipping {file_path} - missing required columns 'page' and 'text'")
+            continue
+        # Group by page and concatenate text
+        grouped = df.groupby('page')['text'].apply(' '.join).reset_index()
+        # Add filename column
+        grouped['file'] = os.path.basename(file_path)
+        all_data.append(grouped)
+    if not all_data:
+        raise ValueError("No valid CSV files were processed")
+    # Combine all dataframes
+    combined_df = pd.concat(all_data, ignore_index=True)
+    # Reorder columns
+    combined_df = combined_df[['file', 'page', 'text']]
+    output_combined_file_path = output_folder + "combined_ocr_output_files.csv"
+    combined_df.to_csv(output_combined_file_path, index=None)
+    output_files.append(output_combined_file_path)
+    return combined_df, output_files
+def process_data(df, column:str):
+    '''
+    Clean and stem text columns in a data frame
+    '''
+    def _clean_text(raw_text):
+        # Remove HTML tags
+        clean = re.sub(r'<.*?>', '', raw_text)
+        clean = re.sub(r'&nbsp;', ' ', clean)
+        clean = re.sub(r'\r\n', ' ', clean)
+        clean = re.sub(r'&lt;', ' ', clean)
+        clean = re.sub(r'&gt;', ' ', clean)
+        clean = re.sub(r'<strong>', ' ', clean)
+        clean = re.sub(r'</strong>', ' ', clean)
+        # Replace non-breaking space \xa0 with a space
+        clean = clean.replace(u'\xa0', u' ')
+        # Remove extra whitespace
+        clean = ' '.join(clean.split())
+        # Tokenize the text
+        words = word_tokenize(clean.lower())
+        # Remove punctuation and numbers
+        words = [word for word in words if word.isalpha()]
+        # Remove stopwords
+        words = [word for word in words if word not in stop_words]
+        # Join the cleaned words back into a string
+        return ' '.join(words)
+    # Function to apply stemming
+    def _apply_stemming(text):
+        # Tokenize the text
+        words = word_tokenize(text.lower())
+        # Apply stemming to each word
+        stemmed_words = [stemmer.stem(word) for word in words]
+        # Join the stemmed words back into a single string
+        return ' '.join(stemmed_words)
+    df['text_clean'] = df[column].apply(_clean_text)
+    df['text_clean'] = df['text_clean'].apply(_apply_stemming)
+    return df
+def identify_similar_pages(input_files:List[str]):
+    output_paths = []
+    df, output_files = combine_ocr_output_text(input_files)
+    output_paths.extend(output_files)
+    # Clean text
+    df = process_data(df, 'text')
+    # Vectorise text
+    tfidf_matrix = vectorizer.fit_transform(df['text_clean'])
+    # Calculate cosine similarity
+    similarity_matrix = cosine_similarity(tfidf_matrix)
+    # Find the indices of the most similar pages
+    np.fill_diagonal(similarity_matrix, 0)  # Ignore self-comparisons
+    similar_pages = np.argwhere(similarity_matrix > similarity_threshold)  # Threshold of similarity
+    #print(similar_pages)
+    # Create a DataFrame for similar pairs and their scores
+    similarity_df = pd.DataFrame({
+        'Page1_Index': similar_pages[:, 0],
+        'Page2_Index': similar_pages[:, 1],
+        'Page1_File': similar_pages[:, 0],
+        'Page2_File': similar_pages[:, 1],
+        'Similarity_Score': similarity_matrix[similar_pages[:, 0], similar_pages[:, 1]]
+    })
+    # Filter out duplicate pairs (keep only one direction)
+    similarity_df = similarity_df[similarity_df['Page1_Index'] < similarity_df['Page2_Index']]
+    # Map the indices to their corresponding text and metadata
+    similarity_df['Page1_File'] = similarity_df['Page1_File'].map(df['file'])
+    similarity_df['Page2_File'] = similarity_df['Page2_File'].map(df['file'])
+    similarity_df['Page1_Page'] = similarity_df['Page1_Index'].map(df['page'])
+    similarity_df['Page2_Page'] = similarity_df['Page2_Index'].map(df['page'])
+    similarity_df['Page1_Text'] = similarity_df['Page1_Index'].map(df['text'])
+    similarity_df['Page2_Text'] = similarity_df['Page2_Index'].map(df['text'])
+    similarity_df_out = similarity_df[['Page1_File', 'Page1_Page', 'Page2_File', 'Page2_Page', 'Similarity_Score', 'Page1_Text', 'Page2_Text']]
+    similarity_df_out = similarity_df_out.sort_values(["Page1_File", "Page1_Page", "Page2_File", "Page2_Page", "Similarity_Score"], ascending=[True, True, True, True, False])
+    # Save detailed results to a CSV file
+    similarity_file_output_path = output_folder + 'page_similarity_results.csv'
+    similarity_df_out.to_csv(similarity_file_output_path, index=False)
+    output_paths.append(similarity_file_output_path)
+    if not similarity_df_out.empty:
+        unique_files = similarity_df_out['Page2_File'].unique()
+        for redact_file in unique_files:
+            output_file_name = output_folder + redact_file + "_whole_page.csv"
+            whole_pages_to_redact_df = similarity_df_out.loc[similarity_df_out['Page2_File']==redact_file,:][['Page2_Page']]
+            whole_pages_to_redact_df.to_csv(output_file_name, header=None, index=None)
+            output_paths.append(output_file_name)
+    return similarity_df_out, output_paths
+# Perturb text
+# Apply the perturbation function with a 10% error probability
+def perturb_text_with_errors(series):
+    def _perturb_text(text, error_probability=0.1):
+        words = text.split()  # Split text into words
+        perturbed_words = []
+        for word in words:
+            if random.random() < error_probability:  # Add a random error
+                perturbation_type = random.choice(['char_error', 'extra_space', 'extra_punctuation'])
+                if perturbation_type == 'char_error':  # Introduce a character error
+                    idx = random.randint(0, len(word) - 1)
+                    char = random.choice(string.ascii_lowercase)  # Add a random letter
+                    word = word[:idx] + char + word[idx:]
+                elif perturbation_type == 'extra_space':  # Add extra space around a word
+                    word = ' ' + word + ' '
+                elif perturbation_type == 'extra_punctuation':  # Add punctuation to the word
+                    punctuation = random.choice(string.punctuation)
+                    idx = random.randint(0, len(word))  # Insert punctuation randomly
+                    word = word[:idx] + punctuation + word[idx:]
+            perturbed_words.append(word)
+        return ' '.join(perturbed_words)
+    series = series.apply(lambda x: _perturb_text(x, error_probability=0.1))
+    return series
+# Run through command line
+# def main():
+#     parser = argparse.ArgumentParser(description='Combine text from multiple CSV files by page')
+#     parser.add_argument('input_pattern', help='Input file pattern (e.g., "input/*.csv")')
+#     parser.add_argument('--output', '-o', default='combined_text.csv',
+#                        help='Output CSV file path (default: combined_text.csv)')
+#     args = parser.parse_args()
+#     # Get list of input files
+#     input_files = glob.glob(args.input_pattern)
+#     if not input_files:
+#         print(f"No files found matching pattern: {args.input_pattern}")
+#         return
+#     print(f"Processing {len(input_files)} files...")
+#     try:
+#         # Combine the text from all files
+#         combined_df = combine_ocr_output_text(input_files)
+#         # Save to CSV
+#         combined_df.to_csv(args.output, index=False)
+#         print(f"Successfully created combined output: {args.output}")
+#         print(f"Total pages processed: {len(combined_df)}")
+#     except Exception as e:
+#         print(f"Error processing files: {str(e)}")
+# if __name__ == "__main__":
+#     main()

tools/helper_functions.py CHANGED Viewed

@@ -20,7 +20,7 @@ def reset_state_vars():
             show_share_button=False,
             show_remove_button=False,
             interactive=False
-        ), [], []
 def get_or_create_env_var(var_name, default_value):
     # Get the environment variable if it exists

             show_share_button=False,
             show_remove_button=False,
             interactive=False
+        ), [], [], [], pd.DataFrame(), pd.DataFrame()
 def get_or_create_env_var(var_name, default_value):
     # Get the environment variable if it exists

tools/redaction_review.py CHANGED Viewed

@@ -1,10 +1,12 @@
 import gradio as gr
 import pandas as pd
 import numpy as np
 from typing import List
 from gradio_image_annotation import image_annotator
 from gradio_image_annotation.image_annotator import AnnotatedImageData
 from tools.file_conversion import is_pdf, convert_review_json_to_pandas_df
 from tools.helper_functions import get_file_path_end, output_folder
 from tools.file_redaction import redact_page_with_pymupdf
@@ -381,3 +383,61 @@ def df_select_callback(df: pd.DataFrame, evt: gr.SelectData):
         row_value_page = evt.row_value[0] # This is the page number value
         return row_value_page

 import gradio as gr
 import pandas as pd
 import numpy as np
+from xml.etree.ElementTree import Element, SubElement, tostring
+from xml.dom import minidom
+import uuid
 from typing import List
 from gradio_image_annotation import image_annotator
 from gradio_image_annotation.image_annotator import AnnotatedImageData
 from tools.file_conversion import is_pdf, convert_review_json_to_pandas_df
 from tools.helper_functions import get_file_path_end, output_folder
 from tools.file_redaction import redact_page_with_pymupdf
         row_value_page = evt.row_value[0] # This is the page number value
         return row_value_page
+def create_xfdf(df, pdf_path):
+    # Create root element
+    xfdf = Element('xfdf', xmlns="http://ns.adobe.com/xfdf/", xml_space="preserve")
+    # Add header
+    header = SubElement(xfdf, 'header')
+    header.set('pdf-filepath', pdf_path)
+    # Add annots
+    annots = SubElement(xfdf, 'annots')
+    # Process each row in dataframe
+    for _, row in df.iterrows():
+        # Create text annotation
+        text_annot = SubElement(annots, 'text')
+        # Generate unique ID for each annotation
+        annot_id = str(uuid.uuid4())
+        text_annot.set('name', annot_id)
+        # Set page number (subtract 1 as PDF pages are 0-based)
+        text_annot.set('page', str(int(row['page']) - 1))
+        # Set coordinates (convert to PDF coordinate system)
+        # Note: You might need to adjust these calculations based on your PDF dimensions
+        text_annot.set('rect', f"{row['xmin']},{row['ymin']},{row['xmax']},{row['ymax']}")
+        # Set color (convert RGB tuple string to comma-separated values)
+        color_str = row['color'].strip('()').replace(' ', '')
+        text_annot.set('color', color_str)
+        # Set text content
+        text_annot.set('contents', f"{row['label']}: {row['text']}")
+        # Set additional properties
+        text_annot.set('flags', "print")
+        text_annot.set('date', "D:20240123000000")
+        text_annot.set('title', "Annotation")
+    # Convert to pretty XML string
+    xml_str = minidom.parseString(tostring(xfdf)).toprettyxml(indent="  ")
+    return xml_str
+# Example usage:
+# Assuming your dataframe is named 'df' and you want to create annotations for 'example.pdf'
+def convert_df_to_xfdf(df, pdf_path, output_path):
+    xfdf_content = create_xfdf(df, pdf_path)
+    with open(output_path, 'w', encoding='utf-8') as f:
+        f.write(xfdf_content)
+# Usage example:
+# df = your_dataframe
+# convert_df_to_xfdf(df, 'path/to/your.pdf', 'output.xfdf')