Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on Aug 21, 2024

Commit

bbf818d

1 Parent(s): 93ac94f

Handles multiple runs with multiple files correctly now. Logging and feedback improvements.

Browse files

Files changed (6) hide show

app.py +28 -27
tools/aws_functions.py +3 -3
tools/data_anonymise.py +20 -16
tools/file_conversion.py +3 -5
tools/file_redaction.py +15 -18
tools/helper_functions.py +1 -1

app.py CHANGED Viewed

@@ -36,6 +36,7 @@ with app:
     output_image_files_state = gr.State([])
     output_file_list_state = gr.State([])
     text_output_file_list_state = gr.State([])
     first_loop_state = gr.State(True)
     second_loop_state = gr.State(False)
@@ -66,16 +67,15 @@ with app:
         with gr.Row():
             output_summary = gr.Textbox(label="Output summary")
             output_file = gr.File(label="Output files")
-            text_documents_done = gr.Number(value=0, label="Number of documents redacted", interactive=False)
         with gr.Row():
             convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary", visible=False)
-        with gr.Row():
-            pdf_feedback_radio = gr.Radio(choices=["The results were good", "The results were not good"], visible=False)
-        with gr.Row():
-            pdf_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
-            pdf_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
         with gr.Row():
             s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
@@ -100,14 +100,13 @@ with app:
         with gr.Row():
             text_output_summary = gr.Textbox(label="Output result")
             text_output_file = gr.File(label="Output files")
-            text_tabular_files_done = gr.Number(value=0, label="Number of tabular files redacted", interactive=False)
-        with gr.Row():
-            data_feedback_radio = gr.Radio(label="Please give some feedback about the results of the redaction. A reminder that the app is only expected to identify about 60% of personally identifiable information in a given (typed) document.",
                 choices=["The results were good", "The results were not good"], visible=False)
-        with gr.Row():
-            data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
-            data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
     with gr.Tab(label="Redaction settings"):
         gr.Markdown(
@@ -124,6 +123,7 @@ with app:
             with gr.Row():
                 in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False)
                 in_allow_list = gr.Dataframe(label="Allow list - enter a new term to ignore for redaction on each row e.g. Lambeth -> add new row -> Lambeth 2030", headers=["Allow list"], row_count=1, col_count=(1, 'fixed'), value=[[""]], type="array", column_widths=["100px"], datatype='str')
         # Invisible text box to hold the session hash/username just for logging purposes
         session_hash_textbox = gr.Textbox(value="", visible=False)
@@ -143,23 +143,23 @@ with app:
     # Document redaction
     redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, first_loop_state], outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
-    then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, first_loop_state],
-                    outputs=[output_summary, output_file, output_file_list_state, text_documents_done], api_name="redact_doc")
     # If the output file count text box changes, keep going with redacting each document until done
     text_documents_done.change(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, second_loop_state], outputs=[output_summary, prepared_pdf_state]).\
-    then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, second_loop_state],
-                    outputs=[output_summary, output_file, output_file_list_state, text_documents_done]).\
-    then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn])
      # Tabular data redaction
     in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets])
-    tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, in_excel_sheets, first_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done], api_name="redact_text")
     # If the output file count text box changes, keep going with redacting each data file until done
-    text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done]).\
-    then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn])
     #app.load(wipe_logs, inputs=[feedback_logs_state, usage_logs_state], outputs=[]).\
     #    then(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
@@ -169,19 +169,20 @@ with app:
     # Log usernames and times of access to file (to know who is using the app when running on AWS)
     callback = gr.CSVLogger()
     callback.setup([session_hash_textbox], logs_data_folder)
-    session_hash_textbox.change(lambda *args: callback.flag(list(args)), [session_hash_textbox], None, preprocess=False)
     # User submitted feedback for pdf redactions
     pdf_callback = gr.CSVLogger()
-    pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text], feedback_data_folder)
-    pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text], None, preprocess=False).\
-    then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
     # User submitted feedback for data redactions
     data_callback = gr.CSVLogger()
-    data_callback.setup([data_feedback_radio, data_further_details_text], feedback_data_folder)
-    data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text], None, preprocess=False).\
-    then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
 # Launch the Gradio app
 COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')

     output_image_files_state = gr.State([])
     output_file_list_state = gr.State([])
     text_output_file_list_state = gr.State([])
+    log_files_output_list_state = gr.State([])
     first_loop_state = gr.State(True)
     second_loop_state = gr.State(False)
         with gr.Row():
             output_summary = gr.Textbox(label="Output summary")
             output_file = gr.File(label="Output files")
+            text_documents_done = gr.Number(value=0, label="Number of documents redacted", interactive=False, visible=False)
         with gr.Row():
             convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary", visible=False)
+        pdf_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
+        pdf_feedback_radio = gr.Radio(choices=["The results were good", "The results were not good"], visible=False)
+        pdf_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
+        pdf_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
         with gr.Row():
             s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
         with gr.Row():
             text_output_summary = gr.Textbox(label="Output result")
             text_output_file = gr.File(label="Output files")
+            text_tabular_files_done = gr.Number(value=0, label="Number of tabular files redacted", interactive=False, visible=False)
+        data_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
+        data_feedback_radio = gr.Radio(label="Please give some feedback about the results of the redaction. A reminder that the app is only expected to identify about 60% of personally identifiable information in a given (typed) document.",
                 choices=["The results were good", "The results were not good"], visible=False)
+        data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
+        data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
     with gr.Tab(label="Redaction settings"):
         gr.Markdown(
             with gr.Row():
                 in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False)
                 in_allow_list = gr.Dataframe(label="Allow list - enter a new term to ignore for redaction on each row e.g. Lambeth -> add new row -> Lambeth 2030", headers=["Allow list"], row_count=1, col_count=(1, 'fixed'), value=[[""]], type="array", column_widths=["100px"], datatype='str')
+            log_files_output = gr.File(label="Log file output", interactive=False)
         # Invisible text box to hold the session hash/username just for logging purposes
         session_hash_textbox = gr.Textbox(value="", visible=False)
     # Document redaction
     redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, first_loop_state], outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
+    then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state],
+                    outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state], api_name="redact_doc")
     # If the output file count text box changes, keep going with redacting each document until done
     text_documents_done.change(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, second_loop_state], outputs=[output_summary, prepared_pdf_state]).\
+    then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state],
+                    outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state]).\
+    then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
      # Tabular data redaction
     in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets])
+    tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_text")
     # If the output file count text box changes, keep going with redacting each data file until done
+    text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
+    then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
     #app.load(wipe_logs, inputs=[feedback_logs_state, usage_logs_state], outputs=[]).\
     #    then(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
     # Log usernames and times of access to file (to know who is using the app when running on AWS)
     callback = gr.CSVLogger()
     callback.setup([session_hash_textbox], logs_data_folder)
+    session_hash_textbox.change(lambda *args: callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
+    then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
     # User submitted feedback for pdf redactions
     pdf_callback = gr.CSVLogger()
+    pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, in_file], feedback_data_folder)
+    pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text, in_file], None, preprocess=False).\
+    then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
     # User submitted feedback for data redactions
     data_callback = gr.CSVLogger()
+    data_callback.setup([data_feedback_radio, data_further_details_text, in_data_files], feedback_data_folder)
+    data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, in_data_files], None, preprocess=False).\
+    then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
 # Launch the Gradio app
 COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')

tools/aws_functions.py CHANGED Viewed

@@ -10,7 +10,7 @@ PandasDataFrame = Type[pd.DataFrame]
 # Get AWS credentials if required
 bucket_name=""
 aws_var = "RUN_AWS_FUNCTIONS"
-aws_var_default = "1"
 aws_var_val = get_or_create_env_var(aws_var, aws_var_default)
 print(f'The value of {aws_var} is {aws_var_val}')
@@ -185,11 +185,11 @@ def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=buck
             print("S3 key: ", s3_key_full)
             s3_client.upload_file(file, s3_bucket, s3_key_full)
-            out_message = "File " + file_name + " uploaded successfully to S3!"
             print(out_message)
         except Exception as e:
-            out_message = f"Error uploading file(s) to S3: {e}"
             print(out_message)
         final_out_message.append(out_message)

 # Get AWS credentials if required
 bucket_name=""
 aws_var = "RUN_AWS_FUNCTIONS"
+aws_var_default = "0"
 aws_var_val = get_or_create_env_var(aws_var, aws_var_default)
 print(f'The value of {aws_var} is {aws_var_val}')
             print("S3 key: ", s3_key_full)
             s3_client.upload_file(file, s3_bucket, s3_key_full)
+            out_message = "File " + file_name + " uploaded successfully!"
             print(out_message)
         except Exception as e:
+            out_message = f"Error uploading file(s): {e}"
             print(out_message)
         final_out_message.append(out_message)

tools/data_anonymise.py CHANGED Viewed

@@ -69,23 +69,18 @@ def generate_decision_process_output(analyzer_results: List[DictAnalyzerResult],
     # Run through each column to analyse for PII
     for i, result in enumerate(analyzer_results):
-        print("Looking at result:", str(i))
-        print("result:\n\n", result)
         # If a single result
         if isinstance(result, RecognizerResult):
-            print("Processing recogniser result as RecognizerResult:", str(i))
             decision_process_output.extend(process_recognizer_result(result, result, 0, i, df_dict, keys_to_keep))
         # If a list of results
         elif isinstance(result, list) or isinstance(result, DictAnalyzerResult):
             for x, recognizer_result in enumerate(result.recognizer_results):
-                print("Processing recogniser result as List:", str(i))
                 decision_process_output.extend(process_recognizer_result(result, recognizer_result, x, i, df_dict, keys_to_keep))
         else:
             try:
-                print("Processing recogniser result in other:", str(i))
                 decision_process_output.extend(process_recognizer_result(result, result, 0, i, df_dict, keys_to_keep))
             except Exception as e:
                 print(e)
@@ -269,7 +264,8 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
     return scrubbed_df, key_string, decision_process_output_str
-def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, excel_sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name):
     def check_lists(list1, list2):
             return any(string in list2 for string in list1)
@@ -344,7 +340,7 @@ def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_
             f.write(decision_process_output_str)
     out_file_paths.append(anon_export_file_name)
-    out_file_paths.append(decision_process_log_output_file)
     # As files are created in a loop, there is a risk of duplicate file names being output. Use set to keep uniques.
     out_file_paths = list(set(out_file_paths))
@@ -353,9 +349,9 @@ def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_
     if anon_file=='open_text':
         out_message = [anon_df_out['text'][0]]
-    return out_file_paths, out_message, key_string
-def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chosen_cols:List[str], language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list = [], in_excel_sheets:list=[], first_loop_state:bool=False, progress=Progress(track_tqdm=True)):
     tic = time.perf_counter()
@@ -386,13 +382,15 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
             file_paths=['open_text']
         else:
             out_message = "Please enter text or a file to redact."
-            return out_message, out_file_paths, out_file_paths, latest_file_completed
     # If we have already redacted the last file, return the input out_message and file list to the relevant components
-    if latest_file_completed == len(file_paths):
         print("Last file reached, returning files:", str(latest_file_completed))
         final_out_message = '\n'.join(out_message)
-        return final_out_message, out_file_paths, out_file_paths, latest_file_completed
     file_path_loop = [file_paths[int(latest_file_completed)]]
@@ -401,7 +399,11 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
         if anon_file=='open_text':
             anon_df = pd.DataFrame(data={'text':[in_text]})
             chosen_cols=['text']
             out_file_part = anon_file
         else:
             # If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
             file_type = detect_file_type(anon_file)
@@ -419,7 +421,7 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
                 anon_xlsx = pd.ExcelFile(anon_file)
                 # Create xlsx file:
-                anon_xlsx_export_file_name = output_folder + out_file_part + ".xlsx"
                 from openpyxl import Workbook
@@ -440,13 +442,13 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
                     print(anon_df.head())  # Print the first few rows
-                    out_file_paths, out_message, key_string = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type,  anon_xlsx_export_file_name)
             else:
                 sheet_name = ""
                 anon_df = read_file(anon_file)
                 out_file_part = get_file_path_end(anon_file.name)
-                out_file_paths, out_message, key_string = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "")
         # Increase latest file completed count unless we are at the last file
         if latest_file_completed != len(file_paths):
@@ -464,5 +466,7 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
         out_message_out = '\n'.join(out_message)
         out_message_out = out_message_out + " " + out_time
-    return out_message_out, out_file_paths, out_file_paths, latest_file_completed

     # Run through each column to analyse for PII
     for i, result in enumerate(analyzer_results):
         # If a single result
         if isinstance(result, RecognizerResult):
             decision_process_output.extend(process_recognizer_result(result, result, 0, i, df_dict, keys_to_keep))
         # If a list of results
         elif isinstance(result, list) or isinstance(result, DictAnalyzerResult):
             for x, recognizer_result in enumerate(result.recognizer_results):
                 decision_process_output.extend(process_recognizer_result(result, recognizer_result, x, i, df_dict, keys_to_keep))
         else:
             try:
                 decision_process_output.extend(process_recognizer_result(result, result, 0, i, df_dict, keys_to_keep))
             except Exception as e:
                 print(e)
     return scrubbed_df, key_string, decision_process_output_str
+def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, excel_sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name, log_files_output_paths):
     def check_lists(list1, list2):
             return any(string in list2 for string in list1)
             f.write(decision_process_output_str)
     out_file_paths.append(anon_export_file_name)
+    log_files_output_paths.append(decision_process_log_output_file)
     # As files are created in a loop, there is a risk of duplicate file names being output. Use set to keep uniques.
     out_file_paths = list(set(out_file_paths))
     if anon_file=='open_text':
         out_message = [anon_df_out['text'][0]]
+    return out_file_paths, out_message, key_string, log_files_output_paths
+def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chosen_cols:List[str], language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list = [], log_files_output_paths:list = [], in_excel_sheets:list=[], first_loop_state:bool=False, progress=Progress(track_tqdm=True)):
     tic = time.perf_counter()
             file_paths=['open_text']
         else:
             out_message = "Please enter text or a file to redact."
+            return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
     # If we have already redacted the last file, return the input out_message and file list to the relevant components
+    if latest_file_completed >= len(file_paths):
         print("Last file reached, returning files:", str(latest_file_completed))
+        # Set to a very high number so as not to mess with subsequent file processing by the user
+        latest_file_completed = 99
         final_out_message = '\n'.join(out_message)
+        return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
     file_path_loop = [file_paths[int(latest_file_completed)]]
         if anon_file=='open_text':
             anon_df = pd.DataFrame(data={'text':[in_text]})
             chosen_cols=['text']
+            sheet_name = ""
+            file_type = ""
             out_file_part = anon_file
+            out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths)
         else:
             # If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
             file_type = detect_file_type(anon_file)
                 anon_xlsx = pd.ExcelFile(anon_file)
                 # Create xlsx file:
+                anon_xlsx_export_file_name = output_folder + out_file_part + "_redacted.xlsx"
                 from openpyxl import Workbook
                     print(anon_df.head())  # Print the first few rows
+                    out_file_paths, out_message, key_string, log_files_output_paths  = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type,  anon_xlsx_export_file_name, log_files_output_paths)
             else:
                 sheet_name = ""
                 anon_df = read_file(anon_file)
                 out_file_part = get_file_path_end(anon_file.name)
+                out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths)
         # Increase latest file completed count unless we are at the last file
         if latest_file_completed != len(file_paths):
         out_message_out = '\n'.join(out_message)
         out_message_out = out_message_out + " " + out_time
+        out_message_out = out_message_out + "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
+    return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths

tools/file_conversion.py CHANGED Viewed

@@ -87,8 +87,6 @@ def process_file(file_path):
         print(f"{file_path} is not an image or PDF file.")
         img_object = ['']
-    print('Image object is:', img_object)
     return img_object
 def prepare_image_or_text_pdf(
@@ -129,7 +127,7 @@ def prepare_image_or_text_pdf(
         out_message = []
         out_file_paths = []
     else:
-        print("Now attempting file:", str(latest_file_completed + 1))
         out_file_paths = []
     if not file_paths:
@@ -140,7 +138,7 @@ def prepare_image_or_text_pdf(
     latest_file_completed = int(latest_file_completed)
     # If we have already redacted the last file, return the input out_message and file list to the relevant components
-    if latest_file_completed == len(file_paths):
         print("Last file reached, returning files:", str(latest_file_completed))
         #final_out_message = '\n'.join(out_message)
         return out_message, out_file_paths
@@ -204,6 +202,6 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
     out_message = "PDF " + file_path_without_ext + " converted to image-based file."
     print(out_message)
-    print("Out file paths:", out_file_paths)
     return out_message, out_file_paths

         print(f"{file_path} is not an image or PDF file.")
         img_object = ['']
     return img_object
 def prepare_image_or_text_pdf(
         out_message = []
         out_file_paths = []
     else:
+        print("Now attempting file:", str(latest_file_completed))
         out_file_paths = []
     if not file_paths:
     latest_file_completed = int(latest_file_completed)
     # If we have already redacted the last file, return the input out_message and file list to the relevant components
+    if latest_file_completed >= len(file_paths):
         print("Last file reached, returning files:", str(latest_file_completed))
         #final_out_message = '\n'.join(out_message)
         return out_message, out_file_paths
     out_message = "PDF " + file_path_without_ext + " converted to image-based file."
     print(out_message)
+    #print("Out file paths:", out_file_paths)
     return out_message, out_file_paths

tools/file_redaction.py CHANGED Viewed

@@ -18,10 +18,11 @@ from tools.data_anonymise import generate_decision_process_output
 import gradio as gr
-def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list = [], first_loop_state:bool=False, progress=gr.Progress(track_tqdm=True)):
     tic = time.perf_counter()
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
         latest_file_completed = 0
@@ -35,15 +36,15 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
     if not out_file_paths:
         out_file_paths = []
-    print("Latest file completed is:", str(latest_file_completed))
     latest_file_completed = int(latest_file_completed)
     # If we have already redacted the last file, return the input out_message and file list to the relevant components
-    if latest_file_completed == len(file_paths):
-        print("Last file reached, returning files:", str(latest_file_completed))
         final_out_message = '\n'.join(out_message)
-        return final_out_message, out_file_paths, out_file_paths, latest_file_completed
     file_paths_loop = [file_paths[int(latest_file_completed)]]
@@ -51,8 +52,6 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
         in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
-    #print("File paths:", file_paths)
     for file in progress.tqdm(file_paths_loop, desc="Redacting files", unit = "files"):
         file_path = file.name
@@ -66,7 +65,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
         else:
             out_message = "No file selected"
             print(out_message)
-            return out_message, out_file_paths, out_file_paths, latest_file_completed
         if in_redact_method == "Image analysis":
             # Analyse and redact image-based pdf or image
@@ -85,7 +84,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
             logs_output_file_name = out_image_file_path + "_decision_process_output.txt"
             with open(logs_output_file_name, "w") as f:
                 f.write(output_logs_str)
-            out_file_paths.append(logs_output_file_name)
             # Increase latest file completed count unless we are at the last file
             if latest_file_completed != len(file_paths):
@@ -119,19 +118,19 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
             logs_output_file_name = img_output_file_path[0] + "_decision_process_output.txt"
             with open(logs_output_file_name, "w") as f:
                 f.write(output_logs_str)
-            out_file_paths.append(logs_output_file_name)
             # Add confirmation for converting to image if you want
             # out_message.append(img_output_summary)
             if latest_file_completed != len(file_paths):
-                print("Completed file number:", str(latest_file_completed))
                 latest_file_completed += 1
         else:
             out_message = "No redaction method selected"
             print(out_message)
-            return out_message, out_file_paths, out_file_paths, latest_file_completed
     toc = time.perf_counter()
@@ -141,7 +140,9 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
     out_message_out = '\n'.join(out_message)
     out_message_out = out_message_out + " " + out_time
-    return out_message_out, out_file_paths, out_file_paths, latest_file_completed
 def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
             merged_bboxes = []
@@ -388,13 +389,9 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
                 # Merge bounding boxes if very close together
                 text_container_analyzed_bounding_boxes = merge_bounding_boxes(text_container_analyzer_results, characters, combine_pixel_dist)
-                print("\n\nanalyzed_bounding_boxes_in_loop:", text_container_analyzed_bounding_boxes)
                 page_analyzed_bounding_boxes.extend(text_container_analyzed_bounding_boxes)
                 page_analyzer_results.extend(text_container_analyzer_results)
-            print("analyzed_bounding_boxes_out_loop:\n\n", page_analyzed_bounding_boxes)
             decision_process_table_on_page = create_text_redaction_process_results(page_analyzer_results, page_analyzed_bounding_boxes, page_num)
             annotations_on_page = create_annotations_for_bounding_boxes(page_analyzed_bounding_boxes)

 import gradio as gr
+def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list=[], log_files_output_paths:list=[], first_loop_state:bool=False, progress=gr.Progress(track_tqdm=True)):
     tic = time.perf_counter()
     # If this is the first time around, set variables to 0/blank
     if first_loop_state==True:
         latest_file_completed = 0
     if not out_file_paths:
         out_file_paths = []
     latest_file_completed = int(latest_file_completed)
     # If we have already redacted the last file, return the input out_message and file list to the relevant components
+    if latest_file_completed >= len(file_paths):
+        print("Last file reached")
+        # Set to a very high number so as not to mess with subsequent file processing by the user
+        latest_file_completed = 99
         final_out_message = '\n'.join(out_message)
+        return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
     file_paths_loop = [file_paths[int(latest_file_completed)]]
         in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
     for file in progress.tqdm(file_paths_loop, desc="Redacting files", unit = "files"):
         file_path = file.name
         else:
             out_message = "No file selected"
             print(out_message)
+            return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
         if in_redact_method == "Image analysis":
             # Analyse and redact image-based pdf or image
             logs_output_file_name = out_image_file_path + "_decision_process_output.txt"
             with open(logs_output_file_name, "w") as f:
                 f.write(output_logs_str)
+            log_files_output_paths.append(logs_output_file_name)
             # Increase latest file completed count unless we are at the last file
             if latest_file_completed != len(file_paths):
             logs_output_file_name = img_output_file_path[0] + "_decision_process_output.txt"
             with open(logs_output_file_name, "w") as f:
                 f.write(output_logs_str)
+            log_files_output_paths.append(logs_output_file_name)
             # Add confirmation for converting to image if you want
             # out_message.append(img_output_summary)
             if latest_file_completed != len(file_paths):
+                print("Completed file number:", str(latest_file_completed), "more files to do")
                 latest_file_completed += 1
         else:
             out_message = "No redaction method selected"
             print(out_message)
+            return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
     toc = time.perf_counter()
     out_message_out = '\n'.join(out_message)
     out_message_out = out_message_out + " " + out_time
+    out_message_out = out_message_out + "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
+    return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
 def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
             merged_bboxes = []
                 # Merge bounding boxes if very close together
                 text_container_analyzed_bounding_boxes = merge_bounding_boxes(text_container_analyzer_results, characters, combine_pixel_dist)
                 page_analyzed_bounding_boxes.extend(text_container_analyzed_bounding_boxes)
                 page_analyzer_results.extend(text_container_analyzer_results)
             decision_process_table_on_page = create_text_redaction_process_results(page_analyzer_results, page_analyzed_bounding_boxes, page_num)
             annotations_on_page = create_annotations_for_bounding_boxes(page_analyzed_bounding_boxes)

tools/helper_functions.py CHANGED Viewed

@@ -141,7 +141,7 @@ def add_folder_to_path(folder_path: str):
 # Upon running a process, the feedback buttons are revealed
 def reveal_feedback_buttons():
-    return gr.Radio(visible=True), gr.Textbox(visible=True), gr.Button(visible=True)
 def wipe_logs(feedback_logs_loc, usage_logs_loc):
     try:

 # Upon running a process, the feedback buttons are revealed
 def reveal_feedback_buttons():
+    return gr.Radio(visible=True), gr.Textbox(visible=True), gr.Button(visible=True), gr.Markdown(visible=True)
 def wipe_logs(feedback_logs_loc, usage_logs_loc):
     try: