seanpedrickcase commited on
Commit
bbf818d
·
1 Parent(s): 93ac94f

Handles multiple runs with multiple files correctly now. Logging and feedback improvements.

Browse files
app.py CHANGED
@@ -36,6 +36,7 @@ with app:
36
  output_image_files_state = gr.State([])
37
  output_file_list_state = gr.State([])
38
  text_output_file_list_state = gr.State([])
 
39
  first_loop_state = gr.State(True)
40
  second_loop_state = gr.State(False)
41
 
@@ -66,16 +67,15 @@ with app:
66
  with gr.Row():
67
  output_summary = gr.Textbox(label="Output summary")
68
  output_file = gr.File(label="Output files")
69
- text_documents_done = gr.Number(value=0, label="Number of documents redacted", interactive=False)
70
 
71
  with gr.Row():
72
  convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary", visible=False)
73
 
74
- with gr.Row():
75
- pdf_feedback_radio = gr.Radio(choices=["The results were good", "The results were not good"], visible=False)
76
- with gr.Row():
77
- pdf_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
78
- pdf_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
79
 
80
  with gr.Row():
81
  s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
@@ -100,14 +100,13 @@ with app:
100
  with gr.Row():
101
  text_output_summary = gr.Textbox(label="Output result")
102
  text_output_file = gr.File(label="Output files")
103
- text_tabular_files_done = gr.Number(value=0, label="Number of tabular files redacted", interactive=False)
104
 
105
- with gr.Row():
106
- data_feedback_radio = gr.Radio(label="Please give some feedback about the results of the redaction. A reminder that the app is only expected to identify about 60% of personally identifiable information in a given (typed) document.",
107
  choices=["The results were good", "The results were not good"], visible=False)
108
- with gr.Row():
109
- data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
110
- data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
111
 
112
  with gr.Tab(label="Redaction settings"):
113
  gr.Markdown(
@@ -124,6 +123,7 @@ with app:
124
  with gr.Row():
125
  in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False)
126
  in_allow_list = gr.Dataframe(label="Allow list - enter a new term to ignore for redaction on each row e.g. Lambeth -> add new row -> Lambeth 2030", headers=["Allow list"], row_count=1, col_count=(1, 'fixed'), value=[[""]], type="array", column_widths=["100px"], datatype='str')
 
127
 
128
  # Invisible text box to hold the session hash/username just for logging purposes
129
  session_hash_textbox = gr.Textbox(value="", visible=False)
@@ -143,23 +143,23 @@ with app:
143
 
144
  # Document redaction
145
  redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, first_loop_state], outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
146
- then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, first_loop_state],
147
- outputs=[output_summary, output_file, output_file_list_state, text_documents_done], api_name="redact_doc")
148
 
149
  # If the output file count text box changes, keep going with redacting each document until done
150
  text_documents_done.change(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, second_loop_state], outputs=[output_summary, prepared_pdf_state]).\
151
- then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, second_loop_state],
152
- outputs=[output_summary, output_file, output_file_list_state, text_documents_done]).\
153
- then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn])
154
 
155
  # Tabular data redaction
156
  in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets])
157
 
158
- tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, in_excel_sheets, first_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done], api_name="redact_text")
159
 
160
  # If the output file count text box changes, keep going with redacting each data file until done
161
- text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done]).\
162
- then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn])
163
 
164
  #app.load(wipe_logs, inputs=[feedback_logs_state, usage_logs_state], outputs=[]).\
165
  # then(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
@@ -169,19 +169,20 @@ with app:
169
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
170
  callback = gr.CSVLogger()
171
  callback.setup([session_hash_textbox], logs_data_folder)
172
- session_hash_textbox.change(lambda *args: callback.flag(list(args)), [session_hash_textbox], None, preprocess=False)
 
173
 
174
  # User submitted feedback for pdf redactions
175
  pdf_callback = gr.CSVLogger()
176
- pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text], feedback_data_folder)
177
- pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text], None, preprocess=False).\
178
- then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
179
 
180
  # User submitted feedback for data redactions
181
  data_callback = gr.CSVLogger()
182
- data_callback.setup([data_feedback_radio, data_further_details_text], feedback_data_folder)
183
- data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text], None, preprocess=False).\
184
- then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
185
 
186
  # Launch the Gradio app
187
  COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
 
36
  output_image_files_state = gr.State([])
37
  output_file_list_state = gr.State([])
38
  text_output_file_list_state = gr.State([])
39
+ log_files_output_list_state = gr.State([])
40
  first_loop_state = gr.State(True)
41
  second_loop_state = gr.State(False)
42
 
 
67
  with gr.Row():
68
  output_summary = gr.Textbox(label="Output summary")
69
  output_file = gr.File(label="Output files")
70
+ text_documents_done = gr.Number(value=0, label="Number of documents redacted", interactive=False, visible=False)
71
 
72
  with gr.Row():
73
  convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary", visible=False)
74
 
75
+ pdf_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
76
+ pdf_feedback_radio = gr.Radio(choices=["The results were good", "The results were not good"], visible=False)
77
+ pdf_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
78
+ pdf_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
 
79
 
80
  with gr.Row():
81
  s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
 
100
  with gr.Row():
101
  text_output_summary = gr.Textbox(label="Output result")
102
  text_output_file = gr.File(label="Output files")
103
+ text_tabular_files_done = gr.Number(value=0, label="Number of tabular files redacted", interactive=False, visible=False)
104
 
105
+ data_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
106
+ data_feedback_radio = gr.Radio(label="Please give some feedback about the results of the redaction. A reminder that the app is only expected to identify about 60% of personally identifiable information in a given (typed) document.",
107
  choices=["The results were good", "The results were not good"], visible=False)
108
+ data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
109
+ data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
 
110
 
111
  with gr.Tab(label="Redaction settings"):
112
  gr.Markdown(
 
123
  with gr.Row():
124
  in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False)
125
  in_allow_list = gr.Dataframe(label="Allow list - enter a new term to ignore for redaction on each row e.g. Lambeth -> add new row -> Lambeth 2030", headers=["Allow list"], row_count=1, col_count=(1, 'fixed'), value=[[""]], type="array", column_widths=["100px"], datatype='str')
126
+ log_files_output = gr.File(label="Log file output", interactive=False)
127
 
128
  # Invisible text box to hold the session hash/username just for logging purposes
129
  session_hash_textbox = gr.Textbox(value="", visible=False)
 
143
 
144
  # Document redaction
145
  redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, first_loop_state], outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
146
+ then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state],
147
+ outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state], api_name="redact_doc")
148
 
149
  # If the output file count text box changes, keep going with redacting each document until done
150
  text_documents_done.change(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, second_loop_state], outputs=[output_summary, prepared_pdf_state]).\
151
+ then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state],
152
+ outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state]).\
153
+ then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
154
 
155
  # Tabular data redaction
156
  in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets])
157
 
158
+ tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_text")
159
 
160
  # If the output file count text box changes, keep going with redacting each data file until done
161
+ text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
162
+ then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
163
 
164
  #app.load(wipe_logs, inputs=[feedback_logs_state, usage_logs_state], outputs=[]).\
165
  # then(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
 
169
  # Log usernames and times of access to file (to know who is using the app when running on AWS)
170
  callback = gr.CSVLogger()
171
  callback.setup([session_hash_textbox], logs_data_folder)
172
+ session_hash_textbox.change(lambda *args: callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
173
+ then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
174
 
175
  # User submitted feedback for pdf redactions
176
  pdf_callback = gr.CSVLogger()
177
+ pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, in_file], feedback_data_folder)
178
+ pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text, in_file], None, preprocess=False).\
179
+ then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
180
 
181
  # User submitted feedback for data redactions
182
  data_callback = gr.CSVLogger()
183
+ data_callback.setup([data_feedback_radio, data_further_details_text, in_data_files], feedback_data_folder)
184
+ data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, in_data_files], None, preprocess=False).\
185
+ then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
186
 
187
  # Launch the Gradio app
188
  COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
tools/aws_functions.py CHANGED
@@ -10,7 +10,7 @@ PandasDataFrame = Type[pd.DataFrame]
10
  # Get AWS credentials if required
11
  bucket_name=""
12
  aws_var = "RUN_AWS_FUNCTIONS"
13
- aws_var_default = "1"
14
  aws_var_val = get_or_create_env_var(aws_var, aws_var_default)
15
  print(f'The value of {aws_var} is {aws_var_val}')
16
 
@@ -185,11 +185,11 @@ def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=buck
185
  print("S3 key: ", s3_key_full)
186
 
187
  s3_client.upload_file(file, s3_bucket, s3_key_full)
188
- out_message = "File " + file_name + " uploaded successfully to S3!"
189
  print(out_message)
190
 
191
  except Exception as e:
192
- out_message = f"Error uploading file(s) to S3: {e}"
193
  print(out_message)
194
 
195
  final_out_message.append(out_message)
 
10
  # Get AWS credentials if required
11
  bucket_name=""
12
  aws_var = "RUN_AWS_FUNCTIONS"
13
+ aws_var_default = "0"
14
  aws_var_val = get_or_create_env_var(aws_var, aws_var_default)
15
  print(f'The value of {aws_var} is {aws_var_val}')
16
 
 
185
  print("S3 key: ", s3_key_full)
186
 
187
  s3_client.upload_file(file, s3_bucket, s3_key_full)
188
+ out_message = "File " + file_name + " uploaded successfully!"
189
  print(out_message)
190
 
191
  except Exception as e:
192
+ out_message = f"Error uploading file(s): {e}"
193
  print(out_message)
194
 
195
  final_out_message.append(out_message)
tools/data_anonymise.py CHANGED
@@ -69,23 +69,18 @@ def generate_decision_process_output(analyzer_results: List[DictAnalyzerResult],
69
 
70
  # Run through each column to analyse for PII
71
  for i, result in enumerate(analyzer_results):
72
- print("Looking at result:", str(i))
73
- print("result:\n\n", result)
74
 
75
  # If a single result
76
  if isinstance(result, RecognizerResult):
77
- print("Processing recogniser result as RecognizerResult:", str(i))
78
  decision_process_output.extend(process_recognizer_result(result, result, 0, i, df_dict, keys_to_keep))
79
 
80
  # If a list of results
81
  elif isinstance(result, list) or isinstance(result, DictAnalyzerResult):
82
  for x, recognizer_result in enumerate(result.recognizer_results):
83
- print("Processing recogniser result as List:", str(i))
84
  decision_process_output.extend(process_recognizer_result(result, recognizer_result, x, i, df_dict, keys_to_keep))
85
 
86
  else:
87
  try:
88
- print("Processing recogniser result in other:", str(i))
89
  decision_process_output.extend(process_recognizer_result(result, result, 0, i, df_dict, keys_to_keep))
90
  except Exception as e:
91
  print(e)
@@ -269,7 +264,8 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
269
 
270
  return scrubbed_df, key_string, decision_process_output_str
271
 
272
- def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, excel_sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name):
 
273
  def check_lists(list1, list2):
274
  return any(string in list2 for string in list1)
275
 
@@ -344,7 +340,7 @@ def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_
344
  f.write(decision_process_output_str)
345
 
346
  out_file_paths.append(anon_export_file_name)
347
- out_file_paths.append(decision_process_log_output_file)
348
 
349
  # As files are created in a loop, there is a risk of duplicate file names being output. Use set to keep uniques.
350
  out_file_paths = list(set(out_file_paths))
@@ -353,9 +349,9 @@ def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_
353
  if anon_file=='open_text':
354
  out_message = [anon_df_out['text'][0]]
355
 
356
- return out_file_paths, out_message, key_string
357
 
358
- def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chosen_cols:List[str], language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list = [], in_excel_sheets:list=[], first_loop_state:bool=False, progress=Progress(track_tqdm=True)):
359
 
360
  tic = time.perf_counter()
361
 
@@ -386,13 +382,15 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
386
  file_paths=['open_text']
387
  else:
388
  out_message = "Please enter text or a file to redact."
389
- return out_message, out_file_paths, out_file_paths, latest_file_completed
390
 
391
  # If we have already redacted the last file, return the input out_message and file list to the relevant components
392
- if latest_file_completed == len(file_paths):
393
  print("Last file reached, returning files:", str(latest_file_completed))
 
 
394
  final_out_message = '\n'.join(out_message)
395
- return final_out_message, out_file_paths, out_file_paths, latest_file_completed
396
 
397
  file_path_loop = [file_paths[int(latest_file_completed)]]
398
 
@@ -401,7 +399,11 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
401
  if anon_file=='open_text':
402
  anon_df = pd.DataFrame(data={'text':[in_text]})
403
  chosen_cols=['text']
 
 
404
  out_file_part = anon_file
 
 
405
  else:
406
  # If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
407
  file_type = detect_file_type(anon_file)
@@ -419,7 +421,7 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
419
  anon_xlsx = pd.ExcelFile(anon_file)
420
 
421
  # Create xlsx file:
422
- anon_xlsx_export_file_name = output_folder + out_file_part + ".xlsx"
423
 
424
  from openpyxl import Workbook
425
 
@@ -440,13 +442,13 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
440
  print(anon_df.head()) # Print the first few rows
441
 
442
 
443
- out_file_paths, out_message, key_string = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name)
444
 
445
  else:
446
  sheet_name = ""
447
  anon_df = read_file(anon_file)
448
  out_file_part = get_file_path_end(anon_file.name)
449
- out_file_paths, out_message, key_string = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "")
450
 
451
  # Increase latest file completed count unless we are at the last file
452
  if latest_file_completed != len(file_paths):
@@ -464,5 +466,7 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
464
 
465
  out_message_out = '\n'.join(out_message)
466
  out_message_out = out_message_out + " " + out_time
 
 
467
 
468
- return out_message_out, out_file_paths, out_file_paths, latest_file_completed
 
69
 
70
  # Run through each column to analyse for PII
71
  for i, result in enumerate(analyzer_results):
 
 
72
 
73
  # If a single result
74
  if isinstance(result, RecognizerResult):
 
75
  decision_process_output.extend(process_recognizer_result(result, result, 0, i, df_dict, keys_to_keep))
76
 
77
  # If a list of results
78
  elif isinstance(result, list) or isinstance(result, DictAnalyzerResult):
79
  for x, recognizer_result in enumerate(result.recognizer_results):
 
80
  decision_process_output.extend(process_recognizer_result(result, recognizer_result, x, i, df_dict, keys_to_keep))
81
 
82
  else:
83
  try:
 
84
  decision_process_output.extend(process_recognizer_result(result, result, 0, i, df_dict, keys_to_keep))
85
  except Exception as e:
86
  print(e)
 
264
 
265
  return scrubbed_df, key_string, decision_process_output_str
266
 
267
+ def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, excel_sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name, log_files_output_paths):
268
+
269
  def check_lists(list1, list2):
270
  return any(string in list2 for string in list1)
271
 
 
340
  f.write(decision_process_output_str)
341
 
342
  out_file_paths.append(anon_export_file_name)
343
+ log_files_output_paths.append(decision_process_log_output_file)
344
 
345
  # As files are created in a loop, there is a risk of duplicate file names being output. Use set to keep uniques.
346
  out_file_paths = list(set(out_file_paths))
 
349
  if anon_file=='open_text':
350
  out_message = [anon_df_out['text'][0]]
351
 
352
+ return out_file_paths, out_message, key_string, log_files_output_paths
353
 
354
+ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chosen_cols:List[str], language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list = [], log_files_output_paths:list = [], in_excel_sheets:list=[], first_loop_state:bool=False, progress=Progress(track_tqdm=True)):
355
 
356
  tic = time.perf_counter()
357
 
 
382
  file_paths=['open_text']
383
  else:
384
  out_message = "Please enter text or a file to redact."
385
+ return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
386
 
387
  # If we have already redacted the last file, return the input out_message and file list to the relevant components
388
+ if latest_file_completed >= len(file_paths):
389
  print("Last file reached, returning files:", str(latest_file_completed))
390
+ # Set to a very high number so as not to mess with subsequent file processing by the user
391
+ latest_file_completed = 99
392
  final_out_message = '\n'.join(out_message)
393
+ return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
394
 
395
  file_path_loop = [file_paths[int(latest_file_completed)]]
396
 
 
399
  if anon_file=='open_text':
400
  anon_df = pd.DataFrame(data={'text':[in_text]})
401
  chosen_cols=['text']
402
+ sheet_name = ""
403
+ file_type = ""
404
  out_file_part = anon_file
405
+
406
+ out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths)
407
  else:
408
  # If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
409
  file_type = detect_file_type(anon_file)
 
421
  anon_xlsx = pd.ExcelFile(anon_file)
422
 
423
  # Create xlsx file:
424
+ anon_xlsx_export_file_name = output_folder + out_file_part + "_redacted.xlsx"
425
 
426
  from openpyxl import Workbook
427
 
 
442
  print(anon_df.head()) # Print the first few rows
443
 
444
 
445
+ out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name, log_files_output_paths)
446
 
447
  else:
448
  sheet_name = ""
449
  anon_df = read_file(anon_file)
450
  out_file_part = get_file_path_end(anon_file.name)
451
+ out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths)
452
 
453
  # Increase latest file completed count unless we are at the last file
454
  if latest_file_completed != len(file_paths):
 
466
 
467
  out_message_out = '\n'.join(out_message)
468
  out_message_out = out_message_out + " " + out_time
469
+
470
+ out_message_out = out_message_out + "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
471
 
472
+ return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
tools/file_conversion.py CHANGED
@@ -87,8 +87,6 @@ def process_file(file_path):
87
  print(f"{file_path} is not an image or PDF file.")
88
  img_object = ['']
89
 
90
- print('Image object is:', img_object)
91
-
92
  return img_object
93
 
94
  def prepare_image_or_text_pdf(
@@ -129,7 +127,7 @@ def prepare_image_or_text_pdf(
129
  out_message = []
130
  out_file_paths = []
131
  else:
132
- print("Now attempting file:", str(latest_file_completed + 1))
133
  out_file_paths = []
134
 
135
  if not file_paths:
@@ -140,7 +138,7 @@ def prepare_image_or_text_pdf(
140
  latest_file_completed = int(latest_file_completed)
141
 
142
  # If we have already redacted the last file, return the input out_message and file list to the relevant components
143
- if latest_file_completed == len(file_paths):
144
  print("Last file reached, returning files:", str(latest_file_completed))
145
  #final_out_message = '\n'.join(out_message)
146
  return out_message, out_file_paths
@@ -204,6 +202,6 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
204
  out_message = "PDF " + file_path_without_ext + " converted to image-based file."
205
  print(out_message)
206
 
207
- print("Out file paths:", out_file_paths)
208
 
209
  return out_message, out_file_paths
 
87
  print(f"{file_path} is not an image or PDF file.")
88
  img_object = ['']
89
 
 
 
90
  return img_object
91
 
92
  def prepare_image_or_text_pdf(
 
127
  out_message = []
128
  out_file_paths = []
129
  else:
130
+ print("Now attempting file:", str(latest_file_completed))
131
  out_file_paths = []
132
 
133
  if not file_paths:
 
138
  latest_file_completed = int(latest_file_completed)
139
 
140
  # If we have already redacted the last file, return the input out_message and file list to the relevant components
141
+ if latest_file_completed >= len(file_paths):
142
  print("Last file reached, returning files:", str(latest_file_completed))
143
  #final_out_message = '\n'.join(out_message)
144
  return out_message, out_file_paths
 
202
  out_message = "PDF " + file_path_without_ext + " converted to image-based file."
203
  print(out_message)
204
 
205
+ #print("Out file paths:", out_file_paths)
206
 
207
  return out_message, out_file_paths
tools/file_redaction.py CHANGED
@@ -18,10 +18,11 @@ from tools.data_anonymise import generate_decision_process_output
18
  import gradio as gr
19
 
20
 
21
- def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list = [], first_loop_state:bool=False, progress=gr.Progress(track_tqdm=True)):
22
 
23
  tic = time.perf_counter()
24
 
 
25
  # If this is the first time around, set variables to 0/blank
26
  if first_loop_state==True:
27
  latest_file_completed = 0
@@ -35,15 +36,15 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
35
  if not out_file_paths:
36
  out_file_paths = []
37
 
38
- print("Latest file completed is:", str(latest_file_completed))
39
-
40
  latest_file_completed = int(latest_file_completed)
41
 
42
  # If we have already redacted the last file, return the input out_message and file list to the relevant components
43
- if latest_file_completed == len(file_paths):
44
- print("Last file reached, returning files:", str(latest_file_completed))
 
 
45
  final_out_message = '\n'.join(out_message)
46
- return final_out_message, out_file_paths, out_file_paths, latest_file_completed
47
 
48
  file_paths_loop = [file_paths[int(latest_file_completed)]]
49
 
@@ -51,8 +52,6 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
51
  in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
52
 
53
 
54
- #print("File paths:", file_paths)
55
-
56
  for file in progress.tqdm(file_paths_loop, desc="Redacting files", unit = "files"):
57
  file_path = file.name
58
 
@@ -66,7 +65,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
66
  else:
67
  out_message = "No file selected"
68
  print(out_message)
69
- return out_message, out_file_paths, out_file_paths, latest_file_completed
70
 
71
  if in_redact_method == "Image analysis":
72
  # Analyse and redact image-based pdf or image
@@ -85,7 +84,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
85
  logs_output_file_name = out_image_file_path + "_decision_process_output.txt"
86
  with open(logs_output_file_name, "w") as f:
87
  f.write(output_logs_str)
88
- out_file_paths.append(logs_output_file_name)
89
 
90
  # Increase latest file completed count unless we are at the last file
91
  if latest_file_completed != len(file_paths):
@@ -119,19 +118,19 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
119
  logs_output_file_name = img_output_file_path[0] + "_decision_process_output.txt"
120
  with open(logs_output_file_name, "w") as f:
121
  f.write(output_logs_str)
122
- out_file_paths.append(logs_output_file_name)
123
 
124
  # Add confirmation for converting to image if you want
125
  # out_message.append(img_output_summary)
126
 
127
  if latest_file_completed != len(file_paths):
128
- print("Completed file number:", str(latest_file_completed))
129
  latest_file_completed += 1
130
 
131
  else:
132
  out_message = "No redaction method selected"
133
  print(out_message)
134
- return out_message, out_file_paths, out_file_paths, latest_file_completed
135
 
136
 
137
  toc = time.perf_counter()
@@ -141,7 +140,9 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
141
  out_message_out = '\n'.join(out_message)
142
  out_message_out = out_message_out + " " + out_time
143
 
144
- return out_message_out, out_file_paths, out_file_paths, latest_file_completed
 
 
145
 
146
  def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
147
  merged_bboxes = []
@@ -388,13 +389,9 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
388
  # Merge bounding boxes if very close together
389
  text_container_analyzed_bounding_boxes = merge_bounding_boxes(text_container_analyzer_results, characters, combine_pixel_dist)
390
 
391
- print("\n\nanalyzed_bounding_boxes_in_loop:", text_container_analyzed_bounding_boxes)
392
-
393
  page_analyzed_bounding_boxes.extend(text_container_analyzed_bounding_boxes)
394
  page_analyzer_results.extend(text_container_analyzer_results)
395
 
396
- print("analyzed_bounding_boxes_out_loop:\n\n", page_analyzed_bounding_boxes)
397
-
398
  decision_process_table_on_page = create_text_redaction_process_results(page_analyzer_results, page_analyzed_bounding_boxes, page_num)
399
 
400
  annotations_on_page = create_annotations_for_bounding_boxes(page_analyzed_bounding_boxes)
 
18
  import gradio as gr
19
 
20
 
21
+ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list=[], log_files_output_paths:list=[], first_loop_state:bool=False, progress=gr.Progress(track_tqdm=True)):
22
 
23
  tic = time.perf_counter()
24
 
25
+
26
  # If this is the first time around, set variables to 0/blank
27
  if first_loop_state==True:
28
  latest_file_completed = 0
 
36
  if not out_file_paths:
37
  out_file_paths = []
38
 
 
 
39
  latest_file_completed = int(latest_file_completed)
40
 
41
  # If we have already redacted the last file, return the input out_message and file list to the relevant components
42
+ if latest_file_completed >= len(file_paths):
43
+ print("Last file reached")
44
+ # Set to a very high number so as not to mess with subsequent file processing by the user
45
+ latest_file_completed = 99
46
  final_out_message = '\n'.join(out_message)
47
+ return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
48
 
49
  file_paths_loop = [file_paths[int(latest_file_completed)]]
50
 
 
52
  in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
53
 
54
 
 
 
55
  for file in progress.tqdm(file_paths_loop, desc="Redacting files", unit = "files"):
56
  file_path = file.name
57
 
 
65
  else:
66
  out_message = "No file selected"
67
  print(out_message)
68
+ return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
69
 
70
  if in_redact_method == "Image analysis":
71
  # Analyse and redact image-based pdf or image
 
84
  logs_output_file_name = out_image_file_path + "_decision_process_output.txt"
85
  with open(logs_output_file_name, "w") as f:
86
  f.write(output_logs_str)
87
+ log_files_output_paths.append(logs_output_file_name)
88
 
89
  # Increase latest file completed count unless we are at the last file
90
  if latest_file_completed != len(file_paths):
 
118
  logs_output_file_name = img_output_file_path[0] + "_decision_process_output.txt"
119
  with open(logs_output_file_name, "w") as f:
120
  f.write(output_logs_str)
121
+ log_files_output_paths.append(logs_output_file_name)
122
 
123
  # Add confirmation for converting to image if you want
124
  # out_message.append(img_output_summary)
125
 
126
  if latest_file_completed != len(file_paths):
127
+ print("Completed file number:", str(latest_file_completed), "more files to do")
128
  latest_file_completed += 1
129
 
130
  else:
131
  out_message = "No redaction method selected"
132
  print(out_message)
133
+ return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
134
 
135
 
136
  toc = time.perf_counter()
 
140
  out_message_out = '\n'.join(out_message)
141
  out_message_out = out_message_out + " " + out_time
142
 
143
+ out_message_out = out_message_out + "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
144
+
145
+ return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
146
 
147
  def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
148
  merged_bboxes = []
 
389
  # Merge bounding boxes if very close together
390
  text_container_analyzed_bounding_boxes = merge_bounding_boxes(text_container_analyzer_results, characters, combine_pixel_dist)
391
 
 
 
392
  page_analyzed_bounding_boxes.extend(text_container_analyzed_bounding_boxes)
393
  page_analyzer_results.extend(text_container_analyzer_results)
394
 
 
 
395
  decision_process_table_on_page = create_text_redaction_process_results(page_analyzer_results, page_analyzed_bounding_boxes, page_num)
396
 
397
  annotations_on_page = create_annotations_for_bounding_boxes(page_analyzed_bounding_boxes)
tools/helper_functions.py CHANGED
@@ -141,7 +141,7 @@ def add_folder_to_path(folder_path: str):
141
 
142
  # Upon running a process, the feedback buttons are revealed
143
  def reveal_feedback_buttons():
144
- return gr.Radio(visible=True), gr.Textbox(visible=True), gr.Button(visible=True)
145
 
146
  def wipe_logs(feedback_logs_loc, usage_logs_loc):
147
  try:
 
141
 
142
  # Upon running a process, the feedback buttons are revealed
143
  def reveal_feedback_buttons():
144
+ return gr.Radio(visible=True), gr.Textbox(visible=True), gr.Button(visible=True), gr.Markdown(visible=True)
145
 
146
  def wipe_logs(feedback_logs_loc, usage_logs_loc):
147
  try: