Commit
·
bbf818d
1
Parent(s):
93ac94f
Handles multiple runs with multiple files correctly now. Logging and feedback improvements.
Browse files- app.py +28 -27
- tools/aws_functions.py +3 -3
- tools/data_anonymise.py +20 -16
- tools/file_conversion.py +3 -5
- tools/file_redaction.py +15 -18
- tools/helper_functions.py +1 -1
app.py
CHANGED
@@ -36,6 +36,7 @@ with app:
|
|
36 |
output_image_files_state = gr.State([])
|
37 |
output_file_list_state = gr.State([])
|
38 |
text_output_file_list_state = gr.State([])
|
|
|
39 |
first_loop_state = gr.State(True)
|
40 |
second_loop_state = gr.State(False)
|
41 |
|
@@ -66,16 +67,15 @@ with app:
|
|
66 |
with gr.Row():
|
67 |
output_summary = gr.Textbox(label="Output summary")
|
68 |
output_file = gr.File(label="Output files")
|
69 |
-
text_documents_done = gr.Number(value=0, label="Number of documents redacted", interactive=False)
|
70 |
|
71 |
with gr.Row():
|
72 |
convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary", visible=False)
|
73 |
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
pdf_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
|
79 |
|
80 |
with gr.Row():
|
81 |
s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
|
@@ -100,14 +100,13 @@ with app:
|
|
100 |
with gr.Row():
|
101 |
text_output_summary = gr.Textbox(label="Output result")
|
102 |
text_output_file = gr.File(label="Output files")
|
103 |
-
text_tabular_files_done = gr.Number(value=0, label="Number of tabular files redacted", interactive=False)
|
104 |
|
105 |
-
|
106 |
-
|
107 |
choices=["The results were good", "The results were not good"], visible=False)
|
108 |
-
|
109 |
-
|
110 |
-
data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
|
111 |
|
112 |
with gr.Tab(label="Redaction settings"):
|
113 |
gr.Markdown(
|
@@ -124,6 +123,7 @@ with app:
|
|
124 |
with gr.Row():
|
125 |
in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False)
|
126 |
in_allow_list = gr.Dataframe(label="Allow list - enter a new term to ignore for redaction on each row e.g. Lambeth -> add new row -> Lambeth 2030", headers=["Allow list"], row_count=1, col_count=(1, 'fixed'), value=[[""]], type="array", column_widths=["100px"], datatype='str')
|
|
|
127 |
|
128 |
# Invisible text box to hold the session hash/username just for logging purposes
|
129 |
session_hash_textbox = gr.Textbox(value="", visible=False)
|
@@ -143,23 +143,23 @@ with app:
|
|
143 |
|
144 |
# Document redaction
|
145 |
redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, first_loop_state], outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
|
146 |
-
then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, first_loop_state],
|
147 |
-
outputs=[output_summary, output_file, output_file_list_state, text_documents_done], api_name="redact_doc")
|
148 |
|
149 |
# If the output file count text box changes, keep going with redacting each document until done
|
150 |
text_documents_done.change(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, second_loop_state], outputs=[output_summary, prepared_pdf_state]).\
|
151 |
-
then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, second_loop_state],
|
152 |
-
outputs=[output_summary, output_file, output_file_list_state, text_documents_done]).\
|
153 |
-
then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn])
|
154 |
|
155 |
# Tabular data redaction
|
156 |
in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets])
|
157 |
|
158 |
-
tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, in_excel_sheets, first_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done], api_name="redact_text")
|
159 |
|
160 |
# If the output file count text box changes, keep going with redacting each data file until done
|
161 |
-
text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done]).\
|
162 |
-
then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn])
|
163 |
|
164 |
#app.load(wipe_logs, inputs=[feedback_logs_state, usage_logs_state], outputs=[]).\
|
165 |
# then(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
|
@@ -169,19 +169,20 @@ with app:
|
|
169 |
# Log usernames and times of access to file (to know who is using the app when running on AWS)
|
170 |
callback = gr.CSVLogger()
|
171 |
callback.setup([session_hash_textbox], logs_data_folder)
|
172 |
-
session_hash_textbox.change(lambda *args: callback.flag(list(args)), [session_hash_textbox], None, preprocess=False)
|
|
|
173 |
|
174 |
# User submitted feedback for pdf redactions
|
175 |
pdf_callback = gr.CSVLogger()
|
176 |
-
pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text], feedback_data_folder)
|
177 |
-
pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text], None, preprocess=False).\
|
178 |
-
then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[
|
179 |
|
180 |
# User submitted feedback for data redactions
|
181 |
data_callback = gr.CSVLogger()
|
182 |
-
data_callback.setup([data_feedback_radio, data_further_details_text], feedback_data_folder)
|
183 |
-
data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text], None, preprocess=False).\
|
184 |
-
then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[
|
185 |
|
186 |
# Launch the Gradio app
|
187 |
COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
|
|
|
36 |
output_image_files_state = gr.State([])
|
37 |
output_file_list_state = gr.State([])
|
38 |
text_output_file_list_state = gr.State([])
|
39 |
+
log_files_output_list_state = gr.State([])
|
40 |
first_loop_state = gr.State(True)
|
41 |
second_loop_state = gr.State(False)
|
42 |
|
|
|
67 |
with gr.Row():
|
68 |
output_summary = gr.Textbox(label="Output summary")
|
69 |
output_file = gr.File(label="Output files")
|
70 |
+
text_documents_done = gr.Number(value=0, label="Number of documents redacted", interactive=False, visible=False)
|
71 |
|
72 |
with gr.Row():
|
73 |
convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary", visible=False)
|
74 |
|
75 |
+
pdf_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
|
76 |
+
pdf_feedback_radio = gr.Radio(choices=["The results were good", "The results were not good"], visible=False)
|
77 |
+
pdf_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
|
78 |
+
pdf_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
|
|
|
79 |
|
80 |
with gr.Row():
|
81 |
s3_logs_output_textbox = gr.Textbox(label="Feedback submission logs", visible=False)
|
|
|
100 |
with gr.Row():
|
101 |
text_output_summary = gr.Textbox(label="Output result")
|
102 |
text_output_file = gr.File(label="Output files")
|
103 |
+
text_tabular_files_done = gr.Number(value=0, label="Number of tabular files redacted", interactive=False, visible=False)
|
104 |
|
105 |
+
data_feedback_title = gr.Markdown(value="## Please give feedback", visible=False)
|
106 |
+
data_feedback_radio = gr.Radio(label="Please give some feedback about the results of the redaction. A reminder that the app is only expected to identify about 60% of personally identifiable information in a given (typed) document.",
|
107 |
choices=["The results were good", "The results were not good"], visible=False)
|
108 |
+
data_further_details_text = gr.Textbox(label="Please give more detailed feedback about the results:", visible=False)
|
109 |
+
data_submit_feedback_btn = gr.Button(value="Submit feedback", visible=False)
|
|
|
110 |
|
111 |
with gr.Tab(label="Redaction settings"):
|
112 |
gr.Markdown(
|
|
|
123 |
with gr.Row():
|
124 |
in_redact_language = gr.Dropdown(value = "en", choices = ["en"], label="Redaction language (only English currently supported)", multiselect=False)
|
125 |
in_allow_list = gr.Dataframe(label="Allow list - enter a new term to ignore for redaction on each row e.g. Lambeth -> add new row -> Lambeth 2030", headers=["Allow list"], row_count=1, col_count=(1, 'fixed'), value=[[""]], type="array", column_widths=["100px"], datatype='str')
|
126 |
+
log_files_output = gr.File(label="Log file output", interactive=False)
|
127 |
|
128 |
# Invisible text box to hold the session hash/username just for logging purposes
|
129 |
session_hash_textbox = gr.Textbox(value="", visible=False)
|
|
|
143 |
|
144 |
# Document redaction
|
145 |
redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, first_loop_state], outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
|
146 |
+
then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, first_loop_state],
|
147 |
+
outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state], api_name="redact_doc")
|
148 |
|
149 |
# If the output file count text box changes, keep going with redacting each document until done
|
150 |
text_documents_done.change(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redaction_method, in_allow_list, text_documents_done, output_summary, second_loop_state], outputs=[output_summary, prepared_pdf_state]).\
|
151 |
+
then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list, text_documents_done, output_summary, output_file_list_state, log_files_output_list_state, second_loop_state],
|
152 |
+
outputs=[output_summary, output_file, output_file_list_state, text_documents_done, log_files_output, log_files_output_list_state]).\
|
153 |
+
then(fn = reveal_feedback_buttons, outputs=[pdf_feedback_radio, pdf_further_details_text, pdf_submit_feedback_btn, pdf_feedback_title])
|
154 |
|
155 |
# Tabular data redaction
|
156 |
in_data_files.upload(fn=put_columns_in_df, inputs=[in_data_files], outputs=[in_colnames, in_excel_sheets])
|
157 |
|
158 |
+
tabular_data_redact_btn.click(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, first_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state], api_name="redact_text")
|
159 |
|
160 |
# If the output file count text box changes, keep going with redacting each data file until done
|
161 |
+
text_tabular_files_done.change(fn=anonymise_data_files, inputs=[in_data_files, in_text, anon_strat, in_colnames, in_redact_language, in_redact_entities, in_allow_list, text_tabular_files_done, text_output_summary, text_output_file_list_state, log_files_output_list_state, in_excel_sheets, second_loop_state], outputs=[text_output_summary, text_output_file, text_output_file_list_state, text_tabular_files_done, log_files_output, log_files_output_list_state]).\
|
162 |
+
then(fn = reveal_feedback_buttons, outputs=[data_feedback_radio, data_further_details_text, data_submit_feedback_btn, data_feedback_title])
|
163 |
|
164 |
#app.load(wipe_logs, inputs=[feedback_logs_state, usage_logs_state], outputs=[]).\
|
165 |
# then(get_connection_params, inputs=None, outputs=[session_hash_state, s3_output_folder_state, session_hash_textbox])
|
|
|
169 |
# Log usernames and times of access to file (to know who is using the app when running on AWS)
|
170 |
callback = gr.CSVLogger()
|
171 |
callback.setup([session_hash_textbox], logs_data_folder)
|
172 |
+
session_hash_textbox.change(lambda *args: callback.flag(list(args)), [session_hash_textbox], None, preprocess=False).\
|
173 |
+
then(fn = upload_file_to_s3, inputs=[usage_logs_state, usage_s3_logs_loc_state], outputs=[s3_logs_output_textbox])
|
174 |
|
175 |
# User submitted feedback for pdf redactions
|
176 |
pdf_callback = gr.CSVLogger()
|
177 |
+
pdf_callback.setup([pdf_feedback_radio, pdf_further_details_text, in_file], feedback_data_folder)
|
178 |
+
pdf_submit_feedback_btn.click(lambda *args: pdf_callback.flag(list(args)), [pdf_feedback_radio, pdf_further_details_text, in_file], None, preprocess=False).\
|
179 |
+
then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[pdf_further_details_text])
|
180 |
|
181 |
# User submitted feedback for data redactions
|
182 |
data_callback = gr.CSVLogger()
|
183 |
+
data_callback.setup([data_feedback_radio, data_further_details_text, in_data_files], feedback_data_folder)
|
184 |
+
data_submit_feedback_btn.click(lambda *args: data_callback.flag(list(args)), [data_feedback_radio, data_further_details_text, in_data_files], None, preprocess=False).\
|
185 |
+
then(fn = upload_file_to_s3, inputs=[feedback_logs_state, feedback_s3_logs_loc_state], outputs=[data_further_details_text])
|
186 |
|
187 |
# Launch the Gradio app
|
188 |
COGNITO_AUTH = get_or_create_env_var('COGNITO_AUTH', '0')
|
tools/aws_functions.py
CHANGED
@@ -10,7 +10,7 @@ PandasDataFrame = Type[pd.DataFrame]
|
|
10 |
# Get AWS credentials if required
|
11 |
bucket_name=""
|
12 |
aws_var = "RUN_AWS_FUNCTIONS"
|
13 |
-
aws_var_default = "
|
14 |
aws_var_val = get_or_create_env_var(aws_var, aws_var_default)
|
15 |
print(f'The value of {aws_var} is {aws_var_val}')
|
16 |
|
@@ -185,11 +185,11 @@ def upload_file_to_s3(local_file_paths:List[str], s3_key:str, s3_bucket:str=buck
|
|
185 |
print("S3 key: ", s3_key_full)
|
186 |
|
187 |
s3_client.upload_file(file, s3_bucket, s3_key_full)
|
188 |
-
out_message = "File " + file_name + " uploaded successfully
|
189 |
print(out_message)
|
190 |
|
191 |
except Exception as e:
|
192 |
-
out_message = f"Error uploading file(s)
|
193 |
print(out_message)
|
194 |
|
195 |
final_out_message.append(out_message)
|
|
|
10 |
# Get AWS credentials if required
|
11 |
bucket_name=""
|
12 |
aws_var = "RUN_AWS_FUNCTIONS"
|
13 |
+
aws_var_default = "0"
|
14 |
aws_var_val = get_or_create_env_var(aws_var, aws_var_default)
|
15 |
print(f'The value of {aws_var} is {aws_var_val}')
|
16 |
|
|
|
185 |
print("S3 key: ", s3_key_full)
|
186 |
|
187 |
s3_client.upload_file(file, s3_bucket, s3_key_full)
|
188 |
+
out_message = "File " + file_name + " uploaded successfully!"
|
189 |
print(out_message)
|
190 |
|
191 |
except Exception as e:
|
192 |
+
out_message = f"Error uploading file(s): {e}"
|
193 |
print(out_message)
|
194 |
|
195 |
final_out_message.append(out_message)
|
tools/data_anonymise.py
CHANGED
@@ -69,23 +69,18 @@ def generate_decision_process_output(analyzer_results: List[DictAnalyzerResult],
|
|
69 |
|
70 |
# Run through each column to analyse for PII
|
71 |
for i, result in enumerate(analyzer_results):
|
72 |
-
print("Looking at result:", str(i))
|
73 |
-
print("result:\n\n", result)
|
74 |
|
75 |
# If a single result
|
76 |
if isinstance(result, RecognizerResult):
|
77 |
-
print("Processing recogniser result as RecognizerResult:", str(i))
|
78 |
decision_process_output.extend(process_recognizer_result(result, result, 0, i, df_dict, keys_to_keep))
|
79 |
|
80 |
# If a list of results
|
81 |
elif isinstance(result, list) or isinstance(result, DictAnalyzerResult):
|
82 |
for x, recognizer_result in enumerate(result.recognizer_results):
|
83 |
-
print("Processing recogniser result as List:", str(i))
|
84 |
decision_process_output.extend(process_recognizer_result(result, recognizer_result, x, i, df_dict, keys_to_keep))
|
85 |
|
86 |
else:
|
87 |
try:
|
88 |
-
print("Processing recogniser result in other:", str(i))
|
89 |
decision_process_output.extend(process_recognizer_result(result, result, 0, i, df_dict, keys_to_keep))
|
90 |
except Exception as e:
|
91 |
print(e)
|
@@ -269,7 +264,8 @@ def anonymise_script(df, anon_strat, language:str, chosen_redact_entities:List[s
|
|
269 |
|
270 |
return scrubbed_df, key_string, decision_process_output_str
|
271 |
|
272 |
-
def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, excel_sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name):
|
|
|
273 |
def check_lists(list1, list2):
|
274 |
return any(string in list2 for string in list1)
|
275 |
|
@@ -344,7 +340,7 @@ def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_
|
|
344 |
f.write(decision_process_output_str)
|
345 |
|
346 |
out_file_paths.append(anon_export_file_name)
|
347 |
-
|
348 |
|
349 |
# As files are created in a loop, there is a risk of duplicate file names being output. Use set to keep uniques.
|
350 |
out_file_paths = list(set(out_file_paths))
|
@@ -353,9 +349,9 @@ def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_
|
|
353 |
if anon_file=='open_text':
|
354 |
out_message = [anon_df_out['text'][0]]
|
355 |
|
356 |
-
return out_file_paths, out_message, key_string
|
357 |
|
358 |
-
def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chosen_cols:List[str], language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list = [], in_excel_sheets:list=[], first_loop_state:bool=False, progress=Progress(track_tqdm=True)):
|
359 |
|
360 |
tic = time.perf_counter()
|
361 |
|
@@ -386,13 +382,15 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
|
|
386 |
file_paths=['open_text']
|
387 |
else:
|
388 |
out_message = "Please enter text or a file to redact."
|
389 |
-
return out_message, out_file_paths, out_file_paths, latest_file_completed
|
390 |
|
391 |
# If we have already redacted the last file, return the input out_message and file list to the relevant components
|
392 |
-
if latest_file_completed
|
393 |
print("Last file reached, returning files:", str(latest_file_completed))
|
|
|
|
|
394 |
final_out_message = '\n'.join(out_message)
|
395 |
-
return final_out_message, out_file_paths, out_file_paths, latest_file_completed
|
396 |
|
397 |
file_path_loop = [file_paths[int(latest_file_completed)]]
|
398 |
|
@@ -401,7 +399,11 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
|
|
401 |
if anon_file=='open_text':
|
402 |
anon_df = pd.DataFrame(data={'text':[in_text]})
|
403 |
chosen_cols=['text']
|
|
|
|
|
404 |
out_file_part = anon_file
|
|
|
|
|
405 |
else:
|
406 |
# If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
|
407 |
file_type = detect_file_type(anon_file)
|
@@ -419,7 +421,7 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
|
|
419 |
anon_xlsx = pd.ExcelFile(anon_file)
|
420 |
|
421 |
# Create xlsx file:
|
422 |
-
anon_xlsx_export_file_name = output_folder + out_file_part + ".xlsx"
|
423 |
|
424 |
from openpyxl import Workbook
|
425 |
|
@@ -440,13 +442,13 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
|
|
440 |
print(anon_df.head()) # Print the first few rows
|
441 |
|
442 |
|
443 |
-
out_file_paths, out_message, key_string = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name)
|
444 |
|
445 |
else:
|
446 |
sheet_name = ""
|
447 |
anon_df = read_file(anon_file)
|
448 |
out_file_part = get_file_path_end(anon_file.name)
|
449 |
-
out_file_paths, out_message, key_string = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "")
|
450 |
|
451 |
# Increase latest file completed count unless we are at the last file
|
452 |
if latest_file_completed != len(file_paths):
|
@@ -464,5 +466,7 @@ def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chos
|
|
464 |
|
465 |
out_message_out = '\n'.join(out_message)
|
466 |
out_message_out = out_message_out + " " + out_time
|
|
|
|
|
467 |
|
468 |
-
return out_message_out, out_file_paths, out_file_paths, latest_file_completed
|
|
|
69 |
|
70 |
# Run through each column to analyse for PII
|
71 |
for i, result in enumerate(analyzer_results):
|
|
|
|
|
72 |
|
73 |
# If a single result
|
74 |
if isinstance(result, RecognizerResult):
|
|
|
75 |
decision_process_output.extend(process_recognizer_result(result, result, 0, i, df_dict, keys_to_keep))
|
76 |
|
77 |
# If a list of results
|
78 |
elif isinstance(result, list) or isinstance(result, DictAnalyzerResult):
|
79 |
for x, recognizer_result in enumerate(result.recognizer_results):
|
|
|
80 |
decision_process_output.extend(process_recognizer_result(result, recognizer_result, x, i, df_dict, keys_to_keep))
|
81 |
|
82 |
else:
|
83 |
try:
|
|
|
84 |
decision_process_output.extend(process_recognizer_result(result, result, 0, i, df_dict, keys_to_keep))
|
85 |
except Exception as e:
|
86 |
print(e)
|
|
|
264 |
|
265 |
return scrubbed_df, key_string, decision_process_output_str
|
266 |
|
267 |
+
def anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, excel_sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name, log_files_output_paths):
|
268 |
+
|
269 |
def check_lists(list1, list2):
|
270 |
return any(string in list2 for string in list1)
|
271 |
|
|
|
340 |
f.write(decision_process_output_str)
|
341 |
|
342 |
out_file_paths.append(anon_export_file_name)
|
343 |
+
log_files_output_paths.append(decision_process_log_output_file)
|
344 |
|
345 |
# As files are created in a loop, there is a risk of duplicate file names being output. Use set to keep uniques.
|
346 |
out_file_paths = list(set(out_file_paths))
|
|
|
349 |
if anon_file=='open_text':
|
350 |
out_message = [anon_df_out['text'][0]]
|
351 |
|
352 |
+
return out_file_paths, out_message, key_string, log_files_output_paths
|
353 |
|
354 |
+
def anonymise_data_files(file_paths:List[str], in_text:str, anon_strat:str, chosen_cols:List[str], language:str, chosen_redact_entities:List[str], in_allow_list:List[str]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list = [], log_files_output_paths:list = [], in_excel_sheets:list=[], first_loop_state:bool=False, progress=Progress(track_tqdm=True)):
|
355 |
|
356 |
tic = time.perf_counter()
|
357 |
|
|
|
382 |
file_paths=['open_text']
|
383 |
else:
|
384 |
out_message = "Please enter text or a file to redact."
|
385 |
+
return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
|
386 |
|
387 |
# If we have already redacted the last file, return the input out_message and file list to the relevant components
|
388 |
+
if latest_file_completed >= len(file_paths):
|
389 |
print("Last file reached, returning files:", str(latest_file_completed))
|
390 |
+
# Set to a very high number so as not to mess with subsequent file processing by the user
|
391 |
+
latest_file_completed = 99
|
392 |
final_out_message = '\n'.join(out_message)
|
393 |
+
return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
|
394 |
|
395 |
file_path_loop = [file_paths[int(latest_file_completed)]]
|
396 |
|
|
|
399 |
if anon_file=='open_text':
|
400 |
anon_df = pd.DataFrame(data={'text':[in_text]})
|
401 |
chosen_cols=['text']
|
402 |
+
sheet_name = ""
|
403 |
+
file_type = ""
|
404 |
out_file_part = anon_file
|
405 |
+
|
406 |
+
out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths)
|
407 |
else:
|
408 |
# If file is an xlsx, we are going to run through all the Excel sheets to anonymise them separately.
|
409 |
file_type = detect_file_type(anon_file)
|
|
|
421 |
anon_xlsx = pd.ExcelFile(anon_file)
|
422 |
|
423 |
# Create xlsx file:
|
424 |
+
anon_xlsx_export_file_name = output_folder + out_file_part + "_redacted.xlsx"
|
425 |
|
426 |
from openpyxl import Workbook
|
427 |
|
|
|
442 |
print(anon_df.head()) # Print the first few rows
|
443 |
|
444 |
|
445 |
+
out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, anon_xlsx_export_file_name, log_files_output_paths)
|
446 |
|
447 |
else:
|
448 |
sheet_name = ""
|
449 |
anon_df = read_file(anon_file)
|
450 |
out_file_part = get_file_path_end(anon_file.name)
|
451 |
+
out_file_paths, out_message, key_string, log_files_output_paths = anon_wrapper_func(anon_file, anon_df, chosen_cols, out_file_paths, out_file_part, out_message, sheet_name, anon_strat, language, chosen_redact_entities, in_allow_list, file_type, "", log_files_output_paths)
|
452 |
|
453 |
# Increase latest file completed count unless we are at the last file
|
454 |
if latest_file_completed != len(file_paths):
|
|
|
466 |
|
467 |
out_message_out = '\n'.join(out_message)
|
468 |
out_message_out = out_message_out + " " + out_time
|
469 |
+
|
470 |
+
out_message_out = out_message_out + "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
|
471 |
|
472 |
+
return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
|
tools/file_conversion.py
CHANGED
@@ -87,8 +87,6 @@ def process_file(file_path):
|
|
87 |
print(f"{file_path} is not an image or PDF file.")
|
88 |
img_object = ['']
|
89 |
|
90 |
-
print('Image object is:', img_object)
|
91 |
-
|
92 |
return img_object
|
93 |
|
94 |
def prepare_image_or_text_pdf(
|
@@ -129,7 +127,7 @@ def prepare_image_or_text_pdf(
|
|
129 |
out_message = []
|
130 |
out_file_paths = []
|
131 |
else:
|
132 |
-
print("Now attempting file:", str(latest_file_completed
|
133 |
out_file_paths = []
|
134 |
|
135 |
if not file_paths:
|
@@ -140,7 +138,7 @@ def prepare_image_or_text_pdf(
|
|
140 |
latest_file_completed = int(latest_file_completed)
|
141 |
|
142 |
# If we have already redacted the last file, return the input out_message and file list to the relevant components
|
143 |
-
if latest_file_completed
|
144 |
print("Last file reached, returning files:", str(latest_file_completed))
|
145 |
#final_out_message = '\n'.join(out_message)
|
146 |
return out_message, out_file_paths
|
@@ -204,6 +202,6 @@ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
|
|
204 |
out_message = "PDF " + file_path_without_ext + " converted to image-based file."
|
205 |
print(out_message)
|
206 |
|
207 |
-
print("Out file paths:", out_file_paths)
|
208 |
|
209 |
return out_message, out_file_paths
|
|
|
87 |
print(f"{file_path} is not an image or PDF file.")
|
88 |
img_object = ['']
|
89 |
|
|
|
|
|
90 |
return img_object
|
91 |
|
92 |
def prepare_image_or_text_pdf(
|
|
|
127 |
out_message = []
|
128 |
out_file_paths = []
|
129 |
else:
|
130 |
+
print("Now attempting file:", str(latest_file_completed))
|
131 |
out_file_paths = []
|
132 |
|
133 |
if not file_paths:
|
|
|
138 |
latest_file_completed = int(latest_file_completed)
|
139 |
|
140 |
# If we have already redacted the last file, return the input out_message and file list to the relevant components
|
141 |
+
if latest_file_completed >= len(file_paths):
|
142 |
print("Last file reached, returning files:", str(latest_file_completed))
|
143 |
#final_out_message = '\n'.join(out_message)
|
144 |
return out_message, out_file_paths
|
|
|
202 |
out_message = "PDF " + file_path_without_ext + " converted to image-based file."
|
203 |
print(out_message)
|
204 |
|
205 |
+
#print("Out file paths:", out_file_paths)
|
206 |
|
207 |
return out_message, out_file_paths
|
tools/file_redaction.py
CHANGED
@@ -18,10 +18,11 @@ from tools.data_anonymise import generate_decision_process_output
|
|
18 |
import gradio as gr
|
19 |
|
20 |
|
21 |
-
def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list =
|
22 |
|
23 |
tic = time.perf_counter()
|
24 |
|
|
|
25 |
# If this is the first time around, set variables to 0/blank
|
26 |
if first_loop_state==True:
|
27 |
latest_file_completed = 0
|
@@ -35,15 +36,15 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
|
|
35 |
if not out_file_paths:
|
36 |
out_file_paths = []
|
37 |
|
38 |
-
print("Latest file completed is:", str(latest_file_completed))
|
39 |
-
|
40 |
latest_file_completed = int(latest_file_completed)
|
41 |
|
42 |
# If we have already redacted the last file, return the input out_message and file list to the relevant components
|
43 |
-
if latest_file_completed
|
44 |
-
print("Last file reached
|
|
|
|
|
45 |
final_out_message = '\n'.join(out_message)
|
46 |
-
return final_out_message, out_file_paths, out_file_paths, latest_file_completed
|
47 |
|
48 |
file_paths_loop = [file_paths[int(latest_file_completed)]]
|
49 |
|
@@ -51,8 +52,6 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
|
|
51 |
in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
52 |
|
53 |
|
54 |
-
#print("File paths:", file_paths)
|
55 |
-
|
56 |
for file in progress.tqdm(file_paths_loop, desc="Redacting files", unit = "files"):
|
57 |
file_path = file.name
|
58 |
|
@@ -66,7 +65,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
|
|
66 |
else:
|
67 |
out_message = "No file selected"
|
68 |
print(out_message)
|
69 |
-
return out_message, out_file_paths, out_file_paths, latest_file_completed
|
70 |
|
71 |
if in_redact_method == "Image analysis":
|
72 |
# Analyse and redact image-based pdf or image
|
@@ -85,7 +84,7 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
|
|
85 |
logs_output_file_name = out_image_file_path + "_decision_process_output.txt"
|
86 |
with open(logs_output_file_name, "w") as f:
|
87 |
f.write(output_logs_str)
|
88 |
-
|
89 |
|
90 |
# Increase latest file completed count unless we are at the last file
|
91 |
if latest_file_completed != len(file_paths):
|
@@ -119,19 +118,19 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
|
|
119 |
logs_output_file_name = img_output_file_path[0] + "_decision_process_output.txt"
|
120 |
with open(logs_output_file_name, "w") as f:
|
121 |
f.write(output_logs_str)
|
122 |
-
|
123 |
|
124 |
# Add confirmation for converting to image if you want
|
125 |
# out_message.append(img_output_summary)
|
126 |
|
127 |
if latest_file_completed != len(file_paths):
|
128 |
-
print("Completed file number:", str(latest_file_completed))
|
129 |
latest_file_completed += 1
|
130 |
|
131 |
else:
|
132 |
out_message = "No redaction method selected"
|
133 |
print(out_message)
|
134 |
-
return out_message, out_file_paths, out_file_paths, latest_file_completed
|
135 |
|
136 |
|
137 |
toc = time.perf_counter()
|
@@ -141,7 +140,9 @@ def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], languag
|
|
141 |
out_message_out = '\n'.join(out_message)
|
142 |
out_message_out = out_message_out + " " + out_time
|
143 |
|
144 |
-
|
|
|
|
|
145 |
|
146 |
def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
|
147 |
merged_bboxes = []
|
@@ -388,13 +389,9 @@ def redact_text_pdf(filename:str, language:str, chosen_redact_entities:List[str]
|
|
388 |
# Merge bounding boxes if very close together
|
389 |
text_container_analyzed_bounding_boxes = merge_bounding_boxes(text_container_analyzer_results, characters, combine_pixel_dist)
|
390 |
|
391 |
-
print("\n\nanalyzed_bounding_boxes_in_loop:", text_container_analyzed_bounding_boxes)
|
392 |
-
|
393 |
page_analyzed_bounding_boxes.extend(text_container_analyzed_bounding_boxes)
|
394 |
page_analyzer_results.extend(text_container_analyzer_results)
|
395 |
|
396 |
-
print("analyzed_bounding_boxes_out_loop:\n\n", page_analyzed_bounding_boxes)
|
397 |
-
|
398 |
decision_process_table_on_page = create_text_redaction_process_results(page_analyzer_results, page_analyzed_bounding_boxes, page_num)
|
399 |
|
400 |
annotations_on_page = create_annotations_for_bounding_boxes(page_analyzed_bounding_boxes)
|
|
|
18 |
import gradio as gr
|
19 |
|
20 |
|
21 |
+
def choose_and_run_redactor(file_paths:List[str], image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, latest_file_completed:int=0, out_message:list=[], out_file_paths:list=[], log_files_output_paths:list=[], first_loop_state:bool=False, progress=gr.Progress(track_tqdm=True)):
|
22 |
|
23 |
tic = time.perf_counter()
|
24 |
|
25 |
+
|
26 |
# If this is the first time around, set variables to 0/blank
|
27 |
if first_loop_state==True:
|
28 |
latest_file_completed = 0
|
|
|
36 |
if not out_file_paths:
|
37 |
out_file_paths = []
|
38 |
|
|
|
|
|
39 |
latest_file_completed = int(latest_file_completed)
|
40 |
|
41 |
# If we have already redacted the last file, return the input out_message and file list to the relevant components
|
42 |
+
if latest_file_completed >= len(file_paths):
|
43 |
+
print("Last file reached")
|
44 |
+
# Set to a very high number so as not to mess with subsequent file processing by the user
|
45 |
+
latest_file_completed = 99
|
46 |
final_out_message = '\n'.join(out_message)
|
47 |
+
return final_out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
|
48 |
|
49 |
file_paths_loop = [file_paths[int(latest_file_completed)]]
|
50 |
|
|
|
52 |
in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
|
53 |
|
54 |
|
|
|
|
|
55 |
for file in progress.tqdm(file_paths_loop, desc="Redacting files", unit = "files"):
|
56 |
file_path = file.name
|
57 |
|
|
|
65 |
else:
|
66 |
out_message = "No file selected"
|
67 |
print(out_message)
|
68 |
+
return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
|
69 |
|
70 |
if in_redact_method == "Image analysis":
|
71 |
# Analyse and redact image-based pdf or image
|
|
|
84 |
logs_output_file_name = out_image_file_path + "_decision_process_output.txt"
|
85 |
with open(logs_output_file_name, "w") as f:
|
86 |
f.write(output_logs_str)
|
87 |
+
log_files_output_paths.append(logs_output_file_name)
|
88 |
|
89 |
# Increase latest file completed count unless we are at the last file
|
90 |
if latest_file_completed != len(file_paths):
|
|
|
118 |
logs_output_file_name = img_output_file_path[0] + "_decision_process_output.txt"
|
119 |
with open(logs_output_file_name, "w") as f:
|
120 |
f.write(output_logs_str)
|
121 |
+
log_files_output_paths.append(logs_output_file_name)
|
122 |
|
123 |
# Add confirmation for converting to image if you want
|
124 |
# out_message.append(img_output_summary)
|
125 |
|
126 |
if latest_file_completed != len(file_paths):
|
127 |
+
print("Completed file number:", str(latest_file_completed), "more files to do")
|
128 |
latest_file_completed += 1
|
129 |
|
130 |
else:
|
131 |
out_message = "No redaction method selected"
|
132 |
print(out_message)
|
133 |
+
return out_message, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
|
134 |
|
135 |
|
136 |
toc = time.perf_counter()
|
|
|
140 |
out_message_out = '\n'.join(out_message)
|
141 |
out_message_out = out_message_out + " " + out_time
|
142 |
|
143 |
+
out_message_out = out_message_out + "\n\nGo to to the Redaction settings tab to see redaction logs. Please give feedback on the results below to help improve this app."
|
144 |
+
|
145 |
+
return out_message_out, out_file_paths, out_file_paths, latest_file_completed, log_files_output_paths, log_files_output_paths
|
146 |
|
147 |
def merge_img_bboxes(bboxes, horizontal_threshold=150, vertical_threshold=25):
|
148 |
merged_bboxes = []
|
|
|
389 |
# Merge bounding boxes if very close together
|
390 |
text_container_analyzed_bounding_boxes = merge_bounding_boxes(text_container_analyzer_results, characters, combine_pixel_dist)
|
391 |
|
|
|
|
|
392 |
page_analyzed_bounding_boxes.extend(text_container_analyzed_bounding_boxes)
|
393 |
page_analyzer_results.extend(text_container_analyzer_results)
|
394 |
|
|
|
|
|
395 |
decision_process_table_on_page = create_text_redaction_process_results(page_analyzer_results, page_analyzed_bounding_boxes, page_num)
|
396 |
|
397 |
annotations_on_page = create_annotations_for_bounding_boxes(page_analyzed_bounding_boxes)
|
tools/helper_functions.py
CHANGED
@@ -141,7 +141,7 @@ def add_folder_to_path(folder_path: str):
|
|
141 |
|
142 |
# Upon running a process, the feedback buttons are revealed
|
143 |
def reveal_feedback_buttons():
|
144 |
-
return gr.Radio(visible=True), gr.Textbox(visible=True), gr.Button(visible=True)
|
145 |
|
146 |
def wipe_logs(feedback_logs_loc, usage_logs_loc):
|
147 |
try:
|
|
|
141 |
|
142 |
# Upon running a process, the feedback buttons are revealed
|
143 |
def reveal_feedback_buttons():
|
144 |
+
return gr.Radio(visible=True), gr.Textbox(visible=True), gr.Button(visible=True), gr.Markdown(visible=True)
|
145 |
|
146 |
def wipe_logs(feedback_logs_loc, usage_logs_loc):
|
147 |
try:
|