seanpedrickcase commited on
Commit
0f18146
·
1 Parent(s): a63133d

Separated file preparation and file redaction functions. Hopefully sts endpoint access now works on AWS

Browse files
app.py CHANGED
@@ -1,12 +1,9 @@
1
- from tools.file_redaction import redact_text_pdf, redact_image_pdf
2
- from tools.helper_functions import get_file_path_end
3
- from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
4
  from tools.aws_functions import load_data_from_aws
5
-
6
  from typing import List
7
  import pandas as pd
8
  import gradio as gr
9
- import time
10
 
11
  #file_path = "examples/Lambeth_2030-Our_Future_Our_Lambeth_foreword.pdf" #"examples/skills-based-cv-example.pdf" # "examples/graduate-job-example-cover-letter.pdf" #
12
 
@@ -14,66 +11,6 @@ chosen_redact_entities = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "
14
  full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
15
  language = 'en'
16
 
17
- def choose_and_run_redactor(file_path:str, language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, progress=gr.Progress(track_tqdm=True)):
18
-
19
- tic = time.perf_counter()
20
-
21
- out_message = ''
22
- out_file_paths = []
23
-
24
- in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
25
-
26
- if file_path:
27
- file_path_without_ext = get_file_path_end(file_path)
28
- else:
29
- out_message = "No file selected"
30
- print(out_message)
31
- return out_message, out_file_paths
32
-
33
- if in_redact_method == "Image analysis":
34
- # Analyse and redact image-based pdf or image
35
- if is_pdf_or_image(file_path) == False:
36
- return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
37
-
38
- pdf_images = redact_image_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
39
- out_image_file_path = "output/" + file_path_without_ext + "_result_as_img.pdf"
40
- pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
41
-
42
- out_file_paths.append(out_image_file_path)
43
- out_message = "Image-based PDF successfully redacted and saved to file."
44
-
45
- elif in_redact_method == "Text analysis":
46
- if is_pdf(file_path) == False:
47
- return "Please upload a PDF file for text analysis.", None
48
-
49
- # Analyse text-based pdf
50
- pdf_text = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
51
- out_text_file_path = "output/" + file_path_without_ext + "_result_as_text.pdf"
52
- pdf_text.save(out_text_file_path)
53
-
54
- out_file_paths.append(out_text_file_path)
55
-
56
- # Convert annotated text pdf back to image to give genuine redactions
57
- pdf_text_image_paths = process_file(out_text_file_path)
58
- out_text_image_file_path = "output/" + file_path_without_ext + "_result_as_text_back_to_img.pdf"
59
- pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_text_image_paths[1:])
60
-
61
- out_file_paths.append(out_text_image_file_path)
62
-
63
- out_message = "Image-based PDF successfully redacted and saved to text-based annotated file, and image-based file."
64
-
65
- else:
66
- out_message = "No redaction method selected"
67
- print(out_message)
68
- return out_message, out_file_paths
69
-
70
- toc = time.perf_counter()
71
- out_time = f"Time taken: {toc - tic:0.1f} seconds."
72
- print(out_time)
73
-
74
- out_message = out_message + "\n\n" + out_time
75
-
76
- return out_message, out_file_paths
77
 
78
  # Create the gradio interface
79
 
@@ -81,6 +18,9 @@ block = gr.Blocks(theme = gr.themes.Base())
81
 
82
  with block:
83
 
 
 
 
84
  gr.Markdown(
85
  """
86
  # Document redaction
@@ -106,6 +46,9 @@ with block:
106
  output_summary = gr.Textbox(label="Output summary")
107
  output_file = gr.File(label="Output file")
108
 
 
 
 
109
  with gr.Tab(label="Advanced options"):
110
  with gr.Accordion(label = "AWS data access", open = True):
111
  aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
@@ -118,7 +61,12 @@ with block:
118
  ### Loading AWS data ###
119
  load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
120
 
121
- redact_btn.click(fn = choose_and_run_redactor, inputs=[in_file, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list],
 
 
 
 
 
122
  outputs=[output_summary, output_file], api_name="redact")
123
 
124
  # Simple run for HF spaces or local on your computer
 
1
+ from tools.file_redaction import choose_and_run_redactor
2
+ from tools.file_conversion import prepare_image_or_text_pdf, convert_text_pdf_to_img_pdf
 
3
  from tools.aws_functions import load_data_from_aws
 
4
  from typing import List
5
  import pandas as pd
6
  import gradio as gr
 
7
 
8
  #file_path = "examples/Lambeth_2030-Our_Future_Our_Lambeth_foreword.pdf" #"examples/skills-based-cv-example.pdf" # "examples/graduate-job-example-cover-letter.pdf" #
9
 
 
11
  full_entity_list = ["TITLES", "PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "STREETNAME", "UKPOSTCODE", 'CREDIT_CARD', 'CRYPTO', 'DATE_TIME', 'IBAN_CODE', 'IP_ADDRESS', 'NRP', 'LOCATION', 'MEDICAL_LICENSE', 'URL', 'UK_NHS']
12
  language = 'en'
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  # Create the gradio interface
16
 
 
18
 
19
  with block:
20
 
21
+ prepared_pdf_state = gr.State([])
22
+ output_image_files_state = gr.State([])
23
+
24
  gr.Markdown(
25
  """
26
  # Document redaction
 
46
  output_summary = gr.Textbox(label="Output summary")
47
  output_file = gr.File(label="Output file")
48
 
49
+ with gr.Row():
50
+ convert_text_pdf_to_img_btn = gr.Button(value="Convert pdf to image-based pdf to apply redactions", variant="secondary")
51
+
52
  with gr.Tab(label="Advanced options"):
53
  with gr.Accordion(label = "AWS data access", open = True):
54
  aws_password_box = gr.Textbox(label="Password for AWS data access (ask the Data team if you don't have this)")
 
61
  ### Loading AWS data ###
62
  load_aws_data_button.click(fn=load_data_from_aws, inputs=[in_aws_file, aws_password_box], outputs=[in_file, aws_log_box])
63
 
64
+ redact_btn.click(fn = prepare_image_or_text_pdf, inputs=[in_file, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list],
65
+ outputs=[output_summary, prepared_pdf_state], api_name="prepare").\
66
+ then(fn = choose_and_run_redactor, inputs=[in_file, prepared_pdf_state, in_redact_language, in_redact_entities, in_redaction_method, in_allow_list],
67
+ outputs=[output_summary, output_file], api_name="redact")
68
+
69
+ convert_text_pdf_to_img_btn.click(fn = convert_text_pdf_to_img_pdf, inputs=[in_file, output_file],
70
  outputs=[output_summary, output_file], api_name="redact")
71
 
72
  # Simple run for HF spaces or local on your computer
tools/aws_functions.py CHANGED
@@ -6,10 +6,10 @@ import os
6
 
7
  PandasDataFrame = Type[pd.DataFrame]
8
 
9
- bucket_name = 'doc-redaction-data'
10
 
11
  try:
12
- session = boto3.Session(profile_name="default")
13
  except Exception as e:
14
  print(e)
15
 
@@ -24,7 +24,7 @@ except Exception as e:
24
 
25
 
26
  def get_assumed_role_info():
27
- sts = boto3.client('sts')
28
  response = sts.get_caller_identity()
29
 
30
  # Extract ARN of the assumed role
 
6
 
7
  PandasDataFrame = Type[pd.DataFrame]
8
 
9
+ bucket_name = os.environ['DOCUMENT_REDACTION_BUCKET']
10
 
11
  try:
12
+ session = boto3.Session() # profile_name="default"
13
  except Exception as e:
14
  print(e)
15
 
 
24
 
25
 
26
  def get_assumed_role_info():
27
+ sts = boto3.client('sts', region_name='us-west-2')
28
  response = sts.get_caller_identity()
29
 
30
  # Extract ARN of the assumed role
tools/file_conversion.py CHANGED
@@ -1,7 +1,9 @@
1
  from pdf2image import convert_from_path, pdfinfo_from_path
 
2
  from PIL import Image
3
  import os
4
  from gradio import Progress
 
5
 
6
  def is_pdf_or_image(filename):
7
  """
@@ -13,7 +15,7 @@ def is_pdf_or_image(filename):
13
  Returns:
14
  bool: True if the file name ends with ".pdf", ".jpg", or ".png", False otherwise.
15
  """
16
- if filename.lower().endswith(".pdf") or filename.lower().endswith(".jpg") or filename.lower().endswith(".png"):
17
  output = True
18
  else:
19
  output = False
@@ -34,7 +36,7 @@ def is_pdf(filename):
34
  # %%
35
  ## Convert pdf to image if necessary
36
 
37
- def convert_pdf_to_images(pdf_path, progress=Progress(track_tqdm=True)):
38
 
39
  # Get the number of pages in the PDF
40
  page_count = pdfinfo_from_path(pdf_path)['Pages']
@@ -54,25 +56,14 @@ def convert_pdf_to_images(pdf_path, progress=Progress(track_tqdm=True)):
54
  if not image:
55
  break
56
 
57
- # # Convert PDF to a list of images
58
- # images = convert_from_path(pdf_path)
59
-
60
- # images = []
61
-
62
  images.extend(image)
63
 
64
- # Save each image as a separate file - deprecated
65
- #image_paths = []
66
- # for i, image in enumerate(images):
67
- # page_path = f"processing/page_{i+1}.png"
68
- # image.save(page_path, "PNG")
69
- # image_paths.append(page_path)
70
-
71
  print("PDF has been converted to images.")
72
 
73
  return images
74
 
75
- # %%
 
76
  def process_file(file_path):
77
  # Get the file extension
78
  file_extension = os.path.splitext(file_path)[1].lower()
@@ -95,3 +86,55 @@ def process_file(file_path):
95
 
96
  return out_path
97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from pdf2image import convert_from_path, pdfinfo_from_path
2
+ from tools.helper_functions import get_file_path_end
3
  from PIL import Image
4
  import os
5
  from gradio import Progress
6
+ from typing import List
7
 
8
  def is_pdf_or_image(filename):
9
  """
 
15
  Returns:
16
  bool: True if the file name ends with ".pdf", ".jpg", or ".png", False otherwise.
17
  """
18
+ if filename.lower().endswith(".pdf") or filename.lower().endswith(".jpg") or filename.lower().endswith(".jpeg") or filename.lower().endswith(".png"):
19
  output = True
20
  else:
21
  output = False
 
36
  # %%
37
  ## Convert pdf to image if necessary
38
 
39
+ def convert_pdf_to_images(pdf_path:str, progress=Progress(track_tqdm=True)):
40
 
41
  # Get the number of pages in the PDF
42
  page_count = pdfinfo_from_path(pdf_path)['Pages']
 
56
  if not image:
57
  break
58
 
 
 
 
 
 
59
  images.extend(image)
60
 
 
 
 
 
 
 
 
61
  print("PDF has been converted to images.")
62
 
63
  return images
64
 
65
+
66
+ # %% Function to take in a file path, decide if it is an image or pdf, then process appropriately.
67
  def process_file(file_path):
68
  # Get the file extension
69
  file_extension = os.path.splitext(file_path)[1].lower()
 
86
 
87
  return out_path
88
 
89
+ def prepare_image_or_text_pdf(file_path:str, language:str, in_redact_method:str, in_allow_list:List[List[str]]=None, progress=Progress(track_tqdm=True)):
90
+
91
+ out_message = ''
92
+ out_file_paths = []
93
+
94
+ in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
95
+
96
+ if file_path:
97
+ file_path_without_ext = get_file_path_end(file_path)
98
+ else:
99
+ out_message = "No file selected"
100
+ print(out_message)
101
+ return out_message, out_file_paths
102
+
103
+ if in_redact_method == "Image analysis":
104
+ # Analyse and redact image-based pdf or image
105
+ if is_pdf_or_image(file_path) == False:
106
+ return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
107
+
108
+ out_file_path = process_file(file_path)
109
+
110
+ elif in_redact_method == "Text analysis":
111
+ if is_pdf(file_path) == False:
112
+ return "Please upload a PDF file for text analysis.", None
113
+
114
+ out_file_path = file_path
115
+
116
+ return out_message, out_file_path
117
+
118
+
119
+ def convert_text_pdf_to_img_pdf(in_file_path:str, out_text_file_path:List[str]):
120
+ file_path_without_ext = get_file_path_end(in_file_path)
121
+
122
+ out_file_paths = []
123
+
124
+ # Convert annotated text pdf back to image to give genuine redactions
125
+ print("Creating image version of results")
126
+ pdf_text_image_paths = process_file(out_text_file_path)
127
+ out_text_image_file_path = "output/" + file_path_without_ext + "_result_as_text_back_to_img.pdf"
128
+ pdf_text_image_paths[0].save(out_text_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_text_image_paths[1:])
129
+
130
+ out_file_paths.append(out_text_image_file_path)
131
+
132
+ out_message = "Image-based PDF successfully redacted and saved to text-based annotated file, and image-based file."
133
+
134
+ return out_message, out_file_paths
135
+
136
+
137
+
138
+
139
+
140
+
tools/file_redaction.py CHANGED
@@ -7,8 +7,68 @@ from tools.file_conversion import process_file
7
  from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTAnno
8
  from pikepdf import Pdf, Dictionary, Name
9
  from gradio import Progress
 
10
 
11
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
  def redact_image_pdf(file_path:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
14
  '''
@@ -42,7 +102,6 @@ def redact_image_pdf(file_path:str, language:str, chosen_redact_entities:List[st
42
  image_analyser = ImageAnalyzerEngine(nlp_analyser)
43
  engine = ImageRedactorEngine(image_analyser)
44
 
45
-
46
  if language == 'en':
47
  ocr_lang = 'eng'
48
  else: ocr_lang = language
@@ -62,26 +121,6 @@ def redact_image_pdf(file_path:str, language:str, chosen_redact_entities:List[st
62
  )
63
 
64
  images.append(redacted_image)
65
-
66
- # multiple inputs (variant 2)
67
- # with open("name.pdf","wb") as f:
68
- # f.write(img2pdf.convert(["test1.jpg", "test2.png"]))
69
-
70
- # # Create page from image
71
- # pdf.add_blank_page(page_size=(redacted_image.width, redacted_image.height))
72
- # page = pdf.pages[-1]
73
- # page.add_image(redacted_image, 0, 0)
74
-
75
- # %%
76
- # Get descriptive output of results for checks - not necessary except for debugging
77
- # bboxes = image_analyser.analyze(image)
78
-
79
- # # %%
80
- # check_df = pd.DataFrame(bboxes)[0].astype(str).str.split(",",expand=True).replace(".*: ", "", regex=True)
81
-
82
- # check_df.columns = ["type", "start", "end", "score", "left", "top", "width", "height"]
83
-
84
- # check_df.to_csv("check_df.csv")
85
 
86
  return images
87
 
 
7
  from pdfminer.layout import LTTextContainer, LTChar, LTTextLine, LTAnno
8
  from pikepdf import Pdf, Dictionary, Name
9
  from gradio import Progress
10
+ import time
11
 
12
  from tools.load_spacy_model_custom_recognisers import nlp_analyser, score_threshold
13
+ from tools.helper_functions import get_file_path_end
14
+ from tools.file_conversion import process_file, is_pdf, is_pdf_or_image
15
+ import gradio as gr
16
+
17
+ def choose_and_run_redactor(file_path:str, image_paths:List[str], language:str, chosen_redact_entities:List[str], in_redact_method:str, in_allow_list:List[List[str]]=None, progress=gr.Progress(track_tqdm=True)):
18
+
19
+ tic = time.perf_counter()
20
+
21
+ out_message = ''
22
+ out_file_paths = []
23
+
24
+ in_allow_list_flat = [item for sublist in in_allow_list for item in sublist]
25
+
26
+ if file_path:
27
+ file_path_without_ext = get_file_path_end(file_path)
28
+ else:
29
+ out_message = "No file selected"
30
+ print(out_message)
31
+ return out_message, out_file_paths
32
+
33
+ if in_redact_method == "Image analysis":
34
+ # Analyse and redact image-based pdf or image
35
+ # if is_pdf_or_image(file_path) == False:
36
+ # return "Please upload a PDF file or image file (JPG, PNG) for image analysis.", None
37
+
38
+ pdf_images = redact_image_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
39
+ out_image_file_path = "output/" + file_path_without_ext + "_result_as_img.pdf"
40
+ pdf_images[0].save(out_image_file_path, "PDF" ,resolution=100.0, save_all=True, append_images=pdf_images[1:])
41
+
42
+ out_file_paths.append(out_image_file_path)
43
+ out_message = "Image-based PDF successfully redacted and saved to file."
44
+
45
+ elif in_redact_method == "Text analysis":
46
+ if is_pdf(file_path) == False:
47
+ return "Please upload a PDF file for text analysis.", None
48
+
49
+ # Analyse text-based pdf
50
+ pdf_text = redact_text_pdf(file_path, language, chosen_redact_entities, in_allow_list_flat)
51
+ out_text_file_path = "output/" + file_path_without_ext + "_result_as_text.pdf"
52
+ pdf_text.save(out_text_file_path)
53
+
54
+ out_file_paths.append(out_text_file_path)
55
+
56
+
57
+
58
+
59
+ else:
60
+ out_message = "No redaction method selected"
61
+ print(out_message)
62
+ return out_message, out_file_paths
63
+
64
+ toc = time.perf_counter()
65
+ out_time = f"Time taken: {toc - tic:0.1f} seconds."
66
+ print(out_time)
67
+
68
+ out_message = out_message + "\n\n" + out_time
69
+
70
+ return out_message, out_file_paths
71
+
72
 
73
  def redact_image_pdf(file_path:str, language:str, chosen_redact_entities:List[str], allow_list:List[str]=None, progress=Progress(track_tqdm=True)):
74
  '''
 
102
  image_analyser = ImageAnalyzerEngine(nlp_analyser)
103
  engine = ImageRedactorEngine(image_analyser)
104
 
 
105
  if language == 'en':
106
  ocr_lang = 'eng'
107
  else: ocr_lang = language
 
121
  )
122
 
123
  images.append(redacted_image)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
  return images
126