seanpedrickcase commited on
Commit
97097ff
·
1 Parent(s): 10f46e9

More checks on ocr outputs in redaction functions

Browse files
Files changed (1) hide show
  1. tools/file_redaction.py +16 -8
tools/file_redaction.py CHANGED
@@ -406,7 +406,7 @@ def choose_and_run_redactor(file_paths:List[str],
406
  progress(0.5, desc="Extracting text and redacting document")
407
 
408
  all_pages_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score", "id"])
409
- all_line_level_ocr_results_df = pd.DataFrame()
410
 
411
  # Run through file loop, redact each file at a time
412
  for file in file_paths_loop:
@@ -1198,7 +1198,7 @@ def redact_image_pdf(file_path:str,
1198
  current_loop_page:int=0,
1199
  page_break_return:bool=False,
1200
  annotations_all_pages:List=[],
1201
- all_line_level_ocr_results_df:pd.DataFrame = pd.DataFrame(),
1202
  all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score", "id"]),
1203
  pymupdf_doc:Document = [],
1204
  pii_identification_method:str="Local",
@@ -1891,8 +1891,8 @@ def redact_text_pdf(
1891
  current_loop_page: int = 0, # Current page being processed in the loop
1892
  page_break_return: bool = False, # Flag to indicate if a page break should be returned
1893
  annotations_all_pages: List[dict] = [], # List of annotations across all pages
1894
- all_line_level_ocr_results_df: pd.DataFrame = pd.DataFrame(), # DataFrame for OCR results
1895
- all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"]), # DataFrame for decision process table
1896
  pymupdf_doc: List = [], # List of PyMuPDF documents
1897
  pii_identification_method: str = "Local",
1898
  comprehend_query_number:int = 0,
@@ -2008,8 +2008,8 @@ def redact_text_pdf(
2008
 
2009
  characters = []
2010
  pikepdf_redaction_annotations_on_page = []
2011
- page_decision_process_table = pd.DataFrame()
2012
- page_text_ocr_outputs = pd.DataFrame()
2013
 
2014
  for n, text_container in enumerate(page_layout):
2015
  characters = []
@@ -2142,7 +2142,12 @@ def redact_text_pdf(
2142
 
2143
  # Write all page outputs
2144
  all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
 
 
 
2145
  all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
 
 
2146
 
2147
  # Convert decision table to relative coordinates
2148
  all_pages_decision_process_table = divide_coordinates_by_page_sizes(all_pages_decision_process_table, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
@@ -2154,8 +2159,11 @@ def redact_text_pdf(
2154
  # Convert decision table to relative coordinates
2155
  all_line_level_ocr_results_df = divide_coordinates_by_page_sizes(all_line_level_ocr_results_df, page_sizes_df, xmin="left", xmax="width", ymin="top", ymax="height")
2156
 
 
 
2157
  # Coordinates need to be reversed for ymin and ymax to match with image annotator objects downstream
2158
- all_line_level_ocr_results_df['top'] = all_line_level_ocr_results_df['top'].astype(float)
2159
- all_line_level_ocr_results_df['top'] = 1 - all_line_level_ocr_results_df['top']
 
2160
 
2161
  return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
 
406
  progress(0.5, desc="Extracting text and redacting document")
407
 
408
  all_pages_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score", "id"])
409
+ all_line_level_ocr_results_df = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height"])
410
 
411
  # Run through file loop, redact each file at a time
412
  for file in file_paths_loop:
 
1198
  current_loop_page:int=0,
1199
  page_break_return:bool=False,
1200
  annotations_all_pages:List=[],
1201
+ all_line_level_ocr_results_df:pd.DataFrame = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height"]),
1202
  all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score", "id"]),
1203
  pymupdf_doc:Document = [],
1204
  pii_identification_method:str="Local",
 
1891
  current_loop_page: int = 0, # Current page being processed in the loop
1892
  page_break_return: bool = False, # Flag to indicate if a page break should be returned
1893
  annotations_all_pages: List[dict] = [], # List of annotations across all pages
1894
+ all_line_level_ocr_results_df: pd.DataFrame = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height"]), # DataFrame for OCR results
1895
+ all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"]), # DataFrame for decision process table
1896
  pymupdf_doc: List = [], # List of PyMuPDF documents
1897
  pii_identification_method: str = "Local",
1898
  comprehend_query_number:int = 0,
 
2008
 
2009
  characters = []
2010
  pikepdf_redaction_annotations_on_page = []
2011
+ page_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"])
2012
+ page_text_ocr_outputs = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height"])
2013
 
2014
  for n, text_container in enumerate(page_layout):
2015
  characters = []
 
2142
 
2143
  # Write all page outputs
2144
  all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
2145
+
2146
+ #print("all_line_level_ocr_results_df_list:", all_line_level_ocr_results_df_list)
2147
+
2148
  all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
2149
+
2150
+ #print("all_line_level_ocr_results_df after concat:", all_line_level_ocr_results_df)
2151
 
2152
  # Convert decision table to relative coordinates
2153
  all_pages_decision_process_table = divide_coordinates_by_page_sizes(all_pages_decision_process_table, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
 
2159
  # Convert decision table to relative coordinates
2160
  all_line_level_ocr_results_df = divide_coordinates_by_page_sizes(all_line_level_ocr_results_df, page_sizes_df, xmin="left", xmax="width", ymin="top", ymax="height")
2161
 
2162
+ #print("all_line_level_ocr_results_df:", all_line_level_ocr_results_df)
2163
+
2164
  # Coordinates need to be reversed for ymin and ymax to match with image annotator objects downstream
2165
+ if not all_line_level_ocr_results_df.empty:
2166
+ all_line_level_ocr_results_df['top'] = all_line_level_ocr_results_df['top'].astype(float)
2167
+ all_line_level_ocr_results_df['top'] = 1 - all_line_level_ocr_results_df['top']
2168
 
2169
  return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number