Commit
·
97097ff
1
Parent(s):
10f46e9
More checks on ocr outputs in redaction functions
Browse files- tools/file_redaction.py +16 -8
tools/file_redaction.py
CHANGED
@@ -406,7 +406,7 @@ def choose_and_run_redactor(file_paths:List[str],
|
|
406 |
progress(0.5, desc="Extracting text and redacting document")
|
407 |
|
408 |
all_pages_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score", "id"])
|
409 |
-
all_line_level_ocr_results_df = pd.DataFrame()
|
410 |
|
411 |
# Run through file loop, redact each file at a time
|
412 |
for file in file_paths_loop:
|
@@ -1198,7 +1198,7 @@ def redact_image_pdf(file_path:str,
|
|
1198 |
current_loop_page:int=0,
|
1199 |
page_break_return:bool=False,
|
1200 |
annotations_all_pages:List=[],
|
1201 |
-
all_line_level_ocr_results_df:pd.DataFrame = pd.DataFrame(),
|
1202 |
all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score", "id"]),
|
1203 |
pymupdf_doc:Document = [],
|
1204 |
pii_identification_method:str="Local",
|
@@ -1891,8 +1891,8 @@ def redact_text_pdf(
|
|
1891 |
current_loop_page: int = 0, # Current page being processed in the loop
|
1892 |
page_break_return: bool = False, # Flag to indicate if a page break should be returned
|
1893 |
annotations_all_pages: List[dict] = [], # List of annotations across all pages
|
1894 |
-
all_line_level_ocr_results_df: pd.DataFrame = pd.DataFrame(), # DataFrame for OCR results
|
1895 |
-
all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax",
|
1896 |
pymupdf_doc: List = [], # List of PyMuPDF documents
|
1897 |
pii_identification_method: str = "Local",
|
1898 |
comprehend_query_number:int = 0,
|
@@ -2008,8 +2008,8 @@ def redact_text_pdf(
|
|
2008 |
|
2009 |
characters = []
|
2010 |
pikepdf_redaction_annotations_on_page = []
|
2011 |
-
page_decision_process_table = pd.DataFrame()
|
2012 |
-
page_text_ocr_outputs = pd.DataFrame()
|
2013 |
|
2014 |
for n, text_container in enumerate(page_layout):
|
2015 |
characters = []
|
@@ -2142,7 +2142,12 @@ def redact_text_pdf(
|
|
2142 |
|
2143 |
# Write all page outputs
|
2144 |
all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
|
|
|
|
|
|
|
2145 |
all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
|
|
|
|
|
2146 |
|
2147 |
# Convert decision table to relative coordinates
|
2148 |
all_pages_decision_process_table = divide_coordinates_by_page_sizes(all_pages_decision_process_table, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
|
@@ -2154,8 +2159,11 @@ def redact_text_pdf(
|
|
2154 |
# Convert decision table to relative coordinates
|
2155 |
all_line_level_ocr_results_df = divide_coordinates_by_page_sizes(all_line_level_ocr_results_df, page_sizes_df, xmin="left", xmax="width", ymin="top", ymax="height")
|
2156 |
|
|
|
|
|
2157 |
# Coordinates need to be reversed for ymin and ymax to match with image annotator objects downstream
|
2158 |
-
|
2159 |
-
|
|
|
2160 |
|
2161 |
return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
|
|
|
406 |
progress(0.5, desc="Extracting text and redacting document")
|
407 |
|
408 |
all_pages_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score", "id"])
|
409 |
+
all_line_level_ocr_results_df = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height"])
|
410 |
|
411 |
# Run through file loop, redact each file at a time
|
412 |
for file in file_paths_loop:
|
|
|
1198 |
current_loop_page:int=0,
|
1199 |
page_break_return:bool=False,
|
1200 |
annotations_all_pages:List=[],
|
1201 |
+
all_line_level_ocr_results_df:pd.DataFrame = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height"]),
|
1202 |
all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "boundingBox", "text", "start","end","score", "id"]),
|
1203 |
pymupdf_doc:Document = [],
|
1204 |
pii_identification_method:str="Local",
|
|
|
1891 |
current_loop_page: int = 0, # Current page being processed in the loop
|
1892 |
page_break_return: bool = False, # Flag to indicate if a page break should be returned
|
1893 |
annotations_all_pages: List[dict] = [], # List of annotations across all pages
|
1894 |
+
all_line_level_ocr_results_df: pd.DataFrame = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height"]), # DataFrame for OCR results
|
1895 |
+
all_pages_decision_process_table:pd.DataFrame = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"]), # DataFrame for decision process table
|
1896 |
pymupdf_doc: List = [], # List of PyMuPDF documents
|
1897 |
pii_identification_method: str = "Local",
|
1898 |
comprehend_query_number:int = 0,
|
|
|
2008 |
|
2009 |
characters = []
|
2010 |
pikepdf_redaction_annotations_on_page = []
|
2011 |
+
page_decision_process_table = pd.DataFrame(columns=["image_path", "page", "label", "xmin", "xmax", "ymin", "ymax", "text", "id"])
|
2012 |
+
page_text_ocr_outputs = pd.DataFrame(columns=["page", "text", "left", "top", "width", "height"])
|
2013 |
|
2014 |
for n, text_container in enumerate(page_layout):
|
2015 |
characters = []
|
|
|
2142 |
|
2143 |
# Write all page outputs
|
2144 |
all_pages_decision_process_table = pd.concat(all_pages_decision_process_table_list)
|
2145 |
+
|
2146 |
+
#print("all_line_level_ocr_results_df_list:", all_line_level_ocr_results_df_list)
|
2147 |
+
|
2148 |
all_line_level_ocr_results_df = pd.concat(all_line_level_ocr_results_df_list)
|
2149 |
+
|
2150 |
+
#print("all_line_level_ocr_results_df after concat:", all_line_level_ocr_results_df)
|
2151 |
|
2152 |
# Convert decision table to relative coordinates
|
2153 |
all_pages_decision_process_table = divide_coordinates_by_page_sizes(all_pages_decision_process_table, page_sizes_df, xmin="xmin", xmax="xmax", ymin="ymin", ymax="ymax")
|
|
|
2159 |
# Convert decision table to relative coordinates
|
2160 |
all_line_level_ocr_results_df = divide_coordinates_by_page_sizes(all_line_level_ocr_results_df, page_sizes_df, xmin="left", xmax="width", ymin="top", ymax="height")
|
2161 |
|
2162 |
+
#print("all_line_level_ocr_results_df:", all_line_level_ocr_results_df)
|
2163 |
+
|
2164 |
# Coordinates need to be reversed for ymin and ymax to match with image annotator objects downstream
|
2165 |
+
if not all_line_level_ocr_results_df.empty:
|
2166 |
+
all_line_level_ocr_results_df['top'] = all_line_level_ocr_results_df['top'].astype(float)
|
2167 |
+
all_line_level_ocr_results_df['top'] = 1 - all_line_level_ocr_results_df['top']
|
2168 |
|
2169 |
return pymupdf_doc, all_pages_decision_process_table, all_line_level_ocr_results_df, annotations_all_pages, current_loop_page, page_break_return, comprehend_query_number
|