import gradio as gr import pdfplumber import textwrap import pprint import json import os from pathlib import Path def table_debugger( file_obj, page_num=0, table_num=0, crop_x0=None, crop_top=None, crop_x1=None, crop_bottom=None, vertical_strategy=None, horizontal_strategy=None, explicit_vertical_lines=None, explicit_horizontal_lines=None, snap_tolerance=None, snap_x_tolerance=None, snap_y_tolerance=None, join_tolerance=None, join_x_tolerance=None, join_y_tolerance=None, text_tolerance=None, text_x_tolerance=None, text_y_tolerance=None, intersection_tolerance=None, intersection_x_tolerance=None, intersection_y_tolerance=None, edge_min_length=None, min_words_vertical=None, min_words_horizontal=None, keep_blank_chars=None, ): table_settings = { "vertical_strategy": vertical_strategy, "horizontal_strategy": horizontal_strategy, "explicit_vertical_lines": json.loads(explicit_vertical_lines) if explicit_vertical_lines else None, "explicit_horizontal_lines": json.loads(explicit_horizontal_lines) if explicit_horizontal_lines else None, "snap_tolerance": snap_tolerance, "snap_x_tolerance": snap_x_tolerance, "snap_y_tolerance": snap_y_tolerance, "join_tolerance": join_tolerance, "join_x_tolerance": join_x_tolerance, "join_y_tolerance": join_y_tolerance, "text_tolerance": text_tolerance, "text_x_tolerance": text_x_tolerance, "text_y_tolerance": text_y_tolerance, "intersection_tolerance": intersection_tolerance, "intersection_x_tolerance": intersection_x_tolerance, "intersection_y_tolerance": intersection_y_tolerance, "edge_min_length": edge_min_length, "min_words_vertical": min_words_vertical, "min_words_horizontal": min_words_horizontal, #'keep_blank_chars': keep_blank_chars } keys = list(table_settings.keys()) for key in keys: if ( table_settings[key] == "" or table_settings[key] == [] or table_settings[key] is None ): del table_settings[key] elif table_settings[key].isdigit(): table_settings[key] = int(table_settings[key]) table_num = int(table_num) with pdfplumber.open(file_obj.name) as pdf: page_num = int(page_num) page = pdf.pages[page_num] page_width = int(page.width) page_height = int(page.height) crop_x0 = int(crop_x0) if crop_x0 else 0 crop_top = int(crop_top) if crop_top else 0 crop_x1 = int(crop_x1) if crop_x1 else page_width crop_bottom = int(crop_bottom) if crop_bottom else page_height # Allow negative numbers if crop_bottom < 0: crop_bottom = page_height + crop_bottom if crop_x1 < 0: crop_x1 = page_width + crop_x1 is_cropped = ( crop_x0 != 0 or crop_top != 0 or crop_x1 != page_width or crop_bottom != page_height ) # Only crop if we need to! if is_cropped: page = page.crop((crop_x0, crop_top, crop_x1, crop_bottom)) tables = page.extract_tables(table_settings) if len(tables) > 0: table = tables[0] else: table = None visual = page.to_image().debug_tablefinder(table_settings).annotated base_filename = file_obj.name.split("/")[-1] notes = f""" - **Filename:** {base_filename} - **Pages:** {len(pdf.pages)} - **Page num {int(page_num)}:** - **Full dimensions:** {page_width} x {page_height} - **Crop:** {crop_x0}, {crop_top}, {crop_x1}, {crop_bottom} - **Tables found:** {len(tables)} ```python import pdfplumber pdf = pdfplumber.open("{base_filename}") page = pdf.pages[{page_num}] """.strip() if is_cropped: notes += ( f"\n page = page.crop(({crop_x0}, {crop_top}, {crop_x1}, {crop_bottom}))" ) notes += f"""\n table_settings = {pprint.pformat(table_settings, indent=8).strip()} tables = page.extract_tables(table_settings) table = tables[{table_num}] ```""" notes = textwrap.dedent(notes) return [notes, visual, table] def demo_subset( file_obj, page_num, table_num, vertical_strategy, horizontal_strategy, snap_y_tolerance, intersection_x_tolerance, crop_bottom, ): return table_debugger( file_obj, page_num=page_num, table_num=table_num, vertical_strategy=vertical_strategy, horizontal_strategy=horizontal_strategy, snap_y_tolerance=snap_y_tolerance, intersection_x_tolerance=intersection_x_tolerance, crop_bottom=crop_bottom, ) notes = gr.Markdown() output_image = gr.Image() data_table = gr.Dataframe(height=250, render=False, type='array', label='Found data') crop_top = gr.Text(label="Crop (top)", placeholder="top", container=False, render=False) crop_x0 = gr.Text(label=" Crop (x0)", placeholder="left", container=False, render=False) crop_x1 = gr.Text( label="Crop (x1)", placeholder="right (from page left)", container=False, render=False ) crop_bottom = gr.Text( label="Crop (bottom)", placeholder="bottom (from page top)", container=False, render=False ) vertical_strategy = gr.Dropdown( label="Vertical Strategy", choices=["lines", "lines_strict", "text", "explicit"], render=False, value="lines", ) horizontal_strategy = gr.Dropdown( label="Horizontal Strategy", choices=["lines", "lines_strict", "text", "explicit"], render=False, value="lines", ) explicit_vertical_lines = gr.Textbox( label="explicit_vertical_lines", render=False, placeholder="[]" ) explicit_horizontal_lines = gr.Textbox( label="explicit_horizontal_lines", render=False, placeholder="[]" ) snap_tolerance = gr.Textbox(label="Snap tolerance", placeholder="3", render=False) snap_x_tolerance = gr.Textbox(label="Snap tolerance (x)", placeholder="3", render=False) snap_y_tolerance = gr.Textbox(label="Snap tolerance (y)", placeholder="3", render=False) join_tolerance = gr.Textbox(label="Join tolerance", placeholder="3", render=False) join_x_tolerance = gr.Textbox(label="Join tolerance (x)", placeholder="3", render=False) join_y_tolerance = gr.Textbox(label="Join tolerance (x)", placeholder="3", render=False) text_tolerance = gr.Textbox( label="Text tolerance", placeholder="1", render=False, value=None ) text_x_tolerance = gr.Textbox(label="Text tolerance (x)", placeholder="1", render=False) text_y_tolerance = gr.Textbox(label="Text tolerance (y)", placeholder="1", render=False) intersection_tolerance = gr.Textbox( label="Intersection tolerance", placeholder="1", render=False ) intersection_x_tolerance = gr.Textbox( label="Intersection tolerance (x)", placeholder="1", render=False ) intersection_y_tolerance = gr.Textbox( label="Intersection tolerance (y)", placeholder="1", render=False ) edge_min_length = gr.Textbox(label="edge_min_length", placeholder="3", render=False) min_words_vertical = gr.Textbox( label="min_words_vertical", placeholder="3", render=False ) min_words_horizontal = gr.Textbox( label="min_words_horizontal", placeholder="1", render=False ) keep_blank_chars = gr.Checkbox(label="Keep blank chars?", value=False) file = gr.File(label="PDF", type="filepath", file_types=["pdf"], render=False) page_num = gr.Number( label="Page number", value=0, info="It's an index: first is 0!", render=False ) table_num = gr.Number( label="Table number", value=0, info="It's an index: first is 0!", render=False ) example_dir = Path(os.path.dirname(__file__)).joinpath("examples") examples = [ [str(example_dir.joinpath("players.pdf")), 0, 0, "text", "text", None, None, None], [ str(example_dir.joinpath("museums.pdf")), 2, 0, "lines", "lines", None, None, None, ], [ str(example_dir.joinpath("background-checks.pdf")), 0, 0, "text", "text", 5, 15, 487, ], ] with gr.Blocks() as demo: gr.Markdown( """ # pdfplumber table extraction playground [pdfplumber](https://github.com/jsvine/pdfplumber/) is a delightful library for processing PDFs, including table extraction. **Scroll down for examples and lots more settings!** YouTube is full of [pdfplumber tutorials](https://www.youtube.com/results?search_query=pdfplumber), but for the notebook-lovers I recommend [this](https://github.com/jsvine/nicar-2023-pdfplumber-workshop) or [this](https://github.com/jsvine/lede-2023/tree/main/pdf-parsing/). """ ) with gr.Row(): with gr.Column(scale=2): file.render() with gr.Accordion("Table details", open=True): with gr.Group(): with gr.Row(): page_num.render() table_num.render() with gr.Row(): vertical_strategy.render() horizontal_strategy.render() with gr.Accordion("Crop", open=True): with gr.Group(): crop_top.render() with gr.Row(): crop_x0.render() crop_x1.render() crop_bottom.render() btn = gr.Button(value="Run") btn.click( table_debugger, inputs=[ file, page_num, table_num, crop_x0, crop_top, crop_x1, crop_bottom, vertical_strategy, horizontal_strategy, explicit_vertical_lines, explicit_horizontal_lines, snap_tolerance, snap_x_tolerance, snap_y_tolerance, join_tolerance, join_x_tolerance, join_y_tolerance, text_tolerance, text_x_tolerance, text_y_tolerance, intersection_tolerance, intersection_x_tolerance, intersection_y_tolerance, edge_min_length, min_words_vertical, min_words_horizontal, keep_blank_chars, ], outputs=[notes, output_image, data_table], ) notes.render() with gr.Column(scale=3): data_table.render() output_image.render() gr.Examples( examples=examples, inputs=[ file, page_num, table_num, vertical_strategy, horizontal_strategy, snap_y_tolerance, intersection_x_tolerance, crop_bottom, ], outputs=[notes, output_image, data_table], fn=demo_subset, run_on_click=True, ) gr.Markdown("## Additional options") with gr.Row(): with gr.Column(): with gr.Group(): snap_tolerance.render() with gr.Row(): snap_x_tolerance.render() snap_y_tolerance.render() join_tolerance.render() with gr.Row(): join_x_tolerance.render() join_y_tolerance.render() text_tolerance.render() with gr.Row(): text_x_tolerance.render() text_y_tolerance.render() intersection_tolerance.render() with gr.Row(): intersection_x_tolerance.render() intersection_y_tolerance.render() with gr.Column(): with gr.Group(): explicit_vertical_lines.render() explicit_horizontal_lines.render() edge_min_length.render() with gr.Row(): min_words_vertical.render() min_words_horizontal.render() keep_blank_chars.render() if __name__ == "__main__": demo.launch()