import gradio as gr
import pdfplumber
import textwrap
import pprint
import json
import os
from pathlib import Path


def table_debugger(
    file_obj,
    page_num=0,
    table_num=0,
    crop_x0=None,
    crop_top=None,
    crop_x1=None,
    crop_bottom=None,
    vertical_strategy=None,
    horizontal_strategy=None,
    explicit_vertical_lines=None,
    explicit_horizontal_lines=None,
    snap_tolerance=None,
    snap_x_tolerance=None,
    snap_y_tolerance=None,
    join_tolerance=None,
    join_x_tolerance=None,
    join_y_tolerance=None,
    text_tolerance=None,
    text_x_tolerance=None,
    text_y_tolerance=None,
    intersection_tolerance=None,
    intersection_x_tolerance=None,
    intersection_y_tolerance=None,
    edge_min_length=None,
    min_words_vertical=None,
    min_words_horizontal=None,
    keep_blank_chars=None,
):
    table_settings = {
        "vertical_strategy": vertical_strategy,
        "horizontal_strategy": horizontal_strategy,
        "explicit_vertical_lines": json.loads(explicit_vertical_lines)
        if explicit_vertical_lines
        else None,
        "explicit_horizontal_lines": json.loads(explicit_horizontal_lines)
        if explicit_horizontal_lines
        else None,
        "snap_tolerance": snap_tolerance,
        "snap_x_tolerance": snap_x_tolerance,
        "snap_y_tolerance": snap_y_tolerance,
        "join_tolerance": join_tolerance,
        "join_x_tolerance": join_x_tolerance,
        "join_y_tolerance": join_y_tolerance,
        "text_tolerance": text_tolerance,
        "text_x_tolerance": text_x_tolerance,
        "text_y_tolerance": text_y_tolerance,
        "intersection_tolerance": intersection_tolerance,
        "intersection_x_tolerance": intersection_x_tolerance,
        "intersection_y_tolerance": intersection_y_tolerance,
        "edge_min_length": edge_min_length,
        "min_words_vertical": min_words_vertical,
        "min_words_horizontal": min_words_horizontal,
        #'keep_blank_chars': keep_blank_chars
    }

    keys = list(table_settings.keys())
    for key in keys:
        if (
            table_settings[key] == ""
            or table_settings[key] == []
            or table_settings[key] is None
        ):
            del table_settings[key]
        elif table_settings[key].isdigit():
            table_settings[key] = int(table_settings[key])

    table_num = int(table_num)

    with pdfplumber.open(file_obj.name) as pdf:
        page_num = int(page_num)
        page = pdf.pages[page_num]
        page_width = int(page.width)
        page_height = int(page.height)

        crop_x0 = int(crop_x0) if crop_x0 else 0
        crop_top = int(crop_top) if crop_top else 0
        crop_x1 = int(crop_x1) if crop_x1 else page_width
        crop_bottom = int(crop_bottom) if crop_bottom else page_height

        # Allow negative numbers
        if crop_bottom < 0:
            crop_bottom = page_height + crop_bottom
        if crop_x1 < 0:
            crop_x1 = page_width + crop_x1

        is_cropped = (
            crop_x0 != 0
            or crop_top != 0
            or crop_x1 != page_width
            or crop_bottom != page_height
        )

        # Only crop if we need to!
        if is_cropped:
            page = page.crop((crop_x0, crop_top, crop_x1, crop_bottom))

        tables = page.extract_tables(table_settings)
        if len(tables) > 0:
            table = tables[0]
        else:
            table = None
        visual = page.to_image().debug_tablefinder(table_settings).annotated

    base_filename = file_obj.name.split("/")[-1]

    notes = f"""
    - **Filename:** {base_filename}
    - **Pages:** {len(pdf.pages)}
    - **Page num {int(page_num)}:**
        - **Full dimensions:** {page_width} x {page_height}
        - **Crop:** {crop_x0}, {crop_top}, {crop_x1}, {crop_bottom}
        - **Tables found:** {len(tables)}

    ```python
    import pdfplumber
    pdf = pdfplumber.open("{base_filename}")
    page = pdf.pages[{page_num}]
    """.strip()

    if is_cropped:
        notes += (
            f"\n    page = page.crop(({crop_x0}, {crop_top}, {crop_x1}, {crop_bottom}))"
        )

    notes += f"""\n
    table_settings = {pprint.pformat(table_settings, indent=8).strip()}
    tables = page.extract_tables(table_settings)
    table = tables[{table_num}]
    ```"""

    notes = textwrap.dedent(notes)

    return [notes, visual, table]


def demo_subset(
    file_obj,
    page_num,
    table_num,
    vertical_strategy,
    horizontal_strategy,
    snap_y_tolerance,
    intersection_x_tolerance,
    crop_bottom,
):
    return table_debugger(
        file_obj,
        page_num=page_num,
        table_num=table_num,
        vertical_strategy=vertical_strategy,
        horizontal_strategy=horizontal_strategy,
        snap_y_tolerance=snap_y_tolerance,
        intersection_x_tolerance=intersection_x_tolerance,
        crop_bottom=crop_bottom,
    )


notes = gr.Markdown()
output_image = gr.Image()
data_table = gr.Dataframe(height=250, render=False, type='array', label='Found data')

crop_top = gr.Text(label="Crop (top)", placeholder="top", container=False, render=False)
crop_x0 = gr.Text(label=" Crop (x0)", placeholder="left", container=False, render=False)
crop_x1 = gr.Text(
    label="Crop (x1)", placeholder="right (from page left)", container=False, render=False
)
crop_bottom = gr.Text(
    label="Crop (bottom)", placeholder="bottom (from page top)", container=False, render=False
)

vertical_strategy = gr.Dropdown(
    label="Vertical Strategy",
    choices=["lines", "lines_strict", "text", "explicit"],
    render=False,
    value="lines",
)
horizontal_strategy = gr.Dropdown(
    label="Horizontal Strategy",
    choices=["lines", "lines_strict", "text", "explicit"],
    render=False,
    value="lines",
)
explicit_vertical_lines = gr.Textbox(
    label="explicit_vertical_lines", render=False, placeholder="[]"
)
explicit_horizontal_lines = gr.Textbox(
    label="explicit_horizontal_lines", render=False, placeholder="[]"
)
snap_tolerance = gr.Textbox(label="Snap tolerance", placeholder="3", render=False)
snap_x_tolerance = gr.Textbox(label="Snap tolerance (x)", placeholder="3", render=False)
snap_y_tolerance = gr.Textbox(label="Snap tolerance (y)", placeholder="3", render=False)
join_tolerance = gr.Textbox(label="Join tolerance", placeholder="3", render=False)
join_x_tolerance = gr.Textbox(label="Join tolerance (x)", placeholder="3", render=False)
join_y_tolerance = gr.Textbox(label="Join tolerance (x)", placeholder="3", render=False)
text_tolerance = gr.Textbox(
    label="Text tolerance", placeholder="1", render=False, value=None
)
text_x_tolerance = gr.Textbox(label="Text tolerance (x)", placeholder="1", render=False)
text_y_tolerance = gr.Textbox(label="Text tolerance (y)", placeholder="1", render=False)
intersection_tolerance = gr.Textbox(
    label="Intersection tolerance", placeholder="1", render=False
)
intersection_x_tolerance = gr.Textbox(
    label="Intersection tolerance (x)", placeholder="1", render=False
)
intersection_y_tolerance = gr.Textbox(
    label="Intersection tolerance (y)", placeholder="1", render=False
)
edge_min_length = gr.Textbox(label="edge_min_length", placeholder="3", render=False)
min_words_vertical = gr.Textbox(
    label="min_words_vertical", placeholder="3", render=False
)
min_words_horizontal = gr.Textbox(
    label="min_words_horizontal", placeholder="1", render=False
)
keep_blank_chars = gr.Checkbox(label="Keep blank chars?", value=False)

file = gr.File(label="PDF", type="filepath", file_types=["pdf"], render=False)
page_num = gr.Number(
    label="Page number", value=0, info="It's an index: first is 0!", render=False
)
table_num = gr.Number(
    label="Table number", value=0, info="It's an index: first is 0!", render=False
)

example_dir = Path(os.path.dirname(__file__)).joinpath("examples")

examples = [
    [str(example_dir.joinpath("players.pdf")), 0, 0, "text", "text", None, None, None],
    [
        str(example_dir.joinpath("museums.pdf")),
        2,
        0,
        "lines",
        "lines",
        None,
        None,
        None,
    ],
    [
        str(example_dir.joinpath("background-checks.pdf")),
        0,
        0,
        "text",
        "text",
        5,
        15,
        487,
    ],
]

with gr.Blocks() as demo:
    gr.Markdown(
        """
# pdfplumber table extraction playground

[pdfplumber](https://github.com/jsvine/pdfplumber/) is a delightful library for processing PDFs, including table extraction. **Scroll down for examples and lots more settings!**

YouTube is full of [pdfplumber tutorials](https://www.youtube.com/results?search_query=pdfplumber), but for the notebook-lovers I recommend [this](https://github.com/jsvine/nicar-2023-pdfplumber-workshop) or [this](https://github.com/jsvine/lede-2023/tree/main/pdf-parsing/).
"""
    )

    with gr.Row():
        with gr.Column(scale=2):
            file.render()
            with gr.Accordion("Table details", open=True):
                with gr.Group():
                    with gr.Row():
                        page_num.render()
                        table_num.render()

                    with gr.Row():
                        vertical_strategy.render()
                        horizontal_strategy.render()

            with gr.Accordion("Crop", open=True):
                with gr.Group():
                    crop_top.render()
                    with gr.Row():
                        crop_x0.render()
                        crop_x1.render()
                    crop_bottom.render()

            btn = gr.Button(value="Run")
            btn.click(
                table_debugger,
                inputs=[
                    file,
                    page_num,
                    table_num,
                    crop_x0,
                    crop_top,
                    crop_x1,
                    crop_bottom,
                    vertical_strategy,
                    horizontal_strategy,
                    explicit_vertical_lines,
                    explicit_horizontal_lines,
                    snap_tolerance,
                    snap_x_tolerance,
                    snap_y_tolerance,
                    join_tolerance,
                    join_x_tolerance,
                    join_y_tolerance,
                    text_tolerance,
                    text_x_tolerance,
                    text_y_tolerance,
                    intersection_tolerance,
                    intersection_x_tolerance,
                    intersection_y_tolerance,
                    edge_min_length,
                    min_words_vertical,
                    min_words_horizontal,
                    keep_blank_chars,
                ],
                outputs=[notes, output_image, data_table],
            )

            notes.render()

        with gr.Column(scale=3):
            data_table.render()
            output_image.render()

    gr.Examples(
        examples=examples,
        inputs=[
            file,
            page_num,
            table_num,
            vertical_strategy,
            horizontal_strategy,
            snap_y_tolerance,
            intersection_x_tolerance,
            crop_bottom,
        ],
        outputs=[notes, output_image, data_table],
        fn=demo_subset,
        run_on_click=True,
    )

    gr.Markdown("## Additional options")
    with gr.Row():
        with gr.Column():
            with gr.Group():
                snap_tolerance.render()
                with gr.Row():
                    snap_x_tolerance.render()
                    snap_y_tolerance.render()
                join_tolerance.render()
                with gr.Row():
                    join_x_tolerance.render()
                    join_y_tolerance.render()
                text_tolerance.render()
                with gr.Row():
                    text_x_tolerance.render()
                    text_y_tolerance.render()
                intersection_tolerance.render()
                with gr.Row():
                    intersection_x_tolerance.render()
                    intersection_y_tolerance.render()

        with gr.Column():
            with gr.Group():
                explicit_vertical_lines.render()
                explicit_horizontal_lines.render()
                edge_min_length.render()
                with gr.Row():
                    min_words_vertical.render()
                    min_words_horizontal.render()
                keep_blank_chars.render()

if __name__ == "__main__":
    demo.launch()