Spaces:

reducto
/

rd_table_bench

Running

File size: 3,639 Bytes

from huggingface_hub import hf_hub_download
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import os
import zipfile
import shutil

st.set_page_config(layout="wide")

with st.spinner("Downloading dataset"):
    results = hf_hub_download(
        repo_id="reducto/rd-tablebench",
        filename="rd-tablebench.zip",
        repo_type="dataset",
    )


def unzip_dataset():
    if not os.path.exists("unzipped_dataset"):
        os.makedirs("unzipped_dataset")
        with st.spinner("Unzipping dataset"):
            with zipfile.ZipFile(results, "r") as zip_ref:
                zip_ref.extractall("unzipped_dataset")
    return "unzipped_dataset/rd-tablebench"


if st.button("Redo Unzip"):
    if os.path.exists("unzipped_dataset"):
        shutil.rmtree("unzipped_dataset")
        st.rerun()


dataset = unzip_dataset()

results = f"{dataset}/providers/scores.csv"

assert os.path.exists(results)

st.html("""
<style>
table {
  font-family: arial, sans-serif;
  border-collapse: collapse;
  white-space: pre;
}

td, th {
  border: 1px solid #dddddd;
  text-align: left;
  padding: 8px;
  font-weight: normal;
}

</style>
""")


df = pd.read_csv(results)

if "current_index" not in st.session_state:
    st.session_state.current_index = 0

col1, col2, col3 = st.columns([2, 5, 2])

with col1:
    st.html("<br/>")
    if st.button("⬅️ Previous", use_container_width=True):
        if st.session_state.current_index > 0:
            st.session_state.current_index -= 1
            st.rerun()

# Search box and Go button in col2
with col2:
    index_input = st.number_input(
        "Index",
        label_visibility="hidden",
        min_value=0,
        max_value=len(df) - 1,
        value=st.session_state.current_index,
        step=1,
    )

    if st.button("Go", use_container_width=True):
        st.session_state.current_index = int(index_input)
        st.rerun()

# Next button in col3
with col3:
    st.html("<br/>")
    if st.button("Next ➡️", use_container_width=True):
        if st.session_state.current_index < len(df) - 1:
            st.session_state.current_index += 1
            st.rerun()


col1, col2 = st.columns([1, 2])

providers = [
    "reducto",
    "azure",
    "textract",
    "gcloud",
    "unstructured",
    "gpt4o",
    "chunkr",
]

with col1:
    row = df.iloc[st.session_state.current_index]

    # Extract scores
    scores = [
        row[f"{p}_score"] if row[f"{p}_score"] is not None else 0 for p in providers
    ]

    fig, ax = plt.subplots(figsize=(6, 10))
    bars = ax.barh(providers[::-1], scores[::-1])

    # Customize plot
    ax.set_title("Provider Scores Comparison")
    ax.set_ylabel("Providers")
    ax.set_xlabel("Scores")
    ax.set_xlim(0, 1.1)

    for bar in bars:
        width = bar.get_width()
        ax.text(
            width,
            bar.get_y() + bar.get_height() / 2.0,
            f"{width:.3f}",
            ha="left",
            va="center",
        )

    plt.tight_layout()
    st.pyplot(fig)
with col2:
    image_path = f"{dataset}/_images/{row['pdf_path'].replace('.pdf', '.jpg')}"
    st.image(image_path, use_column_width=True)

st.write(row)
st.subheader("Groundtruth")
st.html(f"{dataset}/groundtruth/{row['pdf_path'].replace('.pdf', '.html')}")

st.subheader("Provider Outputs")
for p in providers:
    with st.expander(p):
        provider_html = (
            f"{dataset}/providers/{p}/{row['pdf_path'].replace('.pdf', '.html')}"
        )
        if os.path.exists(provider_html):
            st.html(provider_html)
        else:
            st.error(f"{p} failed to produce a table output for this image")