Spaces:

reducto
/

rd_table_bench

Running

App Files Files Community

raunakdoesdev commited on Nov 5, 2024

Commit

ba455aa

1 Parent(s): f14efd0

finished

Browse files

Files changed (5) hide show

README.md +3 -3
_📄_README.py +0 -0
app.py +0 -4
pages/1_🔍_Explorer.py +136 -0
requirements.txt +2 -0

README.md CHANGED Viewed

@@ -1,11 +1,11 @@
 ---
-title: Rd Table Bench
-emoji: 😻
 colorFrom: green
 colorTo: gray
 sdk: streamlit
 sdk_version: 1.39.0
-app_file: app.py
 pinned: false
 license: agpl-3.0
 short_description: Reducto's SOTA human annotated table benchmark.

 ---
+title: RD TableBench
+emoji: 📊
 colorFrom: green
 colorTo: gray
 sdk: streamlit
 sdk_version: 1.39.0
+app_file: _📄_README.py
 pinned: false
 license: agpl-3.0
 short_description: Reducto's SOTA human annotated table benchmark.

_📄_README.py ADDED Viewed

File without changes

app.py DELETED Viewed

@@ -1,4 +0,0 @@
-import streamlit as st
-x = st.slider('Select a value')
-st.write(x, 'squared is', x * x)

pages/1_🔍_Explorer.py ADDED Viewed

	@@ -0,0 +1,136 @@

+from huggingface_hub import snapshot_download, hf_hub_download
+import streamlit as st
+import pandas as pd
+import matplotlib.pyplot as plt
+import os
+st.set_page_config(layout="wide")
+results = hf_hub_download(
+    repo_id="reducto/rd-tablebench",
+    filename="providers/scores.csv",
+    repo_type="dataset",
+)
+st.html("""
+<style>
+table {
+  font-family: arial, sans-serif;
+  border-collapse: collapse;
+  white-space: pre;
+}
+td, th {
+  border: 1px solid #dddddd;
+  text-align: left;
+  padding: 8px;
+  font-weight: normal;
+}
+</style>
+""")
+@st.cache_resource(show_spinner="Loading dataset (can take ~3 min)")
+def load_dataset_1():
+    return snapshot_download(repo_id="reducto/rd-tablebench", repo_type="dataset")
+dataset = load_dataset_1()
+df = pd.read_csv(results)
+if "current_index" not in st.session_state:
+    st.session_state.current_index = 0
+col1, col2, col3 = st.columns([2, 5, 2])
+with col1:
+    st.html("<br/>")
+    if st.button("⬅️ Previous", use_container_width=True):
+        if st.session_state.current_index > 0:
+            st.session_state.current_index -= 1
+            st.rerun()
+# Search box and Go button in col2
+with col2:
+    index_input = st.number_input(
+        "Index",
+        label_visibility="hidden",
+        min_value=0,
+        max_value=len(df) - 1,
+        value=st.session_state.current_index,
+        step=1,
+    )
+    if st.button("Go", use_container_width=True):
+        st.session_state.current_index = int(index_input)
+        st.rerun()
+# Next button in col3
+with col3:
+    st.html("<br/>")
+    if st.button("Next ➡️", use_container_width=True):
+        if st.session_state.current_index < len(df) - 1:
+            st.session_state.current_index += 1
+            st.rerun()
+col1, col2 = st.columns([1, 2])
+providers = [
+    "reducto",
+    "azure",
+    "textract",
+    "gcloud",
+    "unstructured",
+    "gpt4o",
+    "chunkr",
+]
+with col1:
+    row = df.iloc[st.session_state.current_index]
+    # Extract scores
+    scores = [
+        row[f"{p}_score"] if row[f"{p}_score"] is not None else 0 for p in providers
+    ]
+    fig, ax = plt.subplots(figsize=(6, 10))
+    bars = ax.barh(providers[::-1], scores[::-1])
+    # Customize plot
+    ax.set_title("Provider Scores Comparison")
+    ax.set_ylabel("Providers")
+    ax.set_xlabel("Scores")
+    ax.set_xlim(0, 1.1)
+    for bar in bars:
+        width = bar.get_width()
+        ax.text(
+            width,
+            bar.get_y() + bar.get_height() / 2.0,
+            f"{width:.3f}",
+            ha="left",
+            va="center",
+        )
+    plt.tight_layout()
+    st.pyplot(fig)
+with col2:
+    image_path = f"{dataset}/_images/{row['pdf_path'].replace('.pdf', '.jpg')}"
+    st.image(image_path, use_column_width=True)
+st.subheader("Groundtruth")
+st.html(f"{dataset}/groundtruth/{row['pdf_path'].replace('.pdf', '.html')}")
+st.subheader("Provider Outputs")
+for p in providers:
+    with st.expander(p):
+        provider_html = (
+            f"{dataset}/providers/{p}/{row['pdf_path'].replace('.pdf', '.html')}"
+        )
+        if os.path.exists(provider_html):
+            st.html(provider_html)
+        else:
+            st.error(f"{p} failed to produce a table output for this image")

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ pandas
2	+ huggingface-hub