raunakdoesdev commited on
Commit
ba455aa
Β·
1 Parent(s): f14efd0
Files changed (5) hide show
  1. README.md +3 -3
  2. _πŸ“„_README.py +0 -0
  3. app.py +0 -4
  4. pages/1_πŸ”_Explorer.py +136 -0
  5. requirements.txt +2 -0
README.md CHANGED
@@ -1,11 +1,11 @@
1
  ---
2
- title: Rd Table Bench
3
- emoji: 😻
4
  colorFrom: green
5
  colorTo: gray
6
  sdk: streamlit
7
  sdk_version: 1.39.0
8
- app_file: app.py
9
  pinned: false
10
  license: agpl-3.0
11
  short_description: Reducto's SOTA human annotated table benchmark.
 
1
  ---
2
+ title: RD TableBench
3
+ emoji: πŸ“Š
4
  colorFrom: green
5
  colorTo: gray
6
  sdk: streamlit
7
  sdk_version: 1.39.0
8
+ app_file: _πŸ“„_README.py
9
  pinned: false
10
  license: agpl-3.0
11
  short_description: Reducto's SOTA human annotated table benchmark.
_πŸ“„_README.py ADDED
File without changes
app.py DELETED
@@ -1,4 +0,0 @@
1
- import streamlit as st
2
-
3
- x = st.slider('Select a value')
4
- st.write(x, 'squared is', x * x)
 
 
 
 
 
pages/1_πŸ”_Explorer.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import snapshot_download, hf_hub_download
2
+ import streamlit as st
3
+ import pandas as pd
4
+ import matplotlib.pyplot as plt
5
+ import os
6
+
7
+ st.set_page_config(layout="wide")
8
+
9
+ results = hf_hub_download(
10
+ repo_id="reducto/rd-tablebench",
11
+ filename="providers/scores.csv",
12
+ repo_type="dataset",
13
+ )
14
+
15
+ st.html("""
16
+ <style>
17
+ table {
18
+ font-family: arial, sans-serif;
19
+ border-collapse: collapse;
20
+ white-space: pre;
21
+ }
22
+
23
+ td, th {
24
+ border: 1px solid #dddddd;
25
+ text-align: left;
26
+ padding: 8px;
27
+ font-weight: normal;
28
+ }
29
+
30
+ </style>
31
+ """)
32
+
33
+
34
+ @st.cache_resource(show_spinner="Loading dataset (can take ~3 min)")
35
+ def load_dataset_1():
36
+ return snapshot_download(repo_id="reducto/rd-tablebench", repo_type="dataset")
37
+
38
+
39
+ dataset = load_dataset_1()
40
+
41
+ df = pd.read_csv(results)
42
+
43
+ if "current_index" not in st.session_state:
44
+ st.session_state.current_index = 0
45
+
46
+ col1, col2, col3 = st.columns([2, 5, 2])
47
+
48
+ with col1:
49
+ st.html("<br/>")
50
+ if st.button("⬅️ Previous", use_container_width=True):
51
+ if st.session_state.current_index > 0:
52
+ st.session_state.current_index -= 1
53
+ st.rerun()
54
+
55
+ # Search box and Go button in col2
56
+ with col2:
57
+ index_input = st.number_input(
58
+ "Index",
59
+ label_visibility="hidden",
60
+ min_value=0,
61
+ max_value=len(df) - 1,
62
+ value=st.session_state.current_index,
63
+ step=1,
64
+ )
65
+
66
+ if st.button("Go", use_container_width=True):
67
+ st.session_state.current_index = int(index_input)
68
+ st.rerun()
69
+
70
+ # Next button in col3
71
+ with col3:
72
+ st.html("<br/>")
73
+ if st.button("Next ➑️", use_container_width=True):
74
+ if st.session_state.current_index < len(df) - 1:
75
+ st.session_state.current_index += 1
76
+ st.rerun()
77
+
78
+
79
+ col1, col2 = st.columns([1, 2])
80
+
81
+ providers = [
82
+ "reducto",
83
+ "azure",
84
+ "textract",
85
+ "gcloud",
86
+ "unstructured",
87
+ "gpt4o",
88
+ "chunkr",
89
+ ]
90
+
91
+ with col1:
92
+ row = df.iloc[st.session_state.current_index]
93
+
94
+ # Extract scores
95
+ scores = [
96
+ row[f"{p}_score"] if row[f"{p}_score"] is not None else 0 for p in providers
97
+ ]
98
+
99
+ fig, ax = plt.subplots(figsize=(6, 10))
100
+ bars = ax.barh(providers[::-1], scores[::-1])
101
+
102
+ # Customize plot
103
+ ax.set_title("Provider Scores Comparison")
104
+ ax.set_ylabel("Providers")
105
+ ax.set_xlabel("Scores")
106
+ ax.set_xlim(0, 1.1)
107
+
108
+ for bar in bars:
109
+ width = bar.get_width()
110
+ ax.text(
111
+ width,
112
+ bar.get_y() + bar.get_height() / 2.0,
113
+ f"{width:.3f}",
114
+ ha="left",
115
+ va="center",
116
+ )
117
+
118
+ plt.tight_layout()
119
+ st.pyplot(fig)
120
+ with col2:
121
+ image_path = f"{dataset}/_images/{row['pdf_path'].replace('.pdf', '.jpg')}"
122
+ st.image(image_path, use_column_width=True)
123
+
124
+ st.subheader("Groundtruth")
125
+ st.html(f"{dataset}/groundtruth/{row['pdf_path'].replace('.pdf', '.html')}")
126
+
127
+ st.subheader("Provider Outputs")
128
+ for p in providers:
129
+ with st.expander(p):
130
+ provider_html = (
131
+ f"{dataset}/providers/{p}/{row['pdf_path'].replace('.pdf', '.html')}"
132
+ )
133
+ if os.path.exists(provider_html):
134
+ st.html(provider_html)
135
+ else:
136
+ st.error(f"{p} failed to produce a table output for this image")
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ pandas
2
+ huggingface-hub