Spaces:
Runtime error
Runtime error
add content from extract-leaderboard-gradio
Browse files- .gitignore +3 -0
- app.py +10 -4
- extractors/model.py +282 -0
- extractors/model_runner.py +36 -0
- leaderboard/vote.py +11 -0
- postprocessors/postprocessor.py +40 -0
- preprocessors/preprocessor.py +43 -0
- requirements.txt +11 -0
- tabs/arena_sxs.py +113 -0
- tabs/leaderboard.py +7 -0
- utils/pdf_utils.py +22 -0
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
.env
|
2 |
+
__pycache__
|
3 |
+
poetry.lock
|
app.py
CHANGED
@@ -1,7 +1,13 @@
|
|
1 |
import gradio as gr
|
|
|
|
|
2 |
|
3 |
-
|
4 |
-
|
|
|
|
|
|
|
|
|
5 |
|
6 |
-
|
7 |
-
demo.launch()
|
|
|
1 |
import gradio as gr
|
2 |
+
from tabs.arena_sxs import arena_sxs
|
3 |
+
from tabs.leaderboard import leaderboard
|
4 |
|
5 |
+
# Create the Gradio interface with tabs
|
6 |
+
with gr.Blocks() as demo:
|
7 |
+
with gr.Tab("Parser Arena"):
|
8 |
+
arena_sxs()
|
9 |
+
with gr.Tab("Leaderboard"):
|
10 |
+
leaderboard()
|
11 |
|
12 |
+
# Launch the app
|
13 |
+
demo.launch(share=True)
|
extractors/model.py
ADDED
@@ -0,0 +1,282 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
import json
|
3 |
+
import os
|
4 |
+
import requests
|
5 |
+
|
6 |
+
import anthropic
|
7 |
+
import openai
|
8 |
+
from dotenv import load_dotenv
|
9 |
+
from pathlib import Path
|
10 |
+
from llama_parse import LlamaParse
|
11 |
+
from llama_index.core import SimpleDirectoryReader
|
12 |
+
from unstructured.partition.auto import partition
|
13 |
+
from preprocessors.preprocessor import PdfPreprocessor
|
14 |
+
from postprocessors.postprocessor import ClaudePostprocessor, GPTPostprocessor
|
15 |
+
|
16 |
+
load_dotenv()
|
17 |
+
|
18 |
+
|
19 |
+
class Model:
|
20 |
+
BASE_URL: str | None = None
|
21 |
+
API_KEY: str | None = None
|
22 |
+
MODEL: str | None = None
|
23 |
+
|
24 |
+
def __init_subclass__(cls) -> None:
|
25 |
+
"""Initialize subclass."""
|
26 |
+
super().__init_subclass__()
|
27 |
+
|
28 |
+
def __init__(self):
|
29 |
+
"""Init self"""
|
30 |
+
|
31 |
+
def extract(self, file_path: str) -> str:
|
32 |
+
"""Extract model.
|
33 |
+
|
34 |
+
Args:
|
35 |
+
file_path: path to file to extract
|
36 |
+
|
37 |
+
Returns:
|
38 |
+
str: output markdown
|
39 |
+
"""
|
40 |
+
raise NotImplementedError("Model extract method is not implemented")
|
41 |
+
|
42 |
+
class AnyParserModel(Model):
|
43 |
+
BASE_URL = "https://k7u1c342dc.execute-api.us-west-2.amazonaws.com/v1/extract"
|
44 |
+
API_KEY = os.getenv('ANYPARSER_RT_API_KEY')
|
45 |
+
|
46 |
+
def extract(self, file_path: str) -> str:
|
47 |
+
"""Extract data in real-time.
|
48 |
+
|
49 |
+
Args:
|
50 |
+
file_path (str): The path to the file to be parsed.
|
51 |
+
|
52 |
+
Returns:
|
53 |
+
str: The extracted data.
|
54 |
+
"""
|
55 |
+
file_extension = Path(file_path).suffix.lower().lstrip(".")
|
56 |
+
|
57 |
+
# Check if the file exists
|
58 |
+
if not Path(file_path).is_file():
|
59 |
+
return "Error: File does not exist", "File does not exist"
|
60 |
+
|
61 |
+
if file_extension in ["pdf", "docx"]:
|
62 |
+
# Encode the PDF file content in base64
|
63 |
+
with open(file_path, "rb") as file:
|
64 |
+
encoded_file = base64.b64encode(file.read()).decode("utf-8")
|
65 |
+
else:
|
66 |
+
return "Error: Unsupported file type", "Unsupported file type"
|
67 |
+
|
68 |
+
# Create the JSON payload
|
69 |
+
payload = {
|
70 |
+
"file_content": encoded_file,
|
71 |
+
"file_type": file_extension,
|
72 |
+
}
|
73 |
+
|
74 |
+
|
75 |
+
# Set the headers
|
76 |
+
headers = {
|
77 |
+
"Content-Type": "application/json",
|
78 |
+
"x-api-key": self.API_KEY,
|
79 |
+
}
|
80 |
+
|
81 |
+
# Send the POST request
|
82 |
+
response = requests.post(
|
83 |
+
self.BASE_URL, headers=headers, data=json.dumps(payload), timeout=30
|
84 |
+
)
|
85 |
+
|
86 |
+
# Check if the request was successful
|
87 |
+
if response.status_code == 200:
|
88 |
+
try:
|
89 |
+
response_data = response.json()
|
90 |
+
response_list = []
|
91 |
+
for text in response_data["markdown"]:
|
92 |
+
response_list.append(text)
|
93 |
+
markdown_text = "\n".join(response_list)
|
94 |
+
return markdown_text
|
95 |
+
except json.JSONDecodeError:
|
96 |
+
return "Error: Invalid JSON response", f"Response: {response.text}"
|
97 |
+
else:
|
98 |
+
return f"Error: {response.status_code}", f"Response: {response.text}"
|
99 |
+
|
100 |
+
class LlamaParseModel(Model):
|
101 |
+
BASE_URL = None
|
102 |
+
API_KEY = os.getenv('LLAMA_CLOUD_API_KEY')
|
103 |
+
|
104 |
+
def __init__(self):
|
105 |
+
"""Init."""
|
106 |
+
super().__init__()
|
107 |
+
if not self.API_KEY:
|
108 |
+
raise ValueError("The API key is required. Please set the LLAMA_CLOUD_API_KEY environment variable.")
|
109 |
+
|
110 |
+
def extract(self, file_path: str) -> str:
|
111 |
+
"""Extract data in real-time.
|
112 |
+
|
113 |
+
Args:
|
114 |
+
file_path (str): The path to the file to be parsed.
|
115 |
+
|
116 |
+
Returns:
|
117 |
+
str: The extracted data.
|
118 |
+
"""
|
119 |
+
try:
|
120 |
+
parser = LlamaParse(
|
121 |
+
result_type="markdown",
|
122 |
+
num_workers=4,
|
123 |
+
verbose=True,
|
124 |
+
language="en",
|
125 |
+
)
|
126 |
+
|
127 |
+
file_extractor = {".pdf": parser}
|
128 |
+
documents = SimpleDirectoryReader(input_files=[file_path], file_extractor=file_extractor).load_data()
|
129 |
+
|
130 |
+
markdown = "\n\n".join([doc.text for doc in documents])
|
131 |
+
|
132 |
+
return markdown
|
133 |
+
except Exception as e:
|
134 |
+
print(f"Error processing input: {str(e)}")
|
135 |
+
return f"Error processing with LlamaParse: {str(e)}"
|
136 |
+
|
137 |
+
class UnstructuredModel(Model):
|
138 |
+
BASE_URL = None
|
139 |
+
API_KEY = None
|
140 |
+
|
141 |
+
def __init__(self):
|
142 |
+
"""Init."""
|
143 |
+
super().__init__()
|
144 |
+
|
145 |
+
def extract(self, file_path: str) -> str:
|
146 |
+
"""Extract data in real-time.
|
147 |
+
|
148 |
+
Args:
|
149 |
+
file_path (str): The path to the file to be parsed.
|
150 |
+
|
151 |
+
Returns:
|
152 |
+
str: The extracted data.
|
153 |
+
"""
|
154 |
+
try:
|
155 |
+
|
156 |
+
elements = partition(file_path)
|
157 |
+
|
158 |
+
parsed_text = "\n".join(str(element) for element in elements)
|
159 |
+
|
160 |
+
markdown = parsed_text if parsed_text else "No content parsed"
|
161 |
+
return markdown
|
162 |
+
except Exception as e:
|
163 |
+
return f"Error processing UnstructuredModel: {str(e)}"
|
164 |
+
|
165 |
+
class GPTModel(Model):
|
166 |
+
BASE_URL = None
|
167 |
+
API_KEY = os.getenv("OPENAI_API_KEY")
|
168 |
+
MODEL = "gpt-4o-mini"
|
169 |
+
REQUIRES_OPENAI = True
|
170 |
+
|
171 |
+
def __init__(self):
|
172 |
+
"""Init."""
|
173 |
+
super().__init__()
|
174 |
+
if not self.API_KEY:
|
175 |
+
raise ValueError(
|
176 |
+
"The API key is required. Please set the OPENAI_API_KEY environment variable."
|
177 |
+
)
|
178 |
+
self._client = openai.OpenAI(api_key=self.API_KEY)
|
179 |
+
|
180 |
+
|
181 |
+
def extract(self, file_path: str) -> str:
|
182 |
+
"""Extract data in real-time.
|
183 |
+
|
184 |
+
Args:
|
185 |
+
file_path (str): The path to the file to be parsed.
|
186 |
+
|
187 |
+
Returns:
|
188 |
+
str: The extracted data.
|
189 |
+
"""
|
190 |
+
|
191 |
+
|
192 |
+
pdf_preprocessor = PdfPreprocessor()
|
193 |
+
gpt_postprocessor = GPTPostprocessor()
|
194 |
+
file_contents = pdf_preprocessor.run(file_path)
|
195 |
+
contents = []
|
196 |
+
for content in file_contents:
|
197 |
+
contents.append(
|
198 |
+
{
|
199 |
+
"type": "image_url",
|
200 |
+
"image_url": {
|
201 |
+
"url": f"data:image/jpeg;base64,{content}",
|
202 |
+
},
|
203 |
+
})
|
204 |
+
|
205 |
+
messages = [
|
206 |
+
{
|
207 |
+
"role": "user",
|
208 |
+
"content": [
|
209 |
+
{"type": "text", "text": "Convert this image to markdown"},
|
210 |
+
*contents,
|
211 |
+
],
|
212 |
+
}
|
213 |
+
]
|
214 |
+
|
215 |
+
response = self._client.chat.completions.create(
|
216 |
+
model=self.MODEL,
|
217 |
+
messages=messages,
|
218 |
+
)
|
219 |
+
|
220 |
+
return gpt_postprocessor.run(response.choices[0].message.content)
|
221 |
+
|
222 |
+
class ClaudeModel(Model):
|
223 |
+
BASE_URL = "http://103.114.163.134:3000/v1/"
|
224 |
+
API_KEY = os.getenv("ANTHROPIC_API_KEY")
|
225 |
+
MODEL = "claude-3-5-sonnet-20240620"
|
226 |
+
REQUIRES_OPENAI = True
|
227 |
+
|
228 |
+
def __init__(self):
|
229 |
+
"""Init."""
|
230 |
+
super().__init__()
|
231 |
+
if not self.API_KEY:
|
232 |
+
raise ValueError(
|
233 |
+
"The API key is required. Please set the ANTHROPIC_API_KEY environment variable."
|
234 |
+
)
|
235 |
+
self._client = anthropic.Anthropic(
|
236 |
+
api_key=self.API_KEY,
|
237 |
+
)
|
238 |
+
|
239 |
+
|
240 |
+
def extract(self, file_path: str) -> str:
|
241 |
+
"""Extract data in real-time.
|
242 |
+
|
243 |
+
Args:
|
244 |
+
file_path (str): The path to the file to be parsed.
|
245 |
+
|
246 |
+
Returns:
|
247 |
+
str: The extracted data.
|
248 |
+
"""
|
249 |
+
|
250 |
+
prompt = "Convert this image to markdown."
|
251 |
+
pdf_preprocessor = PdfPreprocessor()
|
252 |
+
claude_postprocessor = ClaudePostprocessor()
|
253 |
+
file_contents = pdf_preprocessor.run(file_path)
|
254 |
+
|
255 |
+
contents = []
|
256 |
+
for content in file_contents:
|
257 |
+
contents.append(
|
258 |
+
{
|
259 |
+
"type": "image",
|
260 |
+
"source": {
|
261 |
+
"type": "base64",
|
262 |
+
"media_type": "image/jpeg",
|
263 |
+
"data": content,
|
264 |
+
}
|
265 |
+
})
|
266 |
+
|
267 |
+
messages = [
|
268 |
+
{"role": "user", "content": [
|
269 |
+
{"type": "text", "text": prompt},
|
270 |
+
*contents,
|
271 |
+
]}
|
272 |
+
]
|
273 |
+
|
274 |
+
try:
|
275 |
+
response = self._client.messages.create(
|
276 |
+
model="claude-3-5-sonnet-20240620", max_tokens=1024, messages=messages
|
277 |
+
)
|
278 |
+
print(response.content[0].text)
|
279 |
+
return claude_postprocessor.run(response.content[0].text)
|
280 |
+
except Exception as e:
|
281 |
+
return f"Error processing ClaudeModel: {str(e)}"
|
282 |
+
|
extractors/model_runner.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import concurrent.futures
|
2 |
+
from extractors.model import AnyParserModel, LlamaParseModel, UnstructuredModel, GPTModel, ClaudeModel
|
3 |
+
|
4 |
+
ap_rt = AnyParserModel()
|
5 |
+
lp = LlamaParseModel()
|
6 |
+
un = UnstructuredModel()
|
7 |
+
gpt = GPTModel()
|
8 |
+
claude = ClaudeModel()
|
9 |
+
|
10 |
+
model_function_map = {
|
11 |
+
"AnyParser": ap_rt.extract,
|
12 |
+
"LlamaParse": lp.extract,
|
13 |
+
"Unstructured": un.extract,
|
14 |
+
"GPT-4o-mini": gpt.extract,
|
15 |
+
"Claude-3.5-Sonnet": claude.extract,
|
16 |
+
}
|
17 |
+
|
18 |
+
models = [key for key in model_function_map]
|
19 |
+
|
20 |
+
def run_extract(model, file_path):
|
21 |
+
print('Running extract: model', model, 'file_path', file_path)
|
22 |
+
extractor = model_function_map[model]
|
23 |
+
markdown = extractor(file_path)
|
24 |
+
return markdown
|
25 |
+
|
26 |
+
def run_extract_parallel(model_a, model_b, pdf):
|
27 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
28 |
+
# Submit tasks to the executor for parallel execution
|
29 |
+
future_a = executor.submit(run_extract, model_a, pdf)
|
30 |
+
future_b = executor.submit(run_extract, model_b, pdf)
|
31 |
+
|
32 |
+
# Get the results as they complete
|
33 |
+
result_a = future_a.result()
|
34 |
+
result_b = future_b.result()
|
35 |
+
|
36 |
+
return result_a, result_b
|
leaderboard/vote.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from enum import Enum
|
2 |
+
|
3 |
+
class Vote(Enum):
|
4 |
+
GOOD = "GOOD"
|
5 |
+
BAD = "BAD"
|
6 |
+
NEUTRAL = "NEUTRAL"
|
7 |
+
|
8 |
+
def vote_for_model(model_a: str, model_a_vote: Vote, model_b: str, model_b_vote: Vote):
|
9 |
+
print(f"Voting for {model_a}: {model_a_vote} | Voting for {model_b}: {model_b_vote}")
|
10 |
+
|
11 |
+
return "", ""
|
postprocessors/postprocessor.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
class Postprocessor:
|
2 |
+
"""Postprocessor."""
|
3 |
+
|
4 |
+
def run(self, text: str) -> str:
|
5 |
+
"""Postprocess."""
|
6 |
+
raise NotImplementedError("Postprocess method is not implemented")
|
7 |
+
|
8 |
+
class ClaudePostprocessor(Postprocessor):
|
9 |
+
"""Claude Postprocessor."""
|
10 |
+
def run(self, text: str) -> str:
|
11 |
+
"""Clean the response from the Claude model.
|
12 |
+
|
13 |
+
Args:
|
14 |
+
text (str): The response from the Claude model.
|
15 |
+
|
16 |
+
Returns:
|
17 |
+
str: The cleaned response.
|
18 |
+
"""
|
19 |
+
# remove the ```markdown and ``` at the beginning and end of the text
|
20 |
+
text = text.replace("```markdown", "").replace("```", "")
|
21 |
+
# Remove any leading or trailing whitespace
|
22 |
+
text = text.strip()
|
23 |
+
return text
|
24 |
+
|
25 |
+
class GPTPostprocessor(Postprocessor):
|
26 |
+
"""GPT Postprocessor."""
|
27 |
+
def run(self, text: str) -> str:
|
28 |
+
"""Clean the response from the GPT model.
|
29 |
+
|
30 |
+
Args:
|
31 |
+
text (str): The response from the GPT model.
|
32 |
+
|
33 |
+
Returns:
|
34 |
+
str: The cleaned response.
|
35 |
+
"""
|
36 |
+
# remove the ```markdown and ``` at the beginning and end of the text
|
37 |
+
text = text.replace("```markdown", "").replace("```", "")
|
38 |
+
# Remove any leading or trailing whitespace
|
39 |
+
text = text.strip()
|
40 |
+
return text
|
preprocessors/preprocessor.py
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
import base64
|
4 |
+
import io
|
5 |
+
import pdf2image
|
6 |
+
from typing import Any
|
7 |
+
|
8 |
+
|
9 |
+
class Preprocessor:
|
10 |
+
"""Preprocessor."""
|
11 |
+
|
12 |
+
def run(self, file_path: str) -> Any:
|
13 |
+
"""Preprocess."""
|
14 |
+
raise NotImplementedError("Preprocess method is not implemented")
|
15 |
+
|
16 |
+
# Convert PDF to image
|
17 |
+
class PdfPreprocessor(Preprocessor):
|
18 |
+
"""PDF Preprocessor."""
|
19 |
+
|
20 |
+
def run(self, file_path: str) -> str:
|
21 |
+
images = pdf2image.convert_from_path(file_path)
|
22 |
+
image = images[0] # Assuming there is only one page in the PDF
|
23 |
+
|
24 |
+
# Convert image to base64
|
25 |
+
with io.BytesIO() as buffer:
|
26 |
+
image.save(buffer, format="JPEG")
|
27 |
+
image_content = buffer.getvalue()
|
28 |
+
|
29 |
+
file_content = base64.b64encode(image_content).decode("utf-8")
|
30 |
+
# Process all pages and return a list of images
|
31 |
+
images = pdf2image.convert_from_path(file_path)
|
32 |
+
image_list = []
|
33 |
+
|
34 |
+
for image in images:
|
35 |
+
# Convert image to base64
|
36 |
+
with io.BytesIO() as buffer:
|
37 |
+
image.save(buffer, format="JPEG")
|
38 |
+
image_content = buffer.getvalue()
|
39 |
+
|
40 |
+
file_content = base64.b64encode(image_content).decode("utf-8")
|
41 |
+
image_list.append(file_content)
|
42 |
+
|
43 |
+
return image_list
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio==4.41.0
|
2 |
+
pillow==10.4.0
|
3 |
+
pymupdf==1.24.9
|
4 |
+
python-dotenv==1.0.1
|
5 |
+
llama-index-core==0.10.68.post1
|
6 |
+
llama-parse==0.4.9
|
7 |
+
llama-index-readers-file==0.1.33
|
8 |
+
unstructured[pdf]==0.15.7
|
9 |
+
openai==1.42.0
|
10 |
+
pdf2image==1.17.0
|
11 |
+
anthropic==0.34.1
|
tabs/arena_sxs.py
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from utils.pdf_utils import load_pdf, update_page
|
3 |
+
from extractors.model_runner import models, run_extract_parallel
|
4 |
+
from leaderboard.vote import vote_for_model, Vote
|
5 |
+
|
6 |
+
def update_dropdowns(model_a_choice, model_b_choice):
|
7 |
+
if model_a_choice == model_b_choice:
|
8 |
+
options_b = [m for m in models if m != model_a_choice]
|
9 |
+
return gr.update(choices=options_b, value=options_b[0] if options_b else None)
|
10 |
+
else:
|
11 |
+
options_b = [m for m in models if m != model_a_choice]
|
12 |
+
return gr.update(choices=options_b, value=model_b_choice)
|
13 |
+
|
14 |
+
def sync_models(model_a_choice, model_b_choice):
|
15 |
+
updated_dropdown_b = update_dropdowns(model_a_choice, model_b_choice)
|
16 |
+
return updated_dropdown_b
|
17 |
+
|
18 |
+
def dummy_function_a(model_a_choice):
|
19 |
+
return f"Model A selected: {model_a_choice}"
|
20 |
+
|
21 |
+
def dummy_function_b(model_b_choice):
|
22 |
+
return f"Model B selected: {model_b_choice}"
|
23 |
+
|
24 |
+
def update_button(file):
|
25 |
+
return gr.update(interactive=bool(file))
|
26 |
+
|
27 |
+
def update_vote_button(output):
|
28 |
+
is_active = bool(output)
|
29 |
+
return [gr.update(interactive=is_active) for _ in range(4)]
|
30 |
+
|
31 |
+
def arena_sxs():
|
32 |
+
with gr.Blocks() as arena_block:
|
33 |
+
gr.Markdown("# Rules")
|
34 |
+
gr.Markdown("- Upload a PDF file to extract with two chosen models (e.g., Llama, Unstructured, ChatGPT, Claude) and vote for the better one!")
|
35 |
+
gr.Markdown("- You can upload multiple files until you identify a winner.")
|
36 |
+
|
37 |
+
gr.Markdown("## 1. Upload a file.")
|
38 |
+
gr.Markdown("Only PDF files supported.")
|
39 |
+
with gr.Row():
|
40 |
+
with gr.Column(scale=2):
|
41 |
+
pdf = gr.File(type="filepath", label="Upload PDF", file_types=[".pdf"])
|
42 |
+
pdf_image = gr.Image(label="PDF Page")
|
43 |
+
page_info = gr.Textbox(label="")
|
44 |
+
current_page = gr.State(value=0)
|
45 |
+
with gr.Row():
|
46 |
+
prev_button = gr.Button("Previous")
|
47 |
+
next_button = gr.Button("Next")
|
48 |
+
|
49 |
+
gr.Markdown("---")
|
50 |
+
gr.Markdown("## 2. Choose two models to compare")
|
51 |
+
|
52 |
+
with gr.Blocks():
|
53 |
+
with gr.Row():
|
54 |
+
model_a = gr.Dropdown(choices=models, value=models[0], label="")
|
55 |
+
model_b = gr.Dropdown(choices=[m for m in models if m != models[0]], value=models[1], label="")
|
56 |
+
|
57 |
+
with gr.Row():
|
58 |
+
output_a = gr.Markdown(height=400)
|
59 |
+
output_b = gr.Markdown(height=400)
|
60 |
+
|
61 |
+
|
62 |
+
with gr.Row():
|
63 |
+
extract_button = gr.Button("Extract", interactive=False)
|
64 |
+
|
65 |
+
with gr.Row():
|
66 |
+
vote_model_a_button = gr.Button("π A is better", interactive=False)
|
67 |
+
vote_model_b_button = gr.Button("π B is better", interactive=False)
|
68 |
+
vote_tie_button = gr.Button("π€ Tie", interactive=False)
|
69 |
+
vote_bad_button = gr.Button("π Both are bad", interactive=False)
|
70 |
+
|
71 |
+
#Change handlers
|
72 |
+
pdf.change(load_pdf, inputs=[pdf], outputs=[pdf_image, page_info, current_page])
|
73 |
+
pdf.change(fn=update_button, inputs=pdf, outputs=extract_button)
|
74 |
+
model_a.change(sync_models, inputs=[model_a, model_b], outputs=model_b)
|
75 |
+
model_a.change(dummy_function_a, inputs=model_a)
|
76 |
+
model_b.change(dummy_function_b, inputs=model_b)
|
77 |
+
output_a.change(fn=update_vote_button, inputs=output_a, outputs=[vote_model_a_button, vote_model_b_button, vote_tie_button, vote_bad_button])
|
78 |
+
output_b.change(fn=update_vote_button, inputs=output_b, outputs=[vote_model_a_button, vote_model_b_button, vote_tie_button, vote_bad_button])
|
79 |
+
|
80 |
+
# Button Handlers
|
81 |
+
prev_button.click(
|
82 |
+
fn=lambda file_path, page: update_page(file_path, page, -1),
|
83 |
+
inputs=[pdf, current_page],
|
84 |
+
outputs=[pdf_image, page_info, current_page],
|
85 |
+
)
|
86 |
+
|
87 |
+
next_button.click(
|
88 |
+
fn=lambda file_path, page: update_page(file_path, page, 1),
|
89 |
+
inputs=[pdf, current_page],
|
90 |
+
outputs=[pdf_image, page_info, current_page],
|
91 |
+
)
|
92 |
+
|
93 |
+
extract_button.click(
|
94 |
+
fn=run_extract_parallel,
|
95 |
+
inputs=[model_a, model_b, pdf],
|
96 |
+
outputs=[output_a, output_b],
|
97 |
+
)
|
98 |
+
|
99 |
+
def clear_outputs():
|
100 |
+
return "", ""
|
101 |
+
|
102 |
+
extract_button.click(
|
103 |
+
fn=clear_outputs,
|
104 |
+
outputs=[output_a, output_b],
|
105 |
+
)
|
106 |
+
|
107 |
+
vote_model_a_button.click(fn=lambda model_a, model_b: vote_for_model(model_a, Vote.GOOD, model_b, Vote.NEUTRAL), inputs=[model_a, model_b], outputs=[output_a, output_b])
|
108 |
+
vote_model_b_button.click(fn=lambda model_a, model_b: vote_for_model(model_a, Vote.NEUTRAL, model_b, Vote.GOOD), inputs=[model_a, model_b], outputs=[output_a, output_b])
|
109 |
+
vote_tie_button.click(fn=lambda model_a, model_b: vote_for_model(model_a, Vote.NEUTRAL, model_b, Vote.NEUTRAL), inputs=[model_a, model_b], outputs=[output_a, output_b])
|
110 |
+
vote_bad_button.click(fn=lambda model_a, model_b: vote_for_model(model_a, Vote.BAD, model_b, Vote.BAD), inputs=[model_a, model_b], outputs=[output_a, output_b])
|
111 |
+
|
112 |
+
|
113 |
+
return arena_block
|
tabs/leaderboard.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
|
3 |
+
def leaderboard():
|
4 |
+
with gr.Blocks() as leaderboard_block:
|
5 |
+
gr.Markdown("# Leaderboard")
|
6 |
+
gr.Markdown("## β¨ Coming Soon")
|
7 |
+
return leaderboard_block
|
utils/pdf_utils.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import fitz
|
2 |
+
from PIL import Image
|
3 |
+
|
4 |
+
def update_page(file_path, page_num, direction):
|
5 |
+
if page_num is None:
|
6 |
+
page_num = 0
|
7 |
+
new_page_num = page_num + direction
|
8 |
+
img, actual_page_num, total_pages = get_pdf_page(file_path, new_page_num)
|
9 |
+
return img, f"Page {actual_page_num + 1} of {total_pages}", actual_page_num
|
10 |
+
|
11 |
+
def get_pdf_page(file_path, page_num):
|
12 |
+
doc = fitz.open(file_path)
|
13 |
+
page_count = len(doc)
|
14 |
+
page_num = max(0, min(page_num, page_count - 1)) # Ensure page_num is within bounds
|
15 |
+
page = doc.load_page(page_num)
|
16 |
+
pix = page.get_pixmap()
|
17 |
+
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
18 |
+
return img, page_num, page_count
|
19 |
+
|
20 |
+
def load_pdf(file_path):
|
21 |
+
img, page_num, total_pages = get_pdf_page(file_path, 0)
|
22 |
+
return img, f"Page {page_num + 1} of {total_pages}", page_num
|