Spaces:
Runtime error
Runtime error
add content from extract-leaderboard-gradio
Browse files- .gitignore +3 -0
- app.py +10 -4
- extractors/model.py +282 -0
- extractors/model_runner.py +36 -0
- leaderboard/vote.py +11 -0
- postprocessors/postprocessor.py +40 -0
- preprocessors/preprocessor.py +43 -0
- requirements.txt +11 -0
- tabs/arena_sxs.py +113 -0
- tabs/leaderboard.py +7 -0
- utils/pdf_utils.py +22 -0
.gitignore
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.env
|
| 2 |
+
__pycache__
|
| 3 |
+
poetry.lock
|
app.py
CHANGED
|
@@ -1,7 +1,13 @@
|
|
| 1 |
import gradio as gr
|
|
|
|
|
|
|
| 2 |
|
| 3 |
-
|
| 4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
-
|
| 7 |
-
demo.launch()
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
+
from tabs.arena_sxs import arena_sxs
|
| 3 |
+
from tabs.leaderboard import leaderboard
|
| 4 |
|
| 5 |
+
# Create the Gradio interface with tabs
|
| 6 |
+
with gr.Blocks() as demo:
|
| 7 |
+
with gr.Tab("Parser Arena"):
|
| 8 |
+
arena_sxs()
|
| 9 |
+
with gr.Tab("Leaderboard"):
|
| 10 |
+
leaderboard()
|
| 11 |
|
| 12 |
+
# Launch the app
|
| 13 |
+
demo.launch(share=True)
|
extractors/model.py
ADDED
|
@@ -0,0 +1,282 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
import requests
|
| 5 |
+
|
| 6 |
+
import anthropic
|
| 7 |
+
import openai
|
| 8 |
+
from dotenv import load_dotenv
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from llama_parse import LlamaParse
|
| 11 |
+
from llama_index.core import SimpleDirectoryReader
|
| 12 |
+
from unstructured.partition.auto import partition
|
| 13 |
+
from preprocessors.preprocessor import PdfPreprocessor
|
| 14 |
+
from postprocessors.postprocessor import ClaudePostprocessor, GPTPostprocessor
|
| 15 |
+
|
| 16 |
+
load_dotenv()
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class Model:
|
| 20 |
+
BASE_URL: str | None = None
|
| 21 |
+
API_KEY: str | None = None
|
| 22 |
+
MODEL: str | None = None
|
| 23 |
+
|
| 24 |
+
def __init_subclass__(cls) -> None:
|
| 25 |
+
"""Initialize subclass."""
|
| 26 |
+
super().__init_subclass__()
|
| 27 |
+
|
| 28 |
+
def __init__(self):
|
| 29 |
+
"""Init self"""
|
| 30 |
+
|
| 31 |
+
def extract(self, file_path: str) -> str:
|
| 32 |
+
"""Extract model.
|
| 33 |
+
|
| 34 |
+
Args:
|
| 35 |
+
file_path: path to file to extract
|
| 36 |
+
|
| 37 |
+
Returns:
|
| 38 |
+
str: output markdown
|
| 39 |
+
"""
|
| 40 |
+
raise NotImplementedError("Model extract method is not implemented")
|
| 41 |
+
|
| 42 |
+
class AnyParserModel(Model):
|
| 43 |
+
BASE_URL = "https://k7u1c342dc.execute-api.us-west-2.amazonaws.com/v1/extract"
|
| 44 |
+
API_KEY = os.getenv('ANYPARSER_RT_API_KEY')
|
| 45 |
+
|
| 46 |
+
def extract(self, file_path: str) -> str:
|
| 47 |
+
"""Extract data in real-time.
|
| 48 |
+
|
| 49 |
+
Args:
|
| 50 |
+
file_path (str): The path to the file to be parsed.
|
| 51 |
+
|
| 52 |
+
Returns:
|
| 53 |
+
str: The extracted data.
|
| 54 |
+
"""
|
| 55 |
+
file_extension = Path(file_path).suffix.lower().lstrip(".")
|
| 56 |
+
|
| 57 |
+
# Check if the file exists
|
| 58 |
+
if not Path(file_path).is_file():
|
| 59 |
+
return "Error: File does not exist", "File does not exist"
|
| 60 |
+
|
| 61 |
+
if file_extension in ["pdf", "docx"]:
|
| 62 |
+
# Encode the PDF file content in base64
|
| 63 |
+
with open(file_path, "rb") as file:
|
| 64 |
+
encoded_file = base64.b64encode(file.read()).decode("utf-8")
|
| 65 |
+
else:
|
| 66 |
+
return "Error: Unsupported file type", "Unsupported file type"
|
| 67 |
+
|
| 68 |
+
# Create the JSON payload
|
| 69 |
+
payload = {
|
| 70 |
+
"file_content": encoded_file,
|
| 71 |
+
"file_type": file_extension,
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
# Set the headers
|
| 76 |
+
headers = {
|
| 77 |
+
"Content-Type": "application/json",
|
| 78 |
+
"x-api-key": self.API_KEY,
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
# Send the POST request
|
| 82 |
+
response = requests.post(
|
| 83 |
+
self.BASE_URL, headers=headers, data=json.dumps(payload), timeout=30
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
# Check if the request was successful
|
| 87 |
+
if response.status_code == 200:
|
| 88 |
+
try:
|
| 89 |
+
response_data = response.json()
|
| 90 |
+
response_list = []
|
| 91 |
+
for text in response_data["markdown"]:
|
| 92 |
+
response_list.append(text)
|
| 93 |
+
markdown_text = "\n".join(response_list)
|
| 94 |
+
return markdown_text
|
| 95 |
+
except json.JSONDecodeError:
|
| 96 |
+
return "Error: Invalid JSON response", f"Response: {response.text}"
|
| 97 |
+
else:
|
| 98 |
+
return f"Error: {response.status_code}", f"Response: {response.text}"
|
| 99 |
+
|
| 100 |
+
class LlamaParseModel(Model):
|
| 101 |
+
BASE_URL = None
|
| 102 |
+
API_KEY = os.getenv('LLAMA_CLOUD_API_KEY')
|
| 103 |
+
|
| 104 |
+
def __init__(self):
|
| 105 |
+
"""Init."""
|
| 106 |
+
super().__init__()
|
| 107 |
+
if not self.API_KEY:
|
| 108 |
+
raise ValueError("The API key is required. Please set the LLAMA_CLOUD_API_KEY environment variable.")
|
| 109 |
+
|
| 110 |
+
def extract(self, file_path: str) -> str:
|
| 111 |
+
"""Extract data in real-time.
|
| 112 |
+
|
| 113 |
+
Args:
|
| 114 |
+
file_path (str): The path to the file to be parsed.
|
| 115 |
+
|
| 116 |
+
Returns:
|
| 117 |
+
str: The extracted data.
|
| 118 |
+
"""
|
| 119 |
+
try:
|
| 120 |
+
parser = LlamaParse(
|
| 121 |
+
result_type="markdown",
|
| 122 |
+
num_workers=4,
|
| 123 |
+
verbose=True,
|
| 124 |
+
language="en",
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
file_extractor = {".pdf": parser}
|
| 128 |
+
documents = SimpleDirectoryReader(input_files=[file_path], file_extractor=file_extractor).load_data()
|
| 129 |
+
|
| 130 |
+
markdown = "\n\n".join([doc.text for doc in documents])
|
| 131 |
+
|
| 132 |
+
return markdown
|
| 133 |
+
except Exception as e:
|
| 134 |
+
print(f"Error processing input: {str(e)}")
|
| 135 |
+
return f"Error processing with LlamaParse: {str(e)}"
|
| 136 |
+
|
| 137 |
+
class UnstructuredModel(Model):
|
| 138 |
+
BASE_URL = None
|
| 139 |
+
API_KEY = None
|
| 140 |
+
|
| 141 |
+
def __init__(self):
|
| 142 |
+
"""Init."""
|
| 143 |
+
super().__init__()
|
| 144 |
+
|
| 145 |
+
def extract(self, file_path: str) -> str:
|
| 146 |
+
"""Extract data in real-time.
|
| 147 |
+
|
| 148 |
+
Args:
|
| 149 |
+
file_path (str): The path to the file to be parsed.
|
| 150 |
+
|
| 151 |
+
Returns:
|
| 152 |
+
str: The extracted data.
|
| 153 |
+
"""
|
| 154 |
+
try:
|
| 155 |
+
|
| 156 |
+
elements = partition(file_path)
|
| 157 |
+
|
| 158 |
+
parsed_text = "\n".join(str(element) for element in elements)
|
| 159 |
+
|
| 160 |
+
markdown = parsed_text if parsed_text else "No content parsed"
|
| 161 |
+
return markdown
|
| 162 |
+
except Exception as e:
|
| 163 |
+
return f"Error processing UnstructuredModel: {str(e)}"
|
| 164 |
+
|
| 165 |
+
class GPTModel(Model):
|
| 166 |
+
BASE_URL = None
|
| 167 |
+
API_KEY = os.getenv("OPENAI_API_KEY")
|
| 168 |
+
MODEL = "gpt-4o-mini"
|
| 169 |
+
REQUIRES_OPENAI = True
|
| 170 |
+
|
| 171 |
+
def __init__(self):
|
| 172 |
+
"""Init."""
|
| 173 |
+
super().__init__()
|
| 174 |
+
if not self.API_KEY:
|
| 175 |
+
raise ValueError(
|
| 176 |
+
"The API key is required. Please set the OPENAI_API_KEY environment variable."
|
| 177 |
+
)
|
| 178 |
+
self._client = openai.OpenAI(api_key=self.API_KEY)
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
def extract(self, file_path: str) -> str:
|
| 182 |
+
"""Extract data in real-time.
|
| 183 |
+
|
| 184 |
+
Args:
|
| 185 |
+
file_path (str): The path to the file to be parsed.
|
| 186 |
+
|
| 187 |
+
Returns:
|
| 188 |
+
str: The extracted data.
|
| 189 |
+
"""
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
pdf_preprocessor = PdfPreprocessor()
|
| 193 |
+
gpt_postprocessor = GPTPostprocessor()
|
| 194 |
+
file_contents = pdf_preprocessor.run(file_path)
|
| 195 |
+
contents = []
|
| 196 |
+
for content in file_contents:
|
| 197 |
+
contents.append(
|
| 198 |
+
{
|
| 199 |
+
"type": "image_url",
|
| 200 |
+
"image_url": {
|
| 201 |
+
"url": f"data:image/jpeg;base64,{content}",
|
| 202 |
+
},
|
| 203 |
+
})
|
| 204 |
+
|
| 205 |
+
messages = [
|
| 206 |
+
{
|
| 207 |
+
"role": "user",
|
| 208 |
+
"content": [
|
| 209 |
+
{"type": "text", "text": "Convert this image to markdown"},
|
| 210 |
+
*contents,
|
| 211 |
+
],
|
| 212 |
+
}
|
| 213 |
+
]
|
| 214 |
+
|
| 215 |
+
response = self._client.chat.completions.create(
|
| 216 |
+
model=self.MODEL,
|
| 217 |
+
messages=messages,
|
| 218 |
+
)
|
| 219 |
+
|
| 220 |
+
return gpt_postprocessor.run(response.choices[0].message.content)
|
| 221 |
+
|
| 222 |
+
class ClaudeModel(Model):
|
| 223 |
+
BASE_URL = "http://103.114.163.134:3000/v1/"
|
| 224 |
+
API_KEY = os.getenv("ANTHROPIC_API_KEY")
|
| 225 |
+
MODEL = "claude-3-5-sonnet-20240620"
|
| 226 |
+
REQUIRES_OPENAI = True
|
| 227 |
+
|
| 228 |
+
def __init__(self):
|
| 229 |
+
"""Init."""
|
| 230 |
+
super().__init__()
|
| 231 |
+
if not self.API_KEY:
|
| 232 |
+
raise ValueError(
|
| 233 |
+
"The API key is required. Please set the ANTHROPIC_API_KEY environment variable."
|
| 234 |
+
)
|
| 235 |
+
self._client = anthropic.Anthropic(
|
| 236 |
+
api_key=self.API_KEY,
|
| 237 |
+
)
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
def extract(self, file_path: str) -> str:
|
| 241 |
+
"""Extract data in real-time.
|
| 242 |
+
|
| 243 |
+
Args:
|
| 244 |
+
file_path (str): The path to the file to be parsed.
|
| 245 |
+
|
| 246 |
+
Returns:
|
| 247 |
+
str: The extracted data.
|
| 248 |
+
"""
|
| 249 |
+
|
| 250 |
+
prompt = "Convert this image to markdown."
|
| 251 |
+
pdf_preprocessor = PdfPreprocessor()
|
| 252 |
+
claude_postprocessor = ClaudePostprocessor()
|
| 253 |
+
file_contents = pdf_preprocessor.run(file_path)
|
| 254 |
+
|
| 255 |
+
contents = []
|
| 256 |
+
for content in file_contents:
|
| 257 |
+
contents.append(
|
| 258 |
+
{
|
| 259 |
+
"type": "image",
|
| 260 |
+
"source": {
|
| 261 |
+
"type": "base64",
|
| 262 |
+
"media_type": "image/jpeg",
|
| 263 |
+
"data": content,
|
| 264 |
+
}
|
| 265 |
+
})
|
| 266 |
+
|
| 267 |
+
messages = [
|
| 268 |
+
{"role": "user", "content": [
|
| 269 |
+
{"type": "text", "text": prompt},
|
| 270 |
+
*contents,
|
| 271 |
+
]}
|
| 272 |
+
]
|
| 273 |
+
|
| 274 |
+
try:
|
| 275 |
+
response = self._client.messages.create(
|
| 276 |
+
model="claude-3-5-sonnet-20240620", max_tokens=1024, messages=messages
|
| 277 |
+
)
|
| 278 |
+
print(response.content[0].text)
|
| 279 |
+
return claude_postprocessor.run(response.content[0].text)
|
| 280 |
+
except Exception as e:
|
| 281 |
+
return f"Error processing ClaudeModel: {str(e)}"
|
| 282 |
+
|
extractors/model_runner.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import concurrent.futures
|
| 2 |
+
from extractors.model import AnyParserModel, LlamaParseModel, UnstructuredModel, GPTModel, ClaudeModel
|
| 3 |
+
|
| 4 |
+
ap_rt = AnyParserModel()
|
| 5 |
+
lp = LlamaParseModel()
|
| 6 |
+
un = UnstructuredModel()
|
| 7 |
+
gpt = GPTModel()
|
| 8 |
+
claude = ClaudeModel()
|
| 9 |
+
|
| 10 |
+
model_function_map = {
|
| 11 |
+
"AnyParser": ap_rt.extract,
|
| 12 |
+
"LlamaParse": lp.extract,
|
| 13 |
+
"Unstructured": un.extract,
|
| 14 |
+
"GPT-4o-mini": gpt.extract,
|
| 15 |
+
"Claude-3.5-Sonnet": claude.extract,
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
models = [key for key in model_function_map]
|
| 19 |
+
|
| 20 |
+
def run_extract(model, file_path):
|
| 21 |
+
print('Running extract: model', model, 'file_path', file_path)
|
| 22 |
+
extractor = model_function_map[model]
|
| 23 |
+
markdown = extractor(file_path)
|
| 24 |
+
return markdown
|
| 25 |
+
|
| 26 |
+
def run_extract_parallel(model_a, model_b, pdf):
|
| 27 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 28 |
+
# Submit tasks to the executor for parallel execution
|
| 29 |
+
future_a = executor.submit(run_extract, model_a, pdf)
|
| 30 |
+
future_b = executor.submit(run_extract, model_b, pdf)
|
| 31 |
+
|
| 32 |
+
# Get the results as they complete
|
| 33 |
+
result_a = future_a.result()
|
| 34 |
+
result_b = future_b.result()
|
| 35 |
+
|
| 36 |
+
return result_a, result_b
|
leaderboard/vote.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from enum import Enum
|
| 2 |
+
|
| 3 |
+
class Vote(Enum):
|
| 4 |
+
GOOD = "GOOD"
|
| 5 |
+
BAD = "BAD"
|
| 6 |
+
NEUTRAL = "NEUTRAL"
|
| 7 |
+
|
| 8 |
+
def vote_for_model(model_a: str, model_a_vote: Vote, model_b: str, model_b_vote: Vote):
|
| 9 |
+
print(f"Voting for {model_a}: {model_a_vote} | Voting for {model_b}: {model_b_vote}")
|
| 10 |
+
|
| 11 |
+
return "", ""
|
postprocessors/postprocessor.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
class Postprocessor:
|
| 2 |
+
"""Postprocessor."""
|
| 3 |
+
|
| 4 |
+
def run(self, text: str) -> str:
|
| 5 |
+
"""Postprocess."""
|
| 6 |
+
raise NotImplementedError("Postprocess method is not implemented")
|
| 7 |
+
|
| 8 |
+
class ClaudePostprocessor(Postprocessor):
|
| 9 |
+
"""Claude Postprocessor."""
|
| 10 |
+
def run(self, text: str) -> str:
|
| 11 |
+
"""Clean the response from the Claude model.
|
| 12 |
+
|
| 13 |
+
Args:
|
| 14 |
+
text (str): The response from the Claude model.
|
| 15 |
+
|
| 16 |
+
Returns:
|
| 17 |
+
str: The cleaned response.
|
| 18 |
+
"""
|
| 19 |
+
# remove the ```markdown and ``` at the beginning and end of the text
|
| 20 |
+
text = text.replace("```markdown", "").replace("```", "")
|
| 21 |
+
# Remove any leading or trailing whitespace
|
| 22 |
+
text = text.strip()
|
| 23 |
+
return text
|
| 24 |
+
|
| 25 |
+
class GPTPostprocessor(Postprocessor):
|
| 26 |
+
"""GPT Postprocessor."""
|
| 27 |
+
def run(self, text: str) -> str:
|
| 28 |
+
"""Clean the response from the GPT model.
|
| 29 |
+
|
| 30 |
+
Args:
|
| 31 |
+
text (str): The response from the GPT model.
|
| 32 |
+
|
| 33 |
+
Returns:
|
| 34 |
+
str: The cleaned response.
|
| 35 |
+
"""
|
| 36 |
+
# remove the ```markdown and ``` at the beginning and end of the text
|
| 37 |
+
text = text.replace("```markdown", "").replace("```", "")
|
| 38 |
+
# Remove any leading or trailing whitespace
|
| 39 |
+
text = text.strip()
|
| 40 |
+
return text
|
preprocessors/preprocessor.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
|
| 3 |
+
import base64
|
| 4 |
+
import io
|
| 5 |
+
import pdf2image
|
| 6 |
+
from typing import Any
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class Preprocessor:
|
| 10 |
+
"""Preprocessor."""
|
| 11 |
+
|
| 12 |
+
def run(self, file_path: str) -> Any:
|
| 13 |
+
"""Preprocess."""
|
| 14 |
+
raise NotImplementedError("Preprocess method is not implemented")
|
| 15 |
+
|
| 16 |
+
# Convert PDF to image
|
| 17 |
+
class PdfPreprocessor(Preprocessor):
|
| 18 |
+
"""PDF Preprocessor."""
|
| 19 |
+
|
| 20 |
+
def run(self, file_path: str) -> str:
|
| 21 |
+
images = pdf2image.convert_from_path(file_path)
|
| 22 |
+
image = images[0] # Assuming there is only one page in the PDF
|
| 23 |
+
|
| 24 |
+
# Convert image to base64
|
| 25 |
+
with io.BytesIO() as buffer:
|
| 26 |
+
image.save(buffer, format="JPEG")
|
| 27 |
+
image_content = buffer.getvalue()
|
| 28 |
+
|
| 29 |
+
file_content = base64.b64encode(image_content).decode("utf-8")
|
| 30 |
+
# Process all pages and return a list of images
|
| 31 |
+
images = pdf2image.convert_from_path(file_path)
|
| 32 |
+
image_list = []
|
| 33 |
+
|
| 34 |
+
for image in images:
|
| 35 |
+
# Convert image to base64
|
| 36 |
+
with io.BytesIO() as buffer:
|
| 37 |
+
image.save(buffer, format="JPEG")
|
| 38 |
+
image_content = buffer.getvalue()
|
| 39 |
+
|
| 40 |
+
file_content = base64.b64encode(image_content).decode("utf-8")
|
| 41 |
+
image_list.append(file_content)
|
| 42 |
+
|
| 43 |
+
return image_list
|
requirements.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio==4.41.0
|
| 2 |
+
pillow==10.4.0
|
| 3 |
+
pymupdf==1.24.9
|
| 4 |
+
python-dotenv==1.0.1
|
| 5 |
+
llama-index-core==0.10.68.post1
|
| 6 |
+
llama-parse==0.4.9
|
| 7 |
+
llama-index-readers-file==0.1.33
|
| 8 |
+
unstructured[pdf]==0.15.7
|
| 9 |
+
openai==1.42.0
|
| 10 |
+
pdf2image==1.17.0
|
| 11 |
+
anthropic==0.34.1
|
tabs/arena_sxs.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from utils.pdf_utils import load_pdf, update_page
|
| 3 |
+
from extractors.model_runner import models, run_extract_parallel
|
| 4 |
+
from leaderboard.vote import vote_for_model, Vote
|
| 5 |
+
|
| 6 |
+
def update_dropdowns(model_a_choice, model_b_choice):
|
| 7 |
+
if model_a_choice == model_b_choice:
|
| 8 |
+
options_b = [m for m in models if m != model_a_choice]
|
| 9 |
+
return gr.update(choices=options_b, value=options_b[0] if options_b else None)
|
| 10 |
+
else:
|
| 11 |
+
options_b = [m for m in models if m != model_a_choice]
|
| 12 |
+
return gr.update(choices=options_b, value=model_b_choice)
|
| 13 |
+
|
| 14 |
+
def sync_models(model_a_choice, model_b_choice):
|
| 15 |
+
updated_dropdown_b = update_dropdowns(model_a_choice, model_b_choice)
|
| 16 |
+
return updated_dropdown_b
|
| 17 |
+
|
| 18 |
+
def dummy_function_a(model_a_choice):
|
| 19 |
+
return f"Model A selected: {model_a_choice}"
|
| 20 |
+
|
| 21 |
+
def dummy_function_b(model_b_choice):
|
| 22 |
+
return f"Model B selected: {model_b_choice}"
|
| 23 |
+
|
| 24 |
+
def update_button(file):
|
| 25 |
+
return gr.update(interactive=bool(file))
|
| 26 |
+
|
| 27 |
+
def update_vote_button(output):
|
| 28 |
+
is_active = bool(output)
|
| 29 |
+
return [gr.update(interactive=is_active) for _ in range(4)]
|
| 30 |
+
|
| 31 |
+
def arena_sxs():
|
| 32 |
+
with gr.Blocks() as arena_block:
|
| 33 |
+
gr.Markdown("# Rules")
|
| 34 |
+
gr.Markdown("- Upload a PDF file to extract with two chosen models (e.g., Llama, Unstructured, ChatGPT, Claude) and vote for the better one!")
|
| 35 |
+
gr.Markdown("- You can upload multiple files until you identify a winner.")
|
| 36 |
+
|
| 37 |
+
gr.Markdown("## 1. Upload a file.")
|
| 38 |
+
gr.Markdown("Only PDF files supported.")
|
| 39 |
+
with gr.Row():
|
| 40 |
+
with gr.Column(scale=2):
|
| 41 |
+
pdf = gr.File(type="filepath", label="Upload PDF", file_types=[".pdf"])
|
| 42 |
+
pdf_image = gr.Image(label="PDF Page")
|
| 43 |
+
page_info = gr.Textbox(label="")
|
| 44 |
+
current_page = gr.State(value=0)
|
| 45 |
+
with gr.Row():
|
| 46 |
+
prev_button = gr.Button("Previous")
|
| 47 |
+
next_button = gr.Button("Next")
|
| 48 |
+
|
| 49 |
+
gr.Markdown("---")
|
| 50 |
+
gr.Markdown("## 2. Choose two models to compare")
|
| 51 |
+
|
| 52 |
+
with gr.Blocks():
|
| 53 |
+
with gr.Row():
|
| 54 |
+
model_a = gr.Dropdown(choices=models, value=models[0], label="")
|
| 55 |
+
model_b = gr.Dropdown(choices=[m for m in models if m != models[0]], value=models[1], label="")
|
| 56 |
+
|
| 57 |
+
with gr.Row():
|
| 58 |
+
output_a = gr.Markdown(height=400)
|
| 59 |
+
output_b = gr.Markdown(height=400)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
with gr.Row():
|
| 63 |
+
extract_button = gr.Button("Extract", interactive=False)
|
| 64 |
+
|
| 65 |
+
with gr.Row():
|
| 66 |
+
vote_model_a_button = gr.Button("π A is better", interactive=False)
|
| 67 |
+
vote_model_b_button = gr.Button("π B is better", interactive=False)
|
| 68 |
+
vote_tie_button = gr.Button("π€ Tie", interactive=False)
|
| 69 |
+
vote_bad_button = gr.Button("π Both are bad", interactive=False)
|
| 70 |
+
|
| 71 |
+
#Change handlers
|
| 72 |
+
pdf.change(load_pdf, inputs=[pdf], outputs=[pdf_image, page_info, current_page])
|
| 73 |
+
pdf.change(fn=update_button, inputs=pdf, outputs=extract_button)
|
| 74 |
+
model_a.change(sync_models, inputs=[model_a, model_b], outputs=model_b)
|
| 75 |
+
model_a.change(dummy_function_a, inputs=model_a)
|
| 76 |
+
model_b.change(dummy_function_b, inputs=model_b)
|
| 77 |
+
output_a.change(fn=update_vote_button, inputs=output_a, outputs=[vote_model_a_button, vote_model_b_button, vote_tie_button, vote_bad_button])
|
| 78 |
+
output_b.change(fn=update_vote_button, inputs=output_b, outputs=[vote_model_a_button, vote_model_b_button, vote_tie_button, vote_bad_button])
|
| 79 |
+
|
| 80 |
+
# Button Handlers
|
| 81 |
+
prev_button.click(
|
| 82 |
+
fn=lambda file_path, page: update_page(file_path, page, -1),
|
| 83 |
+
inputs=[pdf, current_page],
|
| 84 |
+
outputs=[pdf_image, page_info, current_page],
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
next_button.click(
|
| 88 |
+
fn=lambda file_path, page: update_page(file_path, page, 1),
|
| 89 |
+
inputs=[pdf, current_page],
|
| 90 |
+
outputs=[pdf_image, page_info, current_page],
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
extract_button.click(
|
| 94 |
+
fn=run_extract_parallel,
|
| 95 |
+
inputs=[model_a, model_b, pdf],
|
| 96 |
+
outputs=[output_a, output_b],
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
def clear_outputs():
|
| 100 |
+
return "", ""
|
| 101 |
+
|
| 102 |
+
extract_button.click(
|
| 103 |
+
fn=clear_outputs,
|
| 104 |
+
outputs=[output_a, output_b],
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
vote_model_a_button.click(fn=lambda model_a, model_b: vote_for_model(model_a, Vote.GOOD, model_b, Vote.NEUTRAL), inputs=[model_a, model_b], outputs=[output_a, output_b])
|
| 108 |
+
vote_model_b_button.click(fn=lambda model_a, model_b: vote_for_model(model_a, Vote.NEUTRAL, model_b, Vote.GOOD), inputs=[model_a, model_b], outputs=[output_a, output_b])
|
| 109 |
+
vote_tie_button.click(fn=lambda model_a, model_b: vote_for_model(model_a, Vote.NEUTRAL, model_b, Vote.NEUTRAL), inputs=[model_a, model_b], outputs=[output_a, output_b])
|
| 110 |
+
vote_bad_button.click(fn=lambda model_a, model_b: vote_for_model(model_a, Vote.BAD, model_b, Vote.BAD), inputs=[model_a, model_b], outputs=[output_a, output_b])
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
return arena_block
|
tabs/leaderboard.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
|
| 3 |
+
def leaderboard():
|
| 4 |
+
with gr.Blocks() as leaderboard_block:
|
| 5 |
+
gr.Markdown("# Leaderboard")
|
| 6 |
+
gr.Markdown("## β¨ Coming Soon")
|
| 7 |
+
return leaderboard_block
|
utils/pdf_utils.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import fitz
|
| 2 |
+
from PIL import Image
|
| 3 |
+
|
| 4 |
+
def update_page(file_path, page_num, direction):
|
| 5 |
+
if page_num is None:
|
| 6 |
+
page_num = 0
|
| 7 |
+
new_page_num = page_num + direction
|
| 8 |
+
img, actual_page_num, total_pages = get_pdf_page(file_path, new_page_num)
|
| 9 |
+
return img, f"Page {actual_page_num + 1} of {total_pages}", actual_page_num
|
| 10 |
+
|
| 11 |
+
def get_pdf_page(file_path, page_num):
|
| 12 |
+
doc = fitz.open(file_path)
|
| 13 |
+
page_count = len(doc)
|
| 14 |
+
page_num = max(0, min(page_num, page_count - 1)) # Ensure page_num is within bounds
|
| 15 |
+
page = doc.load_page(page_num)
|
| 16 |
+
pix = page.get_pixmap()
|
| 17 |
+
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
| 18 |
+
return img, page_num, page_count
|
| 19 |
+
|
| 20 |
+
def load_pdf(file_path):
|
| 21 |
+
img, page_num, total_pages = get_pdf_page(file_path, 0)
|
| 22 |
+
return img, f"Page {page_num + 1} of {total_pages}", page_num
|