Spaces:

cambioml
/

parser-leaderboard

Runtime error

App Files Files Community

jojortz commited on Aug 24, 2024

Commit

f745baf

•

1 Parent(s): ef7c1f6

add content from extract-leaderboard-gradio

Browse files

Files changed (11) hide show

.gitignore +3 -0
app.py +10 -4
extractors/model.py +282 -0
extractors/model_runner.py +36 -0
leaderboard/vote.py +11 -0
postprocessors/postprocessor.py +40 -0
preprocessors/preprocessor.py +43 -0
requirements.txt +11 -0
tabs/arena_sxs.py +113 -0
tabs/leaderboard.py +7 -0
utils/pdf_utils.py +22 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+.env
+__pycache__
+poetry.lock

app.py CHANGED Viewed

@@ -1,7 +1,13 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

 import gradio as gr
+from tabs.arena_sxs import arena_sxs
+from tabs.leaderboard import leaderboard
+# Create the Gradio interface with tabs
+with gr.Blocks() as demo:
+    with gr.Tab("Parser Arena"):
+        arena_sxs()
+    with gr.Tab("Leaderboard"):
+        leaderboard()
+# Launch the app
+demo.launch(share=True)

extractors/model.py ADDED Viewed

	@@ -0,0 +1,282 @@

+import base64
+import json
+import os
+import requests
+import anthropic
+import openai
+from dotenv import load_dotenv
+from pathlib import Path
+from llama_parse import LlamaParse
+from llama_index.core import SimpleDirectoryReader
+from unstructured.partition.auto import partition
+from preprocessors.preprocessor import PdfPreprocessor
+from postprocessors.postprocessor import ClaudePostprocessor, GPTPostprocessor
+load_dotenv()
+class Model:
+    BASE_URL: str | None = None
+    API_KEY: str | None = None
+    MODEL: str | None = None
+    def __init_subclass__(cls) -> None:
+        """Initialize subclass."""
+        super().__init_subclass__()
+    def __init__(self):
+        """Init self"""
+    def extract(self, file_path: str) -> str:
+        """Extract model.
+        Args:
+            file_path: path to file to extract
+        Returns:
+            str: output markdown
+        """
+        raise NotImplementedError("Model extract method is not implemented")
+class AnyParserModel(Model):
+    BASE_URL = "https://k7u1c342dc.execute-api.us-west-2.amazonaws.com/v1/extract"
+    API_KEY = os.getenv('ANYPARSER_RT_API_KEY')
+    def extract(self, file_path: str) -> str:
+        """Extract data in real-time.
+        Args:
+            file_path (str): The path to the file to be parsed.
+        Returns:
+            str: The extracted data.
+        """
+        file_extension = Path(file_path).suffix.lower().lstrip(".")
+        # Check if the file exists
+        if not Path(file_path).is_file():
+            return "Error: File does not exist", "File does not exist"
+        if file_extension in ["pdf", "docx"]:
+            # Encode the PDF file content in base64
+            with open(file_path, "rb") as file:
+                encoded_file = base64.b64encode(file.read()).decode("utf-8")
+        else:
+            return "Error: Unsupported file type", "Unsupported file type"
+        # Create the JSON payload
+        payload = {
+            "file_content": encoded_file,
+            "file_type": file_extension,
+        }
+        # Set the headers
+        headers = {
+            "Content-Type": "application/json",
+            "x-api-key": self.API_KEY,
+        }
+        # Send the POST request
+        response = requests.post(
+            self.BASE_URL, headers=headers, data=json.dumps(payload), timeout=30
+        )
+        # Check if the request was successful
+        if response.status_code == 200:
+            try:
+                response_data = response.json()
+                response_list = []
+                for text in response_data["markdown"]:
+                    response_list.append(text)
+                markdown_text = "\n".join(response_list)
+                return markdown_text
+            except json.JSONDecodeError:
+                return "Error: Invalid JSON response", f"Response: {response.text}"
+        else:
+            return f"Error: {response.status_code}", f"Response: {response.text}"
+class LlamaParseModel(Model):
+    BASE_URL = None
+    API_KEY = os.getenv('LLAMA_CLOUD_API_KEY')
+    def __init__(self):
+        """Init."""
+        super().__init__()
+        if not self.API_KEY:
+            raise ValueError("The API key is required. Please set the LLAMA_CLOUD_API_KEY environment variable.")
+    def extract(self, file_path: str) -> str:
+        """Extract data in real-time.
+        Args:
+            file_path (str): The path to the file to be parsed.
+        Returns:
+            str: The extracted data.
+        """
+        try:
+            parser = LlamaParse(
+                result_type="markdown",
+                num_workers=4,
+                verbose=True,
+                language="en",
+            )
+            file_extractor = {".pdf": parser}
+            documents = SimpleDirectoryReader(input_files=[file_path], file_extractor=file_extractor).load_data()
+            markdown = "\n\n".join([doc.text for doc in documents])
+            return markdown
+        except Exception as e:
+            print(f"Error processing input: {str(e)}")
+            return f"Error processing with LlamaParse: {str(e)}"
+class UnstructuredModel(Model):
+    BASE_URL = None
+    API_KEY = None
+    def __init__(self):
+        """Init."""
+        super().__init__()
+    def extract(self, file_path: str) -> str:
+        """Extract data in real-time.
+        Args:
+            file_path (str): The path to the file to be parsed.
+        Returns:
+            str: The extracted data.
+        """
+        try:
+            elements = partition(file_path)
+            parsed_text = "\n".join(str(element) for element in elements)
+            markdown = parsed_text if parsed_text else "No content parsed"
+            return markdown
+        except Exception as e:
+            return f"Error processing UnstructuredModel: {str(e)}"
+class GPTModel(Model):
+    BASE_URL = None
+    API_KEY = os.getenv("OPENAI_API_KEY")
+    MODEL = "gpt-4o-mini"
+    REQUIRES_OPENAI = True
+    def __init__(self):
+        """Init."""
+        super().__init__()
+        if not self.API_KEY:
+            raise ValueError(
+                "The API key is required. Please set the OPENAI_API_KEY environment variable."
+            )
+        self._client = openai.OpenAI(api_key=self.API_KEY)
+    def extract(self, file_path: str) -> str:
+        """Extract data in real-time.
+        Args:
+            file_path (str): The path to the file to be parsed.
+        Returns:
+            str: The extracted data.
+        """
+        pdf_preprocessor = PdfPreprocessor()
+        gpt_postprocessor = GPTPostprocessor()
+        file_contents = pdf_preprocessor.run(file_path)
+        contents = []
+        for content in file_contents:
+            contents.append(
+            {
+            "type": "image_url",
+            "image_url": {
+                "url": f"data:image/jpeg;base64,{content}",
+            },
+            })
+        messages = [
+            {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Convert this image to markdown"},
+                *contents,
+            ],
+            }
+        ]
+        response = self._client.chat.completions.create(
+            model=self.MODEL,
+            messages=messages,
+        )
+        return gpt_postprocessor.run(response.choices[0].message.content)
+class ClaudeModel(Model):
+    BASE_URL = "http://103.114.163.134:3000/v1/"
+    API_KEY = os.getenv("ANTHROPIC_API_KEY")
+    MODEL = "claude-3-5-sonnet-20240620"
+    REQUIRES_OPENAI = True
+    def __init__(self):
+        """Init."""
+        super().__init__()
+        if not self.API_KEY:
+            raise ValueError(
+                "The API key is required. Please set the ANTHROPIC_API_KEY environment variable."
+            )
+        self._client = anthropic.Anthropic(
+            api_key=self.API_KEY,
+        )
+    def extract(self, file_path: str) -> str:
+        """Extract data in real-time.
+        Args:
+            file_path (str): The path to the file to be parsed.
+        Returns:
+            str: The extracted data.
+        """
+        prompt = "Convert this image to markdown."
+        pdf_preprocessor = PdfPreprocessor()
+        claude_postprocessor = ClaudePostprocessor()
+        file_contents = pdf_preprocessor.run(file_path)
+        contents = []
+        for content in file_contents:
+            contents.append(
+                {
+                    "type": "image",
+                    "source": {
+                        "type": "base64",
+                        "media_type": "image/jpeg",
+                        "data": content,
+                    }
+                })
+        messages = [
+            {"role": "user", "content": [
+                {"type": "text", "text": prompt},
+                *contents,
+            ]}
+        ]
+        try:
+            response = self._client.messages.create(
+                model="claude-3-5-sonnet-20240620", max_tokens=1024, messages=messages
+            )
+            print(response.content[0].text)
+            return claude_postprocessor.run(response.content[0].text)
+        except Exception as e:
+            return f"Error processing ClaudeModel: {str(e)}"

extractors/model_runner.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import concurrent.futures
+from extractors.model import AnyParserModel, LlamaParseModel, UnstructuredModel, GPTModel, ClaudeModel
+ap_rt = AnyParserModel()
+lp = LlamaParseModel()
+un = UnstructuredModel()
+gpt = GPTModel()
+claude = ClaudeModel()
+model_function_map = {
+    "AnyParser": ap_rt.extract,
+    "LlamaParse": lp.extract,
+    "Unstructured": un.extract,
+    "GPT-4o-mini": gpt.extract,
+    "Claude-3.5-Sonnet": claude.extract,
+}
+models = [key for key in model_function_map]
+def run_extract(model, file_path):
+    print('Running extract: model', model, 'file_path', file_path)
+    extractor = model_function_map[model]
+    markdown = extractor(file_path)
+    return markdown
+def run_extract_parallel(model_a, model_b, pdf):
+    with concurrent.futures.ThreadPoolExecutor() as executor:
+        # Submit tasks to the executor for parallel execution
+        future_a = executor.submit(run_extract, model_a, pdf)
+        future_b = executor.submit(run_extract, model_b, pdf)
+        # Get the results as they complete
+        result_a = future_a.result()
+        result_b = future_b.result()
+    return result_a, result_b

leaderboard/vote.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from enum import Enum
+class Vote(Enum):
+    GOOD = "GOOD"
+    BAD = "BAD"
+    NEUTRAL = "NEUTRAL"
+def vote_for_model(model_a: str, model_a_vote: Vote, model_b: str, model_b_vote: Vote):
+    print(f"Voting for {model_a}: {model_a_vote} | Voting for {model_b}: {model_b_vote}")
+    return "", ""

postprocessors/postprocessor.py ADDED Viewed

	@@ -0,0 +1,40 @@

+class Postprocessor:
+    """Postprocessor."""
+    def run(self, text: str) -> str:
+        """Postprocess."""
+        raise NotImplementedError("Postprocess method is not implemented")
+class ClaudePostprocessor(Postprocessor):
+    """Claude Postprocessor."""
+    def run(self, text: str) -> str:
+        """Clean the response from the Claude model.
+        Args:
+            text (str): The response from the Claude model.
+        Returns:
+            str: The cleaned response.
+        """
+        # remove the ```markdown and ``` at the beginning and end of the text
+        text = text.replace("```markdown", "").replace("```", "")
+        # Remove any leading or trailing whitespace
+        text = text.strip()
+        return text
+class GPTPostprocessor(Postprocessor):
+    """GPT Postprocessor."""
+    def run(self, text: str) -> str:
+        """Clean the response from the GPT model.
+        Args:
+            text (str): The response from the GPT model.
+        Returns:
+            str: The cleaned response.
+        """
+        # remove the ```markdown and ``` at the beginning and end of the text
+        text = text.replace("```markdown", "").replace("```", "")
+        # Remove any leading or trailing whitespace
+        text = text.strip()
+        return text

preprocessors/preprocessor.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import base64
+import io
+import pdf2image
+from typing import Any
+class Preprocessor:
+    """Preprocessor."""
+    def run(self, file_path: str) -> Any:
+        """Preprocess."""
+        raise NotImplementedError("Preprocess method is not implemented")
+# Convert PDF to image
+class PdfPreprocessor(Preprocessor):
+    """PDF Preprocessor."""
+    def run(self, file_path: str) -> str:
+        images = pdf2image.convert_from_path(file_path)
+        image = images[0]  # Assuming there is only one page in the PDF
+        # Convert image to base64
+        with io.BytesIO() as buffer:
+            image.save(buffer, format="JPEG")
+            image_content = buffer.getvalue()
+        file_content = base64.b64encode(image_content).decode("utf-8")
+        # Process all pages and return a list of images
+        images = pdf2image.convert_from_path(file_path)
+        image_list = []
+        for image in images:
+            # Convert image to base64
+            with io.BytesIO() as buffer:
+                image.save(buffer, format="JPEG")
+                image_content = buffer.getvalue()
+            file_content = base64.b64encode(image_content).decode("utf-8")
+            image_list.append(file_content)
+        return image_list

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+gradio==4.41.0
+pillow==10.4.0
+pymupdf==1.24.9
+python-dotenv==1.0.1
+llama-index-core==0.10.68.post1
+llama-parse==0.4.9
+llama-index-readers-file==0.1.33
+unstructured[pdf]==0.15.7
+openai==1.42.0
+pdf2image==1.17.0
+anthropic==0.34.1

tabs/arena_sxs.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import gradio as gr
+from utils.pdf_utils import load_pdf, update_page
+from extractors.model_runner import models, run_extract_parallel
+from leaderboard.vote import vote_for_model, Vote
+def update_dropdowns(model_a_choice, model_b_choice):
+    if model_a_choice == model_b_choice:
+        options_b = [m for m in models if m != model_a_choice]
+        return gr.update(choices=options_b, value=options_b[0] if options_b else None)
+    else:
+        options_b = [m for m in models if m != model_a_choice]
+        return gr.update(choices=options_b, value=model_b_choice)
+def sync_models(model_a_choice, model_b_choice):
+    updated_dropdown_b = update_dropdowns(model_a_choice, model_b_choice)
+    return updated_dropdown_b
+def dummy_function_a(model_a_choice):
+    return f"Model A selected: {model_a_choice}"
+def dummy_function_b(model_b_choice):
+    return f"Model B selected: {model_b_choice}"
+def update_button(file):
+    return gr.update(interactive=bool(file))
+def update_vote_button(output):
+    is_active = bool(output)
+    return [gr.update(interactive=is_active) for _ in range(4)]
+def arena_sxs():
+    with gr.Blocks() as arena_block:
+        gr.Markdown("# Rules")
+        gr.Markdown("- Upload a PDF file to extract with two chosen models (e.g., Llama, Unstructured, ChatGPT, Claude) and vote for the better one!")
+        gr.Markdown("- You can upload multiple files until you identify a winner.")
+        gr.Markdown("## 1. Upload a file.")
+        gr.Markdown("Only PDF files supported.")
+        with gr.Row():
+            with gr.Column(scale=2):
+                pdf = gr.File(type="filepath", label="Upload PDF", file_types=[".pdf"])
+                pdf_image = gr.Image(label="PDF Page")
+                page_info = gr.Textbox(label="")
+                current_page = gr.State(value=0)
+                with gr.Row():
+                    prev_button = gr.Button("Previous")
+                    next_button = gr.Button("Next")
+        gr.Markdown("---")
+        gr.Markdown("## 2. Choose two models to compare")
+        with gr.Blocks():
+            with gr.Row():
+                model_a = gr.Dropdown(choices=models, value=models[0], label="")
+                model_b = gr.Dropdown(choices=[m for m in models if m != models[0]], value=models[1], label="")
+            with gr.Row():
+                output_a = gr.Markdown(height=400)
+                output_b = gr.Markdown(height=400)
+        with gr.Row():
+            extract_button = gr.Button("Extract", interactive=False)
+        with gr.Row():
+            vote_model_a_button = gr.Button("👈 A is better", interactive=False)
+            vote_model_b_button = gr.Button("👉 B is better", interactive=False)
+            vote_tie_button = gr.Button("🤝 Tie", interactive=False)
+            vote_bad_button = gr.Button("👎 Both are bad", interactive=False)
+        #Change handlers
+        pdf.change(load_pdf, inputs=[pdf], outputs=[pdf_image, page_info, current_page])
+        pdf.change(fn=update_button, inputs=pdf, outputs=extract_button)
+        model_a.change(sync_models, inputs=[model_a, model_b], outputs=model_b)
+        model_a.change(dummy_function_a, inputs=model_a)
+        model_b.change(dummy_function_b, inputs=model_b)
+        output_a.change(fn=update_vote_button, inputs=output_a, outputs=[vote_model_a_button, vote_model_b_button, vote_tie_button, vote_bad_button])
+        output_b.change(fn=update_vote_button, inputs=output_b, outputs=[vote_model_a_button, vote_model_b_button, vote_tie_button, vote_bad_button])
+        # Button Handlers
+        prev_button.click(
+            fn=lambda file_path, page: update_page(file_path, page, -1),
+            inputs=[pdf, current_page],
+            outputs=[pdf_image, page_info, current_page],
+        )
+        next_button.click(
+            fn=lambda file_path, page: update_page(file_path, page, 1),
+            inputs=[pdf, current_page],
+            outputs=[pdf_image, page_info, current_page],
+        )
+        extract_button.click(
+            fn=run_extract_parallel,
+            inputs=[model_a, model_b, pdf],
+            outputs=[output_a, output_b],
+        )
+        def clear_outputs():
+            return "", ""
+        extract_button.click(
+            fn=clear_outputs,
+            outputs=[output_a, output_b],
+        )
+        vote_model_a_button.click(fn=lambda model_a, model_b: vote_for_model(model_a, Vote.GOOD, model_b, Vote.NEUTRAL), inputs=[model_a, model_b], outputs=[output_a, output_b])
+        vote_model_b_button.click(fn=lambda model_a, model_b: vote_for_model(model_a, Vote.NEUTRAL, model_b, Vote.GOOD), inputs=[model_a, model_b], outputs=[output_a, output_b])
+        vote_tie_button.click(fn=lambda model_a, model_b: vote_for_model(model_a, Vote.NEUTRAL, model_b, Vote.NEUTRAL), inputs=[model_a, model_b], outputs=[output_a, output_b])
+        vote_bad_button.click(fn=lambda model_a, model_b: vote_for_model(model_a, Vote.BAD, model_b, Vote.BAD), inputs=[model_a, model_b], outputs=[output_a, output_b])
+    return arena_block

tabs/leaderboard.py ADDED Viewed

	@@ -0,0 +1,7 @@

+import gradio as gr
+def leaderboard():
+    with gr.Blocks() as leaderboard_block:
+        gr.Markdown("# Leaderboard")
+        gr.Markdown("## ✨ Coming Soon")
+    return leaderboard_block

utils/pdf_utils.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import fitz
+from PIL import Image
+def update_page(file_path, page_num, direction):
+    if page_num is None:
+        page_num = 0
+    new_page_num = page_num + direction
+    img, actual_page_num, total_pages = get_pdf_page(file_path, new_page_num)
+    return img, f"Page {actual_page_num + 1} of {total_pages}", actual_page_num
+def get_pdf_page(file_path, page_num):
+    doc = fitz.open(file_path)
+    page_count = len(doc)
+    page_num = max(0, min(page_num, page_count - 1))  # Ensure page_num is within bounds
+    page = doc.load_page(page_num)
+    pix = page.get_pixmap()
+    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+    return img, page_num, page_count
+def load_pdf(file_path):
+    img, page_num, total_pages = get_pdf_page(file_path, 0)
+    return img, f"Page {page_num + 1} of {total_pages}", page_num