|
import gradio as gr |
|
import os |
|
import torch |
|
|
|
from llama_parse import LlamaParse |
|
from llama_index.core import StorageContext, load_index_from_storage |
|
from llama_index.core.indices import MultiModalVectorStoreIndex |
|
from llama_index.core.schema import Document, ImageDocument |
|
from llama_index.embeddings.huggingface import HuggingFaceEmbedding |
|
|
|
|
|
example_indexes = { |
|
"IONIQ 2024": "./iconiq_report_index", |
|
"Uber 10k 2021": "./uber_index", |
|
} |
|
DEFAULT_INDEX = "IONIQ 2024" |
|
|
|
device = "cpu" |
|
if torch.cuda.is_available(): |
|
device = "cuda" |
|
elif torch.backends.mps.is_available(): |
|
device = "mps" |
|
|
|
image_embed_model = HuggingFaceEmbedding( |
|
model_name="llamaindex/vdr-2b-multi-v1", |
|
device=device, |
|
trust_remote_code=True, |
|
token=os.getenv("HUGGINGFACE_TOKEN"), |
|
model_kwargs={"torch_dtype": torch.float16}, |
|
embed_batch_size=2, |
|
) |
|
|
|
text_embed_model = HuggingFaceEmbedding( |
|
model_name="BAAI/bge-small-en", |
|
device=device, |
|
trust_remote_code=True, |
|
token=os.getenv("HUGGINGFACE_TOKEN"), |
|
embed_batch_size=1, |
|
) |
|
|
|
class IndexManager: |
|
"""Avoids deepcopying the index object in gr.State""" |
|
def __init__(self): |
|
self.current_index = None |
|
|
|
self.load_index(example_indexes[DEFAULT_INDEX]) |
|
|
|
def load_index(self, index_path): |
|
storage_context = StorageContext.from_defaults(persist_dir=index_path) |
|
self.current_index = load_index_from_storage( |
|
storage_context, |
|
embed_model=text_embed_model, |
|
image_embed_model=image_embed_model, |
|
) |
|
return f"Loaded index: {index_path}" |
|
|
|
def set_index(self, index): |
|
self.current_index = index |
|
|
|
def get_index(self): |
|
return self.current_index |
|
|
|
index_manager = IndexManager() |
|
|
|
def load_index(index_path: str) -> MultiModalVectorStoreIndex: |
|
index_manager.load_index(index_path) |
|
return index_manager.get_index() |
|
|
|
def create_index(file, llama_parse_key, progress=gr.Progress()): |
|
if not file or not llama_parse_key: |
|
return None, "Please provide both a file and LlamaParse API key" |
|
|
|
try: |
|
progress(0, desc="Initializing LlamaParse...") |
|
parser = LlamaParse( |
|
api_key=llama_parse_key, |
|
take_screenshot=True, |
|
) |
|
|
|
|
|
progress(0.2, desc="Processing document with LlamaParse...") |
|
md_json_obj = parser.get_json_result(file.name)[0] |
|
|
|
progress(0.4, desc="Downloading and processing images...") |
|
image_dicts = parser.get_images( |
|
[md_json_obj], |
|
download_path=os.path.join(os.path.dirname(file.name), f"{file.name}_images") |
|
) |
|
|
|
|
|
progress(0.6, desc="Creating text documents...") |
|
text = "" |
|
for page in md_json_obj["pages"]: |
|
text += page["md"] + "\n\n" |
|
text_docs = [Document(text=text.strip())] |
|
|
|
|
|
progress(0.8, desc="Creating image documents...") |
|
image_docs = [] |
|
for image_dict in image_dicts: |
|
image_docs.append(ImageDocument(text=image_dict["name"], image_path=image_dict["path"])) |
|
|
|
|
|
progress(0.9, desc="Creating final index...") |
|
index = MultiModalVectorStoreIndex.from_documents( |
|
text_docs + image_docs, |
|
embed_model=text_embed_model, |
|
image_embed_model=image_embed_model, |
|
) |
|
|
|
progress(1.0, desc="Complete!") |
|
index_manager.set_index(index) |
|
return "Index created successfully!" |
|
|
|
except Exception as e: |
|
return f"Error creating index: {str(e)}" |
|
|
|
def run_search(query, text_top_k, image_top_k): |
|
index = index_manager.get_index() |
|
if not index: |
|
return "Please create or select an index first.", [], [] |
|
|
|
retriever = index.as_retriever( |
|
similarity_top_k=text_top_k, |
|
image_similarity_top_k=image_top_k, |
|
) |
|
|
|
image_nodes = retriever.text_to_image_retrieve(query) |
|
text_nodes = retriever.text_retrieve(query) |
|
|
|
|
|
text_results = [{"text": node.text, "score": f"{node.score:.3f}"} for node in text_nodes] |
|
|
|
|
|
image_results = [] |
|
for node in image_nodes: |
|
if hasattr(node.node, 'image_path') and os.path.exists(node.node.image_path): |
|
try: |
|
image_results.append(( |
|
node.node.image_path, |
|
f"Similarity: {node.score:.3f}", |
|
)) |
|
except Exception as e: |
|
print(f"Error loading image {node.node.image_path}: {e}") |
|
|
|
return "Search completed!", text_results, image_results |
|
|
|
|
|
with gr.Blocks() as demo: |
|
gr.Markdown("# Multi-Modal Retrieval with LlamaIndex and llamaindex/vdr-2b-multi-v1") |
|
gr.Markdown(""" |
|
This demo shows how to use the new `llamaindex/vdr-2b-multi-v1` model for multi-modal document search. |
|
|
|
Using this model, we can index images and perform text-to-image retrieval. |
|
|
|
This demo compares to pure text retrieval using the `BAAI/bge-small-en` model. Is this a fair comparison? Not really, |
|
but it's the easiest to run in a limited huggingface space, and shows the strengths of screenshot-based retrieval. |
|
|
|
The two pre-made indexes are: |
|
- [IONIQ 2024](https://cdn.prod.website-files.com/65e1d7fb19a3e64b5c36fb38/66eb856e019e59758ef73759_ICONIQ%20Analytics%20%2B%20Insights%20-%20State%20of%20AI%20Sep24.pdf): A report on the 2024 State of AI. |
|
- [Uber 10k 2021](https://github.com/run-llama/llama_index/blob/main/docs/docs/examples/data/10k/uber_2021.pdf): The 2021 Uber 10k document. |
|
""" |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
|
|
with gr.Tab("Use Existing Index"): |
|
existing_index_dropdown = gr.Dropdown( |
|
choices=list(example_indexes.keys()), |
|
label="Select Pre-made Index", |
|
value=list(example_indexes.keys())[0] |
|
) |
|
|
|
with gr.Tab("Create New Index"): |
|
gr.Markdown( |
|
""" |
|
To create a new index, enter your LlamaParse API key and upload a PDF. |
|
|
|
You can get a free API key by signing up [here](https://cloud.llamaindex.ai). |
|
|
|
Processing will take a few minutes when creating a new index, depending on the size of the document. |
|
""" |
|
) |
|
file_upload = gr.File(label="Upload PDF") |
|
llama_parse_key = gr.Textbox( |
|
label="LlamaParse API Key", |
|
type="password" |
|
) |
|
create_btn = gr.Button("Create Index") |
|
create_status = gr.Textbox(label="Status", interactive=False) |
|
|
|
|
|
query_input = gr.Textbox(label="Search Query", value="What is the Executive Summary?") |
|
with gr.Row(): |
|
text_top_k = gr.Slider( |
|
minimum=1, |
|
maximum=10, |
|
value=2, |
|
step=1, |
|
label="Text Top-K" |
|
) |
|
image_top_k = gr.Slider( |
|
minimum=1, |
|
maximum=10, |
|
value=2, |
|
step=1, |
|
label="Image Top-K" |
|
) |
|
search_btn = gr.Button("Search") |
|
|
|
with gr.Column(): |
|
|
|
status_output = gr.Textbox(label="Search Status") |
|
image_output = gr.Gallery( |
|
label="Retrieved Images", |
|
show_label=True, |
|
elem_id="gallery" |
|
) |
|
text_output = gr.JSON( |
|
label="Retrieved Text with Similarity Scores", |
|
elem_id="text_results" |
|
) |
|
|
|
|
|
def load_existing_index(index_name, progress=gr.Progress()): |
|
if index_name: |
|
try: |
|
progress(0, desc="Loading index...") |
|
result = index_manager.load_index(example_indexes[index_name]) |
|
progress(1.0, desc="Index loaded!") |
|
return result, None |
|
except Exception as e: |
|
return f"Error loading index: {str(e)}", None |
|
return "No index selected", None |
|
|
|
existing_index_dropdown.change( |
|
fn=load_existing_index, |
|
inputs=[existing_index_dropdown], |
|
outputs=[create_status, query_input], |
|
api_name=False, |
|
show_progress=True |
|
) |
|
|
|
create_btn.click( |
|
fn=create_index, |
|
inputs=[file_upload, llama_parse_key], |
|
outputs=[create_status], |
|
api_name=False, |
|
show_progress=True |
|
) |
|
|
|
search_btn.click( |
|
fn=run_search, |
|
inputs=[query_input, text_top_k, image_top_k], |
|
outputs=[status_output, text_output, image_output], |
|
api_name=False |
|
) |
|
|
|
gr.Markdown(""" |
|
This demo was built with [LlamaIndex](https://docs.llamaindex.ai) and [LlamaParse](https://cloud.llamaindex.ai). To see more multi-modal demos, check out the [llama parse examples](https://github.com/run-llama/llama_parse/tree/main/examples/multimodal). |
|
""" |
|
) |
|
|
|
if __name__ == "__main__": |
|
|
|
demo.launch() |
|
|