Spaces:

p3nguknight
/

colpali-pixtral

Running on Zero

App Files Files Community

p3nguknight commited on Sep 22, 2024

Commit

ce8881a

1 Parent(s): c5aa334

Fix bugs

Browse files

Files changed (1) hide show

app.py +34 -26

app.py CHANGED Viewed

@@ -22,7 +22,7 @@ PIXTAL_MODEL_ID = "mistral-community--pixtral-12b-240910"
 PIXTRAL_MODEL_SNAPSHOT = "95758896fcf4691ec9674f29ec90d1441d9d26d2"
 PIXTRAL_MODEL_PATH = (
     pathlib.Path().home()
-    / f".cache/huggingface/hub/models--{PIXTAL_MODEL_ID}/{PIXTRAL_MODEL_SNAPSHOT}"
 )
@@ -30,13 +30,13 @@ COLPALI_GEMMA_MODEL_ID = "vidore--colpaligemma-3b-pt-448-base"
 COLPALI_GEMMA_MODEL_SNAPSHOT = "12c59eb7e23bc4c26876f7be7c17760d5d3a1ffa"
 COLPALI_GEMMA_MODEL_PATH = (
     pathlib.Path().home()
-    / f".cache/huggingface/hub/models--{COLPALI_GEMMA_MODEL_ID}/{COLPALI_GEMMA_MODEL_SNAPSHOT}"
 )
 COLPALI_MODEL_ID = "vidore--colpali-v1.2"
 COLPALI_MODEL_SNAPSHOT = "2d54d5d3684a4f5ceeefbef95df0c94159fd6a45"
 COLPALI_MODEL_PATH = (
     pathlib.Path().home()
-    / f".cache/huggingface/hub/models--{COLPALI_MODEL_ID}/{COLPALI_MODEL_SNAPSHOT}"
 )
@@ -46,11 +46,15 @@ def image_to_base64(image_path):
     return f"data:image/jpeg;base64,{encoded_string}"
-@spaces.GPU
-def model_inference(
     images,
     text,
 ):
     tokenizer = MistralTokenizer.from_file(f"{PIXTRAL_MODEL_PATH}/tekken.json")
     model = Transformer.from_folder(PIXTRAL_MODEL_PATH)
@@ -80,8 +84,13 @@ def model_inference(
     return result
-@spaces.GPU
-def search(query: str, ds, images, k):
     model = ColPali.from_pretrained(
         COLPALI_GEMMA_MODEL_PATH,
         torch_dtype=torch.bfloat16,
@@ -101,11 +110,11 @@ def search(query: str, ds, images, k):
         embeddings_query = model(**batch_query)
         qs.extend(list(torch.unbind(embeddings_query.to("cpu"))))
-    scores = processor.score(qs, ds)
-    top_k_indices = scores.argsort(axis=1)[0][-k:]
     results = []
     for idx in top_k_indices:
-        results.append((images[idx]), f"Page {idx}")
     del model
     del processor
     torch.cuda.empty_cache()
@@ -127,7 +136,7 @@ def convert_files(files):
     return images
-@spaces.GPU
 def index_gpu(images, ds):
     model = ColPali.from_pretrained(
         COLPALI_GEMMA_MODEL_PATH,
@@ -173,8 +182,8 @@ css = """
     max-width: 600px;
 }
 """
-file = gr.File(file_types=["pdf"], file_count="multiple", label="pdfs")
-query = gr.Textbox(placeholder="Enter your query here", label="query")
 with gr.Blocks(
     title="Document Question Answering with ColPali & Pixtral",
@@ -201,32 +210,31 @@ with gr.Blocks(
                 img_chunk = gr.State(value=[])
             with gr.Column(scale=3):
-                gr.Markdown("## Search with ColPali")
                 query.render()
                 k = gr.Slider(
-                    minimum=1, maximum=4, step=1, label="Number of results", value=1
                 )
-                search_button = gr.Button("🔍 Run", variant="primary")
         # Define the actions
         output_gallery = gr.Gallery(
-            label="Retrieved Documents", height=600, show_label=True
         )
         convert_button.click(
             index, inputs=[file, embeds], outputs=[message, embeds, imgs]
         )
-        search_button.click(
-            search, inputs=[query, embeds, imgs, k], outputs=[output_gallery]
-        )
-        gr.Markdown("## Get your answer with Pixtral")
-        answer_button = gr.Button("Run", variant="primary")
-        output = gr.Markdown(label="Output")
         answer_button.click(
-            model_inference, inputs=[output_gallery, query], outputs=output
-        )
 if __name__ == "__main__":
     demo.queue(max_size=10).launch()

 PIXTRAL_MODEL_SNAPSHOT = "95758896fcf4691ec9674f29ec90d1441d9d26d2"
 PIXTRAL_MODEL_PATH = (
     pathlib.Path().home()
+    / f".cache/huggingface/hub/models--{PIXTAL_MODEL_ID}/snapshots/{PIXTRAL_MODEL_SNAPSHOT}"
 )
 COLPALI_GEMMA_MODEL_SNAPSHOT = "12c59eb7e23bc4c26876f7be7c17760d5d3a1ffa"
 COLPALI_GEMMA_MODEL_PATH = (
     pathlib.Path().home()
+    / f".cache/huggingface/hub/models--{COLPALI_GEMMA_MODEL_ID}/snapshots/{COLPALI_GEMMA_MODEL_SNAPSHOT}"
 )
 COLPALI_MODEL_ID = "vidore--colpali-v1.2"
 COLPALI_MODEL_SNAPSHOT = "2d54d5d3684a4f5ceeefbef95df0c94159fd6a45"
 COLPALI_MODEL_PATH = (
     pathlib.Path().home()
+    / f".cache/huggingface/hub/models--{COLPALI_MODEL_ID}/snapshots/{COLPALI_MODEL_SNAPSHOT}"
 )
     return f"data:image/jpeg;base64,{encoded_string}"
+@spaces.GPU(duration=30)
+def pixtral_inference(
     images,
     text,
 ):
+    if len(images) == 0:
+        raise gr.Error("No images for generation")
+    if text == "":
+        raise gr.Error("No query for generation")
     tokenizer = MistralTokenizer.from_file(f"{PIXTRAL_MODEL_PATH}/tekken.json")
     model = Transformer.from_folder(PIXTRAL_MODEL_PATH)
     return result
+@spaces.GPU(duration=30)
+def retrieve(query: str, ds, images, k):
+    if len(images) == 0:
+        raise gr.Error("No docs/images for retrieval")
+    if query == "":
+        raise gr.Error("No query for retrieval")
     model = ColPali.from_pretrained(
         COLPALI_GEMMA_MODEL_PATH,
         torch_dtype=torch.bfloat16,
         embeddings_query = model(**batch_query)
         qs.extend(list(torch.unbind(embeddings_query.to("cpu"))))
+    scores = processor.score(qs, ds).numpy()
+    top_k_indices = scores.argsort(axis=1)[0][-k:][::-1]
     results = []
     for idx in top_k_indices:
+        results.append((images[idx], f"Page {idx}, Score {scores[0][idx]:.2f}"))
     del model
     del processor
     torch.cuda.empty_cache()
     return images
+@spaces.GPU(duration=30)
 def index_gpu(images, ds):
     model = ColPali.from_pretrained(
         COLPALI_GEMMA_MODEL_PATH,
     max-width: 600px;
 }
 """
+file = gr.File(file_types=["pdf"], file_count="multiple", label="Pdfs")
+query = gr.Textbox("", placeholder="Enter your query here", label="Query")
 with gr.Blocks(
     title="Document Question Answering with ColPali & Pixtral",
                 img_chunk = gr.State(value=[])
             with gr.Column(scale=3):
+                gr.Markdown("## Retrieve with ColPali and Answer with Pixtral")
                 query.render()
                 k = gr.Slider(
+                    minimum=1,
+                    maximum=4,
+                    step=1,
+                    label="Number of docs to retrieve",
+                    value=1,
                 )
+                answer_button = gr.Button("🏃 Run", variant="primary")
         # Define the actions
         output_gallery = gr.Gallery(
+            label="Retrieved docs", height=400, show_label=True, interactive=False
         )
+        output = gr.Textbox(label="Answer", lines=2, interactive=False)
         convert_button.click(
             index, inputs=[file, embeds], outputs=[message, embeds, imgs]
         )
         answer_button.click(
+            retrieve, inputs=[query, embeds, imgs, k], outputs=[output_gallery]
+        ).then(pixtral_inference, inputs=[output_gallery, query], outputs=[output])
 if __name__ == "__main__":
     demo.queue(max_size=10).launch()