Spaces:
Paused
Paused
Salma Mayorquin
commited on
Commit
·
0945ad6
1
Parent(s):
a6e329b
initial commit
Browse files- README.md +3 -3
- app.py +83 -0
- examples/warehouse_1.jpg +0 -0
- examples/warehouse_2.jpg +0 -0
- requirements.txt +28 -0
README.md
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
---
|
2 |
title: SpaceLLaVA
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
sdk_version: 4.20.1
|
8 |
app_file: app.py
|
|
|
1 |
---
|
2 |
title: SpaceLLaVA
|
3 |
+
emoji: 🛸
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: purple
|
6 |
sdk: gradio
|
7 |
sdk_version: 4.20.1
|
8 |
app_file: app.py
|
app.py
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import io
|
2 |
+
import base64
|
3 |
+
import numpy as np
|
4 |
+
import torch
|
5 |
+
import matplotlib
|
6 |
+
import matplotlib.cm
|
7 |
+
import gradio as gr
|
8 |
+
from PIL import Image
|
9 |
+
|
10 |
+
from llama_cpp import Llama
|
11 |
+
from llama_cpp.llama_chat_format import Llava15ChatHandler
|
12 |
+
|
13 |
+
# Converts an image input (PIL Image or file path) into a base64 data URI
|
14 |
+
def image_to_base64_data_uri(image_input):
|
15 |
+
if isinstance(image_input, str):
|
16 |
+
with open(image_input, "rb") as img_file:
|
17 |
+
base64_data = base64.b64encode(img_file.read()).decode('utf-8')
|
18 |
+
elif isinstance(image_input, Image.Image):
|
19 |
+
buffer = io.BytesIO()
|
20 |
+
image_input.save(buffer, format="PNG")
|
21 |
+
base64_data = base64.b64encode(buffer.getvalue()).decode('utf-8')
|
22 |
+
else:
|
23 |
+
raise ValueError("Unsupported input type. Input must be a file path or a PIL.Image.Image instance.")
|
24 |
+
return f"data:image/png;base64,{base64_data}"
|
25 |
+
|
26 |
+
class Llava:
|
27 |
+
def __init__(self, mmproj="model/mmproj-model-f16.gguf", model_path="model/ggml-model-q4_0.gguf", gpu=False):
|
28 |
+
chat_handler = Llava15ChatHandler(clip_model_path=mmproj, verbose=True)
|
29 |
+
n_gpu_layers = 0
|
30 |
+
if gpu:
|
31 |
+
n_gpu_layers = -1
|
32 |
+
self.llm = Llama(model_path=model_path, chat_handler=chat_handler, n_ctx=2048, logits_all=True, n_gpu_layers=n_gpu_layers)
|
33 |
+
|
34 |
+
def run_inference(self, image, prompt):
|
35 |
+
data_uri = image_to_base64_data_uri(image)
|
36 |
+
res = self.llm.create_chat_completion(
|
37 |
+
messages=[
|
38 |
+
{"role": "system", "content": "You are an assistant who perfectly describes images."},
|
39 |
+
{
|
40 |
+
"role": "user",
|
41 |
+
"content": [
|
42 |
+
{"type": "image_url", "image_url": {"url": data_uri}},
|
43 |
+
{"type": "text", "text": prompt}
|
44 |
+
]
|
45 |
+
}
|
46 |
+
]
|
47 |
+
)
|
48 |
+
return res["choices"][0]["message"]["content"]
|
49 |
+
|
50 |
+
# Initialize the model
|
51 |
+
llm_model = Llava()
|
52 |
+
|
53 |
+
title_and_links_markdown = """
|
54 |
+
# 🛸SpaceLLaVA🌋: A spatial reasoning multi-modal model
|
55 |
+
This space hosts our initial release of LLaVA 1.5 LoRA tuned for spatial reasoning using data generated with [VQASynth](https://github.com/remyxai/VQASynth).
|
56 |
+
Upload an image and ask a question.
|
57 |
+
|
58 |
+
[Model](https://huggingface.co/remyxai/SpaceLLaVA) | [Code](https://github.com/remyxai/VQASynth) | [Paper](https://spatial-vlm.github.io)
|
59 |
+
"""
|
60 |
+
|
61 |
+
def predict(image, prompt):
|
62 |
+
result = llm_model.run_inference(image, prompt)
|
63 |
+
return result
|
64 |
+
|
65 |
+
image_input = gr.inputs.Image(type="pil", label="Input Image")
|
66 |
+
text_input = gr.inputs.Textbox(label="Prompt")
|
67 |
+
|
68 |
+
# Initialize interface with examples
|
69 |
+
iface = gr.Interface(
|
70 |
+
fn=predict,
|
71 |
+
inputs=[image_input, text_input],
|
72 |
+
outputs="text",
|
73 |
+
title="Llava Model Inference",
|
74 |
+
description="Input an image and a prompt to receive a description."
|
75 |
+
)
|
76 |
+
|
77 |
+
examples = [
|
78 |
+
["examples/warehouse_1.jpg", "Is the man wearing gray pants to the left of the pile of boxes on a pallet?"],
|
79 |
+
["examples/warehouse_2.jpg", "Is the forklift taller than the shelves of boxes?"],
|
80 |
+
]
|
81 |
+
|
82 |
+
iface.examples = examples
|
83 |
+
iface.launch()
|
examples/warehouse_1.jpg
ADDED
examples/warehouse_2.jpg
ADDED
requirements.txt
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pip
|
2 |
+
einops
|
3 |
+
fastapi
|
4 |
+
gradio==3.35.2
|
5 |
+
markdown2[all]
|
6 |
+
numpy
|
7 |
+
requests
|
8 |
+
sentencepiece
|
9 |
+
tokenizers>=0.12.1
|
10 |
+
torch==2.0.1
|
11 |
+
torchvision==0.15.2
|
12 |
+
uvicorn
|
13 |
+
wandb
|
14 |
+
shortuuid
|
15 |
+
pillow
|
16 |
+
httpx==0.24.0
|
17 |
+
deepspeed==0.9.5
|
18 |
+
peft==0.4.0
|
19 |
+
transformers==4.31.0
|
20 |
+
accelerate==0.21.0
|
21 |
+
bitsandbytes==0.41.0
|
22 |
+
scikit-learn==1.2.2
|
23 |
+
sentencepiece==0.1.99
|
24 |
+
einops==0.6.1
|
25 |
+
einops-exts==0.0.4
|
26 |
+
llama-cpp-python==0.2.55
|
27 |
+
timm==0.6.13
|
28 |
+
gradio_client==0.2.9
|