Spaces:
Runtime error
Runtime error
from __future__ import annotations | |
import spaces | |
import gradio as gr | |
from threading import Thread | |
from transformers import TextIteratorStreamer | |
import hashlib | |
import os | |
from transformers import AutoModel, AutoProcessor | |
import torch | |
model = AutoModel.from_pretrained("visheratin/MC-LLaVA-3b", torch_dtype=torch.float16, trust_remote_code=True).to("cuda") | |
processor = AutoProcessor.from_pretrained("visheratin/MC-LLaVA-3b", trust_remote_code=True) | |
if torch.cuda.is_available(): | |
DEVICE = "cuda" | |
DTYPE = torch.float16 | |
else: | |
DEVICE = "cpu" | |
DTYPE = torch.float32 | |
def cached_vision_process(image, max_crops, num_tokens): | |
image_hash = hashlib.sha256(image.tobytes()).hexdigest() | |
cache_path = f"visual_cache/{image_hash}-{max_crops}-{num_tokens}.pt" | |
if os.path.exists(cache_path): | |
return torch.load(cache_path).to(DEVICE, dtype=DTYPE) | |
else: | |
processor_outputs = processor.image_processor([image], max_crops) | |
pixel_values = processor_outputs["pixel_values"] | |
pixel_values = [ | |
value.to(model.device).to(model.dtype) for value in pixel_values | |
] | |
coords = processor_outputs["coords"] | |
coords = [value.to(model.device).to(model.dtype) for value in coords] | |
image_outputs = model.vision_model(pixel_values, coords, num_tokens) | |
image_features = model.multi_modal_projector(image_outputs) | |
os.makedirs("visual_cache", exist_ok=True) | |
torch.save(image_features, cache_path) | |
return image_features.to(DEVICE, dtype=DTYPE) | |
def answer_question(image, question, max_crops, num_tokens): | |
prompt = f"""<|im_start|>user | |
<image> | |
{question}<|im_end|> | |
<|im_start|>assistant | |
""" | |
streamer = TextIteratorStreamer(processor.tokenizer, skip_special_tokens=True) | |
inputs = processor(prompt, [image], model, max_crops=max_crops, num_tokens=num_tokens) | |
generation_kwargs = { | |
"input_ids": inputs["input_ids"], | |
"attention_mask": inputs["attention_mask"], | |
"image_features": cached_vision_process(image, max_crops, num_tokens), | |
"streamer": streamer, | |
"max_length": 1000, | |
"use_cache": True, | |
"eos_token_id": processor.tokenizer.eos_token_id, | |
"pad_token_id": processor.tokenizer.eos_token_id, | |
} | |
thread = Thread(target=model.generate, kwargs=generation_kwargs) | |
thread.start() | |
buffer = "" | |
for new_text in streamer: | |
buffer += new_text | |
if len(buffer) > 1: | |
yield buffer | |
with gr.Blocks() as demo: | |
gr.HTML("<h1 class='gradio-heading'><center>MC-LLaVA 3B</center></h1>") | |
gr.HTML( | |
"<center><p class='gradio-sub-heading'>MC-LLaVA 3B is a model that can answer questions about small details in high-resolution images. Check out the <a href='https://huggingface.co/visheratin/MC-LLaVA-3b'>model card</a> for more details. If you have any questions or ideas hot to make the model better, <a href='https://x.com/visheratin'>let me know</a></p></center>" | |
) | |
with gr.Group(): | |
with gr.Row(): | |
prompt = gr.Textbox( | |
label="Question", placeholder="e.g. What is this?", scale=4 | |
) | |
submit = gr.Button( | |
"Submit", | |
scale=1, | |
) | |
with gr.Row(): | |
max_crops = gr.Slider(minimum=0, maximum=200, step=5, value=0, label="Max crops") | |
num_tokens = gr.Slider(minimum=728, maximum=2184, step=10, value=728, label="Number of image tokens") | |
with gr.Row(): | |
img = gr.Image(type="pil", label="Upload or Drag an Image") | |
output = gr.TextArea(label="Answer") | |
submit.click(answer_question, [img, prompt, max_crops, num_tokens], output) | |
prompt.submit(answer_question, [img, prompt, max_crops, num_tokens], output) | |
demo.queue().launch(debug=True) |