moondream2-batch-processing

Sleeping

File size: 3,036 Bytes

6a8ca1f
 
 
 
3068721
 
 
 
6a8ca1f
3068721
 
6a8ca1f
 
3068721
6a8ca1f
 
3068721
 
 
 
6a8ca1f
 
3068721
04fc1f1
ee5e19e
134e8f7
aae971d
 
3b88725
3068721
db2ea29
07b2bd0
3068721
 
07b2bd0
3b88725
07b2bd0
471f9af
3f71d24
680cfd1
3f71d24
 
aae971d
07b2bd0
3f71d24
aae971d
07b2bd0
134e8f7
6a8ca1f
 
ee5e19e
471f9af
07b2bd0
471f9af
07b2bd0
6a8ca1f
1635aec
69cfbe8
e9ecb71
69cfbe8
6a8ca1f
1635aec
 
 
 
07b2bd0
6a8ca1f
07b2bd0

import spaces
import torch
import re
import gradio as gr
from threading import Thread
from transformers import TextIteratorStreamer, AutoTokenizer, AutoModelForCausalLM
from PIL import ImageDraw
from torchvision.transforms.v2 import Resize

import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

model_id = "vikhyatk/moondream2"
revision = "2024-08-26"
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
moondream = AutoModelForCausalLM.from_pretrained(
    model_id, trust_remote_code=True, revision=revision,
    torch_dtype=torch.bfloat16, device_map={"": "cuda"},
    attn_implementation="flash_attention_2"
)
moondream.eval()

@spaces.GPU
def answer_questions(image_tuples, prompt_text):
    result = ""
    Q_and_A = ""
    prompts = [p.strip() for p in prompt_text.split(',')]  
    image_embeds = [img[0] for img in image_tuples if img[0] is not None]
    answers = []
    
    for prompt in prompts:
        thread = Thread(target=lambda: answers.append(moondream.batch_answer(
                images=[img.convert("RGB") for img in image_embeds],
                prompts=[prompt] * len(image_embeds),
                tokenizer=tokenizer)))
        thread.start()
        thread.join()
    
    for i, prompt in enumerate(prompts):
        Q_and_A += f"### Q: {prompt}\n"
        for j, image_tuple in enumerate(image_tuples):
            image_name = f"image{j+1}"
            answer_text = answers[i][j]  
            Q_and_A += f"**{image_name} A:** \n {answer_text} \n"

    result = {'headers': prompts, 'data': answers} 
    print("result\n{}\n\nQ_and_A\n{}\n\n".format(result, Q_and_A))
    return Q_and_A, result

with gr.Blocks() as demo:
    gr.Markdown("# moondream2 unofficial batch processing demo")
    gr.Markdown("1. Select images\n2. Enter one or more prompts separated by commas. Ex: Describe this image, What is in this image?\n\n")
    gr.Markdown("**Currently each image will be sent as a batch with the prompts thus asking each prompt on each image**")
    gr.Markdown("*Running on free CPU space tier currently so results may take a bit to process compared to duplicating space and using GPU space hardware*")
    gr.Markdown("A tiny vision language model. [moondream2](https://huggingface.co/vikhyatk/moondream2)")
    with gr.Row():
        img = gr.Gallery(label="Upload Images", type="pil", preview=True, columns=4)
    with gr.Row():
        prompt = gr.Textbox(label="Input Prompts", placeholder="Enter prompts (one prompt for each image provided) separated by commas. Ex: Describe this image, What is in this image?", lines=8)
    with gr.Row():
        submit = gr.Button("Submit")
    with gr.Row():
        output = gr.Markdown(label="Questions and Answers", line_breaks=True)
    with gr.Row():
        output2 = gr.Dataframe(label="Structured Dataframe", type="array", wrap=True)
    submit.click(answer_questions, inputs=[img, prompt], outputs=[output, output2])

demo.queue().launch()