Csplk's picture
Update app.py
8a8a62b verified
raw
history blame
1.94 kB
import spaces
import torch
import re
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
from PIL import Image
if torch.cuda.is_available():
device, dtype = "cuda", torch.float16
else:
device, dtype = "cpu", torch.float32
model_id = "vikhyatk/moondream2"
revision = "2024-04-02"
tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision)
moondream = AutoModelForCausalLM.from_pretrained(
model_id, trust_remote_code=True, revision=revision, torch_dtype=dtype
).to(device=device)
moondream.eval()
@spaces.GPU(duration=10)
def answer_questions(image_tuples, prompt_text):
print(f"prompt_text:\n{prompt_text}\n")
print(f"image_tuples:\n{image_tuples}\n")
prompts = [p.strip() for p in prompt_text.split(',')] # Splitting and cleaning prompts
image_embeds = [img[0] for img in image_tuples if img[0] is not None] # Extracting images from tuples, ignoring None
print(f"image_embeds:\n{image_embeds}\n")
print(f"split prompts:\n{prompts}\n")
answers = moondream.batch_answer(
images=image_embeds,
prompts=prompts,
tokenizer=tokenizer,
)
result = ""
for question, answer in zip(prompts, answers):
print(f"Q: {question}")
print(f"A: {answer}")
print()
result += (f"Q: {question}\nA: {answer}\n\n")
return result
with gr.Blocks() as demo:
gr.Markdown("# πŸŒ” moondream2\nA tiny vision language model. [GitHub](https://github.com/vikhyatk/moondream)")
with gr.Row():
img = gr.Gallery(label="Upload Images", type="pil")
prompt = gr.Textbox(label="Input Prompts", placeholder="Enter prompts separated by commas. Ex: Describe this image, What is in this image?", lines=2)
submit = gr.Button("Submit")
output = gr.TextArea(label="Responses", lines=4)
submit.click(answer_questions, [img, prompt], output)
demo.queue().launch()