import spaces import torch import gradio as gr from threading import Thread from transformers import AutoTokenizer, AutoModelForCausalLM # Install the necessary package for the model import subprocess subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True) # Initialize the tokenizer and model model_id = "vikhyatk/moondream2" revision = "2024-04-02" tokenizer = AutoTokenizer.from_pretrained(model_id, revision=revision) moondream = AutoModelForCausalLM.from_pretrained( model_id, revision=revision, trust_remote_code=True, torch_dtype=torch.bfloat16, device_map={"": "cuda"}, attn_implementation="flash_attention_2" ) moondream.eval() @spaces.GPU(duration=10) def chatbot_response(img, text_input): # Here we assume an encoded image processing if needed image_embeds = moondream.encode_image(img) inputs = tokenizer.encode(text_input, return_tensors="pt") outputs = moondream.generate(inputs, max_length=200) response = tokenizer.decode(outputs[0], skip_special_tokens=True) return response # Setting up Gradio Interface with gr.Blocks(theme="Monochrome") as demo: gr.Markdown("# AskMoondream Chatbot") with gr.Row(): img = gr.Image(type="pil", label="Upload an Image") text_input = gr.Textbox(label="Ask a question or describe an image", placeholder="Type here...") with gr.Row(): submit = gr.Button("Submit") response = gr.TextArea(label="Response", placeholder="Moondream's response will appear here...") # Define what happens when the user interacts with the interface submit.click(chatbot_response, inputs=[img, text_input], outputs=response) text_input.submit(chatbot_response, inputs=[img, text_input], outputs=response) # Launch the demo demo.queue().launch()