import gradio as gr from lavis.models import load_model_and_preprocess import torch device = torch.device("cuda") if torch.cuda.is_available() else "cpu" model_name = "blip2_t5_instruct" model_type = "flant5xl" model, vis_processors, _ = load_model_and_preprocess( name=model_name, model_type=model_type, is_eval=True, device=device ) def infer(image, prompt, min_len, max_len, beam_size, len_penalty, repetition_penalty, top_p, decoding_method): use_nucleus_sampling = decoding_method == "Nucleus sampling" image = vis_processors["eval"](image).unsqueeze(0).to(device) samples = { "image": image, "prompt": prompt, } output = model.generate( samples, length_penalty=float(len_penalty), repetition_penalty=float(repetition_penalty), num_beams=beam_size, max_length=max_len, min_length=min_len, top_p=top_p, use_nucleus_sampling=use_nucleus_sampling ) return output[0] theme = gr.themes.Monochrome( primary_hue="indigo", secondary_hue="blue", neutral_hue="slate", radius_size=gr.themes.sizes.radius_sm, font=[gr.themes.GoogleFont("Open Sans"), "ui-sans-serif", "system-ui", "sans-serif"], ) css = ".generating {visibility: hidden}" examples = [ ["banff.jpg", "Can you tell me about this image in detail", 1, 200, 5, 1, 3, 0.9, "Beam search"] ] with gr.Blocks(theme=theme, analytics_enabled=False,css=css) as demo: gr.Markdown("## InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning") gr.Markdown( """ Unofficial demo for InstructBLIP. InstructBLIP is a new vision-language instruction-tuning framework by Salesforce that uses BLIP-2 models, achieving state-of-the-art zero-shot generalization performance on a wide range of vision-language tasks. The demo is based on the official Github implementation """ ) gr.HTML("
You can duplicate this Space to run it privately without a queue for shorter queue times :