import gradio as gr import requests from PIL import Image import torch from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("q-future/co-instruct-preview", trust_remote_code=True, torch_dtype=torch.float16, attn_implementation="eager", device_map={"":"cuda:0"}) def chat(message, history, image_1, image_2): print(history) if history: if image_1 is not None and image_2 is None: past_message = "USER: The image: <|image|> " + history[0][0] + " ASSISTANT:" + history[0][1] for i in range((len(history) - 1)): past_message += "USER:" +history[i][0] + " ASSISTANT:" + history[i][1] + "" message = past_message + "USER:" + message + " ASSISTANT:" images = [image_1] if image_1 is not None and image_2 is not None: past_message = "USER: The first image: <|image|>\nThe second image: <|image|>" + history[0][0] + " ASSISTANT:" + history[0][1] + "" for i in range((len(history) - 1)): past_message += "USER:" + history[i][0] + " ASSISTANT:" + history[i][1] + "" message = past_message + "USER:" + message + " ASSISTANT:" images = [image_1, image_2] else: if image_1 is not None and image_2 is None: message = "USER: The image: <|image|> " + message + " ASSISTANT:" images = [image_1] if image_1 is not None and image_2 is not None: message = "USER: The first image: <|image|>\nThe second image: <|image|>" + message + " ASSISTANT:" images = [image_1, image_2] print(message) return model.tokenizer.batch_decode(model.chat(message, images, max_new_tokens=150).clamp(0, 100000))[0].split("ASSISTANT:")[-1] with gr.Blocks(title="img") as demo: title_markdown = ("""
*Preview Version (v1)! Now we support two images as inputs! Try it now!*

Q-Instruct (mPLUG-Owl-2)

Q-Instruct: Improving Low-level Visual Abilities for Multi-modality Foundation Models

Please find our more accurate visual scoring demo on [OneScorer]!
""") gr.Markdown(title_markdown) with gr.Row(): input_img_1 = gr.Image(type='pil', label="Image 1 (The first image)") input_img_2 = gr.Image(type='pil', label="Image 2 (The second image)") gr.ChatInterface(fn = chat, additional_inputs=[input_img_1, input_img_2]) demo.launch(share=True)