Spaces:

gizemsarsinlar
/

Phi-3.5-Artwork-Analysis

Runtime error

File size: 2,842 Bytes

72bc02d
 
 
 
 
3d9d048
 
72bc02d
 
 
 
 
acaccf4
 
 
 
 
 
 
2b49fb1
acaccf4
 
 
 
 
 
72bc02d
9f684be
acaccf4
 
72bc02d
 
 
3d9d048
 
 
 
 
 
72bc02d
acaccf4
 
 
 
 
72bc02d
 
9f684be
 
 
 
 
 
 
72bc02d
9f684be
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c703df1

import gradio as gr
import spaces
from transformers import AutoModelForCausalLM, AutoProcessor
import torch
from PIL import Image
import subprocess
subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

user_prompt = '<|user|>\n'
assistant_prompt = '<|assistant|>\n'
prompt_suffix = "<|end|>\n"

model_name = "microsoft/Phi-3.5-vision-instruct"

# Lazy-load the model and processor at runtime
def get_model_and_processor(model_id):
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        trust_remote_code=True,
        torch_dtype=torch.bfloat16
    ).cuda().eval()
    processor = AutoProcessor.from_pretrained(
        model_id,
        trust_remote_code=True
    )
    return model, processor

@spaces.GPU(memory=30)
def run_example(image, text_input=None, model_id=model_name):
    model, processor = get_model_and_processor(model_id)
    prompt = f"{user_prompt}<|image_1|>\n{text_input}{prompt_suffix}{assistant_prompt}"
    image = Image.fromarray(image).convert("RGB")

    inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")
    generate_ids = model.generate(
        **inputs,
        max_new_tokens=1000,
        eos_token_id=processor.tokenizer.eos_token_id
    )
    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
    response = processor.batch_decode(
        generate_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0]
    return response

css = """
  #output {
    height: 500px;
    overflow: auto;
    border: 1px solid #ccc;
  }
"""

# Create the Gradio interface
demo = gr.Blocks(css=css)

with demo:
    gr.Markdown("## Phi-3.5 Vision Instruct Demo with Example Inputs")

    with gr.Tab(label="Phi-3.5 Input"):
        with gr.Row():
            with gr.Column():
                input_img = gr.Image(label="Input Picture")
                model_selector = gr.Dropdown(
                    choices=[model_name],
                    label="Model",
                    value=model_name
                )
                text_input = gr.Textbox(label="Question")
                submit_btn = gr.Button(value="Submit")
            with gr.Column():
                output_text = gr.Textbox(label="Output Text")

        examples = [
            ["image1.jpeg", "What does this painting tell us explain in detail?"],
            ["image2.jpg", "What does this painting tell us explain in detail?"],
            ["image3.jpg", "Describe the scene in this picture."]
        ]

        gr.Examples(
            examples=examples,
            inputs=[input_img, text_input],
            examples_per_page=3
        )

        submit_btn.click(run_example, [input_img, text_input, model_selector], [output_text])

# Queue and launch the demo
demo.queue()
demo.launch()