Pheye / app.py
miguelcarv's picture
added spaces
c40944a
raw
history blame
3.82 kB
import gradio as gr
from huggingface_hub import InferenceClient
import json
from pheye_builder import create_model_and_transforms
from huggingface_hub import hf_hub_download
import torch
from PIL import Image
import os
import requests
import spaces
def get_config(hf_model_path):
config_path = hf_hub_download(hf_model_path, "config.json")
with open(config_path, "r") as f:
config = json.load(f)
return config
def get_model_path(hf_model_path):
return hf_hub_download(hf_model_path, "checkpoint.pt")
HF_MODEL = "miguelcarv/Pheye-x2-672"
config = get_config(HF_MODEL)
print("Got config")
model, tokenizer = create_model_and_transforms(
clip_vision_encoder_path=config["encoder"],
lang_decoder_path=config["decoder"],
tokenizer_path=config["tokenizer"],
cross_attn_every_n_layers=config["cross_interval"],
level=config["level"],
reduce_factor=config["reduce"],
from_layer=config["from_layer"],
encoder_dtype=eval(config["encoder_dtype"]),
decoder_dtype=eval(config["decoder_dtype"]),
dtype=eval(config["other_params_dtype"])
)
if config["first_level"]:
model.vision_encoder.add_first_level_adapter()
print("Created model")
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_path = get_model_path(HF_MODEL)
model.load_state_dict(torch.load(model_path, map_location="cpu"))
model = model.to(DEVICE)
print("Loaded model")
SYSTEM_PROMPT = "You are an AI visual assistant and you are seeing a single image. You will receive an instruction regarding that image. Your goal is to follow the instruction as faithfully as you can."
whiteboard = Image.open(requests.get("https://c1.staticflickr.com/7/6168/6207108414_a8833f410e_o.jpg", stream=True).raw).convert('RGB')
taxi_image = Image.open(requests.get("https://llava.hliu.cc/file=/nobackup/haotian/tmp/gradio/ca10383cc943e99941ecffdc4d34c51afb2da472/extreme_ironing.jpg", stream=True).raw).convert('RGB')
@spaces.GPU
def generate_answer(img, question, max_new_tokens, num_beams):
image = [img]
prompt = [f"{SYSTEM_PROMPT}\n\nInstruction: {question}\nOutput:"]
inputs = tokenizer(prompt, padding='longest', return_tensors='pt')
print("Generating a response with the following parameters:")
print(f"""Question: {question}\nMax New Tokens: {max_new_tokens}\nNum Beams: {num_beams}""")
model.eval()
with torch.no_grad():
outputs = model.generate(vision_x=image,
lang_x=inputs.input_ids.to(DEVICE),
device=DEVICE,
max_new_tokens=max_new_tokens,
num_beams = num_beams,
eos_token_id = tokenizer.eos_token_id,
pad_token_id = tokenizer.pad_token_id,
attention_mask=inputs.attention_mask.to(DEVICE))
answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0].split("Output:")[-1].lstrip()
return answer
# Create the Gradio interface
iface = gr.Interface(
fn=generate_answer,
inputs=[
gr.Image(type="pil", label="Image"),
gr.Textbox(label="Question"),
gr.Slider(minimum=5, maximum=500, step=1, value=50, label="Max New Tokens"),
gr.Slider(minimum=1, maximum=5, step=1, value=3, label="Num Beams")
],
outputs=gr.Textbox(label="Answer"),
title="<h1 style='text-align: center; display: block;'>Pheye-x2 672x672 pixels</h1>",
examples=[[taxi_image, "What is unusual about this image?", 500, 3], [whiteboard, "What is the main topic of the whiteboard?", 500, 3]]
)
if __name__ == "__main__":
# Launch the Gradio app
iface.launch()