Spaces:

miguelcarv
/

Pheye

Runtime error

App Files Files Community

Pheye / app.py

miguelcarv

added spaces

c40944a 8 months ago

raw

history blame

3.82 kB

	import gradio as gr
	from huggingface_hub import InferenceClient
	import json
	from pheye_builder import create_model_and_transforms
	from huggingface_hub import hf_hub_download
	import torch
	from PIL import Image
	import os
	import requests
	import spaces

	def get_config(hf_model_path):
	config_path = hf_hub_download(hf_model_path, "config.json")

	with open(config_path, "r") as f:
	config = json.load(f)

	return config


	def get_model_path(hf_model_path):
	return hf_hub_download(hf_model_path, "checkpoint.pt")


	HF_MODEL = "miguelcarv/Pheye-x2-672"
	config = get_config(HF_MODEL)

	print("Got config")

	model, tokenizer = create_model_and_transforms(
	clip_vision_encoder_path=config["encoder"],
	lang_decoder_path=config["decoder"],
	tokenizer_path=config["tokenizer"],
	cross_attn_every_n_layers=config["cross_interval"],
	level=config["level"],
	reduce_factor=config["reduce"],
	from_layer=config["from_layer"],
	encoder_dtype=eval(config["encoder_dtype"]),
	decoder_dtype=eval(config["decoder_dtype"]),
	dtype=eval(config["other_params_dtype"])
	)

	if config["first_level"]:
	model.vision_encoder.add_first_level_adapter()

	print("Created model")

	DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	model_path = get_model_path(HF_MODEL)
	model.load_state_dict(torch.load(model_path, map_location="cpu"))
	model = model.to(DEVICE)

	print("Loaded model")

	SYSTEM_PROMPT = "You are an AI visual assistant and you are seeing a single image. You will receive an instruction regarding that image. Your goal is to follow the instruction as faithfully as you can."

	whiteboard = Image.open(requests.get("https://c1.staticflickr.com/7/6168/6207108414_a8833f410e_o.jpg", stream=True).raw).convert('RGB')
	taxi_image = Image.open(requests.get("https://llava.hliu.cc/file=/nobackup/haotian/tmp/gradio/ca10383cc943e99941ecffdc4d34c51afb2da472/extreme_ironing.jpg", stream=True).raw).convert('RGB')

	@spaces.GPU
	def generate_answer(img, question, max_new_tokens, num_beams):

	image = [img]
	prompt = [f"{SYSTEM_PROMPT}\n\nInstruction: {question}\nOutput:"]
	inputs = tokenizer(prompt, padding='longest', return_tensors='pt')
	print("Generating a response with the following parameters:")
	print(f"""Question: {question}\nMax New Tokens: {max_new_tokens}\nNum Beams: {num_beams}""")

	model.eval()
	with torch.no_grad():
	outputs = model.generate(vision_x=image,
	lang_x=inputs.input_ids.to(DEVICE),
	device=DEVICE,
	max_new_tokens=max_new_tokens,
	num_beams = num_beams,
	eos_token_id = tokenizer.eos_token_id,
	pad_token_id = tokenizer.pad_token_id,
	attention_mask=inputs.attention_mask.to(DEVICE))
	answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0].split("Output:")[-1].lstrip()

	return answer


	# Create the Gradio interface
	iface = gr.Interface(
	fn=generate_answer,
	inputs=[
	gr.Image(type="pil", label="Image"),
	gr.Textbox(label="Question"),
	gr.Slider(minimum=5, maximum=500, step=1, value=50, label="Max New Tokens"),
	gr.Slider(minimum=1, maximum=5, step=1, value=3, label="Num Beams")
	],
	outputs=gr.Textbox(label="Answer"),
	title="<h1 style='text-align: center; display: block;'>Pheye-x2 672x672 pixels</h1>",
	examples=[[taxi_image, "What is unusual about this image?", 500, 3], [whiteboard, "What is the main topic of the whiteboard?", 500, 3]]
	)




	if __name__ == "__main__":
	# Launch the Gradio app
	iface.launch()