import gradio as gr
from inference import inference_and_run
import spaces
import os
import re
import shutil
model_name = 'Ferret-UI'
cur_dir = os.path.dirname(os.path.abspath(__file__))
@spaces.GPU()
def inference_with_gradio(chatbot, image, prompt, model_path, box=None, temperature=0.2, top_p=0.7, max_new_tokens=512):
dir_path = os.path.dirname(image)
# image_path = image
# Define the directory where you want to save the image (current directory)
filename = os.path.basename(image)
dir_path = "./"
# Create the new path for the file (in the current directory)
image_path = os.path.join(dir_path, filename)
shutil.copy(image, image_path)
print("filename path: ", filename)
if "gemma" in model_path.lower():
conv_mode = "ferret_gemma_instruct"
else:
conv_mode = "ferret_llama_3"
# inference_text = inference_and_run(
# image_path=image_path,
# prompt=prompt,
# conv_mode=conv_mode,
# model_path=model_path,
# box=box
# )
inference_text = inference_and_run(
image_path=filename, # double check this
image_dir=dir_path,
prompt=prompt,
model_path="jadechoghari/Ferret-UI-Gemma2b",
conv_mode=conv_mode,
temperature=temperature,
top_p=top_p,
box=box,
max_new_tokens=max_new_tokens,
# stop=stop # Assuming we want to process the image
)
if isinstance(inference_text, (list, tuple)):
inference_text = str(inference_text[0])
# Update chatbot history with new message pair
new_history = chatbot.copy() if chatbot else []
new_history.append((prompt, inference_text))
return new_history
def submit_chat(chatbot, text_input):
response = ''
# chatbot.append((text_input, response))
return chatbot, ''
def clear_chat():
return [], None, "", "", 0.2, 0.7, 512
html = f"""
{model_name}
📱 Grounded Mobile UI Understanding with Multimodal LLMs.
A new MLLM tailored for enhanced understanding of mobile UI screens, equipped with referring, grounding, and reasoning capabilities.
We release two Ferret-UI checkpoints, built on gemma-2b and Llama-3-8B models respectively, for public exploration. 🚀