ferret-demo / app.py
jadechoghari's picture
remove users images and update readme
ee59173
import gradio as gr
from inference import inference_and_run
import spaces
import os
import shutil
from PIL import Image
from gradio_image_prompter import ImagePrompter
model_name = 'Ferret-UI'
cur_dir = os.path.dirname(os.path.abspath(__file__))
@spaces.GPU()
def inference_with_gradio(chatbot, image_data, prompt, model_path, temperature=0.2, top_p=0.7, max_new_tokens=512):
if image_data is None:
raise gr.Error("Please upload an image and draw a bounding box if needed.")
# Handle the image and bounding box data
image = image_data["image"]
box = None
if "points" in image_data and image_data["points"] and len(image_data["points"]) > 0:
points = image_data["points"][0]
# Convert points to [x1, y1, x2, y2] format
box = f"{points[0]}, {points[1]}, {points[3]}, {points[4]}"
# Convert numpy array to a PIL Image
pil_image = Image.fromarray(image)
# Save the image
filename = "temp_image.png"
dir_path = "./"
image_path = os.path.join(dir_path, filename)
pil_image.save(image_path) # Save the PIL image to the file system
if "gemma" in model_path.lower():
conv_mode = "ferret_gemma_instruct"
else:
conv_mode = "ferret_llama_3"
print("the box: ", box)
# Call the main inference function with the model and mask (if applicable)
inference_text = inference_and_run(
image_path=filename,
image_dir=dir_path,
prompt=prompt,
model_path=model_path,
conv_mode=conv_mode,
temperature=temperature,
top_p=top_p,
box=box,
max_new_tokens=max_new_tokens,
)
if isinstance(inference_text, (list, tuple)):
inference_text = str(inference_text[0])
# Update chatbot history
new_history = chatbot.copy() if chatbot else []
new_history.append((prompt, inference_text))
return new_history
def submit_chat(chatbot, text_input):
return chatbot, ''
def clear_chat():
return [], None, "", 0.2, 0.7, 512
html = f"""
<div style="text-align: center; padding: 20px;">
<div style="display: inline-block; background-color: #f5f5f7; padding: 20px; border-radius: 20px; box-shadow: 0px 6px 20px rgba(0, 0, 0, 0.1);">
<div style="display: flex; align-items: center;">
<img src='https://github.com/apple/ml-ferret/blob/main/ferretui/figs/ferretui_icon.png?raw=true' alt='Ferret-UI'
style='width: 80px; height: 80px; border-radius: 20px; box-shadow: 0px 8px 16px rgba(0, 0, 0, 0.2);'/>
<div style="margin-left: 15px;">
<h1 style="font-size: 2.8em; font-family: -apple-system, BlinkMacSystemFont, sans-serif; color: #1D1D1F;
font-weight: bold; margin-bottom: 0;">ο£Ώ {model_name}</h1>
<p style="font-size: 1.2em; color: #6e6e73; font-family: -apple-system, BlinkMacSystemFont, sans-serif; margin-top: 5px;">
πŸ“± Grounded Mobile UI Understanding with Multimodal LLMs.<br>
A new MLLM tailored for enhanced understanding of mobile UI screens, equipped with referring, grounding, and reasoning capabilities.
</p>
<a href='https://huggingface.co/jadechoghari/Ferret-UI-Gemma2b' style='text-decoration: none;'>
<button style="background-color: #007aff; color: white; font-size: 1.2em; padding: 10px 20px; border-radius: 10px; border: none; margin-top: 10px; box-shadow: 0px 4px 12px rgba(0, 122, 255, 0.4); cursor: pointer;">
πŸ€— Try on Hugging Face
</button>
</a>
</div>
</div>
</div>
<p style="font-size: 1.2em; color: #86868B; font-family: -apple-system, BlinkMacSystemFont, sans-serif; margin-top: 30px;">
We release two Ferret-UI checkpoints, built on gemma-2b and Llama-3-8B models respectively, for public exploration. πŸš€
</p>
</div>
"""
latex_delimiters_set = [{
"left": "\\(",
"right": "\\)",
"display": False
}, {
"left": "\\begin{equation}",
"right": "\\end{equation}",
"display": True
}, {
"left": "\\begin{align}",
"right": "\\end{align}",
"display": True
}]
with gr.Blocks(title=model_name) as demo:
gr.HTML(html)
with gr.Row():
with gr.Column(scale=3):
# Replace image_input with ImagePrompter
image_input = ImagePrompter(label="Upload Image & Draw Bounding Box")
text_input = gr.Textbox(lines=2, placeholder="Enter your prompt here...", label="Prompt")
model_dropdown = gr.Dropdown(
choices=[
"jadechoghari/Ferret-UI-Gemma2b",
"jadechoghari/Ferret-UI-Llama8b",
],
label="Model Path",
value="jadechoghari/Ferret-UI-Gemma2b"
)
temperature_input = gr.Slider(minimum=0.1, maximum=2.0, step=0.1, value=0.2, label="Temperature")
top_p_input = gr.Slider(minimum=0.0, maximum=1.0, step=0.05, value=0.7, label="Top P")
max_new_tokens_input = gr.Slider(minimum=1, maximum=1024, step=1, value=512, label="Max New Tokens")
gr.Examples(
examples=[
[{"image": "appstore_reminders.png"}, "Describe the contents inside the box"],
[{"image": "appstore_reminders.png"}, "What is the text shown inside the highlighted area"]
],
inputs=[image_input, text_input],
label="Try these examples"
)
with gr.Column(scale=7):
chatbot = gr.Chatbot(
label="Chat with Ferret-UI",
height=400,
show_copy_button=True,
latex_delimiters=latex_delimiters_set,
type="tuples"
)
with gr.Row():
send_btn = gr.Button("Send", variant="primary")
clear_btn = gr.Button("Clear", variant="secondary")
send_click_event = send_btn.click(
inference_with_gradio,
[chatbot, image_input, text_input, model_dropdown, temperature_input, top_p_input, max_new_tokens_input],
chatbot
).then(
submit_chat,
[chatbot, text_input],
[chatbot, text_input]
)
submit_event = text_input.submit(
inference_with_gradio,
[chatbot, image_input, text_input, model_dropdown, temperature_input, top_p_input, max_new_tokens_input],
chatbot
).then(
submit_chat,
[chatbot, text_input],
[chatbot, text_input]
)
clear_btn.click(
clear_chat,
outputs=[chatbot, image_input, text_input, temperature_input, top_p_input, max_new_tokens_input]
)
demo.launch()