Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
from inference import inference_and_run | |
import spaces | |
import os | |
import shutil | |
from PIL import Image | |
from gradio_image_prompter import ImagePrompter | |
model_name = 'Ferret-UI' | |
cur_dir = os.path.dirname(os.path.abspath(__file__)) | |
def inference_with_gradio(chatbot, image_data, prompt, model_path, temperature=0.2, top_p=0.7, max_new_tokens=512): | |
if image_data is None: | |
raise gr.Error("Please upload an image and draw a bounding box if needed.") | |
# Handle the image and bounding box data | |
image = image_data["image"] | |
box = None | |
if "points" in image_data and image_data["points"] and len(image_data["points"]) > 0: | |
points = image_data["points"][0] | |
# Convert points to [x1, y1, x2, y2] format | |
box = f"{points[0]}, {points[1]}, {points[3]}, {points[4]}" | |
# Convert numpy array to a PIL Image | |
pil_image = Image.fromarray(image) | |
# Save the image | |
filename = "temp_image.png" | |
dir_path = "./" | |
image_path = os.path.join(dir_path, filename) | |
pil_image.save(image_path) # Save the PIL image to the file system | |
if "gemma" in model_path.lower(): | |
conv_mode = "ferret_gemma_instruct" | |
else: | |
conv_mode = "ferret_llama_3" | |
print("the box: ", box) | |
# Call the main inference function with the model and mask (if applicable) | |
inference_text = inference_and_run( | |
image_path=filename, | |
image_dir=dir_path, | |
prompt=prompt, | |
model_path=model_path, | |
conv_mode=conv_mode, | |
temperature=temperature, | |
top_p=top_p, | |
box=box, | |
max_new_tokens=max_new_tokens, | |
) | |
if isinstance(inference_text, (list, tuple)): | |
inference_text = str(inference_text[0]) | |
# Update chatbot history | |
new_history = chatbot.copy() if chatbot else [] | |
new_history.append((prompt, inference_text)) | |
return new_history | |
def submit_chat(chatbot, text_input): | |
return chatbot, '' | |
def clear_chat(): | |
return [], None, "", 0.2, 0.7, 512 | |
html = f""" | |
<div style="text-align: center; padding: 20px;"> | |
<div style="display: inline-block; background-color: #f5f5f7; padding: 20px; border-radius: 20px; box-shadow: 0px 6px 20px rgba(0, 0, 0, 0.1);"> | |
<div style="display: flex; align-items: center;"> | |
<img src='https://github.com/apple/ml-ferret/blob/main/ferretui/figs/ferretui_icon.png?raw=true' alt='Ferret-UI' | |
style='width: 80px; height: 80px; border-radius: 20px; box-shadow: 0px 8px 16px rgba(0, 0, 0, 0.2);'/> | |
<div style="margin-left: 15px;"> | |
<h1 style="font-size: 2.8em; font-family: -apple-system, BlinkMacSystemFont, sans-serif; color: #1D1D1F; | |
font-weight: bold; margin-bottom: 0;">ο£Ώ {model_name}</h1> | |
<p style="font-size: 1.2em; color: #6e6e73; font-family: -apple-system, BlinkMacSystemFont, sans-serif; margin-top: 5px;"> | |
π± Grounded Mobile UI Understanding with Multimodal LLMs.<br> | |
A new MLLM tailored for enhanced understanding of mobile UI screens, equipped with referring, grounding, and reasoning capabilities. | |
</p> | |
<a href='https://huggingface.co/jadechoghari/Ferret-UI-Gemma2b' style='text-decoration: none;'> | |
<button style="background-color: #007aff; color: white; font-size: 1.2em; padding: 10px 20px; border-radius: 10px; border: none; margin-top: 10px; box-shadow: 0px 4px 12px rgba(0, 122, 255, 0.4); cursor: pointer;"> | |
π€ Try on Hugging Face | |
</button> | |
</a> | |
</div> | |
</div> | |
</div> | |
<p style="font-size: 1.2em; color: #86868B; font-family: -apple-system, BlinkMacSystemFont, sans-serif; margin-top: 30px;"> | |
We release two Ferret-UI checkpoints, built on gemma-2b and Llama-3-8B models respectively, for public exploration. π | |
</p> | |
</div> | |
""" | |
latex_delimiters_set = [{ | |
"left": "\\(", | |
"right": "\\)", | |
"display": False | |
}, { | |
"left": "\\begin{equation}", | |
"right": "\\end{equation}", | |
"display": True | |
}, { | |
"left": "\\begin{align}", | |
"right": "\\end{align}", | |
"display": True | |
}] | |
with gr.Blocks(title=model_name) as demo: | |
gr.HTML(html) | |
with gr.Row(): | |
with gr.Column(scale=3): | |
# Replace image_input with ImagePrompter | |
image_input = ImagePrompter(label="Upload Image & Draw Bounding Box") | |
text_input = gr.Textbox(lines=2, placeholder="Enter your prompt here...", label="Prompt") | |
model_dropdown = gr.Dropdown( | |
choices=[ | |
"jadechoghari/Ferret-UI-Gemma2b", | |
"jadechoghari/Ferret-UI-Llama8b", | |
], | |
label="Model Path", | |
value="jadechoghari/Ferret-UI-Gemma2b" | |
) | |
temperature_input = gr.Slider(minimum=0.1, maximum=2.0, step=0.1, value=0.2, label="Temperature") | |
top_p_input = gr.Slider(minimum=0.0, maximum=1.0, step=0.05, value=0.7, label="Top P") | |
max_new_tokens_input = gr.Slider(minimum=1, maximum=1024, step=1, value=512, label="Max New Tokens") | |
gr.Examples( | |
examples=[ | |
[{"image": "appstore_reminders.png"}, "Describe the contents inside the box"], | |
[{"image": "appstore_reminders.png"}, "What is the text shown inside the highlighted area"] | |
], | |
inputs=[image_input, text_input], | |
label="Try these examples" | |
) | |
with gr.Column(scale=7): | |
chatbot = gr.Chatbot( | |
label="Chat with Ferret-UI", | |
height=400, | |
show_copy_button=True, | |
latex_delimiters=latex_delimiters_set, | |
type="tuples" | |
) | |
with gr.Row(): | |
send_btn = gr.Button("Send", variant="primary") | |
clear_btn = gr.Button("Clear", variant="secondary") | |
send_click_event = send_btn.click( | |
inference_with_gradio, | |
[chatbot, image_input, text_input, model_dropdown, temperature_input, top_p_input, max_new_tokens_input], | |
chatbot | |
).then( | |
submit_chat, | |
[chatbot, text_input], | |
[chatbot, text_input] | |
) | |
submit_event = text_input.submit( | |
inference_with_gradio, | |
[chatbot, image_input, text_input, model_dropdown, temperature_input, top_p_input, max_new_tokens_input], | |
chatbot | |
).then( | |
submit_chat, | |
[chatbot, text_input], | |
[chatbot, text_input] | |
) | |
clear_btn.click( | |
clear_chat, | |
outputs=[chatbot, image_input, text_input, temperature_input, top_p_input, max_new_tokens_input] | |
) | |
demo.launch() | |