import gradio as gr from inference import inference_and_run import spaces import os import re import shutil model_name = 'Ferret-UI' cur_dir = os.path.dirname(os.path.abspath(__file__)) @spaces.GPU() def inference_with_gradio(chatbot, image, prompt, model_path, box=None, temperature=0.2, top_p=0.7, max_new_tokens=512): dir_path = os.path.dirname(image) # image_path = image # Define the directory where you want to save the image (current directory) filename = os.path.basename(image) dir_path = "./" # Create the new path for the file (in the current directory) image_path = os.path.join(dir_path, filename) shutil.copy(image, image_path) print("filename path: ", filename) if "gemma" in model_path.lower(): conv_mode = "ferret_gemma_instruct" else: conv_mode = "ferret_llama_3" # inference_text = inference_and_run( # image_path=image_path, # prompt=prompt, # conv_mode=conv_mode, # model_path=model_path, # box=box # ) inference_text = inference_and_run( image_path=filename, # double check this image_dir=dir_path, prompt=prompt, model_path="jadechoghari/Ferret-UI-Gemma2b", conv_mode=conv_mode, temperature=temperature, top_p=top_p, box=box, max_new_tokens=max_new_tokens, # stop=stop # Assuming we want to process the image ) if isinstance(inference_text, (list, tuple)): inference_text = str(inference_text[0]) # Update chatbot history with new message pair new_history = chatbot.copy() if chatbot else [] new_history.append((prompt, inference_text)) return new_history def submit_chat(chatbot, text_input): response = '' # chatbot.append((text_input, response)) return chatbot, '' def clear_chat(): return [], None, "", "", 0.2, 0.7, 512 html = f"""

 {model_name}

📱 Grounded Mobile UI Understanding with Multimodal LLMs.
A new MLLM tailored for enhanced understanding of mobile UI screens, equipped with referring, grounding, and reasoning capabilities.

We release two Ferret-UI checkpoints, built on gemma-2b and Llama-3-8B models respectively, for public exploration. 🚀

""" latex_delimiters_set = [{ "left": "\\(", "right": "\\)", "display": False }, { "left": "\\begin{equation}", "right": "\\end{equation}", "display": True }, { "left": "\\begin{align}", "right": "\\end{align}", "display": True }] # Set up UI components image_input = gr.Image(label="Upload Image", type="filepath", height=350) text_input = gr.Textbox(lines=2, placeholder="Enter your prompt here...", label="Prompt") model_dropdown = gr.Dropdown(choices=[ "jadechoghari/Ferret-UI-Gemma2b", "jadechoghari/Ferret-UI-Llama8b", ], label="Model Path", value="jadechoghari/Ferret-UI-Gemma2b") bounding_box_input = gr.Textbox(placeholder="Optional bounding box (x1, y1, x2, y2)", label="Bounding Box (optional)") # Adding Sliders for temperature, top_p, and max_new_tokens temperature_input = gr.Slider(minimum=0.1, maximum=2.0, step=0.1, value=0.2, label="Temperature") top_p_input = gr.Slider(minimum=0.0, maximum=1.0, step=0.05, value=0.7, label="Top P") max_new_tokens_input = gr.Slider(minimum=1, maximum=1024, step=1, value=512, label="Max New Tokens") chatbot = gr.Chatbot(label="Chat with Ferret-UI", height=400, show_copy_button=True, latex_delimiters=latex_delimiters_set, type="tuples") with gr.Blocks(title=model_name, theme=gr.themes.Ocean()) as demo: gr.HTML(html) with gr.Row(): with gr.Column(scale=3): image_input.render() text_input.render() model_dropdown.render() bounding_box_input.render() temperature_input.render() # Render temperature input top_p_input.render() # Render top_p input max_new_tokens_input.render() gr.Examples( examples=[ ["appstore_reminders.png", "Describe the image in details", "jadechoghari/Ferret-UI-Gemma2b", None], ["appstore_reminders.png", "What's inside the selected region?", "jadechoghari/Ferret-UI-Gemma2b", "189, 906, 404, 970"], ["appstore_reminders.png", "Where is the Game Tab?", "jadechoghari/Ferret-UI-Gemma2b", None], ], inputs=[image_input, text_input, model_dropdown, bounding_box_input] ) with gr.Column(scale=7): chatbot.render() with gr.Row(): send_btn = gr.Button("Send", variant="primary") clear_btn = gr.Button("Clear", variant="secondary") send_click_event = send_btn.click( inference_with_gradio, [chatbot, image_input, text_input, model_dropdown, bounding_box_input, temperature_input, top_p_input, max_new_tokens_input], chatbot ).then(submit_chat, [chatbot, text_input], [chatbot, text_input]) submit_event = text_input.submit( inference_with_gradio, [chatbot, image_input, text_input, model_dropdown, bounding_box_input, temperature_input, top_p_input, max_new_tokens_input], chatbot ).then(submit_chat, [chatbot, text_input], [chatbot, text_input]) clear_btn.click(clear_chat, outputs=[chatbot, image_input, text_input, bounding_box_input, temperature_input, top_p_input, max_new_tokens_input]) demo.launch()