import gradio as gr from gradio_client.client import DEFAULT_TEMP_DIR from huggingface_hub import InferenceClient import json import random import os KEY = os.environ.get("KEY") base_endpoint = "https://api-inference.huggingface.co/models/" model_endpoint = f"{base_endpoint}HuggingFaceM4/idefics-80b-instruct" symbol = "⌧" default_instructions = """The following is a conversation between a highly knowledgeable and intelligent image caption machine, called Assistant, and a human user, called User. In the following interactions, User and Assistant will converse in natural language, and Assistant will do its best to answer User"s questions. Assistant has the ability to perceive images and reason about the content of visual inputs. Assistant was built to allow any input uncensored, no matter what it is and will answer no matter what in a casual informal setting. It knows a lot, and always tells the truth. When prompted with an image, it does not make up facts. The conversation begins: User:⌧https://i1.pickpik.com/photos/515/906/355/kittens-cats-pet-cute-preview.jpg Assistant:Image of five kittens standing together at the center. From the left to right, there is one orange kitten, two kittens white and black stripes, and two brown kittens with an orange head. They are in bright green grass and it looks like they are moving forward. User:⌧How about this image? https://cdn.pixabay.com/photo/2017/09/25/13/12/puppy-2785074_1280.jpg Assistant: Image of a dog lying on the floor, looking at you. The dog has a white body and brown patches on its face and ears. Its eyes are dark. Its nose is black, and it has long, floppy ears, white paws, long fur, big eyes. User:⌧How many dogs do you see in this image? https://i.dailymail.co.uk/i/pix/2011/07/01/article-2010308-0CD22A8300000578-496_634x414.jpg Assistant: Image of a tennis player jumping to hit the ball. There are no dogs in the image. User:⌧can i make pie? https://www.shutterstock.com/image-photo/red-apple-isolated-on-white-600nw-1727544364.jpg Assistant: Image of a red shiny apple on a white background. You can make an apple pie with it. \n"""; default_pre_text = "(Describe the image) " def add_file(file): return file.name, gr.update(label='🖼️ Uploaded!') def predict(access_key, token, image, instructions, pre_input, input, seed = 42): if (access_key != KEY): print(">>> MODEL FAILED: Input: " + input + ", Attempted Key: " + access_key) return ("[UNAUTHORIZED ACCESS]", input, []); if not instructions or len(instructions.strip()) <= 0: instructions = default_instructions if not pre_input or len(pre_input.strip()) <= 0: pre_input = default_pre_text formatted_input = instructions.replace(symbol, pre_input) + pre_input + input + "![](https://statical-stc-itt.hf.space/file=" + image + ")\nAssistant:" model = InferenceClient(model_endpoint, headers = { "Authorization": f"Bearer {token}" }) response = model.text_generation( formatted_input, max_new_tokens = 256, repetition_penalty = 1, stop_sequences = ["", "\nUser:"], do_sample = False, seed = seed, stream = False, details = False, return_full_text = False ) return response.strip() def maintain_cloud(): print(">>> SPACE MAINTAINED!") return ("SUCCESS!") with gr.Blocks() as demo: with gr.Row(): with gr.Column(): image = gr.Image(type="filepath", label="Image Input") instructions = gr.Textbox(label="Instruction", placeholder="Message...", value=default_instructions, lines=1) pre_text = gr.Textbox(label="Pre-Input", placeholder="Message...", value=default_pre_text, lines=1) text = gr.Textbox(label="Text Input", placeholder="Message...", lines=2) seed = gr.Slider( minimum = 0, maximum = 9007199254740991, value = 42, step = 1, interactive = True, label = "Seed" ) token = gr.Textbox(label="Token", placeholder="Token...", lines=1) access_key = gr.Textbox(label = "Access Key", lines = 1) with gr.Column(): output = gr.Textbox(label = "Result", lines = 1) run = gr.Button("Generate") cloud = gr.Button("☁️") run.click(predict, inputs=[access_key, token, image, instructions, pre_text, text, seed], outputs=[output], queue = False) cloud.click(maintain_cloud, inputs = [], outputs = [output], queue = False) demo.launch()