") with gr.Tab(label="SD3 Llava Llama3 Captioner"): with gr.Row(): with gr.Column(): input_img = gr.Image(label="Input Picture") submit_btn = gr.Button(value="Submit") output = gr.Text(label="Caption") gr.Examples( [["image1.jpg"], ["image2.jpg"], ["image3.png"]], inputs = [input_img], outputs = [output], fn=create_captions_llava_llama3_docci, label='Try captioning on examples' ) submit_btn.click(create_captions_llava_llama3_docci, [input_img], [output]) demo.launch(debug=True)

from lmdeploy import pipeline, GenerationConfig, TurbomindEngineConfig
from lmdeploy.vl import load_image
import spaces
import gradio as gr
from PIL import Image
import numpy as np

@spaces.GPU
def create_captions_llava_llama3_docci(image):
    pipe = pipeline('Lin-Chen/open-llava-next-llama3-8b')
    gen_config = GenerationConfig(repetition_penalty=1.10)
    image = Image.fromarray(np.uint8(image)).convert('RGB')
    response = pipe(('As an AI image annotation expert, please provide accurate annotations for the image to enhance the T5 model understanding of the content. Accurately describe images and images in the form of natural language. Your description should include key elements such as the actions, clothing, hairstyle, facial expressions, environment, dressing style, etc. of the characters in the image, as well as background content and any other important information. If the image has a distinct special style or filter, it needs to be described, otherwise it is not necessary. Your description should be accurate and accurate, only describing the actual content of the image, without describing abstract feelings such as atmosphere or quality, and should not exceed three sentences. These descriptions will be used for image reconstruction, so the closer the similarity to the original image, the better the label quality. Special tags will receive a reward of $10 per image.', image), gen_config=gen_config)
    return response.text

css = """
  #mkd {
    height: 500px; 
    overflow: auto; 
    border: 1px solid #ccc; 
  }
"""

with gr.Blocks(css=css) as demo:
    gr.HTML("<h1><center>Fine tuned version of xtuner/llava-llama-3-8b-v1_1 on google/docci dataset.<center><h1>")

    with gr.Tab(label="SD3 Llava Llama3 Captioner"):
        with gr.Row():
            with gr.Column():
                input_img = gr.Image(label="Input Picture")
                submit_btn = gr.Button(value="Submit")
                output = gr.Text(label="Caption")
            
        gr.Examples(
        [["image1.jpg"], ["image2.jpg"], ["image3.png"]],
        inputs = [input_img],
        outputs = [output],
        fn=create_captions_llava_llama3_docci,
        label='Try captioning on examples'
        )
        
        submit_btn.click(create_captions_llava_llama3_docci, [input_img], [output])
    

demo.launch(debug=True)