File size: 2,804 Bytes
355d287
 
a1fde91
2baef79
 
355d287
1e1d66d
 
 
84f3f84
355d287
f595b41
c8f3b13
f595b41
 
 
 
 
79fef48
751acdb
f595b41
 
 
 
 
 
 
 
a1fde91
917196e
 
7a63258
 
176961e
 
9a282a8
176961e
 
 
fcd7ed3
9f60554
fcd7ed3
87e930a
7b99df8
ff94223
9f60554
dff288f
176961e
 
 
7a63258
9a282a8
 
179fa33
54f8b7a
9a282a8
179fa33
54f8b7a
179fa33
ff94223
65c6075
7a63258
65c6075
 
 
 
 
01ce9b4
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import torch 
import re 
import gradio as gr
from PIL import Image

from transformers import AutoTokenizer, ViTFeatureExtractor, VisionEncoderDecoderModel 
import os
import tensorflow as tf
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

device='cpu'

model_id = "nttdataspain/vit-gpt2-stablediffusion2-lora"
model = VisionEncoderDecoderModel.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)
feature_extractor = ViTFeatureExtractor.from_pretrained(model_id)

# Predict function
def predict(image):
    img = image.convert('RGB')
    model.eval()
    pixel_values = feature_extractor(images=[img], return_tensors="pt").pixel_values
    with torch.no_grad():
        output_ids = model.generate(pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True).sequences

    preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    preds = [pred.strip() for pred in preds]
    return preds[0]

input = gr.inputs.Image(label="Upload any Image", type = 'pil', optional=True)
output = gr.outputs.Textbox(type="text",label="Captions")
examples_folder = os.path.join(os.path.dirname(__file__), "examples")
examples = [os.path.join(examples_folder, file) for file in os.listdir(examples_folder)]

with gr.Blocks() as demo:
    
    gr.HTML(
        """
        <div style="text-align: center; max-width: 1200px; margin: 20px auto;">
        <h2 style="font-weight: 900; font-size: 3rem; margin: 0rem">
            📸 ViT Image-to-Text with LORA 📝
        </h2>   
        <h2 style="text-align: left; font-weight: 450; font-size: 1rem; margin-top: 2rem; margin-bottom: 1.5rem">
        In the field of large language models, the challenge of fine-tuning has long perplexed researchers. Microsoft, however, has unveiled an innovative solution called <b>Low-Rank Adaptation (LoRA)</b>. With the emergence of behemoth models like GPT-3 boasting billions of parameters, the cost of fine-tuning them for specific tasks or domains has become exorbitant.
        <br>
        <br>
        You can find more info here: <u><a href="https://www.linkedin.com/pulse/fine-tuning-image-to-text-algorithms-with-lora-daniel-puente-viejo" target="_blank">Linkedin article</a></u>
        </h2>
        </div>
        """)
    
    with gr.Row():
            with gr.Column(scale=1):
                img = gr.inputs.Image(label="Upload any Image", type = 'pil', optional=True)
                button = gr.Button(value="Describe")
            with gr.Column(scale=1):
                out = gr.outputs.Textbox(type="text",label="Captions")   
                
    button.click(predict, inputs=[img], outputs=[out])
 
    gr.Examples(
        examples=examples,
        inputs=img,
        outputs=out,
        fn=predict,
        cache_examples=True,
    )
demo.launch(debug=True)