Spaces:
Runtime error
Runtime error
File size: 2,804 Bytes
355d287 a1fde91 2baef79 355d287 1e1d66d 84f3f84 355d287 f595b41 c8f3b13 f595b41 79fef48 751acdb f595b41 a1fde91 917196e 7a63258 176961e 9a282a8 176961e fcd7ed3 9f60554 fcd7ed3 87e930a 7b99df8 ff94223 9f60554 dff288f 176961e 7a63258 9a282a8 179fa33 54f8b7a 9a282a8 179fa33 54f8b7a 179fa33 ff94223 65c6075 7a63258 65c6075 01ce9b4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 |
import torch
import re
import gradio as gr
from PIL import Image
from transformers import AutoTokenizer, ViTFeatureExtractor, VisionEncoderDecoderModel
import os
import tensorflow as tf
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
device='cpu'
model_id = "nttdataspain/vit-gpt2-stablediffusion2-lora"
model = VisionEncoderDecoderModel.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)
feature_extractor = ViTFeatureExtractor.from_pretrained(model_id)
# Predict function
def predict(image):
img = image.convert('RGB')
model.eval()
pixel_values = feature_extractor(images=[img], return_tensors="pt").pixel_values
with torch.no_grad():
output_ids = model.generate(pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True).sequences
preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
preds = [pred.strip() for pred in preds]
return preds[0]
input = gr.inputs.Image(label="Upload any Image", type = 'pil', optional=True)
output = gr.outputs.Textbox(type="text",label="Captions")
examples_folder = os.path.join(os.path.dirname(__file__), "examples")
examples = [os.path.join(examples_folder, file) for file in os.listdir(examples_folder)]
with gr.Blocks() as demo:
gr.HTML(
"""
<div style="text-align: center; max-width: 1200px; margin: 20px auto;">
<h2 style="font-weight: 900; font-size: 3rem; margin: 0rem">
📸 ViT Image-to-Text with LORA 📝
</h2>
<h2 style="text-align: left; font-weight: 450; font-size: 1rem; margin-top: 2rem; margin-bottom: 1.5rem">
In the field of large language models, the challenge of fine-tuning has long perplexed researchers. Microsoft, however, has unveiled an innovative solution called <b>Low-Rank Adaptation (LoRA)</b>. With the emergence of behemoth models like GPT-3 boasting billions of parameters, the cost of fine-tuning them for specific tasks or domains has become exorbitant.
<br>
<br>
You can find more info here: <u><a href="https://www.linkedin.com/pulse/fine-tuning-image-to-text-algorithms-with-lora-daniel-puente-viejo" target="_blank">Linkedin article</a></u>
</h2>
</div>
""")
with gr.Row():
with gr.Column(scale=1):
img = gr.inputs.Image(label="Upload any Image", type = 'pil', optional=True)
button = gr.Button(value="Describe")
with gr.Column(scale=1):
out = gr.outputs.Textbox(type="text",label="Captions")
button.click(predict, inputs=[img], outputs=[out])
gr.Examples(
examples=examples,
inputs=img,
outputs=out,
fn=predict,
cache_examples=True,
)
demo.launch(debug=True) |