Spaces:

nttdataspain
/

Image-To-Text-Lora-ViT

Runtime error

App Files Files Community

Image-To-Text-Lora-ViT / app.py

D0k-tor

Update app.py

e667bf2 over 1 year ago

raw

history blame

3.22 kB

	import torch
	import re
	import gradio as gr
	from PIL import Image

	from transformers import AutoTokenizer, ViTFeatureExtractor, VisionEncoderDecoderModel
	import os
	import tensorflow as tf
	os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

	device='cpu'

	model_id = "nttdataspain/vit-gpt2-coco-lora"
	model = VisionEncoderDecoderModel.from_pretrained(model_id)
	tokenizer = AutoTokenizer.from_pretrained(model_id)
	feature_extractor = ViTFeatureExtractor.from_pretrained(model_id)

	# Predict function
	def predict(image):
	img = image.convert('RGB')
	model.eval()
	pixel_values = feature_extractor(images=[img], return_tensors="pt").pixel_values
	with torch.no_grad():
	output_ids = model.generate(pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True).sequences

	preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
	preds = [pred.strip() for pred in preds]
	return preds[0]

	input = gr.inputs.Image(label="Upload any Image", type = 'pil', optional=True)
	output = gr.outputs.Textbox(type="text",label="Captions")
	examples_folder = os.path.join(os.path.dirname(__file__), "examples")
	examples = [os.path.join(examples_folder, file) for file in os.listdir(examples_folder)]

	with gr.Blocks() as demo:

	gr.HTML(
	"""
	<div style="text-align: center; max-width: 1200px; margin: 20px auto;">
	<h1 style="font-weight: 900; font-size: 3rem; margin: 0rem">
	📸 ViT Image-to-Text with LORA 📝
	</h1>
	<h2 style="text-align: left; font-weight: 450; font-size: 1rem; margin-top: 2rem; margin-bottom: 1.5rem">
	In the field of large language models, the challenge of fine-tuning has long perplexed researchers. Microsoft, however, has unveiled an innovative solution called <b>Low-Rank Adaptation (LoRA)</b>. With the emergence of behemoth models like GPT-3 boasting billions of parameters, the cost of fine-tuning them for specific tasks or domains has become exorbitant.
	<br>
	<br>
	LoRA offers a groundbreaking approach by freezing the weights of pre-trained models and introducing trainable layers known as <b>rank-decomposition matrices in each transformer block</b>. This ingenious technique significantly reduces the number of trainable parameters and minimizes GPU memory requirements, as gradients no longer need to be computed for the majority of model weights.
	<br>
	<br>
	You can find more info here: <u><a href="https://www.linkedin.com/pulse/fine-tuning-image-to-text-algorithms-with-lora-daniel-puente-viejo" target="_blank">Linkedin article</a></u>
	</h2>
	</div>
	""")

	with gr.Row():
	with gr.Column(scale=1):
	img = gr.inputs.Image(label="Upload any Image", type = 'pil', optional=True)
	button = gr.Button(value="Describe")
	with gr.Column(scale=1):
	out = gr.outputs.Textbox(type="text",label="Captions")

	button.click(predict, inputs=[img], outputs=[out])

	gr.Examples(
	examples=examples,
	inputs=img,
	outputs=out,
	fn=predict,
	cache_examples=True,
	)
	demo.launch(debug=True)