Spaces:
Runtime error
Runtime error
import gradio as gr | |
from sentence_transformers import SentenceTransformer | |
from transformers import AutoProcessor, AutoModelForVision2Seq | |
from PIL import Image | |
import torch | |
from torchvision import io | |
from typing import Dict | |
from datetime import datetime | |
import numpy as np | |
import base64 | |
import os, stat, io | |
# Load the model in half-precision on the available device(s) | |
model = AutoModelForVision2Seq.from_pretrained( | |
"./SmolVLM-500M-Instruct", | |
torch_dtype=torch.float32, | |
_attn_implementation="eager", | |
device_map="cpu" | |
) | |
processor = AutoProcessor.from_pretrained("./SmolVLM-500M-Instruct") | |
def array_to_image(image_array): | |
if image_array is None: | |
raise ValueError("No image provided. Please upload an image before submitting.") | |
# Convert numpy array to PIL Image | |
image = Image.fromarray(np.uint8(image_array)).convert("RGB") | |
return image | |
def generate_embeddings(text): | |
model = SentenceTransformer('./all-MiniLM-L6-v2') | |
embeddings = model.encode(text) | |
return embeddings | |
def describe_image(image_array): | |
image = array_to_image(image_array) | |
messages = [ | |
{ | |
"role": "user", | |
"content": [ | |
{ | |
"type": "image", | |
}, | |
{"type": "text", "text": "Make a very detailed description of the image."}, | |
], | |
} | |
] | |
prompt = processor.apply_chat_template(messages, add_generation_prompt=True) | |
inputs = processor(text=prompt, images=[image], return_tensors="pt") | |
# Inference: Generation of the output | |
with torch.no_grad(): | |
generated_ids = model.generate( | |
**inputs, | |
max_new_tokens=500, | |
num_beams=1, # Disable beam search | |
do_sample=False, # Disable sampling | |
#temperature=1.0 # Set temperature to 1.0 | |
) | |
output_ids = [ | |
generated_ids[len(input_ids) :] | |
for input_ids, generated_ids in zip(inputs.input_ids, generated_ids) | |
] | |
output_text = processor.batch_decode( | |
output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True | |
) | |
# Extract the detailed description from the response | |
return output_text, generate_embeddings(output_text) | |
# Create a Gradio interface | |
iface = gr.Interface( | |
fn=describe_image, | |
inputs=gr.Image(), | |
outputs=[gr.Textbox(label="Description"), gr.JSON(label="Embeddings")], | |
title="Image Description with SmolVLM-500M-Instruct and Textual embeddings with all-MiniLM-L6-v2", | |
description="Upload an image to get a detailed description using the SmolVLM-500M-Instruct model." | |
) | |
# Launch the app | |
#iface.launch(share=True) | |
iface.launch() |