search / vision.py
chanicpanic's picture
Generate multiple alternatives for image search
67bc9b1
raw
history blame
1.46 kB
import math
import streamlit as st
from transformers import (
PreTrainedTokenizerFast,
VisionEncoderDecoderModel,
ViTImageProcessor,
)
MODEL_NAME = "grascii/gregg-vision-v0.2.1"
MIN_LOG_PROB = math.log(0.5)
NUM_BEAMS = 3
@st.cache_resource(show_spinner=f"Loading {MODEL_NAME}")
def load_model():
model = VisionEncoderDecoderModel.from_pretrained(
MODEL_NAME, token=st.secrets.HF_TOKEN
)
tokenizer = PreTrainedTokenizerFast.from_pretrained(
MODEL_NAME,
token=st.secrets.HF_TOKEN,
)
processor = ViTImageProcessor.from_pretrained(MODEL_NAME, token=st.secrets.HF_TOKEN)
return model, tokenizer, processor
@st.cache_data(ttl=3600, show_spinner=f"Running {MODEL_NAME}")
def run_vision(image):
model, tokenizer, processor = load_model()
pixel_values = processor(image, return_tensors="pt").pixel_values
generated = model.generate(
pixel_values,
max_new_tokens=12,
num_beams=NUM_BEAMS,
num_return_sequences=NUM_BEAMS,
output_scores=True,
return_dict_in_generate=True,
)
return [
tokenizer.convert_ids_to_tokens(
generated["sequences"][0], skip_special_tokens=True
)
] + [
tokenizer.convert_ids_to_tokens(seq, skip_special_tokens=True)
for seq, score in zip(
generated["sequences"][1:], generated["sequences_scores"][1:]
)
if score > MIN_LOG_PROB
]