|
import math |
|
|
|
import streamlit as st |
|
from transformers import ( |
|
PreTrainedTokenizerFast, |
|
VisionEncoderDecoderModel, |
|
ViTImageProcessor, |
|
) |
|
|
|
|
|
MODEL_NAME = "grascii/gregg-vision-v0.2.1" |
|
MIN_LOG_PROB = math.log(0.5) |
|
NUM_BEAMS = 3 |
|
|
|
|
|
@st.cache_resource(show_spinner=f"Loading {MODEL_NAME}") |
|
def load_model(): |
|
model = VisionEncoderDecoderModel.from_pretrained( |
|
MODEL_NAME, token=st.secrets.HF_TOKEN |
|
) |
|
tokenizer = PreTrainedTokenizerFast.from_pretrained( |
|
MODEL_NAME, |
|
token=st.secrets.HF_TOKEN, |
|
) |
|
processor = ViTImageProcessor.from_pretrained(MODEL_NAME, token=st.secrets.HF_TOKEN) |
|
return model, tokenizer, processor |
|
|
|
|
|
@st.cache_data(ttl=3600, show_spinner=f"Running {MODEL_NAME}") |
|
def run_vision(image): |
|
model, tokenizer, processor = load_model() |
|
pixel_values = processor(image, return_tensors="pt").pixel_values |
|
generated = model.generate( |
|
pixel_values, |
|
max_new_tokens=12, |
|
num_beams=NUM_BEAMS, |
|
num_return_sequences=NUM_BEAMS, |
|
output_scores=True, |
|
return_dict_in_generate=True, |
|
) |
|
return [ |
|
tokenizer.convert_ids_to_tokens( |
|
generated["sequences"][0], skip_special_tokens=True |
|
) |
|
] + [ |
|
tokenizer.convert_ids_to_tokens(seq, skip_special_tokens=True) |
|
for seq, score in zip( |
|
generated["sequences"][1:], generated["sequences_scores"][1:] |
|
) |
|
if score > MIN_LOG_PROB |
|
] |
|
|