import math import streamlit as st from transformers import ( PreTrainedTokenizerFast, VisionEncoderDecoderModel, ViTImageProcessor, ) MODEL_NAME = "grascii/gregg-vision-v0.2.1" MIN_LOG_PROB = math.log(0.5) NUM_BEAMS = 3 @st.cache_resource(show_spinner=f"Loading {MODEL_NAME}") def load_model(): model = VisionEncoderDecoderModel.from_pretrained( MODEL_NAME, token=st.secrets.HF_TOKEN ) tokenizer = PreTrainedTokenizerFast.from_pretrained( MODEL_NAME, token=st.secrets.HF_TOKEN, ) processor = ViTImageProcessor.from_pretrained(MODEL_NAME, token=st.secrets.HF_TOKEN) return model, tokenizer, processor @st.cache_data(ttl=3600, show_spinner=f"Running {MODEL_NAME}") def run_vision(image): model, tokenizer, processor = load_model() pixel_values = processor(image, return_tensors="pt").pixel_values generated = model.generate( pixel_values, max_new_tokens=12, num_beams=NUM_BEAMS, num_return_sequences=NUM_BEAMS, output_scores=True, return_dict_in_generate=True, ) return [ tokenizer.convert_ids_to_tokens( generated["sequences"][0], skip_special_tokens=True ) ] + [ tokenizer.convert_ids_to_tokens(seq, skip_special_tokens=True) for seq, score in zip( generated["sequences"][1:], generated["sequences_scores"][1:] ) if score > MIN_LOG_PROB ]