|
import torch |
|
from PIL import Image |
|
from omegaconf import OmegaConf |
|
|
|
from lavis.models import load_model, load_preprocess |
|
from lavis.common.registry import registry |
|
|
|
import requests |
|
|
|
from generate import generate |
|
|
|
url = "https://iliad.stanford.edu/pg-vlm/example_images/ceramic_bowl.jpg" |
|
example_image = Image.open(requests.get(url, stream=True).raw).convert("RGB") |
|
|
|
vlm = load_model( |
|
name='blip2_t5_instruct', |
|
model_type='flant5xxl', |
|
checkpoint='pgvlm_weights.bin', |
|
is_eval=True, |
|
device="cuda" if torch.cuda.is_available() else "cpu" |
|
) |
|
|
|
vlm.qformer_text_input = False |
|
|
|
model_cls = registry.get_model_class('blip2_t5_instruct') |
|
model_type = 'flant5xxl' |
|
preprocess_cfg = OmegaConf.load(model_cls.default_config_path(model_type)).preprocess |
|
vis_processors, _ = load_preprocess(preprocess_cfg) |
|
processor = vis_processors["eval"] |
|
|
|
question_samples = { |
|
'prompt': 'Question: Classify this object as transparent, translucent, or opaque? Respond unknown if you are not sure. Short answer:', |
|
'image': torch.stack([processor(example_image)], dim=0).to(vlm.device) |
|
} |
|
|
|
answers, scores = generate(vlm, question_samples, length_penalty=0, repetition_penalty=1, num_captions=3) |
|
print(answers, scores) |
|
|
|
|