from PIL import Image import requests from transformers import Blip2Processor, Blip2ForConditionalGeneration import torch device = "cuda" if torch.cuda.is_available() else "cpu" processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b") model = Blip2ForConditionalGeneration.from_pretrained( "Salesforce/blip2-opt-2.7b", device_map={"": 0}, torch_dtype=torch.float16 ) # doctest: +IGNORE_RESULT url = "http://images.cocodataset.org/val2017/000000039769.jpg" image = Image.open(requests.get(url, stream=True).raw) inputs = processor(images=image, return_tensors="pt").to(device, torch.float16) generated_ids = model.generate(**inputs) generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip() print(generated_text) prompt = "Question: how many cats are there? Answer:" inputs = processor(images=image, text=prompt, return_tensors="pt").to(device="cuda", dtype=torch.float16) generated_ids = model.generate(**inputs) generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip() print(generated_text)