from typing import Dict, List, Any from transformers import BlipProcessor, BlipForQuestionAnswering from PIL import Image from io import BytesIO import base64 import json class EndpointHandler(): def __init__(self, path=""): self.processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base") self.model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to("cuda") def __call__(self, data): info=data['inputs'] image_bytes=info.pop('image',data) raw_image=base64.b64decode(image_bytes) image=Image.open(BytesIO(raw_image)) question=info.pop('text',data) info = self.processor(image, question, return_tensors="pt").to("cuda") out = self.model.generate(**info) return {'text':self.processor.decode(out[0], skip_special_tokens=True)} if __name__=="__main__": my_handler=EndpointHandler(path='.') with open("/home/ubuntu/guoling/1.png",'rb') as img: image_bytes=img.read() image_base64=base64.b64encode(image_bytes).decode('utf-8') question="are there any people in the picture?" test_payload=json.dumps({"inputs": {'image':image_base64,'question':question} }) test_result=my_handler(test_payload) print(test_result)