|
from typing import Dict, List, Any |
|
from transformers import BlipProcessor, BlipForQuestionAnswering |
|
from PIL import Image |
|
from io import BytesIO |
|
import base64 |
|
import json |
|
|
|
class EndpointHandler(): |
|
def __init__(self, path=""): |
|
self.processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base") |
|
self.model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to("cuda") |
|
|
|
def __call__(self, data): |
|
|
|
info=data['inputs'] |
|
image_bytes=info.pop('image',data) |
|
raw_image=base64.b64decode(image_bytes) |
|
image=Image.open(BytesIO(raw_image)) |
|
question=info.pop('text',data) |
|
|
|
info = self.processor(image, question, return_tensors="pt").to("cuda") |
|
out = self.model.generate(**info) |
|
return {'text':self.processor.decode(out[0], skip_special_tokens=True)} |
|
|
|
if __name__=="__main__": |
|
my_handler=EndpointHandler(path='.') |
|
with open("/home/ubuntu/guoling/1.png",'rb') as img: |
|
image_bytes=img.read() |
|
image_base64=base64.b64encode(image_bytes).decode('utf-8') |
|
question="are there any people in the picture?" |
|
test_payload=json.dumps({"inputs": |
|
{'image':image_base64,'question':question} |
|
}) |
|
test_result=my_handler(test_payload) |
|
print(test_result) |
|
|