from typing import Dict, List, Any
from transformers import BlipProcessor, BlipForQuestionAnswering
from PIL import Image
from io import BytesIO
import base64
import json

class EndpointHandler():
    def __init__(self, path=""):
        self.processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
        self.model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base").to("cuda")
    
    def __call__(self, data):

        info=data['inputs']
        image_bytes=info.pop('image',data)
        raw_image=base64.b64decode(image_bytes)
        image=Image.open(BytesIO(raw_image))
        question=info.pop('text',data) 
        
        info = self.processor(image, question, return_tensors="pt").to("cuda")
        out = self.model.generate(**info)
        return {'text':self.processor.decode(out[0], skip_special_tokens=True)}
    
if __name__=="__main__":
    my_handler=EndpointHandler(path='.')
    with open("/home/ubuntu/guoling/1.png",'rb') as img:
        image_bytes=img.read()
    image_base64=base64.b64encode(image_bytes).decode('utf-8')
    question="are there any people in the picture?"
    test_payload=json.dumps({"inputs": 
                  {'image':image_base64,'question':question}
                  })
    test_result=my_handler(test_payload)
    print(test_result)