File size: 2,100 Bytes

3cf4b90
a5df904
 
 
 
 
5c77eb1
a5df904
5c77eb1
 
 
a5df904
 
138ad6d
a5df904
 
5c77eb1
138ad6d
 
a5df904
 
 
 
 
 
 
 
 
 
 
 
 
5c77eb1
a5df904
3cf4b90
 
 
 
 
a5df904
3cf4b90
 
 
ae68696
0b0bb49
a5df904
 
 
 
 
 
 
 
 
ea09db5
 
138ad6d

import base64
from typing import Any, Dict
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
from io import BytesIO
import torch
import logging


logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


class EndpointHandler():
    def __init__(self, path=""):
        logger.debug("Initializing model and processor.")
        self.model = BlipForConditionalGeneration.from_pretrained(
            "quadranttechnologies/qhub-blip-image-captioning-finetuned").to(device)
        self.processor = BlipProcessor.from_pretrained("quadranttechnologies/qhub-blip-image-captioning-finetuned")
        self.model.eval()
        self.model = self.model.to(device).to(device)

    def __call__(self, data: Any) -> Dict[str, Any]:
        """
        Args:
            data (:obj:):
                includes the input data and the parameters for the inference.
        Return:
            A :obj:`dict`:. The object returned should be a dict of one list like {"descriptions": ["Description of the image"]} containing :
                - "description": A string corresponding to the generated description.
        """
        logger.debug(f"Received data keys: {data.keys()}")

        image_base64 = data["inputs"].get("image")
        image_data = base64.b64decode(image_base64)

        # Convert image data to PIL Image
        images = Image.open(BytesIO(image_data))

        # Optional text input
        text = data["inputs"].get("text", "")
        parameters = data.pop("parameters", {})

        processed_image = self.processor(images=images, text=text, return_tensors="pt")
        processed_image["pixel_values"] = processed_image["pixel_values"].to(device)
        processed_image = {**processed_image, **parameters}

        with torch.no_grad():
            out = self.model.generate(
                **processed_image
            )
        description = self.processor.batch_decode(out, skip_special_tokens=True)

        return {"description": description}