File size: 1,697 Bytes
8fb1919 0a1317c 8fb1919 c3c4be8 8fb1919 0a1317c 8fb1919 c3c4be8 8fb1919 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, BitsAndBytesConfig
from typing import Any
class EndpointHandler():
def __init__(self, path=""):
self.model = AutoModelForSeq2SeqLM.from_pretrained(f"{path}/4-bit", device_map="auto")
self.tokenizer = AutoTokenizer.from_pretrained(path)
def __call__(self, data: dict[str, Any]) -> dict[str, Any]:
inputs = data.get("inputs")
parameters = data.get("parameters")
if inputs is None:
raise ValueError(f"'inputs' is missing from the request body")
if not isinstance(inputs, str):
raise ValueError(f"Expected 'inputs' to be a str, but found {type(inputs)}")
if parameters is not None and not isinstance(parameters, dict):
raise ValueError(f"Expected 'parameters' to be a dict, but found {type(parameters)}")
# Truncate the tokens to 1024 to prevent errors with BART and long text.
tokens = self.tokenizer(
inputs,
max_length=1024,
truncation=True,
return_tensors="pt",
return_attention_mask=False,
)
# Ensure the input_ids and the model are both on the GPU to prevent errors.
input_ids = tokens.input_ids.to("cuda")
# Gradient calculation is not needed for inference.
with torch.no_grad():
if parameters is None:
output = self.model.generate(input_ids)
else:
output = self.model.generate(input_ids, **parameters)
generated_text = self.tokenizer.decode(output[0], skip_special_tokens=True)
return {"generated_text": generated_text}
|