remove handler.py
Browse files- handler.py +0 -43
handler.py
DELETED
@@ -1,43 +0,0 @@
|
|
1 |
-
import torch
|
2 |
-
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, BitsAndBytesConfig
|
3 |
-
from typing import Any
|
4 |
-
|
5 |
-
class EndpointHandler():
|
6 |
-
def __init__(self, path=""):
|
7 |
-
self.model = AutoModelForSeq2SeqLM.from_pretrained(f"{path}/4-bit", device_map="auto")
|
8 |
-
self.tokenizer = AutoTokenizer.from_pretrained(path)
|
9 |
-
|
10 |
-
def __call__(self, data: dict[str, Any]) -> dict[str, Any]:
|
11 |
-
inputs = data.get("inputs")
|
12 |
-
parameters = data.get("parameters")
|
13 |
-
|
14 |
-
if inputs is None:
|
15 |
-
raise ValueError(f"'inputs' is missing from the request body")
|
16 |
-
|
17 |
-
if not isinstance(inputs, str):
|
18 |
-
raise ValueError(f"Expected 'inputs' to be a str, but found {type(inputs)}")
|
19 |
-
|
20 |
-
if parameters is not None and not isinstance(parameters, dict):
|
21 |
-
raise ValueError(f"Expected 'parameters' to be a dict, but found {type(parameters)}")
|
22 |
-
|
23 |
-
# Truncate the tokens to 1024 to prevent errors with BART and long text.
|
24 |
-
tokens = self.tokenizer(
|
25 |
-
inputs,
|
26 |
-
max_length=1024,
|
27 |
-
truncation=True,
|
28 |
-
return_tensors="pt",
|
29 |
-
return_attention_mask=False,
|
30 |
-
)
|
31 |
-
|
32 |
-
# Ensure the input_ids and the model are both on the GPU to prevent errors.
|
33 |
-
input_ids = tokens.input_ids.to("cuda")
|
34 |
-
|
35 |
-
# Gradient calculation is not needed for inference.
|
36 |
-
with torch.no_grad():
|
37 |
-
if parameters is None:
|
38 |
-
output = self.model.generate(input_ids)
|
39 |
-
else:
|
40 |
-
output = self.model.generate(input_ids, **parameters)
|
41 |
-
|
42 |
-
generated_text = self.tokenizer.decode(output[0], skip_special_tokens=True)
|
43 |
-
return {"generated_text": generated_text}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|