from typing import Dict, List, Any from transformers import AutoTokenizer, AutoModelForSeq2SeqLM import torch from handler import EndpointHandler device = torch.device("cuda" if torch.cuda.is_available() else "cpu") class EndpointHandler: def __init__(self, path=""): self.tokenizer = AutoTokenizer.from_pretrained(path).to(device) self.model = AutoModelForSeq2SeqLM.from_pretrained(path).to(device)) def __call__(self, data: str) -> str: inp = self.tokenizer(data, return_tensors="pt") for q in inp: inp[q] = inp[q].to(device) with torch.inference_mode(): out= model.generate(**inp) final_output = tokenizer.batch_decode(out,skip_special_tokens=True) return {"translation": final_output[0]}