booksouls
/

bart-large-cnn

@@ -4,19 +4,7 @@ from typing import Any
 class EndpointHandler():
     def __init__(self, path=""):
-        # bitsandbytes quantization is only supported on CUDA devices.
-        bits_and_bytes_config = BitsAndBytesConfig(
-            load_in_4bit=True,
-            bnb_4bit_compute_dtype=torch.bfloat16,
-        )
-        quantization_config = bits_and_bytes_config if torch.cuda.is_available() else None
-        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.model = AutoModelForSeq2SeqLM.from_pretrained(
-            path,
-            quantization_config=quantization_config,
-            device_map="auto",
-        )
         self.tokenizer = AutoTokenizer.from_pretrained(path)
     def __call__(self, data: dict[str, Any]) -> dict[str, Any]:
@@ -41,8 +29,8 @@ class EndpointHandler():
             return_attention_mask=False,
         )
-        # Ensure the input_ids and the model are on the same device to prevent errors.
-        input_ids = tokens.input_ids.to(self.device)
         # Gradient calculation is not needed for inference.
         with torch.no_grad():
@@ -53,4 +41,3 @@ class EndpointHandler():
         generated_text = self.tokenizer.decode(output[0], skip_special_tokens=True)
         return {"generated_text": generated_text}

 class EndpointHandler():
     def __init__(self, path=""):
+        self.model = AutoModelForSeq2SeqLM.from_pretrained(f"{path}/4-bit", device_map="auto")
         self.tokenizer = AutoTokenizer.from_pretrained(path)
     def __call__(self, data: dict[str, Any]) -> dict[str, Any]:
             return_attention_mask=False,
         )
+        # Ensure the input_ids and the model are both on the GPU to prevent errors.
+        input_ids = tokens.input_ids.to("cuda")
         # Gradient calculation is not needed for inference.
         with torch.no_grad():
         generated_text = self.tokenizer.decode(output[0], skip_special_tokens=True)
         return {"generated_text": generated_text}