booksouls
/

bart-large-cnn

@@ -1,14 +1,24 @@
 import torch
-from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 from typing import Any
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 class EndpointHandler():
     def __init__(self, path=""):
-        self.model = AutoModelForSeq2SeqLM.from_pretrained(path).to(device)
         self.tokenizer = AutoTokenizer.from_pretrained(path)
     def __call__(self, data: dict[str, Any]) -> dict[str, Any]:
         inputs = data.get("inputs")
         parameters = data.get("parameters")
@@ -32,7 +42,7 @@ class EndpointHandler():
         )
         # Ensure the input_ids and the model are on the same device to prevent errors.
-        input_ids = tokens.input_ids.to(device)
         # Gradient calculation is not needed for inference.
         with torch.no_grad():

 import torch
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, BitsAndBytesConfig
 from typing import Any
 class EndpointHandler():
     def __init__(self, path=""):
+        # bitsandbytes quantization is only supported on CUDA devices.
+        bits_and_bytes_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.bfloat16,
+        )
+        quantization_config = bits_and_bytes_config if torch.cuda.is_available() else None
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model = AutoModelForSeq2SeqLM.from_pretrained(
+            path,
+            quantization_config=quantization_config,
+            device_map="auto",
+        )
         self.tokenizer = AutoTokenizer.from_pretrained(path)
     def __call__(self, data: dict[str, Any]) -> dict[str, Any]:
         inputs = data.get("inputs")
         parameters = data.get("parameters")
         )
         # Ensure the input_ids and the model are on the same device to prevent errors.
+        input_ids = tokens.input_ids.to(self.device)
         # Gradient calculation is not needed for inference.
         with torch.no_grad():