snorkelai
/

RedPajama-7B-Chat-Curated

Text Generation

text-generation-inference

Inference Endpoints

Model card Files Files and versions Community

viethoangtranduong commited on Jun 7, 2023

Commit

a679b46

·

1 Parent(s): 958469b

Create handler.py

Files changed (1) hide show

handler.py +38 -0

handler.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import torch
+from typing import  Dict, List, Any
+from transformers import AutoTokenizer, AutoModelForCausalLM
+MAX_TOKENS_IN_BATCH = 4_000
+DEFAULT_MAX_NEW_TOKENS = 10
+class EndpointHandler():
+    def __init__(self, path: str = ""):
+        assert torch.cuda.device_count() >= 4, f"Only found access to {torch.cuda.device_count()} GPUs"
+        self.tokenizer = AutoTokenizer.from_pretrained(path)
+        self.model = AutoModelForCausalLM.from_pretrained(path, torch_dtype=torch.float16)
+        self.model = self.model.to('cuda:0')
+        self.model.parallelize()
+    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """
+        Args:
+            data (:obj:):
+                includes the input data and the parameters for the inference.
+        Return:
+            A :obj:`list`:. The list contains the answer and scores of the inference inputs
+        """
+        prompts = [f"<human>: {prompt}\n<bot>:" for prompt in data["inputs"]]
+        inputs = tokenizer(prompts, padding=True, return_tensors='pt').to(model.device)
+        input_length = inputs.input_ids.shape[1]
+        outputs = model.generate(
+            **inputs, max_new_tokens=512, do_sample=True, temperature=0.7, top_p=0.7, top_k=50
+         )
+        output_strs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        return output_strs