import torch from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer from typing import Dict, List, Any class EndpointHandler: def __init__(self, path="ajayarora1235/Rap-Nemo-0"): # Activate 4-bit precision base model loading use_4bit = True # Compute dtype for 4-bit base models bnb_4bit_compute_dtype = "float16" # Quantization type (fp4 or nf4) bnb_4bit_quant_type = "nf4" # Activate nested quantization for 4-bit base models (double quantization) use_nested_quant = False nf4_config = BitsAndBytesConfig( load_in_4bit=use_4bit, bnb_4bit_quant_type=bnb_4bit_quant_type, bnb_4bit_use_double_quant=use_nested_quant, bnb_4bit_compute_dtype=bnb_4bit_compute_dtype ) self.model = AutoModelForCausalLM.from_pretrained( path, quantization_config=nf4_config, ) self.model.config.use_cache = False self.model.config.pretraining_tp = 1 self.tokenizer = AutoTokenizer.from_pretrained(path) self.tokenizer.pad_token = self.tokenizer.unk_token self.tokenizer.padding_side = "right" def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]: input_text = data["inputs"] kwargs = data.get("kwargs", {}) # Tokenize input text input_tokens = self.tokenizer.encode(input_text, return_tensors="pt") # Generate output tokens with torch.no_grad(): output_tokens = self.model.generate(input_tokens, max_new_tokens=500, do_sample=True, **kwargs) # Decode output tokens output_text = self.tokenizer.decode(output_tokens[0]) return [{"output": output_text}] # Example usage if __name__ == "__main__": handler = EndpointHandler() input_data = {"inputs": "Write a verse in the style of Lupe Fiasco about falling in love with Chipotle."} output_data = handler(input_data) print(output_data)