import torch from transformers import AutoModelForCausalLM, AutoTokenizer import warnings torch_dtype = torch.bfloat16 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_name = "mosaicml/mpt-7b" model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch_dtype, trust_remote_code=True, use_auth_token=None, ) tokenizer = AutoTokenizer.from_pretrained( model_name, trust_remote_code=True, use_auth_token=None, ) model.eval() model.to(device=device, dtype=torch_dtype) if tokenizer.pad_token_id is None: warnings.warn( "pad_token_id is not set for the tokenizer. Using eos_token_id as pad_token_id." ) tokenizer.pad_token = tokenizer.eos_token tokenizer.padding_side = "left" gkw = { "temperature": 0.5, "top_p": 0.92, "top_k": 0, "max_new_tokens": 512, "use_cache": True, "do_sample": True, "eos_token_id": tokenizer.eos_token_id, "pad_token_id": tokenizer.pad_token_id, "repetition_penalty": 1.1, # 1.0 means no penalty, > 1.0 means penalty, 1.2 from CTRL paper } def mpt_7b(s): input_ids = tokenizer(s, return_tensors="pt").input_ids input_ids = input_ids.to(model.device) with torch.no_grad(): output_ids = model.generate(input_ids, **gkw) # Slice the output_ids tensor to get only new tokens new_tokens = output_ids[0, len(input_ids[0]) :] output_text = tokenizer.decode(new_tokens, skip_special_tokens=True) return output_text