Spaces:
Runtime error
Runtime error
import sys | |
from collections import namedtuple | |
import click | |
import torch | |
from peft import PeftModel | |
from transformers import ( | |
AutoModel, | |
AutoTokenizer, | |
BloomForCausalLM, | |
BloomTokenizerFast, | |
GenerationConfig, | |
LlamaForCausalLM, | |
LlamaTokenizer, | |
) | |
from utils import generate_prompt | |
def decide_model(args, device_map): | |
ModelClass = namedtuple("ModelClass", ('tokenizer', 'model')) | |
_MODEL_CLASSES = { | |
"llama": ModelClass(**{ | |
"tokenizer": LlamaTokenizer, | |
"model": LlamaForCausalLM, | |
}), | |
"chatglm": ModelClass(**{ | |
"tokenizer": AutoTokenizer, #ChatGLMTokenizer, | |
"model": AutoModel, #ChatGLMForConditionalGeneration, | |
}), | |
"bloom": ModelClass(**{ | |
"tokenizer": BloomTokenizerFast, | |
"model": BloomForCausalLM, | |
}), | |
"Auto": ModelClass(**{ | |
"tokenizer": AutoTokenizer, | |
"model": AutoModel, | |
}) | |
} | |
model_type = "Auto" if args.model_type not in ["llama", "bloom", "chatglm"] else args.model_type | |
if model_type == "chatglm": | |
tokenizer = _MODEL_CLASSES[model_type].tokenizer.from_pretrained( | |
args.base_model, | |
trust_remote_code=True | |
) | |
# todo: ChatGLMForConditionalGeneration revision | |
model = _MODEL_CLASSES[model_type].model.from_pretrained( | |
args.base_model, | |
trust_remote_code=True, | |
device_map=device_map | |
) | |
else: | |
tokenizer = _MODEL_CLASSES[model_type].tokenizer.from_pretrained(args.base_model) | |
model = _MODEL_CLASSES[model_type].model.from_pretrained( | |
args.base_model, | |
load_in_8bit=False, | |
torch_dtype=torch.float16, | |
device_map=device_map | |
) | |
if model_type == "llama": | |
tokenizer.pad_token_id = 0 | |
tokenizer.padding_side = "left" # Allow batched inference | |
if device_map == "auto": | |
model = PeftModel.from_pretrained( | |
model, | |
args.finetuned_weights, | |
torch_dtype=torch.float16, | |
) | |
else: | |
model = PeftModel.from_pretrained( | |
model, | |
args.finetuned_weights, | |
device_map=device_map | |
) | |
return tokenizer, model | |
class ModelServe: | |
def __init__( | |
self, | |
load_8bit: bool = True, | |
model_type: str = "llama", | |
base_model: str = "linhvu/decapoda-research-llama-7b-hf", | |
finetuned_weights: str = "llama-7b-hf_alpaca-en-zh", | |
): | |
args = locals() | |
namedtupler = namedtuple("args", tuple(list(args.keys()))) | |
local_args = namedtupler(**args) | |
if torch.cuda.is_available(): | |
self.device = "cuda:0" | |
self.device_map = "auto" | |
#self.max_memory = {i: "12GB" for i in range(torch.cuda.device_count())} | |
#self.max_memory.update({"cpu": "30GB"}) | |
else: | |
self.device = "cpu" | |
self.device_map = {"": self.device} | |
self.tokenizer, self.model = decide_model(args=local_args, device_map=self.device_map) | |
# unwind broken decapoda-research config | |
self.model.config.pad_token_id = self.tokenizer.pad_token_id = 0 # unk | |
self.model.config.bos_token_id = 1 | |
self.model.config.eos_token_id = 2 | |
if not load_8bit: | |
self.model.half() # seems to fix bugs for some users. | |
self.model.eval() | |
if torch.__version__ >= "2" and sys.platform != "win32": | |
self.model = torch.compile(self.model) | |
def generate( | |
self, | |
instruction: str, | |
input: str, | |
temperature: float = 0.7, | |
top_p: float = 0.75, | |
top_k: int = 40, | |
num_beams: int = 4, | |
max_new_tokens: int = 1024, | |
**kwargs | |
): | |
prompt = generate_prompt(instruction, input) | |
print(f"Prompt: {prompt}") | |
inputs = self.tokenizer(prompt, return_tensors="pt") | |
input_ids = inputs["input_ids"].to(self.device) | |
generation_config = GenerationConfig( | |
temperature=temperature, | |
top_p=top_p, | |
top_k=top_k, | |
num_beams=num_beams, | |
**kwargs, | |
) | |
print("generating...") | |
with torch.no_grad(): | |
generation_output = self.model.generate( | |
input_ids=input_ids, | |
generation_config=generation_config, | |
return_dict_in_generate=True, | |
output_scores=True, | |
max_new_tokens=max_new_tokens, | |
) | |
s = generation_output.sequences[0] | |
output = self.tokenizer.decode(s) | |
print(f"Output: {output}") | |
return output.split("### 回覆:")[1].strip() | |