|
import os |
|
import pathlib |
|
|
|
from transformers import AutoTokenizer |
|
from vllm import LLM, SamplingParams |
|
|
|
|
|
def get_infer_kwargs(args) -> dict: |
|
"""llm_inference kwargs""" |
|
temperature = args.temperature if args.temperature else 1.0 |
|
max_new_tokens = args.max_new_tokens if args.max_new_tokens else 1024 |
|
model_type = args.model_type if args.model_type else "chat_model" |
|
|
|
kwargs = { |
|
"temperature": temperature, |
|
"max_tokens": max_new_tokens, |
|
"model_type": model_type, |
|
} |
|
return kwargs |
|
|
|
|
|
def load_tokenizer_and_template(model_name_or_path, template=None): |
|
tokenizer = AutoTokenizer.from_pretrained( |
|
model_name_or_path, trust_remote_code=True |
|
) |
|
|
|
if tokenizer.chat_template is None: |
|
if template is not None: |
|
chatml_jinja_path = ( |
|
pathlib.Path(os.path.dirname(os.path.abspath(__file__))) |
|
/ f"templates/template_{template}.jinja" |
|
) |
|
assert chatml_jinja_path.exists() |
|
with open(chatml_jinja_path, "r") as f: |
|
tokenizer.chat_template = f.read() |
|
else: |
|
pass |
|
|
|
return tokenizer |
|
|
|
|
|
def load_model(model_name_or_path, max_model_len=None, gpus_num=1): |
|
llm_args = { |
|
"model": model_name_or_path, |
|
"gpu_memory_utilization": 0.95, |
|
"trust_remote_code": True, |
|
"tensor_parallel_size": gpus_num, |
|
"dtype": "half", |
|
} |
|
|
|
if max_model_len: |
|
llm_args["max_model_len"] = max_model_len |
|
|
|
|
|
llm = LLM(**llm_args) |
|
return llm |
|
|
|
|
|
def generate_outputs(messages_batch, llm_model, tokenizer, generate_args): |
|
""" |
|
messages = [ |
|
{"role": "system", "content": "You are a helpful assistant."}, |
|
{"role": "user", "content": prompt} |
|
] |
|
|
|
messages_batch = [messages] |
|
|
|
generate_args = { |
|
"max_new_tokens": 1024, |
|
"do_sample": True or False, |
|
"temperature": 0-1, |
|
"" |
|
} |
|
""" |
|
model_type = generate_args.pop("model_type", "chat_model") |
|
|
|
|
|
sampling_params = SamplingParams(**generate_args) |
|
|
|
prompt_batch = [] |
|
for messages in messages_batch: |
|
|
|
if model_type == "base_model": |
|
messages_content = [msg["content"] for msg in messages] |
|
prompt = "\n".join(messages_content) |
|
|
|
else: |
|
prompt = tokenizer.apply_chat_template( |
|
messages, tokenize=False, add_generation_prompt=True |
|
) |
|
prompt_batch.append(prompt) |
|
|
|
outputs = llm_model.generate(prompt_batch, sampling_params) |
|
|
|
outputs_batch = [] |
|
for output in outputs: |
|
prompt_output = output.prompt |
|
generated_text = output.outputs[0].text |
|
outputs_batch.append( |
|
{"input_prompt": prompt_output, "output_text": generated_text} |
|
) |
|
|
|
return outputs_batch |
|
|