qianxiao1111's picture
upgrade: add benchmarks eval
2a26d3b
raw
history blame
3.08 kB
import argparse
from inference import generate_outputs, load_model, load_tokenizer_and_template
from reject_eval.run_eval import (eval_outputs,
format_inputs,
load_json)
def main(args):
temperature = args.temperature
model_path = args.model_path
max_new_tokens = args.max_new_tokens
max_model_len = args.max_model_len
test_path = args.test_path
template = args.template
gpus_num = args.gpus_num
model_type = args.model_type
# 加载 model 和 tokenizer
llm_model = load_model(model_path, max_model_len, gpus_num)
tokenizer = load_tokenizer_and_template(model_path, template)
# 推理参数
generate_args = {
"temperature": temperature,
"max_tokens": max_new_tokens,
"model_type": model_type,
}
# 推理&评估
test_datas = load_json(test_path)
format_message_datas = format_inputs(test_datas)
# 这是第一轮输出, 由于eval-llm指令遵循能力可能比较弱,不会按照指定格式输出,因而在第一轮后增加一轮校正输出
model_outputs_tmp = generate_outputs(
format_message_datas, llm_model, tokenizer, generate_args
)
# save_json("./model_outputs.json", model_outputs_tmp)
eval_outputs(model_outputs_tmp, test_path)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="eval reject")
parser.add_argument(
"--gpus_num", type=int, default=1, help="the number of GPUs you want to use."
)
parser.add_argument(
"--temperature", type=float, default=0.01, help="Temperature setting"
)
parser.add_argument(
"--model_path", type=str, required=True, help="Path to the model"
)
parser.add_argument(
"--model_type",
choices=["base_model", "chat_model"],
default="chat_model",
help="Base model or Chat model",
)
parser.add_argument(
"--max_new_tokens",
type=int,
default=1024,
help="Maximum number of output new tokens",
)
parser.add_argument(
"--max_model_len", type=int, default=8192, help="Max model length"
)
parser.add_argument(
"--template",
type=str,
choices=[None, "llama3", "baichuan", "chatglm"],
default=None,
help="The template must be specified if not present in the config file",
)
parser.add_argument(
"--test_path",
type=str,
default="table_related_benchmarks/evalset/reject_test/test_query.json",
help="Test File Path",
)
parser.add_argument(
"--save_path",
type=str,
default="output/result_reject.json",
help="LLM output samples save path",
)
args = parser.parse_args()
main(args)
# example /home/dev/weights/CodeQwen1.5-7B-Chat /data0/pretrained-models/checkpoints/qwen2/checkpoint-1200
"""
python table_related_benchmarks/run_reject_eval.py --model_path /data4/sft_output/qwen2-base-0817
"""