File size: 3,082 Bytes
2a26d3b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import argparse

from inference import generate_outputs, load_model, load_tokenizer_and_template
from reject_eval.run_eval import (eval_outputs,
                                                           format_inputs,
                                                           load_json)


def main(args):
    temperature = args.temperature
    model_path = args.model_path
    max_new_tokens = args.max_new_tokens
    max_model_len = args.max_model_len
    test_path = args.test_path
    template = args.template
    gpus_num = args.gpus_num
    model_type = args.model_type

    # 加载 model 和 tokenizer
    llm_model = load_model(model_path, max_model_len, gpus_num)
    tokenizer = load_tokenizer_and_template(model_path, template)

    # 推理参数
    generate_args = {
        "temperature": temperature,
        "max_tokens": max_new_tokens,
        "model_type": model_type,
    }

    # 推理&评估
    test_datas = load_json(test_path)
    format_message_datas = format_inputs(test_datas)
    # 这是第一轮输出, 由于eval-llm指令遵循能力可能比较弱,不会按照指定格式输出,因而在第一轮后增加一轮校正输出
    model_outputs_tmp = generate_outputs(
        format_message_datas, llm_model, tokenizer, generate_args
    )
    # save_json("./model_outputs.json", model_outputs_tmp)
    eval_outputs(model_outputs_tmp, test_path)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="eval reject")
    parser.add_argument(
        "--gpus_num", type=int, default=1, help="the number of GPUs you want to use."
    )
    parser.add_argument(
        "--temperature", type=float, default=0.01, help="Temperature setting"
    )
    parser.add_argument(
        "--model_path", type=str, required=True, help="Path to the model"
    )
    parser.add_argument(
        "--model_type",
        choices=["base_model", "chat_model"],
        default="chat_model",
        help="Base model or Chat model",
    )
    parser.add_argument(
        "--max_new_tokens",
        type=int,
        default=1024,
        help="Maximum number of output new tokens",
    )
    parser.add_argument(
        "--max_model_len", type=int, default=8192, help="Max model length"
    )
    parser.add_argument(
        "--template",
        type=str,
        choices=[None, "llama3", "baichuan", "chatglm"],
        default=None,
        help="The template must be specified if not present in the config file",
    )
    parser.add_argument(
        "--test_path",
        type=str,
        default="table_related_benchmarks/evalset/reject_test/test_query.json",
        help="Test File Path",
    )
    parser.add_argument(
        "--save_path",
        type=str,
        default="output/result_reject.json",
        help="LLM output samples save path",
    )

    args = parser.parse_args()
    main(args)


# example /home/dev/weights/CodeQwen1.5-7B-Chat /data0/pretrained-models/checkpoints/qwen2/checkpoint-1200
"""
python table_related_benchmarks/run_reject_eval.py --model_path /data4/sft_output/qwen2-base-0817  
"""