Update README.md
Browse files
README.md
CHANGED
@@ -62,6 +62,56 @@ We apply tailored prompts for coding and math task:
|
|
62 |
{question} + "\n\nPresent the answer in LaTex format: \\boxed{Your answer}"
|
63 |
```
|
64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
## Evaluation
|
66 |
|
67 |
Through PRIME, we successfully achieve substantial improvements on key reasoning benchmarks over our SFT version of the model, leading to **16.7%** improvement on average, and over **20%** on AMC&AIME competitions. Our final model Eurus-2-7B-PRIME, based on Qwen-2.5-Math-7B-Base, surpassed its instruct version on 5 key reasoning benchmarks.
|
|
|
62 |
{question} + "\n\nPresent the answer in LaTex format: \\boxed{Your answer}"
|
63 |
```
|
64 |
|
65 |
+
|
66 |
+
```python
|
67 |
+
import os
|
68 |
+
from tqdm import tqdm
|
69 |
+
import torch
|
70 |
+
from transformers import AutoTokenizer
|
71 |
+
from vllm import LLM, SamplingParams
|
72 |
+
os.environ["NCCL_IGNORE_DISABLED_P2P"] = "1"
|
73 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "true"
|
74 |
+
|
75 |
+
def generate(question_list,model_path):
|
76 |
+
llm = LLM(
|
77 |
+
model=model_path,
|
78 |
+
trust_remote_code=True,
|
79 |
+
tensor_parallel_size=torch.cuda.device_count(),
|
80 |
+
gpu_memory_utilization=0.90,
|
81 |
+
)
|
82 |
+
sampling_params = SamplingParams(max_tokens=8192,
|
83 |
+
temperature=0.0,
|
84 |
+
n=1)
|
85 |
+
outputs = llm.generate(question_list, sampling_params, use_tqdm=True)
|
86 |
+
completions = [[output.text for output in output_item.outputs] for output_item in outputs]
|
87 |
+
return completions
|
88 |
+
|
89 |
+
def make_conv_hf(question, tokenizer):
|
90 |
+
# for math problem
|
91 |
+
content = question + "\n\nPresent the answer in LaTex format: \\boxed{Your answer}"
|
92 |
+
# for code problem
|
93 |
+
# content = question + "\n\nWrite Python code to solve the problem. Present the code in \n```python\nYour code\n```\nat the end."
|
94 |
+
msg = [
|
95 |
+
{"role": "user", "content": content}
|
96 |
+
]
|
97 |
+
chat = tokenizer.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
|
98 |
+
return chat
|
99 |
+
|
100 |
+
def run():
|
101 |
+
model_path = "PRIME-RL/Eurus-2-7B-PRIME"
|
102 |
+
all_problems = [
|
103 |
+
"which number is larger? 9.11 or 9.9?"
|
104 |
+
]
|
105 |
+
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
106 |
+
completions = generate([make_conv_hf(problem_data, tokenizer) for problem_data in all_problems],model_path)
|
107 |
+
print(completions)
|
108 |
+
# [['[ASSESS]\n\n# The problem asks us to compare two decimal numbers, 9.11 and 9.9, to determine which one is larger.\n# We need to compare the whole parts and the decimal parts of the numbers.\n\nNext action: [ADVANCE]\n\n# Compare the whole parts of the numbers: both 9.11 and 9.9 have the same whole part, which is 9.\n# Compare the decimal parts of the numbers: 0.11 (from 9.11) is less than 0.9 (from 9.9).\n\nNext action: [ADVANCE]\n\n# Since the whole parts are the same and the decimal part of 9.9 is greater than the decimal part of 9.11, we can conclude that 9.9 is larger than 9.11.\n\nNext action: [OUTPUT]\n\nThe final answer is $\\boxed{9.9}$.\n\n']]
|
109 |
+
if __name__ == "__main__":
|
110 |
+
run()
|
111 |
+
```
|
112 |
+
|
113 |
+
|
114 |
+
|
115 |
## Evaluation
|
116 |
|
117 |
Through PRIME, we successfully achieve substantial improvements on key reasoning benchmarks over our SFT version of the model, leading to **16.7%** improvement on average, and over **20%** on AMC&AIME competitions. Our final model Eurus-2-7B-PRIME, based on Qwen-2.5-Math-7B-Base, surpassed its instruct version on 5 key reasoning benchmarks.
|