qianxiao1111
commited on
Commit
•
6a74973
1
Parent(s):
5da0640
Revert "eval (#6)"
Browse filesThis reverts commit 7f272e4a20ca97427179a37281c7a2a62e203e89.
This view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +0 -4
- added_tokens.json +24 -3
- config.json +28 -3
- evaluation/.gitignore +0 -190
- evaluation/README.md +0 -181
- evaluation/general_benchmarks/HumanEval/README.md +0 -74
- evaluation/general_benchmarks/HumanEval/data/humaneval-cpp +0 -0
- evaluation/general_benchmarks/HumanEval/data/humaneval-cpp.jsonl +0 -3
- evaluation/general_benchmarks/HumanEval/data/humaneval-cs +0 -0
- evaluation/general_benchmarks/HumanEval/data/humaneval-cs-bu.jsonl +0 -3
- evaluation/general_benchmarks/HumanEval/data/humaneval-cs.jsonl +0 -3
- evaluation/general_benchmarks/HumanEval/data/humaneval-d.jsonl +0 -3
- evaluation/general_benchmarks/HumanEval/data/humaneval-go.jsonl +0 -3
- evaluation/general_benchmarks/HumanEval/data/humaneval-java +0 -0
- evaluation/general_benchmarks/HumanEval/data/humaneval-java.jsonl +0 -3
- evaluation/general_benchmarks/HumanEval/data/humaneval-jl.jsonl +0 -3
- evaluation/general_benchmarks/HumanEval/data/humaneval-js.jsonl +0 -3
- evaluation/general_benchmarks/HumanEval/data/humaneval-lua.jsonl +0 -3
- evaluation/general_benchmarks/HumanEval/data/humaneval-php +0 -0
- evaluation/general_benchmarks/HumanEval/data/humaneval-php.jsonl +0 -3
- evaluation/general_benchmarks/HumanEval/data/humaneval-pl.jsonl +0 -3
- evaluation/general_benchmarks/HumanEval/data/humaneval-python.jsonl +0 -3
- evaluation/general_benchmarks/HumanEval/data/humaneval-r.jsonl +0 -3
- evaluation/general_benchmarks/HumanEval/data/humaneval-rb.jsonl +0 -3
- evaluation/general_benchmarks/HumanEval/data/humaneval-rkt.jsonl +0 -3
- evaluation/general_benchmarks/HumanEval/data/humaneval-rs.jsonl +0 -3
- evaluation/general_benchmarks/HumanEval/data/humaneval-scala.jsonl +0 -3
- evaluation/general_benchmarks/HumanEval/data/humaneval-sh +0 -0
- evaluation/general_benchmarks/HumanEval/data/humaneval-sh.jsonl +0 -3
- evaluation/general_benchmarks/HumanEval/data/humaneval-swift.jsonl +0 -3
- evaluation/general_benchmarks/HumanEval/data/humaneval-ts +0 -0
- evaluation/general_benchmarks/HumanEval/data/humaneval-ts.jsonl +0 -3
- evaluation/general_benchmarks/HumanEval/eval.sh +0 -4
- evaluation/general_benchmarks/HumanEval/eval_base_vllm.py +0 -162
- evaluation/general_benchmarks/HumanEval/eval_instruct.py +0 -168
- evaluation/general_benchmarks/HumanEval/eval_instruct_vllm.py +0 -225
- evaluation/general_benchmarks/HumanEval/eval_pal.py +0 -62
- evaluation/general_benchmarks/HumanEval/human_eval/__init__.py +0 -0
- evaluation/general_benchmarks/HumanEval/human_eval/data.py +0 -48
- evaluation/general_benchmarks/HumanEval/human_eval/evaluate_functional_correctness.py +0 -32
- evaluation/general_benchmarks/HumanEval/human_eval/evaluation.py +0 -351
- evaluation/general_benchmarks/HumanEval/human_eval/execution.py +0 -817
- evaluation/general_benchmarks/HumanEval/humaneval.py +0 -217
- evaluation/general_benchmarks/HumanEval/javatuples-1.2.jar +0 -0
- evaluation/general_benchmarks/HumanEval/test_config.yaml +0 -15
- evaluation/general_benchmarks/HumanEval/utils/dataset.py +0 -72
- evaluation/general_benchmarks/HumanEval/utils/utils.py +0 -161
- evaluation/general_benchmarks/MATH/LICENSE +0 -21
- evaluation/general_benchmarks/MATH/README.md +0 -52
- evaluation/general_benchmarks/MATH/data/aime24/test.jsonl +0 -3
.gitattributes
CHANGED
@@ -47,7 +47,3 @@ rng_state_2.pth filter=lfs diff=lfs merge=lfs -text
|
|
47 |
model-00003-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
|
48 |
rng_state_4.pth filter=lfs diff=lfs merge=lfs -text
|
49 |
model-00002-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
|
50 |
-
*.csv filter=lfs diff=lfs merge=lfs -text
|
51 |
-
*.json filter=lfs diff=lfs merge=lfs -text
|
52 |
-
*.jsonl filter=lfs diff=lfs merge=lfs -text
|
53 |
-
*.sqlite filter=lfs diff=lfs merge=lfs -text
|
|
|
47 |
model-00003-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
|
48 |
rng_state_4.pth filter=lfs diff=lfs merge=lfs -text
|
49 |
model-00002-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
added_tokens.json
CHANGED
@@ -1,3 +1,24 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"</tool_call>": 151658,
|
3 |
+
"<tool_call>": 151657,
|
4 |
+
"<|box_end|>": 151649,
|
5 |
+
"<|box_start|>": 151648,
|
6 |
+
"<|endoftext|>": 151643,
|
7 |
+
"<|file_sep|>": 151664,
|
8 |
+
"<|fim_middle|>": 151660,
|
9 |
+
"<|fim_pad|>": 151662,
|
10 |
+
"<|fim_prefix|>": 151659,
|
11 |
+
"<|fim_suffix|>": 151661,
|
12 |
+
"<|im_end|>": 151645,
|
13 |
+
"<|im_start|>": 151644,
|
14 |
+
"<|image_pad|>": 151655,
|
15 |
+
"<|object_ref_end|>": 151647,
|
16 |
+
"<|object_ref_start|>": 151646,
|
17 |
+
"<|quad_end|>": 151651,
|
18 |
+
"<|quad_start|>": 151650,
|
19 |
+
"<|repo_name|>": 151663,
|
20 |
+
"<|video_pad|>": 151656,
|
21 |
+
"<|vision_end|>": 151653,
|
22 |
+
"<|vision_pad|>": 151654,
|
23 |
+
"<|vision_start|>": 151652
|
24 |
+
}
|
config.json
CHANGED
@@ -1,3 +1,28 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
|
3 |
+
"architectures": [
|
4 |
+
"Qwen2ForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_dropout": 0.0,
|
7 |
+
"bos_token_id": 151643,
|
8 |
+
"eos_token_id": 151645,
|
9 |
+
"hidden_act": "silu",
|
10 |
+
"hidden_size": 3584,
|
11 |
+
"initializer_range": 0.02,
|
12 |
+
"intermediate_size": 18944,
|
13 |
+
"max_position_embeddings": 32768,
|
14 |
+
"max_window_layers": 28,
|
15 |
+
"model_type": "qwen2",
|
16 |
+
"num_attention_heads": 28,
|
17 |
+
"num_hidden_layers": 28,
|
18 |
+
"num_key_value_heads": 4,
|
19 |
+
"rms_norm_eps": 1e-06,
|
20 |
+
"rope_theta": 1000000.0,
|
21 |
+
"sliding_window": null,
|
22 |
+
"tie_word_embeddings": false,
|
23 |
+
"torch_dtype": "bfloat16",
|
24 |
+
"transformers_version": "4.44.2",
|
25 |
+
"use_cache": false,
|
26 |
+
"use_sliding_window": false,
|
27 |
+
"vocab_size": 152064
|
28 |
+
}
|
evaluation/.gitignore
DELETED
@@ -1,190 +0,0 @@
|
|
1 |
-
|
2 |
-
# Byte-compiled / optimized / DLL files
|
3 |
-
__pycache__/
|
4 |
-
*.py[cod]
|
5 |
-
*$py.class
|
6 |
-
*.sql
|
7 |
-
*.sqlite
|
8 |
-
*.splite
|
9 |
-
*.desc
|
10 |
-
*.txt
|
11 |
-
*.DS_Store
|
12 |
-
.DS_Store
|
13 |
-
!eval_retriever/data/*.json
|
14 |
-
!eval_retriever/preds/*.json
|
15 |
-
!reject_eval/*.json
|
16 |
-
!evalset/*/*.json
|
17 |
-
!evalset/*.json
|
18 |
-
|
19 |
-
# C extensions
|
20 |
-
*.so
|
21 |
-
|
22 |
-
# Distribution / packaging
|
23 |
-
.Python
|
24 |
-
build/
|
25 |
-
develop-eggs/
|
26 |
-
dist/
|
27 |
-
downloads/
|
28 |
-
eggs/
|
29 |
-
.eggs/
|
30 |
-
lib/
|
31 |
-
lib64/
|
32 |
-
parts/
|
33 |
-
sdist/
|
34 |
-
var/
|
35 |
-
wheels/
|
36 |
-
share/python-wheels/
|
37 |
-
*.egg-info/
|
38 |
-
.installed.cfg
|
39 |
-
*.egg
|
40 |
-
MANIFEST
|
41 |
-
|
42 |
-
# PyInstaller
|
43 |
-
# Usually these files are written by a python script from a template
|
44 |
-
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
45 |
-
*.manifest
|
46 |
-
*.spec
|
47 |
-
|
48 |
-
# Installer logs
|
49 |
-
pip-log.txt
|
50 |
-
pip-delete-this-directory.txt
|
51 |
-
|
52 |
-
# Unit test / coverage reports
|
53 |
-
htmlcov/
|
54 |
-
.tox/
|
55 |
-
.nox/
|
56 |
-
.coverage
|
57 |
-
.coverage.*
|
58 |
-
.cache
|
59 |
-
nosetests.xml
|
60 |
-
coverage.xml
|
61 |
-
*.cover
|
62 |
-
*.py,cover
|
63 |
-
.hypothesis/
|
64 |
-
.pytest_cache/
|
65 |
-
cover/
|
66 |
-
|
67 |
-
# Translations
|
68 |
-
*.mo
|
69 |
-
*.pot
|
70 |
-
|
71 |
-
# Django stuff:
|
72 |
-
*.log
|
73 |
-
local_settings.py
|
74 |
-
db.sqlite3
|
75 |
-
db.sqlite3-journal
|
76 |
-
|
77 |
-
# Flask stuff:
|
78 |
-
instance/
|
79 |
-
.webassets-cache
|
80 |
-
|
81 |
-
# Scrapy stuff:
|
82 |
-
.scrapy
|
83 |
-
|
84 |
-
# Sphinx documentation
|
85 |
-
docs/_build/
|
86 |
-
|
87 |
-
# PyBuilder
|
88 |
-
.pybuilder/
|
89 |
-
target/
|
90 |
-
|
91 |
-
# Jupyter Notebook
|
92 |
-
.ipynb_checkpoints
|
93 |
-
|
94 |
-
# IPython
|
95 |
-
profile_default/
|
96 |
-
ipython_config.py
|
97 |
-
|
98 |
-
# pyenv
|
99 |
-
# For a library or package, you might want to ignore these files since the code is
|
100 |
-
# intended to run in multiple environments; otherwise, check them in:
|
101 |
-
# .python-version
|
102 |
-
|
103 |
-
# pipenv
|
104 |
-
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
105 |
-
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
106 |
-
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
107 |
-
# install all needed dependencies.
|
108 |
-
#Pipfile.lock
|
109 |
-
|
110 |
-
# poetry
|
111 |
-
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
112 |
-
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
113 |
-
# commonly ignored for libraries.
|
114 |
-
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
115 |
-
#poetry.lock
|
116 |
-
|
117 |
-
# pdm
|
118 |
-
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
119 |
-
#pdm.lock
|
120 |
-
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
121 |
-
# in version control.
|
122 |
-
# https://pdm.fming.dev/#use-with-ide
|
123 |
-
.pdm.toml
|
124 |
-
|
125 |
-
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
126 |
-
__pypackages__/
|
127 |
-
|
128 |
-
# Celery stuff
|
129 |
-
celerybeat-schedule
|
130 |
-
celerybeat.pid
|
131 |
-
|
132 |
-
# SageMath parsed files
|
133 |
-
*.sage.py
|
134 |
-
|
135 |
-
# Environments
|
136 |
-
.env
|
137 |
-
.venv
|
138 |
-
env/
|
139 |
-
venv/
|
140 |
-
ENV/
|
141 |
-
env.bak/
|
142 |
-
venv.bak/
|
143 |
-
|
144 |
-
# Spyder project settings
|
145 |
-
.spyderproject
|
146 |
-
.spyproject
|
147 |
-
|
148 |
-
# Rope project settings
|
149 |
-
.ropeproject
|
150 |
-
|
151 |
-
# mkdocs documentation
|
152 |
-
/site
|
153 |
-
|
154 |
-
# mypy
|
155 |
-
.mypy_cache/
|
156 |
-
.dmypy.json
|
157 |
-
dmypy.json
|
158 |
-
|
159 |
-
# Pyre type checker
|
160 |
-
.pyre/
|
161 |
-
|
162 |
-
# pytype static type analyzer
|
163 |
-
.pytype/
|
164 |
-
|
165 |
-
# Cython debug symbols
|
166 |
-
cython_debug/
|
167 |
-
|
168 |
-
# PyCharm
|
169 |
-
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
170 |
-
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
171 |
-
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
172 |
-
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
173 |
-
#.idea/
|
174 |
-
|
175 |
-
output/
|
176 |
-
images/
|
177 |
-
.vscode
|
178 |
-
vllm_encoder/
|
179 |
-
|
180 |
-
!table_related_benchmarks/evalset/bird_data/*.sql
|
181 |
-
!table_related_benchmarks/evalset/spider_data/*.sql
|
182 |
-
|
183 |
-
table_related_benchmarks/evalset/spider_data/test_database/*
|
184 |
-
table_related_benchmarks/evalset/bird_data/dev_databases/*
|
185 |
-
table_related_benchmarks/evalset/spider_data/dev_database/*
|
186 |
-
|
187 |
-
|
188 |
-
!table_related_benchmarks/evalset/spider_data/test_database/README.md
|
189 |
-
!table_related_benchmarks/evalset/bird_data/dev_databases/README.md
|
190 |
-
!table_related_benchmarks/evalset/spider_data/dev_database/README.md
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evaluation/README.md
DELETED
@@ -1,181 +0,0 @@
|
|
1 |
-
# Benchmarks evaluations for tablegpt
|
2 |
-
|
3 |
-
<p align="center">
|
4 |
-
<a href="#-About">🔥About</a> •
|
5 |
-
<a href="#-Usage">💻Usage</a> •
|
6 |
-
</p>
|
7 |
-
|
8 |
-
## About
|
9 |
-
|
10 |
-
</div>
|
11 |
-
|
12 |
-
This is a repo opened for evaluation on different table-related benchmarks for tablegpt.
|
13 |
-
|
14 |
-
Given the complexity of table QA tasks and the uncertainty of input instructions, we provide evaluation datasets and scripts for 7 capabilities:
|
15 |
-
|
16 |
-
- ✨Code correction based on tables
|
17 |
-
- ✨Refusal of ambiguous questions
|
18 |
-
- ✨Table & field recall in multi-table scenarios
|
19 |
-
- ✨Table QA output code executable
|
20 |
-
- ✨Table-Bench.
|
21 |
-
- ✨Text2Sql.
|
22 |
-
- ✨TableInstruct, which includes a series of table-related evaluation benchmarks.
|
23 |
-
|
24 |
-
In addition, we have integrated other general abilities benchmarks like HumanEval, MBPP and MMLU/CMMLU.
|
25 |
-
We have built an inference method based on the local model path using vLLM as the backend, and defined a set of example prompts templates for the above benchmarks.
|
26 |
-
|
27 |
-
## Usage
|
28 |
-
|
29 |
-
</div>
|
30 |
-
</details>
|
31 |
-
|
32 |
-
⏬ To use this framework, please first install the repository from GitHub:
|
33 |
-
|
34 |
-
```shell
|
35 |
-
git clone https://github.com/tablegpt/tablegpt-eval
|
36 |
-
cd tablegpt-eval
|
37 |
-
pip install -r requirements.txt
|
38 |
-
```
|
39 |
-
|
40 |
-
</div>
|
41 |
-
</details>
|
42 |
-
|
43 |
-
[!Tips]
|
44 |
-
1. You can run all the benchmarks with the default params by running command `bash run_benchmarks.sh`.
|
45 |
-
2. If you want more configuration options for running parameters, refer to the typical Python script.
|
46 |
-
3. Download the .db files before running text2sql evaluation scripts. Download urls refer to `/table_related_benchmarks/evalset/bird_data/dev_databases/README.md`(Bird dev) & `table_related_benchmarks/evalset/spider_data/dev_database/README.md` (Spider dev) & `table_related_benchmarks/evalset/spider_data/test_database/README.md` (Spider test).
|
47 |
-
|
48 |
-
|
49 |
-
### Code correction eval
|
50 |
-
|
51 |
-
We provide a non-executable eval dataset based on the Python language. Eval dataset path:
|
52 |
-
|
53 |
-
```python
|
54 |
-
table_related_benchmarks/evalset/code_correction_test/correction_set.json
|
55 |
-
```
|
56 |
-
|
57 |
-
We use the ***executable_pass_rate*** and ***absolute_match_rate*** of the corrected code in pass-1 to evaluate the model's code correction ability. You can perform code-correction evaluation by running the following Python command:
|
58 |
-
|
59 |
-
```bash
|
60 |
-
python table_related_benchmarks/run_code_correction_eval.py \
|
61 |
-
--model_path <EVAL MODEL PATH> \
|
62 |
-
--template <CHAT_TEMPLATE_NAME, support [llama3, baichuan, chatglm, None], default None> \
|
63 |
-
--eval_results_save_path <PATH TO SAVE THE EVAL RESULTS> \
|
64 |
-
--gpus_num <NUMBER OF GPU TO RUN INFERENCE> \
|
65 |
-
--temperature <ONE OF THE INFERENCE PARAMETER>
|
66 |
-
```
|
67 |
-
|
68 |
-
### Ambiguous reject eval
|
69 |
-
|
70 |
-
We provide 298 table-based queries, with a ratio of about 1:3 between queries marked as ambiguous (to be rejected) and queries that should be accepted and correctly answered. Dataset path:
|
71 |
-
|
72 |
-
```python
|
73 |
-
# test queries
|
74 |
-
evalset/reject_test/test_query.json
|
75 |
-
# queries with ground truth
|
76 |
-
evalset/reject_test/ground_truth.json
|
77 |
-
```
|
78 |
-
|
79 |
-
We use **accuracy**, **recall**, and **F1 score** as metrics to evaluate the LLM's ability in this task. You can perform reject evaluation by running the following Python command:
|
80 |
-
|
81 |
-
```bash
|
82 |
-
python table_related_benchmarks/run_reject_eval.py \
|
83 |
-
--model_path <EVAL MODEL PATH> \
|
84 |
-
--save_path <LLM OUTPUT CONTENT SAVE PATH> \
|
85 |
-
--gpus_num <NUMBER OF GPU TO RUN INFERENCE> \
|
86 |
-
--temperature <ONE OF THE INFERENCE PARAMETER>
|
87 |
-
```
|
88 |
-
|
89 |
-
### Table&Fields recall eval
|
90 |
-
|
91 |
-
The provided eval dataset path:
|
92 |
-
|
93 |
-
```python
|
94 |
-
table_related_benchmarks/evalset/retrieval_test/recall_set.json
|
95 |
-
```
|
96 |
-
|
97 |
-
We use a series of evaluation metrics such as **recall**, **precision**, **Jaccard similarity**, and **Hamming loss** to assess the LLM's performance in table and field retrieval tasks. You can perform recall evaluation by running the following Python command:
|
98 |
-
|
99 |
-
```bash
|
100 |
-
python table_related_benchmarks/run_recall_eval.py \
|
101 |
-
--model_path <EVAL MODEL PATH> \
|
102 |
-
--temperature <TEMPERATURE> \
|
103 |
-
--gpus_num <NUMBER OF GPU TO RUN INFERENCE>
|
104 |
-
```
|
105 |
-
|
106 |
-
### Table QA executable
|
107 |
-
|
108 |
-
Provide 2178 table based queries, eval dataset path:
|
109 |
-
|
110 |
-
```python
|
111 |
-
table_related_benchmarks/evalset/table_qa_execuate_test/tableqa_samples_with_paths.jsonl
|
112 |
-
```
|
113 |
-
|
114 |
-
We employ ***executable_pass_rate*** of pass-1 to employ the model's tableQA code generation ability. You can perform tableQA evaluation by running the following Python command:
|
115 |
-
|
116 |
-
```bash
|
117 |
-
python table_related_benchmarks/run_tableqa_execution_eval.py \
|
118 |
-
--model_path <EVAL MODEL PATH> \
|
119 |
-
--temperature <ONE OF THE INFERENCE PARAMETER> \
|
120 |
-
--gpus_num <NUMBER OF GPU TO RUN INFERENCE>
|
121 |
-
```
|
122 |
-
|
123 |
-
### TableBench evaluation
|
124 |
-
|
125 |
-
The provided eval dataset path:
|
126 |
-
|
127 |
-
```python
|
128 |
-
table_related_benchmarks/evalset/TableBench
|
129 |
-
```
|
130 |
-
|
131 |
-
In the evaluation of TableBench, Rough-L was used to assess general QA questions, while pass@1 was used as the evaluation metric for visualization-type samples. You can perform TableBench evaluation by the following command:
|
132 |
-
|
133 |
-
```bash
|
134 |
-
python table_related_benchmarks/run_table_bench_eval.py \
|
135 |
-
--model_path <EVAL MODEL PATH> \
|
136 |
-
--temperature <ONE OF THE INFERENCE PARAMETER> \
|
137 |
-
--gpus_num <NUMBER OF GPU TO RUN INFERENCE>
|
138 |
-
```
|
139 |
-
|
140 |
-
### TableInstruct
|
141 |
-
|
142 |
-
The provided eval dataset path:
|
143 |
-
|
144 |
-
```python
|
145 |
-
table_related_benchmarks/evalset/TableInstruct
|
146 |
-
```
|
147 |
-
|
148 |
-
You can perform TableInstruct evaluation by the following command:
|
149 |
-
|
150 |
-
```bash
|
151 |
-
python table_related_benchmarks/run_table_instruct_eval.py \
|
152 |
-
--model_path <EVAL MODEL PATH> \
|
153 |
-
--temperature <ONE OF THE INFERENCE PARAMETER> \
|
154 |
-
--gpus_num <NUMBER OF GPU TO RUN INFERENCE>
|
155 |
-
```
|
156 |
-
|
157 |
-
### Text2Sql
|
158 |
-
```bash
|
159 |
-
python table_related_benchmarks/run_text2sql_eval.py --model_path <EVAL MODEL PATH>
|
160 |
-
```
|
161 |
-
|
162 |
-
### HumanEval
|
163 |
-
Perform HumanEval evaluation by the following command:
|
164 |
-
|
165 |
-
```bash
|
166 |
-
python general_benchmarks/HumanEval/eval_instruct_vllm.py --model_path <EVAL MODEL PATH>
|
167 |
-
```
|
168 |
-
|
169 |
-
### MBPP
|
170 |
-
Perform MBPP evaluation by the following command:
|
171 |
-
|
172 |
-
```bash
|
173 |
-
python general_benchmarks/MBPP/eval_instruct_vllm.py --model_path <EVAL MODEL PATH>
|
174 |
-
```
|
175 |
-
|
176 |
-
### MMLU & CMMLU
|
177 |
-
|
178 |
-
```bash
|
179 |
-
python general_benchmarks/MMLU/evaluator.py --task <mmlu or cmmlu> --lang <en or zh> --model_path <EVAL MODEL PATH>
|
180 |
-
```
|
181 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evaluation/general_benchmarks/HumanEval/README.md
DELETED
@@ -1,74 +0,0 @@
|
|
1 |
-
## 1. Introduction
|
2 |
-
|
3 |
-
We provide a test script to evaluate the performance of the **deepseek-coder** model on code generation benchmarks. We select the widely-used benchmarks: **[HumanEval-Python](https://huggingface.co/datasets/openai_humaneval), [HumanEval-Multilingual](https://huggingface.co/datasets/nuprl/MultiPL-E)**.
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
## 2. Setup
|
8 |
-
|
9 |
-
```
|
10 |
-
pip install accelerate
|
11 |
-
pip install attrdict
|
12 |
-
pip install transformers
|
13 |
-
pip install pytorch
|
14 |
-
```
|
15 |
-
|
16 |
-
|
17 |
-
## 3. Evaluation
|
18 |
-
|
19 |
-
We've created a sample script, **eval.sh**, that demonstrates how to test the **DeepSeek-Coder-1.3b-Base** model on the HumanEval dataset leveraging **8** GPUs. If your use case involves a different model or dataset, simply adjust the script to fit your needs.
|
20 |
-
|
21 |
-
Additionally, for various programming languages, the execution path may differ. Please ensure you update the appropriate paths in the **humaneval/execution.py** file accordingly.
|
22 |
-
|
23 |
-
```bash
|
24 |
-
MODEL_NAME_OR_PATH="deepseek-ai/deepseek-coder-1.3b-base"
|
25 |
-
DATASET_ROOT="data/"
|
26 |
-
LANGUAGE="python"
|
27 |
-
python -m accelerate.commands.launch --config_file test_config.yaml eval_pal.py --logdir ${MODEL_NAME_OR_PATH} --language ${LANGUAGE} --dataroot ${DATASET_ROOT}
|
28 |
-
```
|
29 |
-
|
30 |
-
To evaluate the instruction-based model, please follow the script below:
|
31 |
-
```bash
|
32 |
-
LANG="python"
|
33 |
-
OUPUT_DIR="output"
|
34 |
-
MODEL="deepseek-coder-33b-instruct"
|
35 |
-
|
36 |
-
CUDA_VISIBLE_DEVICES=0,1 python eval_instruct.py \
|
37 |
-
--model "deepseek-ai/$MODEL" \
|
38 |
-
--output_path "$OUPUT_DIR/${LANG}.$MODEL.jsonl" \
|
39 |
-
--language $LANG \
|
40 |
-
--temp_dir $OUPUT_DIR
|
41 |
-
```
|
42 |
-
|
43 |
-
## 4. Experimental Results
|
44 |
-
|
45 |
-
We report experimental results here for 8 main-stream programming languages, **python**, **c++**, **java**, **PHP**, **TypeScript**, **C#**, **Bash**, and **JavaScript**. For all open-source models, we utilize this repository to obtain the performance of the models on the HumanEval dataset. We set the maximum input length to **4096** and the maximum output length to **500**, and employ the **greedy search strategy**.
|
46 |
-
|
47 |
-
|
48 |
-
#### (1) Multilingual Base Models
|
49 |
-
|
50 |
-
| Model | Size | Python | C++ | Java | PHP | TS | C# | Bash | JS | Avg |
|
51 |
-
|-------------------|------|--------|-------|------|------|------|------|------|------|------|
|
52 |
-
| code-cushman-001 | 12B | 33.5% | 31.9% | 30.6%| 28.9%| 31.3%| 22.1%| 11.7%| - | - |
|
53 |
-
| CodeShell | 7B | 35.4% | 32.9% | 34.2%| 31.7%| 30.2%| 38.0%| 7.0% | 33.5%| 30.4%|
|
54 |
-
| CodeGeeX2 | 6B | 36.0% | 29.2% | 25.9%| 23.6%| 20.8%| 29.7%| 6.3% | 24.8%| 24.5%|
|
55 |
-
| StarCoderBase | 16B | 31.7% | 31.1% | 28.5%| 25.4%| 34.0%| 34.8%| 8.9% | 29.8%| 28.0%|
|
56 |
-
| CodeLLama | 7B | 31.7% | 29.8% | 34.2%| 23.6%| 36.5%| 36.7%| 12.0%| 29.2%| 29.2%|
|
57 |
-
| CodeLLama | 13B | 36.0% | 37.9% | 38.0%| 34.2%| 45.2%| 43.0%| 16.5%| 32.3%| 35.4%|
|
58 |
-
| CodeLLama | 34B | 48.2% | 44.7% | 44.9%| 41.0%| 42.1%| 48.7%| 15.8%| 42.2%| 41.0%|
|
59 |
-
| | | | | | | | | | | |
|
60 |
-
| DeepSeek-Coder-Base| 1.3B | 34.8% | 31.1% | 32.3%| 24.2%| 28.9%| 36.7%| 10.1%| 28.6%| 28.3%|
|
61 |
-
| DeepSeek-Coder-Base| 5.7B | 48.7% | 45.3% | 41.1%| 39.7%| 44.7%| 41.1%| 27.8%| 42.2%| 41.3%|
|
62 |
-
| DeepSeek-Coder-Base| 6.7B | 49.4% | 50.3% | 43.0%| 38.5%| 49.7%| 50.0%| 28.5%| 48.4%| 44.7%|
|
63 |
-
| DeepSeek-Coder-Base|33B | **56.1%** | **58.4%** | **51.9%**| **44.1%**| **52.8%**| **51.3%**| **32.3%**| **55.3%**| **50.3%**|
|
64 |
-
|
65 |
-
#### (2) Instruction-Tuned Models
|
66 |
-
| Model | Size | Python | C++ | Java | PHP | TS | C# | Bash | JS | Avg |
|
67 |
-
|---------------------|------|--------|-------|------|------|------|------|------|------|------|
|
68 |
-
| GPT-3.5-Turbo | - | 76.2% | 63.4% | 69.2%| 60.9%| 69.1%| 70.8%| 42.4%| 67.1%| 64.9%|
|
69 |
-
| GPT-4 | - | **84.1%** | **76.4%** | **81.6%**| **77.2%**| **77.4%**| **79.1%**| **58.2%**| **78.0%**| **76.5%**|
|
70 |
-
| | | | | | | | | | | |
|
71 |
-
| DeepSeek-Coder-Instruct | 1.3B | 65.2% | 45.3% | 51.9% | 45.3% | 59.7% |55.1% | 12.7% | 52.2% | 48.4% |
|
72 |
-
| DeepSeek-Coder-Instruct | 6.7B | 78.9% | 63.4% | 68.4% | 68.9%| 67.2%| 72.8%| 36.7%| 72.7%| 66.1%|
|
73 |
-
| DeepSeek-Coder-Instruct | 33B | **79.3%** | **68.9%** | **73.4%** | **72.7%**| **67.9%**| **74.1%**| **43.0%**| **73.9%**| **69.2%**|
|
74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evaluation/general_benchmarks/HumanEval/data/humaneval-cpp
DELETED
The diff for this file is too large to render.
See raw diff
|
|
evaluation/general_benchmarks/HumanEval/data/humaneval-cpp.jsonl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:8717eabdf202137158c84506144b0fb1e73d5ecccbe5363ec79009ca014df629
|
3 |
-
size 388688
|
|
|
|
|
|
|
|
evaluation/general_benchmarks/HumanEval/data/humaneval-cs
DELETED
The diff for this file is too large to render.
See raw diff
|
|
evaluation/general_benchmarks/HumanEval/data/humaneval-cs-bu.jsonl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:d281b53b24e0f44cb76f1b1a8702b1ca668ff2a29c7621276ee8b658f5c124c6
|
3 |
-
size 448701
|
|
|
|
|
|
|
|
evaluation/general_benchmarks/HumanEval/data/humaneval-cs.jsonl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:1c3fe18be10addc2d0b96311f4db192ae3232e08628d17768d889d6ab87be224
|
3 |
-
size 452021
|
|
|
|
|
|
|
|
evaluation/general_benchmarks/HumanEval/data/humaneval-d.jsonl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:226938c015f90d713a3e30d8f174f4a6d2c88820cf50512379a16890dda70332
|
3 |
-
size 289365
|
|
|
|
|
|
|
|
evaluation/general_benchmarks/HumanEval/data/humaneval-go.jsonl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:7f5dfe3ff001049b1221e9d21a8119b5cbc38eb87c97fefc5d57fa7adc1df888
|
3 |
-
size 432325
|
|
|
|
|
|
|
|
evaluation/general_benchmarks/HumanEval/data/humaneval-java
DELETED
The diff for this file is too large to render.
See raw diff
|
|
evaluation/general_benchmarks/HumanEval/data/humaneval-java.jsonl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:b21b015763452ec4f9f4e0e6425148ec331ace4da1232c8b4d441186185f6265
|
3 |
-
size 454059
|
|
|
|
|
|
|
|
evaluation/general_benchmarks/HumanEval/data/humaneval-jl.jsonl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:ef21bd920889c9c1ab0e87a825cc26bd895e7a715f569f8ba7de577f870b6815
|
3 |
-
size 268754
|
|
|
|
|
|
|
|
evaluation/general_benchmarks/HumanEval/data/humaneval-js.jsonl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:634e2eee8d6e22de07c121b972207683dd96be76256bf44bdfc1a3386b739287
|
3 |
-
size 297853
|
|
|
|
|
|
|
|
evaluation/general_benchmarks/HumanEval/data/humaneval-lua.jsonl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:d2e91f3603f6aee63db2c2d5754d165397603a8c9bd6130842af7988b27a96fc
|
3 |
-
size 298314
|
|
|
|
|
|
|
|
evaluation/general_benchmarks/HumanEval/data/humaneval-php
DELETED
The diff for this file is too large to render.
See raw diff
|
|
evaluation/general_benchmarks/HumanEval/data/humaneval-php.jsonl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:bebd86875b1d8e65a8b7e692ed7bdf64612b44aa388825ea4dab40c7047c786b
|
3 |
-
size 388096
|
|
|
|
|
|
|
|
evaluation/general_benchmarks/HumanEval/data/humaneval-pl.jsonl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:949048bff3eaea7ae47cd5759042c088a7c227d53c74fe95a80728fd5aefbf77
|
3 |
-
size 437506
|
|
|
|
|
|
|
|
evaluation/general_benchmarks/HumanEval/data/humaneval-python.jsonl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:0eae07adadbb00d51962fdb78b9e2a26bfa8ade85dc54eb57cae9bffed2f5c54
|
3 |
-
size 342974
|
|
|
|
|
|
|
|
evaluation/general_benchmarks/HumanEval/data/humaneval-r.jsonl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:c2a3268ea54d2bfb8ce65bb2c808437ac0d9934c6caf99fcf75d6b6a4fb3f911
|
3 |
-
size 311904
|
|
|
|
|
|
|
|
evaluation/general_benchmarks/HumanEval/data/humaneval-rb.jsonl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:4204395b835019e32d55513325a0fad01c6d382fd1eb97b516f0458d00058302
|
3 |
-
size 312806
|
|
|
|
|
|
|
|
evaluation/general_benchmarks/HumanEval/data/humaneval-rkt.jsonl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:083325daf55daf431ae6d6d17cf017afb18a6ec790d4c841b7c2b4752c5807ff
|
3 |
-
size 312006
|
|
|
|
|
|
|
|
evaluation/general_benchmarks/HumanEval/data/humaneval-rs.jsonl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:e33f562838e973b7e6b8d76dbc1fb84b076d7426629b4cdc624f12678778d2fa
|
3 |
-
size 306470
|
|
|
|
|
|
|
|
evaluation/general_benchmarks/HumanEval/data/humaneval-scala.jsonl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:482b72a16613a563755d642e37792da68471399789b069fba5b1249e831445f3
|
3 |
-
size 384243
|
|
|
|
|
|
|
|
evaluation/general_benchmarks/HumanEval/data/humaneval-sh
DELETED
The diff for this file is too large to render.
See raw diff
|
|
evaluation/general_benchmarks/HumanEval/data/humaneval-sh.jsonl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:2f702106e9ce4aa7385de568d9248d6c57c90382d782b162cb9e072fbd01ccf8
|
3 |
-
size 274180
|
|
|
|
|
|
|
|
evaluation/general_benchmarks/HumanEval/data/humaneval-swift.jsonl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:e4418d6f532e4298969e727690cda80dc88b94e4abf836f8ff79ac18a737eaaa
|
3 |
-
size 344436
|
|
|
|
|
|
|
|
evaluation/general_benchmarks/HumanEval/data/humaneval-ts
DELETED
The diff for this file is too large to render.
See raw diff
|
|
evaluation/general_benchmarks/HumanEval/data/humaneval-ts.jsonl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:4b1af1050c9f226335d54a8c80553c39454a525cace3e67bdcfcc9092ba02637
|
3 |
-
size 304732
|
|
|
|
|
|
|
|
evaluation/general_benchmarks/HumanEval/eval.sh
DELETED
@@ -1,4 +0,0 @@
|
|
1 |
-
MODEL_NAME_OR_PATH="/data3/models/DeepSeek/deepseek-coder-6.7b-base"
|
2 |
-
DATASET_ROOT="HumanEval/data"
|
3 |
-
LANGUAGE="python"
|
4 |
-
CUDA_VISIBLE_DEVICES=5,6,7 python -m accelerate.commands.launch --config_file HumanEval/test_config.yaml HumanEval/eval_pal.py --model_path ${MODEL_NAME_OR_PATH} --language ${LANGUAGE} --dataroot ${DATASET_ROOT}
|
|
|
|
|
|
|
|
|
|
evaluation/general_benchmarks/HumanEval/eval_base_vllm.py
DELETED
@@ -1,162 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import os
|
3 |
-
import time
|
4 |
-
from argparse import ArgumentParser
|
5 |
-
# from accelerate import Accelerator
|
6 |
-
# from accelerate import DistributedDataParallelKwargs
|
7 |
-
from pathlib import Path
|
8 |
-
|
9 |
-
import numpy as np
|
10 |
-
import torch
|
11 |
-
import torch.nn.functional as F
|
12 |
-
from human_eval.evaluation import evaluate_functional_correctness
|
13 |
-
from tqdm import tqdm
|
14 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
15 |
-
from utils.dataset import HumanEvalDataset
|
16 |
-
from utils.utils import cleanup_code
|
17 |
-
from vllm import LLM, SamplingParams
|
18 |
-
|
19 |
-
|
20 |
-
class HumanEval:
|
21 |
-
"""
|
22 |
-
HumanEval evaluation class.
|
23 |
-
"""
|
24 |
-
|
25 |
-
def __init__(
|
26 |
-
self,
|
27 |
-
data_root,
|
28 |
-
language="python",
|
29 |
-
log_dir=None,
|
30 |
-
issft=False,
|
31 |
-
inference_increment=True,
|
32 |
-
n_sample=1,
|
33 |
-
k_sample=1,
|
34 |
-
):
|
35 |
-
self.data_root = data_root
|
36 |
-
self.k = k_sample
|
37 |
-
self.n_sample = n_sample
|
38 |
-
self.language = language
|
39 |
-
self.log_dir = log_dir
|
40 |
-
self.sft = issft
|
41 |
-
self.inference_increment = inference_increment
|
42 |
-
os.makedirs(self.log_dir, exist_ok=True)
|
43 |
-
|
44 |
-
@torch.no_grad()
|
45 |
-
def eval_model(self, args):
|
46 |
-
"""
|
47 |
-
Evaluate the model on HumanEval.
|
48 |
-
"""
|
49 |
-
assert (
|
50 |
-
self.log_dir is not None
|
51 |
-
), "log_dir should not be None when evaluating humaneval"
|
52 |
-
dataset = HumanEvalDataset(
|
53 |
-
self.data_root,
|
54 |
-
sample_num=self.n_sample,
|
55 |
-
language=self.language,
|
56 |
-
issft=self.sft,
|
57 |
-
)
|
58 |
-
model_name_or_path = args.model_path
|
59 |
-
print("model", model_name_or_path)
|
60 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
|
61 |
-
print(
|
62 |
-
"load tokenizer {} from {} over.".format(
|
63 |
-
tokenizer.__class__, model_name_or_path
|
64 |
-
)
|
65 |
-
)
|
66 |
-
|
67 |
-
llm = LLM(
|
68 |
-
model=model_name_or_path,
|
69 |
-
tensor_parallel_size=1,
|
70 |
-
max_model_len=4096,
|
71 |
-
trust_remote_code=True,
|
72 |
-
enforce_eager=True,
|
73 |
-
)
|
74 |
-
sampling_params = SamplingParams(
|
75 |
-
temperature=0,
|
76 |
-
max_tokens=1024,
|
77 |
-
top_p=0.95,
|
78 |
-
stop_token_ids=[tokenizer.eos_token_id],
|
79 |
-
)
|
80 |
-
messages_list = []
|
81 |
-
for i in range(len(dataset)):
|
82 |
-
data = dataset[i]
|
83 |
-
prompt = data["prompt"].strip()
|
84 |
-
messages_list.append(prompt)
|
85 |
-
outputs = llm.generate(messages_list, sampling_params=sampling_params)
|
86 |
-
assert len(dataset) == len(outputs), "dataset and outputs different lengths."
|
87 |
-
log_file = os.path.join(self.log_dir, f"{self.language}.json")
|
88 |
-
tmpfile = open(log_file, "w")
|
89 |
-
for i, output in enumerate(tqdm(outputs)):
|
90 |
-
data = dataset[i]
|
91 |
-
output = output.outputs[0].text
|
92 |
-
output = cleanup_code(
|
93 |
-
output,
|
94 |
-
self.language,
|
95 |
-
"humaneval",
|
96 |
-
self.sft,
|
97 |
-
dataset.stopwords,
|
98 |
-
)
|
99 |
-
# sft mode does not need original prompt
|
100 |
-
if not self.sft:
|
101 |
-
suffixprediction = data["original_prompt"] + "\n" + output
|
102 |
-
res = {
|
103 |
-
"task_id": data["task_id"],
|
104 |
-
"generation": suffixprediction,
|
105 |
-
"prompt": data["original_prompt"],
|
106 |
-
}
|
107 |
-
tmpfile.write(json.dumps(res) + "\n")
|
108 |
-
|
109 |
-
tmpfile.close()
|
110 |
-
# calculate the final score of pass@k
|
111 |
-
self._calculate_final_score(log_file)
|
112 |
-
return
|
113 |
-
|
114 |
-
def _calculate_final_score(self, logfilepath):
|
115 |
-
"""
|
116 |
-
Calculate the final score.
|
117 |
-
"""
|
118 |
-
res = evaluate_functional_correctness(
|
119 |
-
input_file=logfilepath,
|
120 |
-
problem_file=os.path.join(
|
121 |
-
self.data_root, f"humaneval-{self.language}.jsonl"
|
122 |
-
),
|
123 |
-
tmp_dir=self.log_dir,
|
124 |
-
language=self.language,
|
125 |
-
)
|
126 |
-
print("score is", res["pass@%d" % self.k])
|
127 |
-
os.remove(logfilepath)
|
128 |
-
return
|
129 |
-
|
130 |
-
|
131 |
-
if __name__ == "__main__":
|
132 |
-
parser = ArgumentParser()
|
133 |
-
parser.add_argument("--logdir", type=str, default="")
|
134 |
-
parser.add_argument(
|
135 |
-
"--model_path",
|
136 |
-
type=str,
|
137 |
-
help="model name or path",
|
138 |
-
default="/data0/pretrained-models/qwen2-7b",
|
139 |
-
)
|
140 |
-
|
141 |
-
parser.add_argument("--language", type=str, default="python")
|
142 |
-
parser.add_argument(
|
143 |
-
"--dataroot",
|
144 |
-
type=str,
|
145 |
-
default="HumanEval/data",
|
146 |
-
)
|
147 |
-
args = parser.parse_args()
|
148 |
-
|
149 |
-
logdir = args.logdir
|
150 |
-
language = args.language
|
151 |
-
|
152 |
-
if logdir == "":
|
153 |
-
logdir = "output/tmp/"
|
154 |
-
|
155 |
-
evaluator = HumanEval(
|
156 |
-
data_root=args.dataroot,
|
157 |
-
log_dir=logdir,
|
158 |
-
n_sample=1,
|
159 |
-
language=language,
|
160 |
-
)
|
161 |
-
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
162 |
-
evaluator.eval_model(args)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evaluation/general_benchmarks/HumanEval/eval_instruct.py
DELETED
@@ -1,168 +0,0 @@
|
|
1 |
-
import argparse
|
2 |
-
import json
|
3 |
-
import os
|
4 |
-
from pathlib import Path
|
5 |
-
|
6 |
-
import torch
|
7 |
-
from tqdm import tqdm
|
8 |
-
|
9 |
-
data_abs_dir = Path(__file__).parent / "data"
|
10 |
-
|
11 |
-
from human_eval.evaluation import evaluate_functional_correctness
|
12 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
13 |
-
from utils.utils import extract_generation_code, languge_settings
|
14 |
-
|
15 |
-
|
16 |
-
def build_deepseekcoder_instruction(languge: str, question: str):
|
17 |
-
return """
|
18 |
-
Please continue to complete the function. You are not allowed to modify the given code and do the completion only. Please return all completed function in a codeblock. Here is the given code to do completion:
|
19 |
-
```{}
|
20 |
-
{}
|
21 |
-
```
|
22 |
-
""".strip().format(
|
23 |
-
languge.lower(), question.strip()
|
24 |
-
)
|
25 |
-
|
26 |
-
|
27 |
-
def generate_one(example, lang, tokenizer, model):
|
28 |
-
prompt = build_deepseekcoder_instruction(
|
29 |
-
languge_settings[lang]["full_name"], example["prompt"]
|
30 |
-
)
|
31 |
-
inputs = tokenizer.apply_chat_template(
|
32 |
-
[{"role": "user", "content": prompt}],
|
33 |
-
return_tensors="pt",
|
34 |
-
add_generation_prompt=True,
|
35 |
-
).to(model.device)
|
36 |
-
|
37 |
-
stop_id = tokenizer.convert_tokens_to_ids("<|EOT|>")
|
38 |
-
assert isinstance(stop_id, int), "Invalid tokenizer, EOT id not found"
|
39 |
-
|
40 |
-
outputs = model.generate(
|
41 |
-
inputs,
|
42 |
-
max_new_tokens=1024,
|
43 |
-
do_sample=False,
|
44 |
-
# top_p=0.95,
|
45 |
-
# temperature=temperature,
|
46 |
-
pad_token_id=stop_id,
|
47 |
-
eos_token_id=stop_id,
|
48 |
-
)
|
49 |
-
|
50 |
-
output = tokenizer.decode(outputs[0][len(inputs[0]) :], skip_special_tokens=True)
|
51 |
-
example["output"] = output
|
52 |
-
|
53 |
-
return extract_generation_code(example, lang_code=lang)
|
54 |
-
|
55 |
-
|
56 |
-
def generate_main(args):
|
57 |
-
model_name_or_path = args.model
|
58 |
-
lang = args.language
|
59 |
-
saved_path = args.output_path
|
60 |
-
temp_dir = args.temp_dir
|
61 |
-
os.makedirs(temp_dir, exist_ok=True)
|
62 |
-
problem_file = os.path.join(data_abs_dir, f"humaneval-{lang}.jsonl")
|
63 |
-
|
64 |
-
print("model", model_name_or_path)
|
65 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
|
66 |
-
print(
|
67 |
-
"load tokenizer {} from {} over.".format(
|
68 |
-
tokenizer.__class__, model_name_or_path
|
69 |
-
)
|
70 |
-
)
|
71 |
-
model = AutoModelForCausalLM.from_pretrained(
|
72 |
-
model_name_or_path,
|
73 |
-
torch_dtype=torch.bfloat16,
|
74 |
-
device_map="auto",
|
75 |
-
# use_flash_attention_2=True
|
76 |
-
)
|
77 |
-
model.eval()
|
78 |
-
examples = [json.loads(x) for x in open(problem_file) if x.strip()]
|
79 |
-
print("Read {} examples for evaluation over.".format(len(examples)))
|
80 |
-
|
81 |
-
generated_examples = []
|
82 |
-
for ex in tqdm(examples, desc="Generating"):
|
83 |
-
gen_example = generate_one(ex, args.language, tokenizer, model)
|
84 |
-
generated_examples.append(gen_example)
|
85 |
-
|
86 |
-
print("Generate all over!!!")
|
87 |
-
with open(saved_path, "w", encoding="utf-8") as fw:
|
88 |
-
for ex in generated_examples:
|
89 |
-
fw.write(json.dumps(ex) + "\n")
|
90 |
-
print(
|
91 |
-
"Save {} processed examples into {} over!".format(
|
92 |
-
len(generated_examples), saved_path
|
93 |
-
)
|
94 |
-
)
|
95 |
-
|
96 |
-
result = evaluate_functional_correctness(
|
97 |
-
input_file=saved_path,
|
98 |
-
tmp_dir=temp_dir,
|
99 |
-
n_workers=8,
|
100 |
-
timeout=3.0,
|
101 |
-
problem_file=problem_file,
|
102 |
-
language=lang,
|
103 |
-
)
|
104 |
-
print(lang, result, model_name_or_path)
|
105 |
-
pass
|
106 |
-
|
107 |
-
|
108 |
-
def evaluation_only(args):
|
109 |
-
lang = args.language
|
110 |
-
temp_dir = args.temp_dir
|
111 |
-
assert os.path.exists(args.output_path), "Not fond output file: {}".format(
|
112 |
-
args.output_path
|
113 |
-
)
|
114 |
-
os.makedirs(temp_dir, exist_ok=True)
|
115 |
-
|
116 |
-
output_name = os.path.basename(args.output_path)
|
117 |
-
output_examples = [json.loads(x) for x in open(args.output_path) if x.strip()]
|
118 |
-
|
119 |
-
processed_examples = [
|
120 |
-
extract_generation_code(ex, lang) for ex in tqdm(output_examples, "Processing")
|
121 |
-
]
|
122 |
-
processed_path = os.path.join(temp_dir, output_name)
|
123 |
-
with open(processed_path, "w", encoding="utf-8") as fw:
|
124 |
-
for ex in processed_examples:
|
125 |
-
fw.write(json.dumps(ex) + "\n")
|
126 |
-
print(
|
127 |
-
"Save {} processed examples into {} over!".format(
|
128 |
-
len(processed_examples), processed_path
|
129 |
-
)
|
130 |
-
)
|
131 |
-
|
132 |
-
problem_file = os.path.join(data_abs_dir, f"humaneval-{lang}.jsonl")
|
133 |
-
from human_eval.evaluation import evaluate_functional_correctness
|
134 |
-
|
135 |
-
result = evaluate_functional_correctness(
|
136 |
-
input_file=processed_path,
|
137 |
-
tmp_dir=temp_dir,
|
138 |
-
n_workers=8,
|
139 |
-
timeout=3.0,
|
140 |
-
problem_file=problem_file,
|
141 |
-
language=lang,
|
142 |
-
)
|
143 |
-
print(lang, result)
|
144 |
-
|
145 |
-
|
146 |
-
if __name__ == "__main__":
|
147 |
-
parser = argparse.ArgumentParser()
|
148 |
-
parser.add_argument(
|
149 |
-
"--model",
|
150 |
-
type=str,
|
151 |
-
help="model name or path",
|
152 |
-
default="/data0/pretrained-models/deepseek-coder-6.7b-instruct",
|
153 |
-
)
|
154 |
-
parser.add_argument(
|
155 |
-
"--output_path",
|
156 |
-
type=str,
|
157 |
-
help="output path of your generation",
|
158 |
-
default="/home/qyhuang/DeepSeek-Coder/outputs/deepseek-chat.json",
|
159 |
-
)
|
160 |
-
parser.add_argument("--language", type=str, help="langauge", default="python")
|
161 |
-
parser.add_argument(
|
162 |
-
"--temp_dir", type=str, help="temp dir for evaluation", default="tmp"
|
163 |
-
)
|
164 |
-
args = parser.parse_args()
|
165 |
-
|
166 |
-
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
167 |
-
generate_main(args)
|
168 |
-
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evaluation/general_benchmarks/HumanEval/eval_instruct_vllm.py
DELETED
@@ -1,225 +0,0 @@
|
|
1 |
-
import argparse
|
2 |
-
import json
|
3 |
-
import os
|
4 |
-
import shutil
|
5 |
-
from pathlib import Path
|
6 |
-
|
7 |
-
import torch
|
8 |
-
import transformers
|
9 |
-
from human_eval.evaluation import evaluate_functional_correctness
|
10 |
-
from tqdm import tqdm
|
11 |
-
from transformers import AutoTokenizer
|
12 |
-
from utils.utils import extract_generation_code, languge_settings
|
13 |
-
from vllm import LLM, SamplingParams
|
14 |
-
|
15 |
-
data_abs_dir = Path(__file__).parent / "data"
|
16 |
-
|
17 |
-
|
18 |
-
def build_deepseekcoder_instruction(languge: str, question: str):
|
19 |
-
return """
|
20 |
-
Please continue to complete the function. You are not allowed to modify the given code and do the completion only. Please return all completed function in a codeblock. Here is the given code to do completion:
|
21 |
-
```{}
|
22 |
-
{}
|
23 |
-
```
|
24 |
-
""".strip().format(
|
25 |
-
languge.lower(), question.strip()
|
26 |
-
)
|
27 |
-
|
28 |
-
|
29 |
-
def create_dir(output_dir):
|
30 |
-
if os.path.exists(output_dir):
|
31 |
-
if not os.access(output_dir, os.W_OK):
|
32 |
-
shutil.rmtree(output_dir)
|
33 |
-
os.makedirs(output_dir)
|
34 |
-
os.chmod(output_dir, 0o777)
|
35 |
-
print("not write permission, makedir:", output_dir)
|
36 |
-
else:
|
37 |
-
print(f"{output_dir} exists!")
|
38 |
-
else:
|
39 |
-
os.makedirs(output_dir)
|
40 |
-
os.chmod(output_dir, 0o777)
|
41 |
-
print("makedir:", output_dir)
|
42 |
-
|
43 |
-
|
44 |
-
def get_client_res(messages, example, output_key, open_ai_key=False):
|
45 |
-
try:
|
46 |
-
if open_ai_key:
|
47 |
-
from openai import AzureOpenAI, OpenAI
|
48 |
-
try:
|
49 |
-
api_key = os.environ["OPENAI_API_KEY"]
|
50 |
-
except KeyError:
|
51 |
-
print("环境变量 OPENAI_API_KEY 未设置")
|
52 |
-
api_key = "default_value"
|
53 |
-
|
54 |
-
client = AzureOpenAI(
|
55 |
-
api_key=api_key,
|
56 |
-
api_version="2024-07-01-preview",
|
57 |
-
azure_endpoint="https://zju-tablegpt.openai.azure.com/",
|
58 |
-
)
|
59 |
-
chat_response = client.chat.completions.create(
|
60 |
-
model="gpt-4o",
|
61 |
-
# model="gpt-4o-mini",
|
62 |
-
messages=messages,
|
63 |
-
top_p=0.95,
|
64 |
-
temperature=0,
|
65 |
-
max_tokens=1024,
|
66 |
-
timeout=40,
|
67 |
-
)
|
68 |
-
else:
|
69 |
-
# Set OpenAI's API key and API base to use vLLM's API server.
|
70 |
-
openai_api_key = "EMPTY"
|
71 |
-
openai_api_base = "http://localhost:8080/v1"
|
72 |
-
|
73 |
-
client = OpenAI(
|
74 |
-
api_key=openai_api_key,
|
75 |
-
base_url=openai_api_base,
|
76 |
-
)
|
77 |
-
chat_response = client.chat.completions.create(
|
78 |
-
model="qwen2-7b-sft",
|
79 |
-
messages=messages,
|
80 |
-
top_p=0.3,
|
81 |
-
temperature=0.1,
|
82 |
-
max_tokens=1024,
|
83 |
-
)
|
84 |
-
example[output_key] = chat_response.choices[0].message.content
|
85 |
-
except Exception as e:
|
86 |
-
print(f"An unexpected error occurred: {e}")
|
87 |
-
example[output_key] = None
|
88 |
-
example["input"] = messages
|
89 |
-
return example
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
def generate_main(args):
|
94 |
-
model_name_or_path = args.model_path
|
95 |
-
lang = args.language
|
96 |
-
temp_dir = args.temp_dir
|
97 |
-
create_dir(temp_dir)
|
98 |
-
# os.makedirs(temp_dir, exist_ok=True)
|
99 |
-
problem_file = os.path.join(data_abs_dir, f"humaneval-{lang}.jsonl")
|
100 |
-
if not args.api:
|
101 |
-
print("model", model_name_or_path)
|
102 |
-
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
|
103 |
-
print(
|
104 |
-
"load tokenizer {} from {} over.".format(
|
105 |
-
tokenizer.__class__, model_name_or_path
|
106 |
-
)
|
107 |
-
)
|
108 |
-
llm_args = {
|
109 |
-
"model": model_name_or_path,
|
110 |
-
"gpu_memory_utilization": 0.95,
|
111 |
-
"trust_remote_code": True,
|
112 |
-
"tensor_parallel_size": args.gpus_num,
|
113 |
-
"dtype": "half",
|
114 |
-
"max_model_len": 8192,
|
115 |
-
"enforce_eager": True,
|
116 |
-
}
|
117 |
-
|
118 |
-
llm = LLM(**llm_args)
|
119 |
-
sampling_params = SamplingParams(
|
120 |
-
temperature=0,
|
121 |
-
max_tokens=1024,
|
122 |
-
top_p=0.95,
|
123 |
-
stop_token_ids=[tokenizer.eos_token_id],
|
124 |
-
)
|
125 |
-
|
126 |
-
examples = [json.loads(x) for x in open(problem_file) if x.strip()]
|
127 |
-
print("Read {} examples for evaluation over.".format(len(examples)))
|
128 |
-
messages_list = []
|
129 |
-
for example in tqdm(examples, desc="Generating"):
|
130 |
-
prompt = build_deepseekcoder_instruction(
|
131 |
-
languge_settings[lang]["full_name"], example["prompt"]
|
132 |
-
)
|
133 |
-
message = [{"role": "user", "content": prompt}]
|
134 |
-
if args.api:
|
135 |
-
messages_list.append(message)
|
136 |
-
else:
|
137 |
-
messages_list.append(
|
138 |
-
tokenizer.apply_chat_template(
|
139 |
-
message, tokenize=False, add_generation_prompt=True
|
140 |
-
)
|
141 |
-
)
|
142 |
-
if args.api:
|
143 |
-
from joblib import Parallel, delayed
|
144 |
-
examples_ = Parallel(n_jobs=24)(
|
145 |
-
delayed(get_client_res)(inp, examples[i], "output",open_ai_key=True)
|
146 |
-
for i, inp in enumerate(tqdm(messages_list))
|
147 |
-
)
|
148 |
-
|
149 |
-
# 请求错误的重新请求
|
150 |
-
examples = []
|
151 |
-
for example in examples_:
|
152 |
-
if example["output"] == None:
|
153 |
-
example = get_client_res(
|
154 |
-
example["input"], example, "output", open_ai_key=True
|
155 |
-
)
|
156 |
-
del example["input"]
|
157 |
-
examples.append(example)
|
158 |
-
|
159 |
-
generated_examples = []
|
160 |
-
for example in examples:
|
161 |
-
example = extract_generation_code(example, lang_code=lang)
|
162 |
-
generated_examples.append(example)
|
163 |
-
else:
|
164 |
-
outputs = llm.generate(messages_list, sampling_params=sampling_params)
|
165 |
-
generated_examples = []
|
166 |
-
for i, output in enumerate(tqdm(outputs)):
|
167 |
-
output = output.outputs[0].text
|
168 |
-
example = examples[i]
|
169 |
-
example["output"] = output
|
170 |
-
example = extract_generation_code(example, lang_code=lang)
|
171 |
-
generated_examples.append(example)
|
172 |
-
|
173 |
-
print("Generate all over!!!")
|
174 |
-
# os.makedirs(args.save_dir, exist_ok=True)
|
175 |
-
create_dir(args.save_dir)
|
176 |
-
saved_path = os.path.join(args.save_dir, "results_humaneval.json")
|
177 |
-
with open(saved_path, "w", encoding="utf-8") as fw:
|
178 |
-
for ex in generated_examples:
|
179 |
-
fw.write(json.dumps(ex) + "\n")
|
180 |
-
print(
|
181 |
-
"Save {} processed examples into {} over!".format(
|
182 |
-
len(generated_examples), saved_path
|
183 |
-
)
|
184 |
-
)
|
185 |
-
|
186 |
-
result = evaluate_functional_correctness(
|
187 |
-
input_file=saved_path,
|
188 |
-
tmp_dir=temp_dir,
|
189 |
-
n_workers=8,
|
190 |
-
timeout=3.0,
|
191 |
-
problem_file=problem_file,
|
192 |
-
language=lang,
|
193 |
-
out_path=saved_path,
|
194 |
-
)
|
195 |
-
print(lang, result, model_name_or_path)
|
196 |
-
|
197 |
-
|
198 |
-
if __name__ == "__main__":
|
199 |
-
parser = argparse.ArgumentParser()
|
200 |
-
parser.add_argument(
|
201 |
-
"--model_path",
|
202 |
-
type=str,
|
203 |
-
help="model name or path",
|
204 |
-
default="/data4/sft_output/qwen2-instruct-0709/checkpoint-1400",
|
205 |
-
)
|
206 |
-
parser.add_argument(
|
207 |
-
"--gpus_num", type=int, default=1, help="the number of GPUs you want to use."
|
208 |
-
)
|
209 |
-
parser.add_argument(
|
210 |
-
"--save_dir",
|
211 |
-
type=str,
|
212 |
-
help="output path of your generation",
|
213 |
-
default="output",
|
214 |
-
)
|
215 |
-
parser.add_argument("--api", action="store_true", help="infer api type")
|
216 |
-
parser.add_argument("--language", type=str, help="langauge", default="python")
|
217 |
-
parser.add_argument(
|
218 |
-
"--temp_dir", type=str, help="temp dir for evaluation", default="output/tmp"
|
219 |
-
)
|
220 |
-
parser.add_argument("--seed", type=int, help="seed", default=42)
|
221 |
-
args = parser.parse_args()
|
222 |
-
|
223 |
-
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
224 |
-
transformers.set_seed(args.seed)
|
225 |
-
generate_main(args)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evaluation/general_benchmarks/HumanEval/eval_pal.py
DELETED
@@ -1,62 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import os
|
3 |
-
import subprocess
|
4 |
-
import sys
|
5 |
-
from argparse import ArgumentParser
|
6 |
-
from pathlib import Path
|
7 |
-
|
8 |
-
import numpy as np
|
9 |
-
import pandas as pd
|
10 |
-
import torch
|
11 |
-
import torch.distributed as dist
|
12 |
-
import torch.nn.functional as F
|
13 |
-
from accelerate import Accelerator, DistributedDataParallelKwargs
|
14 |
-
from humaneval import HumanEval as evaltor
|
15 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
16 |
-
|
17 |
-
if __name__ == "__main__":
|
18 |
-
kwargs_handlers = [DistributedDataParallelKwargs(find_unused_parameters=True)]
|
19 |
-
accelerator = Accelerator(mixed_precision="bf16", kwargs_handlers=kwargs_handlers)
|
20 |
-
|
21 |
-
parser = ArgumentParser()
|
22 |
-
parser.add_argument("--logdir", type=str, default="./output")
|
23 |
-
parser.add_argument(
|
24 |
-
"--model_path",
|
25 |
-
type=str,
|
26 |
-
default="/data3/models/DeepSeek/deepseek-coder-6.7b-base",
|
27 |
-
)
|
28 |
-
parser.add_argument("--language", type=str, default="python")
|
29 |
-
parser.add_argument("--dataroot", type=str, default="HumanEval/data")
|
30 |
-
args = parser.parse_args()
|
31 |
-
|
32 |
-
logdir = args.logdir
|
33 |
-
language = args.language
|
34 |
-
model_path = args.model_path
|
35 |
-
|
36 |
-
if logdir == "":
|
37 |
-
logdir = "tmp/"
|
38 |
-
tokenizer = dict(
|
39 |
-
cls=AutoTokenizer,
|
40 |
-
model_path=model_path,
|
41 |
-
)
|
42 |
-
|
43 |
-
dataroot = args.dataroot
|
44 |
-
|
45 |
-
evaluator = evaltor(
|
46 |
-
data_root=dataroot,
|
47 |
-
max_seq_len=4096,
|
48 |
-
tokenizer_cfg=tokenizer,
|
49 |
-
log_dir=logdir,
|
50 |
-
n_sample=1,
|
51 |
-
batch_size=1,
|
52 |
-
language=language,
|
53 |
-
max_gen_len=500,
|
54 |
-
)
|
55 |
-
model = AutoModelForCausalLM.from_pretrained(
|
56 |
-
model_path,
|
57 |
-
device_map=accelerator.device,
|
58 |
-
trust_remote_code=True,
|
59 |
-
torch_dtype=torch.bfloat16,
|
60 |
-
)
|
61 |
-
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
62 |
-
evaluator.eval_model(model, accelerator)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evaluation/general_benchmarks/HumanEval/human_eval/__init__.py
DELETED
File without changes
|
evaluation/general_benchmarks/HumanEval/human_eval/data.py
DELETED
@@ -1,48 +0,0 @@
|
|
1 |
-
import gzip
|
2 |
-
import json
|
3 |
-
import os
|
4 |
-
from typing import Dict, Iterable
|
5 |
-
|
6 |
-
ROOT = os.path.dirname(os.path.abspath(__file__))
|
7 |
-
HUMAN_EVAL = os.path.join(ROOT, "..", "data", "HumanEval.jsonl.gz")
|
8 |
-
|
9 |
-
|
10 |
-
def read_problems(evalset_file: str = HUMAN_EVAL) -> Dict[str, Dict]:
|
11 |
-
return {task["task_id"]: task for task in stream_jsonl(evalset_file)}
|
12 |
-
|
13 |
-
|
14 |
-
def stream_jsonl(filename: str) -> Iterable[Dict]:
|
15 |
-
"""
|
16 |
-
Parses each jsonl line and yields it as a dictionary
|
17 |
-
"""
|
18 |
-
if filename.endswith(".gz"):
|
19 |
-
with open(filename, "rb") as gzfp:
|
20 |
-
with gzip.open(gzfp, "rt") as fp:
|
21 |
-
for line in fp:
|
22 |
-
if any(not x.isspace() for x in line):
|
23 |
-
yield json.loads(line)
|
24 |
-
else:
|
25 |
-
with open(filename, "r", encoding="utf-8") as fp:
|
26 |
-
for line in fp:
|
27 |
-
if any(not x.isspace() for x in line):
|
28 |
-
yield json.loads(line)
|
29 |
-
|
30 |
-
|
31 |
-
def write_jsonl(filename: str, data: Iterable[Dict], append: bool = False):
|
32 |
-
"""
|
33 |
-
Writes an iterable of dictionaries to jsonl
|
34 |
-
"""
|
35 |
-
if append:
|
36 |
-
mode = "ab"
|
37 |
-
else:
|
38 |
-
mode = "wb"
|
39 |
-
filename = os.path.expanduser(filename)
|
40 |
-
if filename.endswith(".gz"):
|
41 |
-
with open(filename, mode) as fp:
|
42 |
-
with gzip.GzipFile(fileobj=fp, mode="wb") as gzfp:
|
43 |
-
for x in data:
|
44 |
-
gzfp.write((json.dumps(x) + "\n").encode("utf-8"))
|
45 |
-
else:
|
46 |
-
with open(filename, mode) as fp:
|
47 |
-
for x in data:
|
48 |
-
fp.write((json.dumps(x) + "\n").encode("utf-8"))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evaluation/general_benchmarks/HumanEval/human_eval/evaluate_functional_correctness.py
DELETED
@@ -1,32 +0,0 @@
|
|
1 |
-
import sys
|
2 |
-
|
3 |
-
import fire
|
4 |
-
|
5 |
-
from .data import HUMAN_EVAL
|
6 |
-
from .evaluation import evaluate_functional_correctness
|
7 |
-
|
8 |
-
|
9 |
-
def entry_point(
|
10 |
-
sample_file: str,
|
11 |
-
k: str = "1,10,100",
|
12 |
-
n_workers: int = 4,
|
13 |
-
timeout: float = 3.0,
|
14 |
-
problem_file: str = "",
|
15 |
-
is_mbpp: bool = False,
|
16 |
-
):
|
17 |
-
"""
|
18 |
-
Evaluates the functional correctness of generated samples, and writes
|
19 |
-
results to f"{sample_file}_results.jsonl.gz"
|
20 |
-
"""
|
21 |
-
k = list(map(int, k.split(",")))
|
22 |
-
results = evaluate_functional_correctness(
|
23 |
-
sample_file, k, n_workers, timeout, problem_file, is_mbpp
|
24 |
-
)
|
25 |
-
print(results)
|
26 |
-
|
27 |
-
|
28 |
-
def main():
|
29 |
-
fire.Fire(entry_point)
|
30 |
-
|
31 |
-
|
32 |
-
sys.exit(main())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evaluation/general_benchmarks/HumanEval/human_eval/evaluation.py
DELETED
@@ -1,351 +0,0 @@
|
|
1 |
-
import gzip
|
2 |
-
import itertools
|
3 |
-
import json
|
4 |
-
import os
|
5 |
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
6 |
-
from typing import *
|
7 |
-
|
8 |
-
import numpy as np
|
9 |
-
from tqdm.auto import tqdm
|
10 |
-
|
11 |
-
from human_eval.data import stream_jsonl
|
12 |
-
from human_eval.execution import check_correctness
|
13 |
-
|
14 |
-
IMPORT_HELPER = {
|
15 |
-
"python": [
|
16 |
-
"import math",
|
17 |
-
"import re",
|
18 |
-
"import sys",
|
19 |
-
"import copy",
|
20 |
-
"import datetime",
|
21 |
-
"import itertools",
|
22 |
-
"import collections",
|
23 |
-
"import heapq",
|
24 |
-
"import functools",
|
25 |
-
"import hashlib",
|
26 |
-
"import numpy",
|
27 |
-
"import numpy as np",
|
28 |
-
"import string",
|
29 |
-
"from typing import *",
|
30 |
-
"from collections import *",
|
31 |
-
],
|
32 |
-
"go": [
|
33 |
-
"math",
|
34 |
-
"strings",
|
35 |
-
"fmt",
|
36 |
-
"strconv",
|
37 |
-
"time",
|
38 |
-
"bytes",
|
39 |
-
"regexp",
|
40 |
-
"sort",
|
41 |
-
"math/rand",
|
42 |
-
"crypto/md5",
|
43 |
-
],
|
44 |
-
"cpp": [
|
45 |
-
"#include<stdlib.h>",
|
46 |
-
"#include<algorithm>",
|
47 |
-
"#include<math.h>",
|
48 |
-
"#include<stdio.h>",
|
49 |
-
"#include<vector>",
|
50 |
-
"#include<string>",
|
51 |
-
"#include<climits>",
|
52 |
-
"#include<cstring>",
|
53 |
-
"#include<iostream>",
|
54 |
-
"#include<cassert>",
|
55 |
-
],
|
56 |
-
"cs": [
|
57 |
-
"using System.Numerics;",
|
58 |
-
"using System.Diagnostics;",
|
59 |
-
"using System.Collections.Generic;",
|
60 |
-
"using System.Linq;",
|
61 |
-
"using System.Text;",
|
62 |
-
"using System.Security.Cryptography;",
|
63 |
-
"using System.Collections.Generic;",
|
64 |
-
],
|
65 |
-
}
|
66 |
-
|
67 |
-
|
68 |
-
LANGUAGE_NAME = {
|
69 |
-
"cpp": "CPP",
|
70 |
-
"go": "Go",
|
71 |
-
"java": "Java",
|
72 |
-
"js": "JavaScript",
|
73 |
-
"python": "Python",
|
74 |
-
}
|
75 |
-
|
76 |
-
|
77 |
-
def read_dataset(
|
78 |
-
data_file: str = None,
|
79 |
-
dataset_type: str = "humaneval",
|
80 |
-
num_shot=None,
|
81 |
-
) -> Dict:
|
82 |
-
"""
|
83 |
-
Reads a dataset and returns a dictionary of tasks.
|
84 |
-
"""
|
85 |
-
if num_shot is not None:
|
86 |
-
print(f"{num_shot}-shot setting...")
|
87 |
-
if "humaneval" in dataset_type.lower():
|
88 |
-
if data_file is None:
|
89 |
-
current_path = os.path.dirname(os.path.abspath(__file__))
|
90 |
-
data_file = os.path.join(
|
91 |
-
current_path,
|
92 |
-
"..",
|
93 |
-
"humaneval-x",
|
94 |
-
"python",
|
95 |
-
"data",
|
96 |
-
"humaneval_python.jsonl.gz",
|
97 |
-
)
|
98 |
-
dataset = {task["task_id"]: task for task in stream_jsonl(data_file)}
|
99 |
-
else:
|
100 |
-
raise f"Dataset: {dataset_type} not supported."
|
101 |
-
|
102 |
-
return dataset
|
103 |
-
|
104 |
-
|
105 |
-
def estimate_pass_at_k(
|
106 |
-
num_samples: Union[int, List[int], np.ndarray],
|
107 |
-
num_correct: Union[List[int], np.ndarray],
|
108 |
-
k: int,
|
109 |
-
) -> np.ndarray:
|
110 |
-
"""
|
111 |
-
Estimates pass@k of each problem and returns them in an array.
|
112 |
-
"""
|
113 |
-
|
114 |
-
def estimator(n: int, c: int, k: int) -> float:
|
115 |
-
"""
|
116 |
-
Calculates 1 - comb(n - c, k) / comb(n, k).
|
117 |
-
"""
|
118 |
-
if n - c < k:
|
119 |
-
return 1.0
|
120 |
-
return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
|
121 |
-
|
122 |
-
if isinstance(num_samples, int):
|
123 |
-
num_samples_it = itertools.repeat(num_samples, len(num_correct))
|
124 |
-
else:
|
125 |
-
assert len(num_samples) == len(num_correct)
|
126 |
-
num_samples_it = iter(num_samples)
|
127 |
-
|
128 |
-
return np.array(
|
129 |
-
[estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)]
|
130 |
-
)
|
131 |
-
|
132 |
-
|
133 |
-
def process_humaneval_test(
|
134 |
-
sample, problems, example_test=False, is_mbpp=False, language="python"
|
135 |
-
):
|
136 |
-
"""
|
137 |
-
Processes a sample for evaluation.
|
138 |
-
"""
|
139 |
-
task_id = sample["task_id"]
|
140 |
-
if is_mbpp:
|
141 |
-
return sample["generation"] + "\n" + "\n".join(problems[task_id]["test"])
|
142 |
-
|
143 |
-
prompt = sample["prompt"]
|
144 |
-
if (
|
145 |
-
example_test
|
146 |
-
and "example_test" in problems[task_id]
|
147 |
-
and problems[task_id]["example_test"] != ""
|
148 |
-
):
|
149 |
-
test = problems[task_id]["example_test"]
|
150 |
-
else:
|
151 |
-
test = problems[task_id]["test"]
|
152 |
-
code = sample["generation"]
|
153 |
-
|
154 |
-
# Pre-process for different languages
|
155 |
-
if language == "python":
|
156 |
-
test_setup = "\n".join(IMPORT_HELPER["python"]) + "\n"
|
157 |
-
test_string = test_setup + code + "\n" + test + "\n"
|
158 |
-
elif language == "cpp":
|
159 |
-
test_set_up = ""
|
160 |
-
for s in IMPORT_HELPER["cpp"]:
|
161 |
-
if s not in prompt:
|
162 |
-
test_set_up += s + "\n"
|
163 |
-
test_string = test_set_up + "\n" + code + "\n" + test
|
164 |
-
elif language == "java":
|
165 |
-
test_string = code + "\n" + test
|
166 |
-
elif language == "cs":
|
167 |
-
test_set_up = ""
|
168 |
-
for s in IMPORT_HELPER["cs"]:
|
169 |
-
test_set_up += s + "\n"
|
170 |
-
test_string = test_set_up + "\n" + code + "\n" + test
|
171 |
-
elif language in ["js", "javascript", "ts", "sh", "go"]:
|
172 |
-
test_string = code + "\n" + test
|
173 |
-
elif language == "go232":
|
174 |
-
import_string = problems[task_id]["import"]
|
175 |
-
prompt = prompt.replace(import_string, "")
|
176 |
-
if example_test and "example_test" in problems[task_id]:
|
177 |
-
test = problems[task_id]["example_test"]
|
178 |
-
else:
|
179 |
-
test = problems[task_id]["test"]
|
180 |
-
test_setup = problems[task_id]["test_setup"]
|
181 |
-
other_pkgs = []
|
182 |
-
for pkg in IMPORT_HELPER["go"]:
|
183 |
-
if pkg not in test_setup:
|
184 |
-
p = pkg.split("/")[-1]
|
185 |
-
if p + "." in code:
|
186 |
-
other_pkgs.append(f'"{pkg}"')
|
187 |
-
if other_pkgs:
|
188 |
-
import_other_pkgs = (
|
189 |
-
"import (\n" + " ".join([p + "\n" for p in other_pkgs]) + ")"
|
190 |
-
)
|
191 |
-
test_string = (
|
192 |
-
test_setup
|
193 |
-
+ "\n"
|
194 |
-
+ import_other_pkgs
|
195 |
-
+ "\n"
|
196 |
-
+ prompt
|
197 |
-
+ code
|
198 |
-
+ "\n"
|
199 |
-
+ test
|
200 |
-
)
|
201 |
-
else:
|
202 |
-
test_string = test_setup + "\n" + prompt + code + "\n" + test
|
203 |
-
elif language == "rust":
|
204 |
-
main = "\nfn main(){ \n } \n"
|
205 |
-
declaration = problems[task_id]["declaration"]
|
206 |
-
test_string = main + declaration + prompt + code + test
|
207 |
-
elif language == "php":
|
208 |
-
if code[:5] != "<?php":
|
209 |
-
code = "<?php\n" + code
|
210 |
-
test_string = code + "\n" + test + "?>"
|
211 |
-
return test_string
|
212 |
-
|
213 |
-
|
214 |
-
def stream_jsonl_all(filename: str) -> Iterable[Dict]:
|
215 |
-
"""
|
216 |
-
Streams a JSONL file.
|
217 |
-
"""
|
218 |
-
results = []
|
219 |
-
if filename.endswith(".gz"):
|
220 |
-
fp = gzip.open(open(filename, "rb"), "rt")
|
221 |
-
else:
|
222 |
-
fp = open(filename, "r")
|
223 |
-
for line in fp:
|
224 |
-
if any(not x.isspace() for x in line):
|
225 |
-
results.append(json.loads(line))
|
226 |
-
fp.close()
|
227 |
-
|
228 |
-
return results
|
229 |
-
|
230 |
-
|
231 |
-
def evaluate_functional_correctness(
|
232 |
-
input_file: str = None,
|
233 |
-
tmp_dir: str = "./",
|
234 |
-
n_workers: int = 32,
|
235 |
-
timeout: float = 10.0,
|
236 |
-
problem_file: str = "../data/humaneval_python.jsonl.gz",
|
237 |
-
out_path: str = None,
|
238 |
-
k: List[int] = [1, 10, 100],
|
239 |
-
test_groundtruth: bool = False,
|
240 |
-
example_test: bool = False,
|
241 |
-
is_mbpp: bool = False,
|
242 |
-
language: str = "python",
|
243 |
-
):
|
244 |
-
"""
|
245 |
-
Evaluates the functional correctness of a model.
|
246 |
-
"""
|
247 |
-
if example_test:
|
248 |
-
print("Example test...")
|
249 |
-
|
250 |
-
problems = read_dataset(problem_file, dataset_type="humaneval")
|
251 |
-
sample_jsonl = stream_jsonl_all(input_file)
|
252 |
-
|
253 |
-
with ThreadPoolExecutor(max_workers=n_workers) as executor:
|
254 |
-
|
255 |
-
futures = []
|
256 |
-
completion_id = Counter()
|
257 |
-
n_samples = 0
|
258 |
-
# results = defaultdict(list)
|
259 |
-
results = {}
|
260 |
-
|
261 |
-
if test_groundtruth:
|
262 |
-
print("Testing ground truth...")
|
263 |
-
for sample in tqdm(problems.values()):
|
264 |
-
task_id = sample["task_id"]
|
265 |
-
lang = task_id.split("/")[0].lower()
|
266 |
-
if lang == "javascript":
|
267 |
-
lang = "js"
|
268 |
-
tmp_dir_ = os.path.join(tmp_dir, lang, "evaluation")
|
269 |
-
sample["generation"] = sample["canonical_solution"]
|
270 |
-
sample["test_code"] = process_humaneval_test(
|
271 |
-
sample, problems, example_test, language
|
272 |
-
)
|
273 |
-
if sample["test_code"] is None:
|
274 |
-
continue
|
275 |
-
args = (
|
276 |
-
task_id,
|
277 |
-
sample,
|
278 |
-
lang,
|
279 |
-
timeout,
|
280 |
-
tmp_dir_,
|
281 |
-
completion_id[task_id],
|
282 |
-
)
|
283 |
-
future = executor.submit(check_correctness, *args)
|
284 |
-
futures.append(future)
|
285 |
-
completion_id[task_id] += 1
|
286 |
-
n_samples += 1
|
287 |
-
else:
|
288 |
-
print("Reading samples...")
|
289 |
-
for sample in tqdm(sample_jsonl):
|
290 |
-
task_id = sample["task_id"]
|
291 |
-
if not is_mbpp:
|
292 |
-
lang = language
|
293 |
-
if not is_mbpp and lang == "javascript":
|
294 |
-
lang = "js"
|
295 |
-
if is_mbpp:
|
296 |
-
lang = "python"
|
297 |
-
tmp_dir_ = os.path.join(tmp_dir, lang, "evaluation")
|
298 |
-
sample["task_id"] = task_id
|
299 |
-
sample["test_code"] = process_humaneval_test(
|
300 |
-
sample, problems, example_test, is_mbpp, language
|
301 |
-
)
|
302 |
-
if sample["test_code"] is None:
|
303 |
-
continue
|
304 |
-
if "completion_id" in sample:
|
305 |
-
completion_id_ = sample["completion_id"]
|
306 |
-
else:
|
307 |
-
completion_id_ = completion_id[task_id]
|
308 |
-
args = (task_id, sample, lang, timeout, tmp_dir_, completion_id_)
|
309 |
-
future = executor.submit(check_correctness, *args)
|
310 |
-
futures.append(future)
|
311 |
-
completion_id[task_id] += 1
|
312 |
-
n_samples += 1
|
313 |
-
|
314 |
-
if len(completion_id) == len(problems):
|
315 |
-
evaluate_pass_at_k = True
|
316 |
-
else:
|
317 |
-
evaluate_pass_at_k = False
|
318 |
-
|
319 |
-
print("Running test suites...")
|
320 |
-
for future in tqdm(as_completed(futures), total=len(futures)):
|
321 |
-
result = future.result()
|
322 |
-
# results[result["task_id"]].append((result["completion_id"], result))
|
323 |
-
results[result["task_id"]] = result
|
324 |
-
|
325 |
-
# Calculate pass@k.
|
326 |
-
total, correct = [], []
|
327 |
-
for result in results.values():
|
328 |
-
# passed = [r[1]["passed"] for r in result]
|
329 |
-
passed = [result["passed"]]
|
330 |
-
total.append(len(passed))
|
331 |
-
correct.append(sum(passed))
|
332 |
-
total = np.array(total)
|
333 |
-
correct = np.array(correct)
|
334 |
-
|
335 |
-
if evaluate_pass_at_k:
|
336 |
-
ks = k
|
337 |
-
pass_at_k = {
|
338 |
-
f"pass@{k}": estimate_pass_at_k(total, correct, k).mean()
|
339 |
-
for k in ks
|
340 |
-
if (total >= k).all()
|
341 |
-
}
|
342 |
-
print(pass_at_k)
|
343 |
-
else:
|
344 |
-
print("Total:", np.sum(total))
|
345 |
-
print("Correct:", np.sum(correct))
|
346 |
-
|
347 |
-
if out_path:
|
348 |
-
with open(out_path, "w") as f:
|
349 |
-
json.dump(list(results.values()), f, ensure_ascii=False)
|
350 |
-
|
351 |
-
return pass_at_k
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evaluation/general_benchmarks/HumanEval/human_eval/execution.py
DELETED
@@ -1,817 +0,0 @@
|
|
1 |
-
import contextlib
|
2 |
-
import faulthandler
|
3 |
-
import gzip
|
4 |
-
import io
|
5 |
-
import json
|
6 |
-
import multiprocessing
|
7 |
-
import os
|
8 |
-
import platform
|
9 |
-
import random
|
10 |
-
import signal
|
11 |
-
import subprocess
|
12 |
-
import tempfile
|
13 |
-
import traceback
|
14 |
-
from typing import *
|
15 |
-
|
16 |
-
java_exec = ""
|
17 |
-
node_exec = ""
|
18 |
-
tsc_exec = ""
|
19 |
-
go_exec = ""
|
20 |
-
php_exec = ""
|
21 |
-
cs_exec = ""
|
22 |
-
|
23 |
-
|
24 |
-
def check_correctness(
|
25 |
-
task_id: str,
|
26 |
-
sample: dict,
|
27 |
-
language_type: str,
|
28 |
-
timeout: float = 3.0,
|
29 |
-
tmp_dir: str = None,
|
30 |
-
completion_id: Optional[int] = None,
|
31 |
-
) -> Dict:
|
32 |
-
"""
|
33 |
-
Evaluates the functional correctness of a completion by running the test
|
34 |
-
suite provided in the problem.
|
35 |
-
"""
|
36 |
-
|
37 |
-
def unsafe_execute(tmp_dir):
|
38 |
-
random_id = random.randint(1, 100000)
|
39 |
-
if "python" in language_type.lower():
|
40 |
-
with create_tempdir():
|
41 |
-
|
42 |
-
# These system calls are needed when cleaning up tempdir.
|
43 |
-
import os
|
44 |
-
import shutil
|
45 |
-
|
46 |
-
rmtree = shutil.rmtree
|
47 |
-
rmdir = os.rmdir
|
48 |
-
chdir = os.chdir
|
49 |
-
|
50 |
-
# Disable functionalities that can make destructive changes to the test.
|
51 |
-
reliability_guard()
|
52 |
-
|
53 |
-
try:
|
54 |
-
exec_globals = {}
|
55 |
-
with swallow_io():
|
56 |
-
with time_limit(timeout):
|
57 |
-
# WARNING
|
58 |
-
# This program exists to execute untrusted model-generated code. Although
|
59 |
-
# it is highly unlikely that model-generated code will do something overtly
|
60 |
-
# malicious in response to this test suite, model-generated code may act
|
61 |
-
# destructively due to a lack of model capability or alignment.
|
62 |
-
# Users are strongly encouraged to sandbox this evaluation suite so that it
|
63 |
-
# does not perform destructive actions on their host or network.
|
64 |
-
# Once you have read this disclaimer and taken appropriate precautions,
|
65 |
-
# uncomment the following line and proceed at your own risk:
|
66 |
-
exec(sample["test_code"], exec_globals)
|
67 |
-
result.append("passed")
|
68 |
-
except TimeoutException:
|
69 |
-
result.append("timed out")
|
70 |
-
except AssertionError as e:
|
71 |
-
result.append(f"failed: AssertionError")
|
72 |
-
except BaseException as e:
|
73 |
-
result.append(f"failed: {e}")
|
74 |
-
# print(sample["test_code"])
|
75 |
-
# print(result)
|
76 |
-
# Needed for cleaning up.
|
77 |
-
shutil.rmtree = rmtree
|
78 |
-
os.rmdir = rmdir
|
79 |
-
os.chdir = chdir
|
80 |
-
|
81 |
-
elif "go" in language_type.lower():
|
82 |
-
assert (
|
83 |
-
tmp_dir is not None
|
84 |
-
), "Go should be evaluated in a dir where necessary module files installed."
|
85 |
-
|
86 |
-
import os
|
87 |
-
import shutil
|
88 |
-
|
89 |
-
if "tmp" not in tmp_dir:
|
90 |
-
tmp_dir = os.path.join(tmp_dir, "tmp")
|
91 |
-
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
|
92 |
-
if not os.path.exists(tmp_dir):
|
93 |
-
os.makedirs(tmp_dir)
|
94 |
-
origin_path = os.getcwd()
|
95 |
-
os.chdir(tmp_dir)
|
96 |
-
open(f"main_test.go", "w").write(sample["test_code"])
|
97 |
-
try:
|
98 |
-
exec_result = None
|
99 |
-
with time_limit(timeout):
|
100 |
-
# WARNING
|
101 |
-
# This program exists to execute untrusted model-generated code. Although
|
102 |
-
# it is highly unlikely that model-generated code will do something overtly
|
103 |
-
# malicious in response to this test suite, model-generated code may act
|
104 |
-
# destructively due to a lack of model capability or alignment.
|
105 |
-
# Users are strongly encouraged to sandbox this evaluation suite so that it
|
106 |
-
# does not perform destructive actions on their host or network.
|
107 |
-
# Once you have read this disclaimer and taken appropriate precautions,
|
108 |
-
# uncomment the following line and proceed at your own risk:
|
109 |
-
exec_result = subprocess.run(
|
110 |
-
[
|
111 |
-
f"{go_exec}go",
|
112 |
-
"test",
|
113 |
-
f"-timeout={timeout}s",
|
114 |
-
"main_test.go",
|
115 |
-
],
|
116 |
-
timeout=timeout,
|
117 |
-
capture_output=True,
|
118 |
-
)
|
119 |
-
|
120 |
-
if exec_result.returncode == 0:
|
121 |
-
result.append("passed")
|
122 |
-
else:
|
123 |
-
if exec_result.stderr:
|
124 |
-
try:
|
125 |
-
err = exec_result.stderr.decode()
|
126 |
-
except:
|
127 |
-
err = exec_result.stderr
|
128 |
-
else:
|
129 |
-
try:
|
130 |
-
err = exec_result.stdout.decode()
|
131 |
-
except:
|
132 |
-
err = exec_result.stdout
|
133 |
-
result.append(f"failed: {err}")
|
134 |
-
|
135 |
-
except TimeoutException:
|
136 |
-
result.append("timed out")
|
137 |
-
os.chdir(origin_path)
|
138 |
-
shutil.rmtree(tmp_dir)
|
139 |
-
elif "js" in language_type.lower():
|
140 |
-
import os
|
141 |
-
import shutil
|
142 |
-
|
143 |
-
if "tmp" not in tmp_dir:
|
144 |
-
tmp_dir = os.path.join(tmp_dir, "tmp")
|
145 |
-
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
|
146 |
-
if not os.path.exists(tmp_dir):
|
147 |
-
os.makedirs(tmp_dir)
|
148 |
-
origin_path = os.getcwd()
|
149 |
-
os.chdir(tmp_dir)
|
150 |
-
open(f"test.js", "w").write(sample["test_code"])
|
151 |
-
try:
|
152 |
-
exec_result = None
|
153 |
-
with time_limit(timeout):
|
154 |
-
# WARNING
|
155 |
-
# This program exists to execute untrusted model-generated code. Although
|
156 |
-
# it is highly unlikely that model-generated code will do something overtly
|
157 |
-
# malicious in response to this test suite, model-generated code may act
|
158 |
-
# destructively due to a lack of model capability or alignment.
|
159 |
-
# Users are strongly encouraged to sandbox this evaluation suite so that it
|
160 |
-
# does not perform destructive actions on their host or network.
|
161 |
-
# Once you have read this disclaimer and taken appropriate precautions,
|
162 |
-
# uncomment the following line and proceed at your own risk:
|
163 |
-
exec_result = subprocess.run(
|
164 |
-
[f"{node_exec}node", "test.js"],
|
165 |
-
timeout=timeout,
|
166 |
-
capture_output=True,
|
167 |
-
)
|
168 |
-
|
169 |
-
if exec_result.stderr.decode():
|
170 |
-
err = exec_result.stderr.decode()
|
171 |
-
result.append(f"failed: {err}")
|
172 |
-
elif exec_result.stdout.decode():
|
173 |
-
err = exec_result.stdout.decode()
|
174 |
-
result.append(f"failed: {err}")
|
175 |
-
else:
|
176 |
-
result.append("passed")
|
177 |
-
|
178 |
-
except TimeoutException:
|
179 |
-
result.append("timed out")
|
180 |
-
os.chdir(origin_path)
|
181 |
-
shutil.rmtree(tmp_dir)
|
182 |
-
elif "cpp" in language_type.lower():
|
183 |
-
import os
|
184 |
-
import shutil
|
185 |
-
|
186 |
-
origin_path = os.getcwd()
|
187 |
-
if "tmp" not in tmp_dir:
|
188 |
-
tmp_dir = os.path.join(tmp_dir, "tmp")
|
189 |
-
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
|
190 |
-
if not os.path.exists(tmp_dir):
|
191 |
-
os.makedirs(tmp_dir)
|
192 |
-
|
193 |
-
os.chdir(tmp_dir)
|
194 |
-
open(f"test.cpp", "w").write(sample["test_code"])
|
195 |
-
if "162" in task_id:
|
196 |
-
compilation_result = subprocess.run(
|
197 |
-
["/usr/bin/g++", "-std=c++17", "test.cpp", "-lcrypto", "-lssl"],
|
198 |
-
timeout=timeout,
|
199 |
-
capture_output=True,
|
200 |
-
)
|
201 |
-
else:
|
202 |
-
compilation_result = subprocess.run(
|
203 |
-
["/usr/bin/g++", "-std=c++17", "test.cpp"],
|
204 |
-
timeout=timeout,
|
205 |
-
capture_output=True,
|
206 |
-
)
|
207 |
-
if compilation_result.returncode != 0:
|
208 |
-
if compilation_result.stderr:
|
209 |
-
err = compilation_result.stderr.decode()
|
210 |
-
else:
|
211 |
-
err = compilation_result.stdout.decode()
|
212 |
-
result.append(f"failed: compilation error: {err}")
|
213 |
-
else:
|
214 |
-
try:
|
215 |
-
exec_result = None
|
216 |
-
with time_limit(timeout):
|
217 |
-
# WARNING
|
218 |
-
# This program exists to execute untrusted model-generated code. Although
|
219 |
-
# it is highly unlikely that model-generated code will do something overtly
|
220 |
-
# malicious in response to this test suite, model-generated code may act
|
221 |
-
# destructively due to a lack of model capability or alignment.
|
222 |
-
# Users are strongly encouraged to sandbox this evaluation suite so that it
|
223 |
-
# does not perform destructive actions on their host or network.
|
224 |
-
# Once you have read this disclaimer and taken appropriate precautions,
|
225 |
-
# uncomment the following line and proceed at your own risk:
|
226 |
-
exec_result = subprocess.run(
|
227 |
-
["./a.out"], timeout=timeout, capture_output=True
|
228 |
-
)
|
229 |
-
|
230 |
-
if exec_result.returncode == 0:
|
231 |
-
result.append("passed")
|
232 |
-
else:
|
233 |
-
if exec_result.stderr:
|
234 |
-
try:
|
235 |
-
err = exec_result.stderr.decode()
|
236 |
-
except:
|
237 |
-
err = exec_result.stderr
|
238 |
-
else:
|
239 |
-
try:
|
240 |
-
err = exec_result.stdout.decode()
|
241 |
-
except:
|
242 |
-
err = exec_result.stdout
|
243 |
-
result.append(f"failed: {err}")
|
244 |
-
except TimeoutException:
|
245 |
-
result.append("timed out")
|
246 |
-
# print(result[-1])
|
247 |
-
# print(sample["test_code"])
|
248 |
-
os.chdir(origin_path)
|
249 |
-
shutil.rmtree(tmp_dir)
|
250 |
-
elif "php" in language_type.lower():
|
251 |
-
import os
|
252 |
-
import shutil
|
253 |
-
|
254 |
-
origin_path = os.getcwd()
|
255 |
-
if "tmp" not in tmp_dir:
|
256 |
-
tmp_dir = os.path.join(tmp_dir, "tmp")
|
257 |
-
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
|
258 |
-
if not os.path.exists(tmp_dir):
|
259 |
-
os.makedirs(tmp_dir)
|
260 |
-
|
261 |
-
os.chdir(tmp_dir)
|
262 |
-
open(f"test.php", "w").write(sample["test_code"])
|
263 |
-
try:
|
264 |
-
exec_result = None
|
265 |
-
with time_limit(timeout):
|
266 |
-
cmd = f"{php_exec}php -f test.php"
|
267 |
-
exec_result = subprocess.run(
|
268 |
-
cmd, timeout=timeout, capture_output=True, shell=True
|
269 |
-
)
|
270 |
-
|
271 |
-
if exec_result.returncode == 0:
|
272 |
-
result.append("passed")
|
273 |
-
else:
|
274 |
-
if exec_result.stderr:
|
275 |
-
try:
|
276 |
-
err = exec_result.stderr.decode()
|
277 |
-
except:
|
278 |
-
err = exec_result.stderr
|
279 |
-
else:
|
280 |
-
try:
|
281 |
-
err = exec_result.stdout.decode()
|
282 |
-
except:
|
283 |
-
err = exec_result.stdout
|
284 |
-
result.append(f"failed: {err}")
|
285 |
-
except TimeoutException:
|
286 |
-
result.append("timed out")
|
287 |
-
print(result[-1])
|
288 |
-
print(sample["test_code"])
|
289 |
-
os.chdir(origin_path)
|
290 |
-
shutil.rmtree(tmp_dir)
|
291 |
-
elif "sh" in language_type.lower():
|
292 |
-
import os
|
293 |
-
import shutil
|
294 |
-
|
295 |
-
origin_path = os.getcwd()
|
296 |
-
if "tmp" not in tmp_dir:
|
297 |
-
tmp_dir = os.path.join(tmp_dir, "tmp")
|
298 |
-
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
|
299 |
-
if not os.path.exists(tmp_dir):
|
300 |
-
os.makedirs(tmp_dir)
|
301 |
-
|
302 |
-
os.chdir(tmp_dir)
|
303 |
-
open(f"test.sh", "w").write(sample["test_code"])
|
304 |
-
try:
|
305 |
-
exec_result = None
|
306 |
-
with time_limit(timeout):
|
307 |
-
cmd = "/bin/bash test.sh"
|
308 |
-
exec_result = subprocess.run(
|
309 |
-
cmd, timeout=10, capture_output=True, shell=True
|
310 |
-
)
|
311 |
-
|
312 |
-
if exec_result.returncode == 0:
|
313 |
-
result.append("passed")
|
314 |
-
else:
|
315 |
-
if exec_result.stderr:
|
316 |
-
try:
|
317 |
-
err = exec_result.stderr.decode()
|
318 |
-
except:
|
319 |
-
err = exec_result.stderr
|
320 |
-
else:
|
321 |
-
try:
|
322 |
-
err = exec_result.stdout.decode()
|
323 |
-
except:
|
324 |
-
err = exec_result.stdout
|
325 |
-
result.append(f"failed: {err}")
|
326 |
-
except TimeoutException:
|
327 |
-
result.append("timed out")
|
328 |
-
# print(result[-1])
|
329 |
-
# print(sample["test_code"])
|
330 |
-
os.chdir(origin_path)
|
331 |
-
shutil.rmtree(tmp_dir)
|
332 |
-
elif "ts" in language_type.lower():
|
333 |
-
import os
|
334 |
-
import shutil
|
335 |
-
|
336 |
-
origin_path = os.getcwd()
|
337 |
-
if "tmp" not in tmp_dir:
|
338 |
-
tmp_dir = os.path.join(tmp_dir, "tmp")
|
339 |
-
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
|
340 |
-
if not os.path.exists(tmp_dir):
|
341 |
-
os.makedirs(tmp_dir)
|
342 |
-
|
343 |
-
os.chdir(tmp_dir)
|
344 |
-
env = {"PATH": f"{node_exec}:" + subprocess.os.environ["PATH"]}
|
345 |
-
open(f"test.ts", "w").write(sample["test_code"])
|
346 |
-
cmd = f"{tsc_exec}tsc test.ts --target ES2015 --lib ES2015,DOM"
|
347 |
-
compilation_result = subprocess.run(
|
348 |
-
cmd, timeout=timeout, capture_output=True, env=env, shell=True
|
349 |
-
)
|
350 |
-
if compilation_result.returncode != 0:
|
351 |
-
if compilation_result.stderr:
|
352 |
-
err = compilation_result.stderr.decode()
|
353 |
-
else:
|
354 |
-
err = compilation_result.stdout.decode()
|
355 |
-
result.append(f"failed: compilation error: {err}")
|
356 |
-
else:
|
357 |
-
try:
|
358 |
-
exec_result = None
|
359 |
-
with time_limit(timeout):
|
360 |
-
exec_result = subprocess.run(
|
361 |
-
[f"{node_exec}node", "test.js"],
|
362 |
-
timeout=timeout,
|
363 |
-
capture_output=True,
|
364 |
-
)
|
365 |
-
|
366 |
-
if exec_result.returncode == 0:
|
367 |
-
result.append("passed")
|
368 |
-
else:
|
369 |
-
if exec_result.stderr:
|
370 |
-
try:
|
371 |
-
err = exec_result.stderr.decode()
|
372 |
-
except:
|
373 |
-
err = exec_result.stderr
|
374 |
-
else:
|
375 |
-
try:
|
376 |
-
err = exec_result.stdout.decode()
|
377 |
-
except:
|
378 |
-
err = exec_result.stdout
|
379 |
-
result.append(f"failed: {err}")
|
380 |
-
except TimeoutException:
|
381 |
-
result.append("timed out")
|
382 |
-
if result[-1] != "passed":
|
383 |
-
env = {"PATH": f"{node_exec}:" + subprocess.os.environ["PATH"]}
|
384 |
-
cmd = f"{tsc_exec}tsc test.ts"
|
385 |
-
compilation_result = subprocess.run(
|
386 |
-
cmd, timeout=timeout, capture_output=True, env=env, shell=True
|
387 |
-
)
|
388 |
-
if compilation_result.returncode != 0:
|
389 |
-
if compilation_result.stderr:
|
390 |
-
err = compilation_result.stderr.decode()
|
391 |
-
else:
|
392 |
-
err = compilation_result.stdout.decode()
|
393 |
-
result[-1] = f"failed: compilation error: {err}"
|
394 |
-
else:
|
395 |
-
try:
|
396 |
-
exec_result = None
|
397 |
-
with time_limit(timeout):
|
398 |
-
exec_result = subprocess.run(
|
399 |
-
[f"{node_exec}node", "test.js"],
|
400 |
-
timeout=timeout,
|
401 |
-
capture_output=True,
|
402 |
-
)
|
403 |
-
|
404 |
-
if exec_result.returncode == 0:
|
405 |
-
result[-1] = "passed"
|
406 |
-
else:
|
407 |
-
if exec_result.stderr:
|
408 |
-
try:
|
409 |
-
err = exec_result.stderr.decode()
|
410 |
-
except:
|
411 |
-
err = exec_result.stderr
|
412 |
-
else:
|
413 |
-
try:
|
414 |
-
err = exec_result.stdout.decode()
|
415 |
-
except:
|
416 |
-
err = exec_result.stdout
|
417 |
-
result[-1] = f"failed: {err}"
|
418 |
-
except TimeoutException:
|
419 |
-
result[-1] = "timed out"
|
420 |
-
|
421 |
-
os.chdir(origin_path)
|
422 |
-
shutil.rmtree(tmp_dir)
|
423 |
-
elif "cs" in language_type.lower():
|
424 |
-
import os
|
425 |
-
import shutil
|
426 |
-
|
427 |
-
origin_path = os.getcwd()
|
428 |
-
if "tmp" not in tmp_dir:
|
429 |
-
tmp_dir = os.path.join(tmp_dir, "tmp")
|
430 |
-
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
|
431 |
-
if not os.path.exists(tmp_dir):
|
432 |
-
os.makedirs(tmp_dir)
|
433 |
-
os.chdir(tmp_dir)
|
434 |
-
open(f"Program.cs", "w").write(sample["test_code"])
|
435 |
-
cmd = f"{cs_exec}mcs -d:DEBUG Program.cs"
|
436 |
-
compilation_result = subprocess.run(cmd, shell=True, capture_output=True)
|
437 |
-
if compilation_result.returncode != 0:
|
438 |
-
if compilation_result.stderr:
|
439 |
-
err = compilation_result.stderr.decode()
|
440 |
-
else:
|
441 |
-
err = compilation_result.stdout.decode()
|
442 |
-
result.append(f"failed: compilation error: {err}")
|
443 |
-
else:
|
444 |
-
try:
|
445 |
-
exec_result = None
|
446 |
-
cmd = f"{cs_exec}mono Program.exe"
|
447 |
-
env = dict(MONO_TRACE_LISTENER="Console.Error")
|
448 |
-
with time_limit(timeout):
|
449 |
-
exec_result = subprocess.run(
|
450 |
-
cmd,
|
451 |
-
timeout=timeout,
|
452 |
-
shell=True,
|
453 |
-
capture_output=True,
|
454 |
-
env=env,
|
455 |
-
)
|
456 |
-
|
457 |
-
if "Fail" not in exec_result.stderr.decode():
|
458 |
-
result.append("passed")
|
459 |
-
else:
|
460 |
-
if exec_result.stderr:
|
461 |
-
try:
|
462 |
-
err = exec_result.stderr.decode()
|
463 |
-
except:
|
464 |
-
err = exec_result.stderr
|
465 |
-
else:
|
466 |
-
try:
|
467 |
-
err = exec_result.stdout.decode()
|
468 |
-
except:
|
469 |
-
err = exec_result.stdout
|
470 |
-
result.append(f"failed: {err}")
|
471 |
-
except TimeoutException:
|
472 |
-
result.append("timed out")
|
473 |
-
except Exception as e:
|
474 |
-
result.append(f"failed: {e}")
|
475 |
-
os.chdir(origin_path)
|
476 |
-
shutil.rmtree(tmp_dir)
|
477 |
-
elif "rust" in language_type.lower():
|
478 |
-
import os
|
479 |
-
|
480 |
-
WD: str = os.path.dirname(os.path.abspath(__file__))
|
481 |
-
RUST_DIR: str = os.path.join(WD, "rust")
|
482 |
-
RUST_SRC: str = os.path.join(RUST_DIR, "src")
|
483 |
-
RUST_BIN: str = os.path.join(RUST_SRC, "bin")
|
484 |
-
RUST_TMP_DIR: str = os.path.join(RUST_DIR, "tmp")
|
485 |
-
RUST_LOGS: str = os.path.join(RUST_TMP_DIR, "logs")
|
486 |
-
RUST_EXT: str = ".rs"
|
487 |
-
|
488 |
-
# Create mandatory tmp directories
|
489 |
-
os.makedirs(RUST_TMP_DIR, exist_ok=True)
|
490 |
-
os.makedirs(RUST_LOGS, exist_ok=True)
|
491 |
-
os.makedirs(RUST_SRC, exist_ok=True)
|
492 |
-
os.makedirs(RUST_BIN, exist_ok=True)
|
493 |
-
|
494 |
-
with tempfile.NamedTemporaryFile(dir=RUST_BIN, delete=False) as f:
|
495 |
-
# temporal file name
|
496 |
-
file_prefix = sample["task_id"].lower().replace("/", "_")
|
497 |
-
file_name: str = file_prefix + RUST_EXT
|
498 |
-
|
499 |
-
os.rename(f.name, os.path.join(RUST_BIN, file_name))
|
500 |
-
|
501 |
-
# Sample to pure Rust function
|
502 |
-
rust_code: str = sample["test_code"]
|
503 |
-
|
504 |
-
# dump the rust source code in the target temporal file
|
505 |
-
f.write(rust_code.encode("utf-8"))
|
506 |
-
|
507 |
-
# Proceed towards Rust binaries compilation. Therefore move to Rust module root dir.
|
508 |
-
os.chdir(RUST_DIR)
|
509 |
-
|
510 |
-
# Two possible outcomes
|
511 |
-
# Pass OR Fail compilation
|
512 |
-
log_filename: str = file_prefix + ".jsonl"
|
513 |
-
log_path: str = os.path.join(RUST_LOGS, log_filename)
|
514 |
-
cargo_check: str = (
|
515 |
-
"cargo check --bin "
|
516 |
-
+ file_prefix
|
517 |
-
+ " --message-format json >> "
|
518 |
-
+ log_path
|
519 |
-
)
|
520 |
-
# Compilation build status
|
521 |
-
returned_val_compilation: int
|
522 |
-
|
523 |
-
# Overwrite file content
|
524 |
-
if os.path.exists(log_path):
|
525 |
-
if (file_size := os.path.getsize(log_path)) >= 0:
|
526 |
-
os.remove(log_path)
|
527 |
-
returned_val_compilation = os.system(cargo_check)
|
528 |
-
|
529 |
-
else:
|
530 |
-
returned_val_compilation = os.system(cargo_check)
|
531 |
-
|
532 |
-
# 0 means success
|
533 |
-
if returned_val_compilation == 0:
|
534 |
-
|
535 |
-
# Execution pipeline
|
536 |
-
cargo_test: str = (
|
537 |
-
"cargo test --bin "
|
538 |
-
+ file_prefix
|
539 |
-
+ " --message-format json >> "
|
540 |
-
+ log_path
|
541 |
-
)
|
542 |
-
returned_val_execution = os.system(cargo_test)
|
543 |
-
|
544 |
-
if returned_val_execution == 0:
|
545 |
-
result.append("passed")
|
546 |
-
else:
|
547 |
-
result.append(f"failed: execution error")
|
548 |
-
|
549 |
-
else:
|
550 |
-
result.append(f"failed: compilation error")
|
551 |
-
|
552 |
-
elif "java" in language_type.lower():
|
553 |
-
assert tmp_dir is not None, "Java should be evaluated in a temporary dir."
|
554 |
-
|
555 |
-
import os
|
556 |
-
import shutil
|
557 |
-
|
558 |
-
if "tmp" not in tmp_dir:
|
559 |
-
tmp_dir = os.path.join(tmp_dir, "tmp")
|
560 |
-
tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
|
561 |
-
if not os.path.exists(tmp_dir):
|
562 |
-
os.makedirs(tmp_dir)
|
563 |
-
open(os.path.join(tmp_dir, "Problem.java"), "w").write(sample["test_code"])
|
564 |
-
origin_path = os.getcwd()
|
565 |
-
os.system(f"cp ./javatuples-1.2.jar {tmp_dir}/")
|
566 |
-
os.chdir(tmp_dir)
|
567 |
-
res = "failed: unknown error"
|
568 |
-
compile_returncode = -1
|
569 |
-
for _ in range(5):
|
570 |
-
try:
|
571 |
-
cmd = f"{java_exec}javac -cp javatuples-1.2.jar Problem.java"
|
572 |
-
compilation_result = subprocess.run(
|
573 |
-
cmd, timeout=60, capture_output=True, shell=True
|
574 |
-
)
|
575 |
-
compile_returncode = compilation_result.returncode
|
576 |
-
break
|
577 |
-
except subprocess.TimeoutExpired as e:
|
578 |
-
continue
|
579 |
-
if compile_returncode != 0:
|
580 |
-
res = "failed: compilation error"
|
581 |
-
else:
|
582 |
-
exec_result = None
|
583 |
-
try:
|
584 |
-
# WARNING
|
585 |
-
# This program exists to execute untrusted model-generated code. Although
|
586 |
-
# it is highly unlikely that model-generated code will do something overtly
|
587 |
-
# malicious in response to this test suite, model-generated code may act
|
588 |
-
# destructively due to a lack of model capability or alignment.
|
589 |
-
# Users are strongly encouraged to sandbox this evaluation suite so that it
|
590 |
-
# does not perform destructive actions on their host or network.
|
591 |
-
# Once you have read this disclaimer and taken appropriate precautions,
|
592 |
-
# uncomment the following line and proceed at your own risk:
|
593 |
-
cmd = f"{java_exec}java -ea -cp .:javatuples-1.2.jar Problem"
|
594 |
-
exec_result = subprocess.run(
|
595 |
-
cmd, timeout=timeout, capture_output=True, shell=True
|
596 |
-
)
|
597 |
-
if exec_result.returncode == 0:
|
598 |
-
res = "passed"
|
599 |
-
elif exec_result.returncode == 1:
|
600 |
-
if "AssertionError" in exec_result.stderr.decode(
|
601 |
-
"unicode-escape"
|
602 |
-
):
|
603 |
-
res = "failed: wrong answer"
|
604 |
-
else:
|
605 |
-
res = f"failed: {exec_result.stderr.decode()}"
|
606 |
-
except subprocess.TimeoutExpired as e:
|
607 |
-
res = "time out"
|
608 |
-
except BaseException as e:
|
609 |
-
res = f"failed: {e}"
|
610 |
-
|
611 |
-
result.append(res)
|
612 |
-
os.chdir(origin_path)
|
613 |
-
shutil.rmtree(tmp_dir)
|
614 |
-
|
615 |
-
manager = multiprocessing.Manager()
|
616 |
-
result = manager.list()
|
617 |
-
|
618 |
-
p = multiprocessing.Process(target=unsafe_execute, args=(tmp_dir,))
|
619 |
-
p.start()
|
620 |
-
p.join(timeout=timeout + 1)
|
621 |
-
if p.is_alive():
|
622 |
-
p.kill()
|
623 |
-
|
624 |
-
if not result:
|
625 |
-
result.append("timed out")
|
626 |
-
|
627 |
-
return {
|
628 |
-
"task_id": task_id,
|
629 |
-
"completion_id": completion_id,
|
630 |
-
"result": result[0],
|
631 |
-
"passed": result[0] == "passed",
|
632 |
-
"finish": -1 if "finish" not in sample else sample["finish"],
|
633 |
-
"test_code": sample["test_code"],
|
634 |
-
"prompt": sample["prompt"],
|
635 |
-
# "canonical_solution" : sample["canonical_solution"],
|
636 |
-
# "test" : sample["test"],
|
637 |
-
# "text" : sample["text"],
|
638 |
-
# "output" : sample["output"],
|
639 |
-
# "generation" : sample["generation"],
|
640 |
-
}
|
641 |
-
|
642 |
-
|
643 |
-
# Copyright (c) OpenAI (https://openai.com)
|
644 |
-
|
645 |
-
# Permission is hereby granted, free of charge, to any person obtaining a copy
|
646 |
-
# of this software and associated documentation files (the "Software"), to deal
|
647 |
-
# in the Software without restriction, including without limitation the rights
|
648 |
-
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
649 |
-
# copies of the Software, and to permit persons to whom the Software is
|
650 |
-
# furnished to do so, subject to the following conditions:
|
651 |
-
|
652 |
-
# The above copyright notice and this permission notice shall be included in
|
653 |
-
# all copies or substantial portions of the Software.
|
654 |
-
|
655 |
-
|
656 |
-
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
657 |
-
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
658 |
-
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
659 |
-
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
660 |
-
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
661 |
-
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
662 |
-
# THE SOFTWARE.
|
663 |
-
# ============================================================================
|
664 |
-
@contextlib.contextmanager
|
665 |
-
def time_limit(seconds: float):
|
666 |
-
def signal_handler(signum, frame):
|
667 |
-
raise TimeoutException("Timed out!")
|
668 |
-
|
669 |
-
signal.setitimer(signal.ITIMER_REAL, seconds)
|
670 |
-
signal.signal(signal.SIGALRM, signal_handler)
|
671 |
-
try:
|
672 |
-
yield
|
673 |
-
finally:
|
674 |
-
signal.setitimer(signal.ITIMER_REAL, 0)
|
675 |
-
|
676 |
-
|
677 |
-
@contextlib.contextmanager
|
678 |
-
def swallow_io():
|
679 |
-
stream = WriteOnlyStringIO()
|
680 |
-
with contextlib.redirect_stdout(stream):
|
681 |
-
with contextlib.redirect_stderr(stream):
|
682 |
-
with redirect_stdin(stream):
|
683 |
-
yield
|
684 |
-
|
685 |
-
|
686 |
-
@contextlib.contextmanager
|
687 |
-
def create_tempdir():
|
688 |
-
with tempfile.TemporaryDirectory() as dirname:
|
689 |
-
with chdir(dirname):
|
690 |
-
yield dirname
|
691 |
-
|
692 |
-
|
693 |
-
class TimeoutException(Exception):
|
694 |
-
pass
|
695 |
-
|
696 |
-
|
697 |
-
class WriteOnlyStringIO(io.StringIO):
|
698 |
-
"""StringIO that throws an exception when it's read from"""
|
699 |
-
|
700 |
-
def read(self, *args, **kwargs):
|
701 |
-
raise IOError
|
702 |
-
|
703 |
-
def readline(self, *args, **kwargs):
|
704 |
-
raise IOError
|
705 |
-
|
706 |
-
def readlines(self, *args, **kwargs):
|
707 |
-
raise IOError
|
708 |
-
|
709 |
-
def readable(self, *args, **kwargs):
|
710 |
-
"""Returns True if the IO object can be read."""
|
711 |
-
return False
|
712 |
-
|
713 |
-
|
714 |
-
class redirect_stdin(contextlib._RedirectStream): # type: ignore
|
715 |
-
_stream = "stdin"
|
716 |
-
|
717 |
-
|
718 |
-
@contextlib.contextmanager
|
719 |
-
def chdir(root):
|
720 |
-
if root == ".":
|
721 |
-
yield
|
722 |
-
return
|
723 |
-
cwd = os.getcwd()
|
724 |
-
os.chdir(root)
|
725 |
-
try:
|
726 |
-
yield
|
727 |
-
except BaseException as exc:
|
728 |
-
raise exc
|
729 |
-
finally:
|
730 |
-
os.chdir(cwd)
|
731 |
-
|
732 |
-
|
733 |
-
def reliability_guard(maximum_memory_bytes: Optional[int] = None):
|
734 |
-
"""
|
735 |
-
This disables various destructive functions and prevents the generated code
|
736 |
-
from interfering with the test (e.g. fork bomb, killing other processes,
|
737 |
-
removing filesystem files, etc.)
|
738 |
-
|
739 |
-
WARNING
|
740 |
-
This function is NOT a security sandbox. Untrusted code, including, model-
|
741 |
-
generated code, should not be blindly executed outside of one. See the
|
742 |
-
Codex paper for more information about OpenAI's code sandbox, and proceed
|
743 |
-
with caution.
|
744 |
-
"""
|
745 |
-
|
746 |
-
if maximum_memory_bytes is not None:
|
747 |
-
import resource
|
748 |
-
|
749 |
-
resource.setrlimit(
|
750 |
-
resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes)
|
751 |
-
)
|
752 |
-
resource.setrlimit(
|
753 |
-
resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes)
|
754 |
-
)
|
755 |
-
if not platform.uname().system == "Darwin":
|
756 |
-
resource.setrlimit(
|
757 |
-
resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes)
|
758 |
-
)
|
759 |
-
|
760 |
-
faulthandler.disable()
|
761 |
-
|
762 |
-
import builtins
|
763 |
-
|
764 |
-
builtins.exit = None
|
765 |
-
builtins.quit = None
|
766 |
-
|
767 |
-
import os
|
768 |
-
|
769 |
-
os.environ["OMP_NUM_THREADS"] = "1"
|
770 |
-
|
771 |
-
os.kill = None
|
772 |
-
os.system = None
|
773 |
-
os.putenv = None
|
774 |
-
os.remove = None
|
775 |
-
os.removedirs = None
|
776 |
-
os.rmdir = None
|
777 |
-
os.fchdir = None
|
778 |
-
os.setuid = None
|
779 |
-
os.fork = None
|
780 |
-
os.forkpty = None
|
781 |
-
os.killpg = None
|
782 |
-
os.rename = None
|
783 |
-
os.renames = None
|
784 |
-
os.truncate = None
|
785 |
-
os.replace = None
|
786 |
-
os.unlink = None
|
787 |
-
os.fchmod = None
|
788 |
-
os.fchown = None
|
789 |
-
os.chmod = None
|
790 |
-
os.chown = None
|
791 |
-
os.chroot = None
|
792 |
-
os.fchdir = None
|
793 |
-
os.lchflags = None
|
794 |
-
os.lchmod = None
|
795 |
-
os.lchown = None
|
796 |
-
os.getcwd = None
|
797 |
-
os.chdir = None
|
798 |
-
|
799 |
-
import shutil
|
800 |
-
|
801 |
-
shutil.rmtree = None
|
802 |
-
shutil.move = None
|
803 |
-
shutil.chown = None
|
804 |
-
|
805 |
-
import subprocess
|
806 |
-
|
807 |
-
subprocess.Popen = None # type: ignore
|
808 |
-
|
809 |
-
__builtins__["help"] = None
|
810 |
-
|
811 |
-
import sys
|
812 |
-
|
813 |
-
sys.modules["ipdb"] = None
|
814 |
-
sys.modules["joblib"] = None
|
815 |
-
sys.modules["resource"] = None
|
816 |
-
sys.modules["psutil"] = None
|
817 |
-
sys.modules["tkinter"] = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evaluation/general_benchmarks/HumanEval/humaneval.py
DELETED
@@ -1,217 +0,0 @@
|
|
1 |
-
import datetime
|
2 |
-
import json
|
3 |
-
import multiprocessing
|
4 |
-
import os
|
5 |
-
import re
|
6 |
-
import string
|
7 |
-
import subprocess
|
8 |
-
import time
|
9 |
-
|
10 |
-
import numpy as np
|
11 |
-
import torch
|
12 |
-
import torch.distributed as dist
|
13 |
-
# from attrdict import AttrDict
|
14 |
-
from human_eval.evaluation import evaluate_functional_correctness
|
15 |
-
from transformers import AutoTokenizer
|
16 |
-
from utils.dataset import HumanEvalDataset
|
17 |
-
from utils.utils import cleanup_code
|
18 |
-
|
19 |
-
|
20 |
-
class HumanEval:
|
21 |
-
"""
|
22 |
-
HumanEval evaluation class.
|
23 |
-
"""
|
24 |
-
|
25 |
-
def __init__(
|
26 |
-
self,
|
27 |
-
data_root,
|
28 |
-
max_seq_len=2048,
|
29 |
-
language="python",
|
30 |
-
max_gen_len=200,
|
31 |
-
batch_size=512,
|
32 |
-
log_dir=None,
|
33 |
-
temperature=0,
|
34 |
-
issft=False,
|
35 |
-
top_p=0.95,
|
36 |
-
model_name="",
|
37 |
-
inference_increment=True,
|
38 |
-
tokenizer_cfg=None,
|
39 |
-
n_sample=40,
|
40 |
-
k_sample=1,
|
41 |
-
):
|
42 |
-
self.data_root = data_root
|
43 |
-
self.max_seq_len = max_seq_len
|
44 |
-
self.max_gen_len = max_gen_len
|
45 |
-
self.batch_size = batch_size
|
46 |
-
self.k = k_sample
|
47 |
-
self.n_sample = n_sample
|
48 |
-
self.language = language
|
49 |
-
self.log_dir = log_dir
|
50 |
-
self.sft = issft
|
51 |
-
self.temperature = temperature
|
52 |
-
self.top_p = top_p
|
53 |
-
self.model_name = tokenizer_cfg["model_path"].replace("/", "_")
|
54 |
-
self.inference_increment = inference_increment
|
55 |
-
os.makedirs(self.log_dir, exist_ok=True)
|
56 |
-
tokenizer_cls = tokenizer_cfg.pop("cls")
|
57 |
-
try:
|
58 |
-
self.tokenizer = AutoTokenizer.from_pretrained(
|
59 |
-
tokenizer_cfg.pop("model_path"), trust_remote_code=True
|
60 |
-
)
|
61 |
-
except Exception as e:
|
62 |
-
print(e)
|
63 |
-
assert False
|
64 |
-
|
65 |
-
@torch.no_grad()
|
66 |
-
def eval_model(self, gpt, accelerator):
|
67 |
-
"""
|
68 |
-
Evaluate the model on HumanEval.
|
69 |
-
"""
|
70 |
-
assert (
|
71 |
-
self.log_dir is not None
|
72 |
-
), "log_dir should not be None when evaluating humaneval"
|
73 |
-
dataset = HumanEvalDataset(
|
74 |
-
self.data_root,
|
75 |
-
sample_num=self.n_sample,
|
76 |
-
language=self.language,
|
77 |
-
issft=self.sft,
|
78 |
-
)
|
79 |
-
nprompt = len(dataset) // self.n_sample
|
80 |
-
dp_rank = accelerator.process_index
|
81 |
-
dp_size = accelerator.num_processes
|
82 |
-
if self.k > 1:
|
83 |
-
assert self.n_sample >= 100, "HumanEval PASS@100 needs n_sample >= 100"
|
84 |
-
gpt.eval()
|
85 |
-
# each process will process a subset of the dataset
|
86 |
-
prompt_indices_split = np.array_split(range(nprompt), dp_size)
|
87 |
-
prompt_indices = prompt_indices_split[dp_rank]
|
88 |
-
indices = [
|
89 |
-
x * self.n_sample + j for x in prompt_indices for j in range(self.n_sample)
|
90 |
-
]
|
91 |
-
all_num = len(indices)
|
92 |
-
processed_num = 0
|
93 |
-
log_file = os.path.join(
|
94 |
-
self.log_dir,
|
95 |
-
f"{self.model_name}_rank{dp_rank}_bs{self.batch_size}_shot_log_{self.language}.json",
|
96 |
-
)
|
97 |
-
tmpfile = open(log_file, "w")
|
98 |
-
start_time = time.time()
|
99 |
-
# split the dataset into batches and construct a list of inputs
|
100 |
-
for idx in range(0, len(indices), self.batch_size):
|
101 |
-
prompt_list = []
|
102 |
-
prompt_lens = []
|
103 |
-
orriginal_prompt_list = []
|
104 |
-
tokenized_prompt_lens = []
|
105 |
-
taskid = []
|
106 |
-
# get the prompts from the dataset
|
107 |
-
for j in indices[idx : idx + self.batch_size]:
|
108 |
-
data = dataset[j]
|
109 |
-
fprompt = data["prompt"].strip()
|
110 |
-
prompt_list.append(fprompt)
|
111 |
-
tmp = self.tokenizer.encode(fprompt)
|
112 |
-
orriginal_prompt_list.append(data["original_prompt"])
|
113 |
-
prompt_lens.append(len(fprompt))
|
114 |
-
tokenized_prompt_lens.append(tmp)
|
115 |
-
taskid.append(data["task_id"])
|
116 |
-
input_ids = torch.tensor(tokenized_prompt_lens).to(accelerator.device)
|
117 |
-
# generate the code
|
118 |
-
if self.temperature != 0:
|
119 |
-
decoded = gpt.generate(
|
120 |
-
input_ids=input_ids,
|
121 |
-
max_new_tokens=self.max_gen_len,
|
122 |
-
do_sample=True,
|
123 |
-
eos_token_id=self.tokenizer.eos_token_id,
|
124 |
-
temperature=self.temperature,
|
125 |
-
top_p=self.top_p,
|
126 |
-
pad_token_id=self.tokenizer.eos_token_id,
|
127 |
-
)
|
128 |
-
else:
|
129 |
-
decoded = gpt.generate(
|
130 |
-
input_ids=input_ids,
|
131 |
-
max_new_tokens=self.max_gen_len,
|
132 |
-
do_sample=False,
|
133 |
-
eos_token_id=self.tokenizer.eos_token_id,
|
134 |
-
pad_token_id=self.tokenizer.eos_token_id,
|
135 |
-
)
|
136 |
-
# save the results to a file
|
137 |
-
for local_idx, text in enumerate(decoded):
|
138 |
-
prediction = decoded[local_idx]
|
139 |
-
prediction = self.tokenizer.decode(prediction, skip_special_tokens=True)
|
140 |
-
suffixprediction = prediction[prompt_lens[local_idx] :]
|
141 |
-
suffixprediction = cleanup_code(
|
142 |
-
suffixprediction,
|
143 |
-
self.language,
|
144 |
-
"humaneval",
|
145 |
-
self.sft,
|
146 |
-
dataset.stopwords,
|
147 |
-
)
|
148 |
-
# sft mode does not need original prompt
|
149 |
-
if not self.sft:
|
150 |
-
suffixprediction = (
|
151 |
-
orriginal_prompt_list[local_idx] + "\n" + suffixprediction
|
152 |
-
)
|
153 |
-
res = {
|
154 |
-
"task_id": taskid[local_idx],
|
155 |
-
"generation": suffixprediction,
|
156 |
-
"prompt": orriginal_prompt_list[local_idx],
|
157 |
-
"wholecode": prediction,
|
158 |
-
}
|
159 |
-
tmpfile.write(json.dumps(res) + "\n")
|
160 |
-
tmpfile.flush()
|
161 |
-
processed_num += 1
|
162 |
-
self.log_score(dp_rank, processed_num, all_num, start_time, self.batch_size)
|
163 |
-
tmpfile.close()
|
164 |
-
accelerator.wait_for_everyone()
|
165 |
-
# calculate the final score of pass@k
|
166 |
-
self._calculate_final_score(accelerator)
|
167 |
-
accelerator.wait_for_everyone()
|
168 |
-
return
|
169 |
-
|
170 |
-
def log_score(self, dp_rank, processed_num, all_num, start_time, bs):
|
171 |
-
"""
|
172 |
-
Log the score.
|
173 |
-
"""
|
174 |
-
mem = torch.cuda.max_memory_allocated() / (1 << 30)
|
175 |
-
avg_time = (time.time() - start_time) / processed_num * bs
|
176 |
-
print(
|
177 |
-
f"DP RANK:{dp_rank} process_num/all_num:{int(processed_num)}/{all_num} "
|
178 |
-
f"avg_time_per_batch:{avg_time:.2f} s "
|
179 |
-
f"still_need:{((all_num - processed_num) // bs + 1) * avg_time / 60:.2f} m",
|
180 |
-
f"mem:{mem:.3f} GiB bs:{bs}",
|
181 |
-
flush=True,
|
182 |
-
)
|
183 |
-
if processed_num == all_num:
|
184 |
-
print(
|
185 |
-
f"EVAL DONE! Process time {(time.time() - start_time) / 60:.2f} m",
|
186 |
-
flush=True,
|
187 |
-
)
|
188 |
-
|
189 |
-
def _calculate_final_score(self, accelerator):
|
190 |
-
"""
|
191 |
-
Calculate the final score.
|
192 |
-
"""
|
193 |
-
if accelerator.is_local_main_process:
|
194 |
-
logfilepath = os.path.join(self.log_dir, f"final_{self.model_name}.jsonl")
|
195 |
-
logfile = open(logfilepath, "w")
|
196 |
-
for i in range(accelerator.num_processes):
|
197 |
-
tmplogfile = os.path.join(
|
198 |
-
self.log_dir,
|
199 |
-
f"{self.model_name}_rank{i}_bs{self.batch_size}_shot_log_{self.language}.json",
|
200 |
-
)
|
201 |
-
logfile.write(open(tmplogfile).read().strip() + "\n")
|
202 |
-
os.remove(tmplogfile)
|
203 |
-
logfile.close()
|
204 |
-
timeout = 10
|
205 |
-
runlang = self.language
|
206 |
-
res = evaluate_functional_correctness(
|
207 |
-
input_file=logfilepath,
|
208 |
-
problem_file=os.path.join(
|
209 |
-
self.data_root, f"humaneval-{self.language}.jsonl"
|
210 |
-
),
|
211 |
-
tmp_dir=self.log_dir,
|
212 |
-
timeout=timeout,
|
213 |
-
language=runlang,
|
214 |
-
)
|
215 |
-
print("score is", res["pass@%d" % self.k])
|
216 |
-
os.remove(logfilepath)
|
217 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evaluation/general_benchmarks/HumanEval/javatuples-1.2.jar
DELETED
Binary file (65.5 kB)
|
|
evaluation/general_benchmarks/HumanEval/test_config.yaml
DELETED
@@ -1,15 +0,0 @@
|
|
1 |
-
compute_environment: LOCAL_MACHINE
|
2 |
-
distributed_type: MULTI_GPU
|
3 |
-
downcast_bf16: 'no'
|
4 |
-
gpu_ids: all
|
5 |
-
machine_rank: 0
|
6 |
-
main_training_function: main
|
7 |
-
mixed_precision: 'no'
|
8 |
-
num_machines: 1
|
9 |
-
num_processes: 3
|
10 |
-
rdzv_backend: static
|
11 |
-
same_network: true
|
12 |
-
tpu_env: []
|
13 |
-
tpu_use_cluster: false
|
14 |
-
tpu_use_sudo: false
|
15 |
-
use_cpu: false
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evaluation/general_benchmarks/HumanEval/utils/dataset.py
DELETED
@@ -1,72 +0,0 @@
|
|
1 |
-
import json
|
2 |
-
import os
|
3 |
-
|
4 |
-
import numpy as np
|
5 |
-
|
6 |
-
|
7 |
-
class HumanEvalDataset:
|
8 |
-
|
9 |
-
def __init__(self, root, sample_num=1, language="python", issft=False):
|
10 |
-
"""
|
11 |
-
root: the path to the HumanEval dataset
|
12 |
-
sample_num: the number of samples for each prompt
|
13 |
-
language: the language of the HumanEval dataset
|
14 |
-
issft: whether to use the SFT setting
|
15 |
-
"""
|
16 |
-
self.root = root
|
17 |
-
self.data = open(
|
18 |
-
os.path.join(self.root, f"humaneval-{language}.jsonl")
|
19 |
-
).readlines()
|
20 |
-
|
21 |
-
tmp = self.get_qa_only_data(self.data, issft)
|
22 |
-
self.clean_data = []
|
23 |
-
for i in range(len(tmp)):
|
24 |
-
for j in range(sample_num):
|
25 |
-
self.clean_data.append(tmp[i])
|
26 |
-
self.stopwords = self.clean_data[0]["stopwords"]
|
27 |
-
np.random.seed(1234)
|
28 |
-
print(f"Read HumanEval from {root}, number of samples {len(self.clean_data)}")
|
29 |
-
|
30 |
-
def get_qa_only_data(self, data_json, sft=False):
|
31 |
-
"""
|
32 |
-
data_json: the jsonl file of HumanEval
|
33 |
-
sft: whether to use the SFT setting
|
34 |
-
return: a list of dict, each dict contains the prompt, task_id and stopwords
|
35 |
-
"""
|
36 |
-
ans = []
|
37 |
-
for line in data_json:
|
38 |
-
line = json.loads(line)
|
39 |
-
prompt = line["prompt"].strip()
|
40 |
-
if "prefix" in line:
|
41 |
-
origin_prompt = line["prefix"]
|
42 |
-
else:
|
43 |
-
origin_prompt = line["prompt"]
|
44 |
-
|
45 |
-
if sft:
|
46 |
-
prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context.\nWrite a response that appropriately completes the request.\n\n### Instruction:\nWrite a program to perform the given task.\n\nInput:\n{prompt}\n\n### Response:\n"""
|
47 |
-
if "stop_tokens" in line:
|
48 |
-
s = line["stop_tokens"]
|
49 |
-
else:
|
50 |
-
s = []
|
51 |
-
ans.append(
|
52 |
-
{
|
53 |
-
"prompt": prompt,
|
54 |
-
"task_id": line["task_id"],
|
55 |
-
"original_prompt": origin_prompt,
|
56 |
-
"stopwords": s,
|
57 |
-
}
|
58 |
-
)
|
59 |
-
return ans
|
60 |
-
|
61 |
-
def __len__(self):
|
62 |
-
"""
|
63 |
-
return the number of samples in the dataset
|
64 |
-
"""
|
65 |
-
return len(self.clean_data)
|
66 |
-
|
67 |
-
def __getitem__(self, index):
|
68 |
-
"""
|
69 |
-
return the sample at index
|
70 |
-
"""
|
71 |
-
sample = self.clean_data[index]
|
72 |
-
return sample
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evaluation/general_benchmarks/HumanEval/utils/utils.py
DELETED
@@ -1,161 +0,0 @@
|
|
1 |
-
import re
|
2 |
-
|
3 |
-
languge_settings = {
|
4 |
-
"python": {
|
5 |
-
"full_name": "Python",
|
6 |
-
"indent": 4,
|
7 |
-
},
|
8 |
-
"cpp": {
|
9 |
-
"full_name": "cpp",
|
10 |
-
"indent": 0,
|
11 |
-
"main": "int main()",
|
12 |
-
},
|
13 |
-
"java": {
|
14 |
-
"full_name": "Java",
|
15 |
-
"indent": 4,
|
16 |
-
"main": "public static void main",
|
17 |
-
},
|
18 |
-
"cs": {
|
19 |
-
"full_name": "csharp",
|
20 |
-
"indent": 0,
|
21 |
-
"main": "public static void Main",
|
22 |
-
},
|
23 |
-
"php": {
|
24 |
-
"full_name": "PHP",
|
25 |
-
"indent": 0,
|
26 |
-
},
|
27 |
-
"ts": {
|
28 |
-
"full_name": "TypeScript",
|
29 |
-
"indent": 0,
|
30 |
-
},
|
31 |
-
"js": {"full_name": "JavaScript", "indent": 0},
|
32 |
-
"sh": {"full_name": "Bash", "indent": 0},
|
33 |
-
}
|
34 |
-
|
35 |
-
|
36 |
-
def get_function_name(question: str, lang: str):
|
37 |
-
func_lines = [x for x in question.strip().split("\n") if x.strip()]
|
38 |
-
|
39 |
-
if lang.lower() == "python":
|
40 |
-
func_idx = [
|
41 |
-
i for i in range(len(func_lines)) if func_lines[i].startswith("def ")
|
42 |
-
][-1]
|
43 |
-
func_name = func_lines[func_idx].split("(")[0].strip()
|
44 |
-
func_prefix = "\n".join(func_lines[:func_idx])
|
45 |
-
return func_name, func_prefix
|
46 |
-
|
47 |
-
func_name = func_lines[-1].split("{")[0].strip()
|
48 |
-
func_prefix = "\n".join(func_lines[:-1])
|
49 |
-
return func_name, func_prefix
|
50 |
-
|
51 |
-
|
52 |
-
def extract_generation_code(example: str, lang_code: str, verbose: bool = False):
|
53 |
-
task_id = example["task_id"]
|
54 |
-
output = example.get("output", example.get("gpt_completion"))
|
55 |
-
question = example["prompt"].strip()
|
56 |
-
setting = languge_settings[lang_code]
|
57 |
-
lang = setting["full_name"]
|
58 |
-
indent = setting["indent"]
|
59 |
-
|
60 |
-
try:
|
61 |
-
code_block: str = re.findall(
|
62 |
-
f"```{lang.lower()}\n(.*?)```", output, re.DOTALL | re.IGNORECASE
|
63 |
-
)[0]
|
64 |
-
if verbose:
|
65 |
-
print(">>> Task: {}\n{}".format(task_id, code_block))
|
66 |
-
|
67 |
-
# Remove main
|
68 |
-
if setting.get("main", None) and setting["main"] in code_block:
|
69 |
-
main_start = code_block.index(setting["main"])
|
70 |
-
code_block = code_block[:main_start]
|
71 |
-
|
72 |
-
func_name, func_prefix = get_function_name(question, lang)
|
73 |
-
|
74 |
-
try:
|
75 |
-
start = code_block.lower().index(func_name.lower())
|
76 |
-
indent = 0
|
77 |
-
while start - indent >= 0 and code_block[start - indent - 1] == " ":
|
78 |
-
indent += 1
|
79 |
-
|
80 |
-
try:
|
81 |
-
end = code_block.rindex("\n" + " " * indent + "}")
|
82 |
-
except:
|
83 |
-
end = len(code_block)
|
84 |
-
except:
|
85 |
-
start = 0
|
86 |
-
try:
|
87 |
-
end = code_block.rindex("\n" + " " * indent + "}")
|
88 |
-
except:
|
89 |
-
end = len(code_block)
|
90 |
-
|
91 |
-
body = code_block[start:end]
|
92 |
-
|
93 |
-
if lang_code.lower() in ["php", "ts", "js"]:
|
94 |
-
body += "\n" + " " * indent + "}"
|
95 |
-
|
96 |
-
generation = func_prefix + "\n" + body + "\n"
|
97 |
-
example["generation"] = generation
|
98 |
-
|
99 |
-
except Exception as ex:
|
100 |
-
print(
|
101 |
-
"Failed to extract code block with error `{}`:\n>>> Task: {}\n>>> Output:\n{}".format(
|
102 |
-
ex, task_id, output
|
103 |
-
)
|
104 |
-
)
|
105 |
-
example["generation"] = example["prompt"] + "\n" + output
|
106 |
-
|
107 |
-
return example
|
108 |
-
|
109 |
-
|
110 |
-
def cleanup_code(
|
111 |
-
code: str,
|
112 |
-
language_type: str = None,
|
113 |
-
dataset: str = None,
|
114 |
-
issft: bool = False,
|
115 |
-
stop_words=[],
|
116 |
-
):
|
117 |
-
"""
|
118 |
-
Cleans up the generated code.
|
119 |
-
"""
|
120 |
-
|
121 |
-
if language_type.lower() == "python":
|
122 |
-
if issft:
|
123 |
-
code = _clean_python_code_for_sft(code)
|
124 |
-
stop_words = ["\ndef", "\nclass", "\nif", "\n#", "\nprint"]
|
125 |
-
code = _truncate_code_at_stopwords(code, stop_words)
|
126 |
-
elif language_type.lower() == "ts":
|
127 |
-
code = _truncate_code_at_stopwords(
|
128 |
-
code,
|
129 |
-
stop_words
|
130 |
-
+ [
|
131 |
-
"\nexport",
|
132 |
-
"\nimport",
|
133 |
-
"\nexport default",
|
134 |
-
"\nimport default",
|
135 |
-
"\nconsole.log",
|
136 |
-
],
|
137 |
-
)
|
138 |
-
else:
|
139 |
-
code = _truncate_code_at_stopwords(code, stop_words)
|
140 |
-
|
141 |
-
return code
|
142 |
-
|
143 |
-
|
144 |
-
def _clean_python_code_for_sft(code):
|
145 |
-
code = code.replace("\r", "")
|
146 |
-
if "```python" in code:
|
147 |
-
code_start_idx = code.index("```python")
|
148 |
-
code = code[code_start_idx:].replace("```python", "").strip()
|
149 |
-
end_idx = code.find("```") if "```" in code else len(code)
|
150 |
-
code = code[:end_idx].strip()
|
151 |
-
|
152 |
-
return code
|
153 |
-
|
154 |
-
|
155 |
-
def _truncate_code_at_stopwords(code, stop_words):
|
156 |
-
min_stop_idx = len(code)
|
157 |
-
for stop_word in stop_words:
|
158 |
-
stop_index = code.find(stop_word)
|
159 |
-
if 0 <= stop_index < min_stop_idx:
|
160 |
-
min_stop_idx = stop_index
|
161 |
-
return code[:min_stop_idx]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evaluation/general_benchmarks/MATH/LICENSE
DELETED
@@ -1,21 +0,0 @@
|
|
1 |
-
MIT License
|
2 |
-
|
3 |
-
Copyright (c) 2024 Zhibin Gou
|
4 |
-
|
5 |
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
-
of this software and associated documentation files (the "Software"), to deal
|
7 |
-
in the Software without restriction, including without limitation the rights
|
8 |
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
-
copies of the Software, and to permit persons to whom the Software is
|
10 |
-
furnished to do so, subject to the following conditions:
|
11 |
-
|
12 |
-
The above copyright notice and this permission notice shall be included in all
|
13 |
-
copies or substantial portions of the Software.
|
14 |
-
|
15 |
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
-
SOFTWARE.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evaluation/general_benchmarks/MATH/README.md
DELETED
@@ -1,52 +0,0 @@
|
|
1 |
-
### Requirements
|
2 |
-
You can install the required packages with the following command:
|
3 |
-
```bash
|
4 |
-
cd latex2sympy
|
5 |
-
pip install -e .
|
6 |
-
cd ..
|
7 |
-
pip install -r requirements.txt
|
8 |
-
pip install vllm==0.5.1 --no-build-isolation
|
9 |
-
pip install transformers==4.42.3
|
10 |
-
```
|
11 |
-
|
12 |
-
### Evaluation
|
13 |
-
You can evaluate Qwen2.5/Qwen2-Math-Instruct series model with the following command:
|
14 |
-
```bash
|
15 |
-
# Qwen2.5-Math-Instruct Series
|
16 |
-
PROMPT_TYPE="qwen25-math-cot"
|
17 |
-
# Qwen2.5-Math-1.5B-Instruct
|
18 |
-
export CUDA_VISIBLE_DEVICES="0"
|
19 |
-
MODEL_NAME_OR_PATH="Qwen/Qwen2.5-Math-1.5B-Instruct"
|
20 |
-
bash sh/eval.sh $PROMPT_TYPE $MODEL_NAME_OR_PATH
|
21 |
-
|
22 |
-
# Qwen2.5-Math-7B-Instruct
|
23 |
-
export CUDA_VISIBLE_DEVICES="0"
|
24 |
-
MODEL_NAME_OR_PATH="Qwen/Qwen2.5-Math-7B-Instruct"
|
25 |
-
bash sh/eval.sh $PROMPT_TYPE $MODEL_NAME_OR_PATH
|
26 |
-
|
27 |
-
# Qwen2.5-Math-72B-Instruct
|
28 |
-
export CUDA_VISIBLE_DEVICES="0,1,2,3"
|
29 |
-
MODEL_NAME_OR_PATH="Qwen/Qwen2.5-Math-72B-Instruct"
|
30 |
-
bash sh/eval.sh $PROMPT_TYPE $MODEL_NAME_OR_PATH
|
31 |
-
|
32 |
-
|
33 |
-
# Qwen2-Math-Instruct Series
|
34 |
-
PROMPT_TYPE="qwen-boxed"
|
35 |
-
# Qwen2-Math-1.5B-Instruct
|
36 |
-
export CUDA_VISIBLE_DEVICES="0"
|
37 |
-
MODEL_NAME_OR_PATH="Qwen/Qwen2-Math-1.5B-Instruct"
|
38 |
-
bash sh/eval.sh $PROMPT_TYPE $MODEL_NAME_OR_PATH
|
39 |
-
|
40 |
-
# Qwen2-Math-7B-Instruct
|
41 |
-
export CUDA_VISIBLE_DEVICES="0"
|
42 |
-
MODEL_NAME_OR_PATH="Qwen/Qwen2-Math-7B-Instruct"
|
43 |
-
bash sh/eval.sh $PROMPT_TYPE $MODEL_NAME_OR_PATH
|
44 |
-
|
45 |
-
# Qwen2-Math-72B-Instruct
|
46 |
-
export CUDA_VISIBLE_DEVICES="0,1,2,3"
|
47 |
-
MODEL_NAME_OR_PATH="Qwen/Qwen2-Math-72B-Instruct"
|
48 |
-
bash sh/eval.sh $PROMPT_TYPE $MODEL_NAME_OR_PATH
|
49 |
-
```
|
50 |
-
|
51 |
-
## Acknowledgement
|
52 |
-
The codebase is adapted from [math-evaluation-harness](https://github.com/ZubinGou/math-evaluation-harness).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
evaluation/general_benchmarks/MATH/data/aime24/test.jsonl
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:af2b8bd2aa911b6333ad0df32f3ca05c7ae8ed10f1731f4372c8ae26990bf7ac
|
3 |
-
size 156944
|
|
|
|
|
|
|
|