qianxiao1111 commited on
Commit
6a74973
1 Parent(s): 5da0640
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +0 -4
  2. added_tokens.json +24 -3
  3. config.json +28 -3
  4. evaluation/.gitignore +0 -190
  5. evaluation/README.md +0 -181
  6. evaluation/general_benchmarks/HumanEval/README.md +0 -74
  7. evaluation/general_benchmarks/HumanEval/data/humaneval-cpp +0 -0
  8. evaluation/general_benchmarks/HumanEval/data/humaneval-cpp.jsonl +0 -3
  9. evaluation/general_benchmarks/HumanEval/data/humaneval-cs +0 -0
  10. evaluation/general_benchmarks/HumanEval/data/humaneval-cs-bu.jsonl +0 -3
  11. evaluation/general_benchmarks/HumanEval/data/humaneval-cs.jsonl +0 -3
  12. evaluation/general_benchmarks/HumanEval/data/humaneval-d.jsonl +0 -3
  13. evaluation/general_benchmarks/HumanEval/data/humaneval-go.jsonl +0 -3
  14. evaluation/general_benchmarks/HumanEval/data/humaneval-java +0 -0
  15. evaluation/general_benchmarks/HumanEval/data/humaneval-java.jsonl +0 -3
  16. evaluation/general_benchmarks/HumanEval/data/humaneval-jl.jsonl +0 -3
  17. evaluation/general_benchmarks/HumanEval/data/humaneval-js.jsonl +0 -3
  18. evaluation/general_benchmarks/HumanEval/data/humaneval-lua.jsonl +0 -3
  19. evaluation/general_benchmarks/HumanEval/data/humaneval-php +0 -0
  20. evaluation/general_benchmarks/HumanEval/data/humaneval-php.jsonl +0 -3
  21. evaluation/general_benchmarks/HumanEval/data/humaneval-pl.jsonl +0 -3
  22. evaluation/general_benchmarks/HumanEval/data/humaneval-python.jsonl +0 -3
  23. evaluation/general_benchmarks/HumanEval/data/humaneval-r.jsonl +0 -3
  24. evaluation/general_benchmarks/HumanEval/data/humaneval-rb.jsonl +0 -3
  25. evaluation/general_benchmarks/HumanEval/data/humaneval-rkt.jsonl +0 -3
  26. evaluation/general_benchmarks/HumanEval/data/humaneval-rs.jsonl +0 -3
  27. evaluation/general_benchmarks/HumanEval/data/humaneval-scala.jsonl +0 -3
  28. evaluation/general_benchmarks/HumanEval/data/humaneval-sh +0 -0
  29. evaluation/general_benchmarks/HumanEval/data/humaneval-sh.jsonl +0 -3
  30. evaluation/general_benchmarks/HumanEval/data/humaneval-swift.jsonl +0 -3
  31. evaluation/general_benchmarks/HumanEval/data/humaneval-ts +0 -0
  32. evaluation/general_benchmarks/HumanEval/data/humaneval-ts.jsonl +0 -3
  33. evaluation/general_benchmarks/HumanEval/eval.sh +0 -4
  34. evaluation/general_benchmarks/HumanEval/eval_base_vllm.py +0 -162
  35. evaluation/general_benchmarks/HumanEval/eval_instruct.py +0 -168
  36. evaluation/general_benchmarks/HumanEval/eval_instruct_vllm.py +0 -225
  37. evaluation/general_benchmarks/HumanEval/eval_pal.py +0 -62
  38. evaluation/general_benchmarks/HumanEval/human_eval/__init__.py +0 -0
  39. evaluation/general_benchmarks/HumanEval/human_eval/data.py +0 -48
  40. evaluation/general_benchmarks/HumanEval/human_eval/evaluate_functional_correctness.py +0 -32
  41. evaluation/general_benchmarks/HumanEval/human_eval/evaluation.py +0 -351
  42. evaluation/general_benchmarks/HumanEval/human_eval/execution.py +0 -817
  43. evaluation/general_benchmarks/HumanEval/humaneval.py +0 -217
  44. evaluation/general_benchmarks/HumanEval/javatuples-1.2.jar +0 -0
  45. evaluation/general_benchmarks/HumanEval/test_config.yaml +0 -15
  46. evaluation/general_benchmarks/HumanEval/utils/dataset.py +0 -72
  47. evaluation/general_benchmarks/HumanEval/utils/utils.py +0 -161
  48. evaluation/general_benchmarks/MATH/LICENSE +0 -21
  49. evaluation/general_benchmarks/MATH/README.md +0 -52
  50. evaluation/general_benchmarks/MATH/data/aime24/test.jsonl +0 -3
.gitattributes CHANGED
@@ -47,7 +47,3 @@ rng_state_2.pth filter=lfs diff=lfs merge=lfs -text
47
  model-00003-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
48
  rng_state_4.pth filter=lfs diff=lfs merge=lfs -text
49
  model-00002-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
50
- *.csv filter=lfs diff=lfs merge=lfs -text
51
- *.json filter=lfs diff=lfs merge=lfs -text
52
- *.jsonl filter=lfs diff=lfs merge=lfs -text
53
- *.sqlite filter=lfs diff=lfs merge=lfs -text
 
47
  model-00003-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
48
  rng_state_4.pth filter=lfs diff=lfs merge=lfs -text
49
  model-00002-of-00004.safetensors filter=lfs diff=lfs merge=lfs -text
 
 
 
 
added_tokens.json CHANGED
@@ -1,3 +1,24 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:58b54bbe36fc752f79a24a271ef66a0a0830054b4dfad94bde757d851968060b
3
- size 605
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
config.json CHANGED
@@ -1,3 +1,28 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:82c97ddf1f855ce947c97a62859a064efb2499cb7cbb82aa9c67bdc65b678c17
3
- size 709
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
3
+ "architectures": [
4
+ "Qwen2ForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 151643,
8
+ "eos_token_id": 151645,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 3584,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 18944,
13
+ "max_position_embeddings": 32768,
14
+ "max_window_layers": 28,
15
+ "model_type": "qwen2",
16
+ "num_attention_heads": 28,
17
+ "num_hidden_layers": 28,
18
+ "num_key_value_heads": 4,
19
+ "rms_norm_eps": 1e-06,
20
+ "rope_theta": 1000000.0,
21
+ "sliding_window": null,
22
+ "tie_word_embeddings": false,
23
+ "torch_dtype": "bfloat16",
24
+ "transformers_version": "4.44.2",
25
+ "use_cache": false,
26
+ "use_sliding_window": false,
27
+ "vocab_size": 152064
28
+ }
evaluation/.gitignore DELETED
@@ -1,190 +0,0 @@
1
-
2
- # Byte-compiled / optimized / DLL files
3
- __pycache__/
4
- *.py[cod]
5
- *$py.class
6
- *.sql
7
- *.sqlite
8
- *.splite
9
- *.desc
10
- *.txt
11
- *.DS_Store
12
- .DS_Store
13
- !eval_retriever/data/*.json
14
- !eval_retriever/preds/*.json
15
- !reject_eval/*.json
16
- !evalset/*/*.json
17
- !evalset/*.json
18
-
19
- # C extensions
20
- *.so
21
-
22
- # Distribution / packaging
23
- .Python
24
- build/
25
- develop-eggs/
26
- dist/
27
- downloads/
28
- eggs/
29
- .eggs/
30
- lib/
31
- lib64/
32
- parts/
33
- sdist/
34
- var/
35
- wheels/
36
- share/python-wheels/
37
- *.egg-info/
38
- .installed.cfg
39
- *.egg
40
- MANIFEST
41
-
42
- # PyInstaller
43
- # Usually these files are written by a python script from a template
44
- # before PyInstaller builds the exe, so as to inject date/other infos into it.
45
- *.manifest
46
- *.spec
47
-
48
- # Installer logs
49
- pip-log.txt
50
- pip-delete-this-directory.txt
51
-
52
- # Unit test / coverage reports
53
- htmlcov/
54
- .tox/
55
- .nox/
56
- .coverage
57
- .coverage.*
58
- .cache
59
- nosetests.xml
60
- coverage.xml
61
- *.cover
62
- *.py,cover
63
- .hypothesis/
64
- .pytest_cache/
65
- cover/
66
-
67
- # Translations
68
- *.mo
69
- *.pot
70
-
71
- # Django stuff:
72
- *.log
73
- local_settings.py
74
- db.sqlite3
75
- db.sqlite3-journal
76
-
77
- # Flask stuff:
78
- instance/
79
- .webassets-cache
80
-
81
- # Scrapy stuff:
82
- .scrapy
83
-
84
- # Sphinx documentation
85
- docs/_build/
86
-
87
- # PyBuilder
88
- .pybuilder/
89
- target/
90
-
91
- # Jupyter Notebook
92
- .ipynb_checkpoints
93
-
94
- # IPython
95
- profile_default/
96
- ipython_config.py
97
-
98
- # pyenv
99
- # For a library or package, you might want to ignore these files since the code is
100
- # intended to run in multiple environments; otherwise, check them in:
101
- # .python-version
102
-
103
- # pipenv
104
- # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
105
- # However, in case of collaboration, if having platform-specific dependencies or dependencies
106
- # having no cross-platform support, pipenv may install dependencies that don't work, or not
107
- # install all needed dependencies.
108
- #Pipfile.lock
109
-
110
- # poetry
111
- # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
112
- # This is especially recommended for binary packages to ensure reproducibility, and is more
113
- # commonly ignored for libraries.
114
- # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
115
- #poetry.lock
116
-
117
- # pdm
118
- # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
119
- #pdm.lock
120
- # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
121
- # in version control.
122
- # https://pdm.fming.dev/#use-with-ide
123
- .pdm.toml
124
-
125
- # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
126
- __pypackages__/
127
-
128
- # Celery stuff
129
- celerybeat-schedule
130
- celerybeat.pid
131
-
132
- # SageMath parsed files
133
- *.sage.py
134
-
135
- # Environments
136
- .env
137
- .venv
138
- env/
139
- venv/
140
- ENV/
141
- env.bak/
142
- venv.bak/
143
-
144
- # Spyder project settings
145
- .spyderproject
146
- .spyproject
147
-
148
- # Rope project settings
149
- .ropeproject
150
-
151
- # mkdocs documentation
152
- /site
153
-
154
- # mypy
155
- .mypy_cache/
156
- .dmypy.json
157
- dmypy.json
158
-
159
- # Pyre type checker
160
- .pyre/
161
-
162
- # pytype static type analyzer
163
- .pytype/
164
-
165
- # Cython debug symbols
166
- cython_debug/
167
-
168
- # PyCharm
169
- # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
170
- # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
171
- # and can be added to the global gitignore or merged into this file. For a more nuclear
172
- # option (not recommended) you can uncomment the following to ignore the entire idea folder.
173
- #.idea/
174
-
175
- output/
176
- images/
177
- .vscode
178
- vllm_encoder/
179
-
180
- !table_related_benchmarks/evalset/bird_data/*.sql
181
- !table_related_benchmarks/evalset/spider_data/*.sql
182
-
183
- table_related_benchmarks/evalset/spider_data/test_database/*
184
- table_related_benchmarks/evalset/bird_data/dev_databases/*
185
- table_related_benchmarks/evalset/spider_data/dev_database/*
186
-
187
-
188
- !table_related_benchmarks/evalset/spider_data/test_database/README.md
189
- !table_related_benchmarks/evalset/bird_data/dev_databases/README.md
190
- !table_related_benchmarks/evalset/spider_data/dev_database/README.md
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/README.md DELETED
@@ -1,181 +0,0 @@
1
- # Benchmarks evaluations for tablegpt
2
-
3
- <p align="center">
4
- <a href="#-About">🔥About</a> •
5
- <a href="#-Usage">💻Usage</a> •
6
- </p>
7
-
8
- ## About
9
-
10
- </div>
11
-
12
- This is a repo opened for evaluation on different table-related benchmarks for tablegpt.
13
-
14
- Given the complexity of table QA tasks and the uncertainty of input instructions, we provide evaluation datasets and scripts for 7 capabilities:
15
-
16
- - ✨Code correction based on tables
17
- - ✨Refusal of ambiguous questions
18
- - ✨Table & field recall in multi-table scenarios
19
- - ✨Table QA output code executable
20
- - ✨Table-Bench.
21
- - ✨Text2Sql.
22
- - ✨TableInstruct, which includes a series of table-related evaluation benchmarks.
23
-
24
- In addition, we have integrated other general abilities benchmarks like HumanEval, MBPP and MMLU/CMMLU.
25
- We have built an inference method based on the local model path using vLLM as the backend, and defined a set of example prompts templates for the above benchmarks.
26
-
27
- ## Usage
28
-
29
- </div>
30
- </details>
31
-
32
- ⏬ To use this framework, please first install the repository from GitHub:
33
-
34
- ```shell
35
- git clone https://github.com/tablegpt/tablegpt-eval
36
- cd tablegpt-eval
37
- pip install -r requirements.txt
38
- ```
39
-
40
- </div>
41
- </details>
42
-
43
- [!Tips]
44
- 1. You can run all the benchmarks with the default params by running command `bash run_benchmarks.sh`.
45
- 2. If you want more configuration options for running parameters, refer to the typical Python script.
46
- 3. Download the .db files before running text2sql evaluation scripts. Download urls refer to `/table_related_benchmarks/evalset/bird_data/dev_databases/README.md`(Bird dev) & `table_related_benchmarks/evalset/spider_data/dev_database/README.md` (Spider dev) & `table_related_benchmarks/evalset/spider_data/test_database/README.md` (Spider test).
47
-
48
-
49
- ### Code correction eval
50
-
51
- We provide a non-executable eval dataset based on the Python language. Eval dataset path:
52
-
53
- ```python
54
- table_related_benchmarks/evalset/code_correction_test/correction_set.json
55
- ```
56
-
57
- We use the ***executable_pass_rate*** and ***absolute_match_rate*** of the corrected code in pass-1 to evaluate the model's code correction ability. You can perform code-correction evaluation by running the following Python command:
58
-
59
- ```bash
60
- python table_related_benchmarks/run_code_correction_eval.py \
61
- --model_path <EVAL MODEL PATH> \
62
- --template <CHAT_TEMPLATE_NAME, support [llama3, baichuan, chatglm, None], default None> \
63
- --eval_results_save_path <PATH TO SAVE THE EVAL RESULTS> \
64
- --gpus_num <NUMBER OF GPU TO RUN INFERENCE> \
65
- --temperature <ONE OF THE INFERENCE PARAMETER>
66
- ```
67
-
68
- ### Ambiguous reject eval
69
-
70
- We provide 298 table-based queries, with a ratio of about 1:3 between queries marked as ambiguous (to be rejected) and queries that should be accepted and correctly answered. Dataset path:
71
-
72
- ```python
73
- # test queries
74
- evalset/reject_test/test_query.json
75
- # queries with ground truth
76
- evalset/reject_test/ground_truth.json
77
- ```
78
-
79
- We use **accuracy**, **recall**, and **F1 score** as metrics to evaluate the LLM's ability in this task. You can perform reject evaluation by running the following Python command:
80
-
81
- ```bash
82
- python table_related_benchmarks/run_reject_eval.py \
83
- --model_path <EVAL MODEL PATH> \
84
- --save_path <LLM OUTPUT CONTENT SAVE PATH> \
85
- --gpus_num <NUMBER OF GPU TO RUN INFERENCE> \
86
- --temperature <ONE OF THE INFERENCE PARAMETER>
87
- ```
88
-
89
- ### Table&Fields recall eval
90
-
91
- The provided eval dataset path:
92
-
93
- ```python
94
- table_related_benchmarks/evalset/retrieval_test/recall_set.json
95
- ```
96
-
97
- We use a series of evaluation metrics such as **recall**, **precision**, **Jaccard similarity**, and **Hamming loss** to assess the LLM's performance in table and field retrieval tasks. You can perform recall evaluation by running the following Python command:
98
-
99
- ```bash
100
- python table_related_benchmarks/run_recall_eval.py \
101
- --model_path <EVAL MODEL PATH> \
102
- --temperature <TEMPERATURE> \
103
- --gpus_num <NUMBER OF GPU TO RUN INFERENCE>
104
- ```
105
-
106
- ### Table QA executable
107
-
108
- Provide 2178 table based queries, eval dataset path:
109
-
110
- ```python
111
- table_related_benchmarks/evalset/table_qa_execuate_test/tableqa_samples_with_paths.jsonl
112
- ```
113
-
114
- We employ ***executable_pass_rate*** of pass-1 to employ the model's tableQA code generation ability. You can perform tableQA evaluation by running the following Python command:
115
-
116
- ```bash
117
- python table_related_benchmarks/run_tableqa_execution_eval.py \
118
- --model_path <EVAL MODEL PATH> \
119
- --temperature <ONE OF THE INFERENCE PARAMETER> \
120
- --gpus_num <NUMBER OF GPU TO RUN INFERENCE>
121
- ```
122
-
123
- ### TableBench evaluation
124
-
125
- The provided eval dataset path:
126
-
127
- ```python
128
- table_related_benchmarks/evalset/TableBench
129
- ```
130
-
131
- In the evaluation of TableBench, Rough-L was used to assess general QA questions, while pass@1 was used as the evaluation metric for visualization-type samples. You can perform TableBench evaluation by the following command:
132
-
133
- ```bash
134
- python table_related_benchmarks/run_table_bench_eval.py \
135
- --model_path <EVAL MODEL PATH> \
136
- --temperature <ONE OF THE INFERENCE PARAMETER> \
137
- --gpus_num <NUMBER OF GPU TO RUN INFERENCE>
138
- ```
139
-
140
- ### TableInstruct
141
-
142
- The provided eval dataset path:
143
-
144
- ```python
145
- table_related_benchmarks/evalset/TableInstruct
146
- ```
147
-
148
- You can perform TableInstruct evaluation by the following command:
149
-
150
- ```bash
151
- python table_related_benchmarks/run_table_instruct_eval.py \
152
- --model_path <EVAL MODEL PATH> \
153
- --temperature <ONE OF THE INFERENCE PARAMETER> \
154
- --gpus_num <NUMBER OF GPU TO RUN INFERENCE>
155
- ```
156
-
157
- ### Text2Sql
158
- ```bash
159
- python table_related_benchmarks/run_text2sql_eval.py --model_path <EVAL MODEL PATH>
160
- ```
161
-
162
- ### HumanEval
163
- Perform HumanEval evaluation by the following command:
164
-
165
- ```bash
166
- python general_benchmarks/HumanEval/eval_instruct_vllm.py --model_path <EVAL MODEL PATH>
167
- ```
168
-
169
- ### MBPP
170
- Perform MBPP evaluation by the following command:
171
-
172
- ```bash
173
- python general_benchmarks/MBPP/eval_instruct_vllm.py --model_path <EVAL MODEL PATH>
174
- ```
175
-
176
- ### MMLU & CMMLU
177
-
178
- ```bash
179
- python general_benchmarks/MMLU/evaluator.py --task <mmlu or cmmlu> --lang <en or zh> --model_path <EVAL MODEL PATH>
180
- ```
181
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/general_benchmarks/HumanEval/README.md DELETED
@@ -1,74 +0,0 @@
1
- ## 1. Introduction
2
-
3
- We provide a test script to evaluate the performance of the **deepseek-coder** model on code generation benchmarks. We select the widely-used benchmarks: **[HumanEval-Python](https://huggingface.co/datasets/openai_humaneval), [HumanEval-Multilingual](https://huggingface.co/datasets/nuprl/MultiPL-E)**.
4
-
5
-
6
-
7
- ## 2. Setup
8
-
9
- ```
10
- pip install accelerate
11
- pip install attrdict
12
- pip install transformers
13
- pip install pytorch
14
- ```
15
-
16
-
17
- ## 3. Evaluation
18
-
19
- We've created a sample script, **eval.sh**, that demonstrates how to test the **DeepSeek-Coder-1.3b-Base** model on the HumanEval dataset leveraging **8** GPUs. If your use case involves a different model or dataset, simply adjust the script to fit your needs.
20
-
21
- Additionally, for various programming languages, the execution path may differ. Please ensure you update the appropriate paths in the **humaneval/execution.py** file accordingly.
22
-
23
- ```bash
24
- MODEL_NAME_OR_PATH="deepseek-ai/deepseek-coder-1.3b-base"
25
- DATASET_ROOT="data/"
26
- LANGUAGE="python"
27
- python -m accelerate.commands.launch --config_file test_config.yaml eval_pal.py --logdir ${MODEL_NAME_OR_PATH} --language ${LANGUAGE} --dataroot ${DATASET_ROOT}
28
- ```
29
-
30
- To evaluate the instruction-based model, please follow the script below:
31
- ```bash
32
- LANG="python"
33
- OUPUT_DIR="output"
34
- MODEL="deepseek-coder-33b-instruct"
35
-
36
- CUDA_VISIBLE_DEVICES=0,1 python eval_instruct.py \
37
- --model "deepseek-ai/$MODEL" \
38
- --output_path "$OUPUT_DIR/${LANG}.$MODEL.jsonl" \
39
- --language $LANG \
40
- --temp_dir $OUPUT_DIR
41
- ```
42
-
43
- ## 4. Experimental Results
44
-
45
- We report experimental results here for 8 main-stream programming languages, **python**, **c++**, **java**, **PHP**, **TypeScript**, **C#**, **Bash**, and **JavaScript**. For all open-source models, we utilize this repository to obtain the performance of the models on the HumanEval dataset. We set the maximum input length to **4096** and the maximum output length to **500**, and employ the **greedy search strategy**.
46
-
47
-
48
- #### (1) Multilingual Base Models
49
-
50
- | Model | Size | Python | C++ | Java | PHP | TS | C# | Bash | JS | Avg |
51
- |-------------------|------|--------|-------|------|------|------|------|------|------|------|
52
- | code-cushman-001 | 12B | 33.5% | 31.9% | 30.6%| 28.9%| 31.3%| 22.1%| 11.7%| - | - |
53
- | CodeShell | 7B | 35.4% | 32.9% | 34.2%| 31.7%| 30.2%| 38.0%| 7.0% | 33.5%| 30.4%|
54
- | CodeGeeX2 | 6B | 36.0% | 29.2% | 25.9%| 23.6%| 20.8%| 29.7%| 6.3% | 24.8%| 24.5%|
55
- | StarCoderBase | 16B | 31.7% | 31.1% | 28.5%| 25.4%| 34.0%| 34.8%| 8.9% | 29.8%| 28.0%|
56
- | CodeLLama | 7B | 31.7% | 29.8% | 34.2%| 23.6%| 36.5%| 36.7%| 12.0%| 29.2%| 29.2%|
57
- | CodeLLama | 13B | 36.0% | 37.9% | 38.0%| 34.2%| 45.2%| 43.0%| 16.5%| 32.3%| 35.4%|
58
- | CodeLLama | 34B | 48.2% | 44.7% | 44.9%| 41.0%| 42.1%| 48.7%| 15.8%| 42.2%| 41.0%|
59
- | | | | | | | | | | | |
60
- | DeepSeek-Coder-Base| 1.3B | 34.8% | 31.1% | 32.3%| 24.2%| 28.9%| 36.7%| 10.1%| 28.6%| 28.3%|
61
- | DeepSeek-Coder-Base| 5.7B | 48.7% | 45.3% | 41.1%| 39.7%| 44.7%| 41.1%| 27.8%| 42.2%| 41.3%|
62
- | DeepSeek-Coder-Base| 6.7B | 49.4% | 50.3% | 43.0%| 38.5%| 49.7%| 50.0%| 28.5%| 48.4%| 44.7%|
63
- | DeepSeek-Coder-Base|33B | **56.1%** | **58.4%** | **51.9%**| **44.1%**| **52.8%**| **51.3%**| **32.3%**| **55.3%**| **50.3%**|
64
-
65
- #### (2) Instruction-Tuned Models
66
- | Model | Size | Python | C++ | Java | PHP | TS | C# | Bash | JS | Avg |
67
- |---------------------|------|--------|-------|------|------|------|------|------|------|------|
68
- | GPT-3.5-Turbo | - | 76.2% | 63.4% | 69.2%| 60.9%| 69.1%| 70.8%| 42.4%| 67.1%| 64.9%|
69
- | GPT-4 | - | **84.1%** | **76.4%** | **81.6%**| **77.2%**| **77.4%**| **79.1%**| **58.2%**| **78.0%**| **76.5%**|
70
- | | | | | | | | | | | |
71
- | DeepSeek-Coder-Instruct | 1.3B | 65.2% | 45.3% | 51.9% | 45.3% | 59.7% |55.1% | 12.7% | 52.2% | 48.4% |
72
- | DeepSeek-Coder-Instruct | 6.7B | 78.9% | 63.4% | 68.4% | 68.9%| 67.2%| 72.8%| 36.7%| 72.7%| 66.1%|
73
- | DeepSeek-Coder-Instruct | 33B | **79.3%** | **68.9%** | **73.4%** | **72.7%**| **67.9%**| **74.1%**| **43.0%**| **73.9%**| **69.2%**|
74
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/general_benchmarks/HumanEval/data/humaneval-cpp DELETED
The diff for this file is too large to render. See raw diff
 
evaluation/general_benchmarks/HumanEval/data/humaneval-cpp.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8717eabdf202137158c84506144b0fb1e73d5ecccbe5363ec79009ca014df629
3
- size 388688
 
 
 
 
evaluation/general_benchmarks/HumanEval/data/humaneval-cs DELETED
The diff for this file is too large to render. See raw diff
 
evaluation/general_benchmarks/HumanEval/data/humaneval-cs-bu.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d281b53b24e0f44cb76f1b1a8702b1ca668ff2a29c7621276ee8b658f5c124c6
3
- size 448701
 
 
 
 
evaluation/general_benchmarks/HumanEval/data/humaneval-cs.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1c3fe18be10addc2d0b96311f4db192ae3232e08628d17768d889d6ab87be224
3
- size 452021
 
 
 
 
evaluation/general_benchmarks/HumanEval/data/humaneval-d.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:226938c015f90d713a3e30d8f174f4a6d2c88820cf50512379a16890dda70332
3
- size 289365
 
 
 
 
evaluation/general_benchmarks/HumanEval/data/humaneval-go.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7f5dfe3ff001049b1221e9d21a8119b5cbc38eb87c97fefc5d57fa7adc1df888
3
- size 432325
 
 
 
 
evaluation/general_benchmarks/HumanEval/data/humaneval-java DELETED
The diff for this file is too large to render. See raw diff
 
evaluation/general_benchmarks/HumanEval/data/humaneval-java.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b21b015763452ec4f9f4e0e6425148ec331ace4da1232c8b4d441186185f6265
3
- size 454059
 
 
 
 
evaluation/general_benchmarks/HumanEval/data/humaneval-jl.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ef21bd920889c9c1ab0e87a825cc26bd895e7a715f569f8ba7de577f870b6815
3
- size 268754
 
 
 
 
evaluation/general_benchmarks/HumanEval/data/humaneval-js.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:634e2eee8d6e22de07c121b972207683dd96be76256bf44bdfc1a3386b739287
3
- size 297853
 
 
 
 
evaluation/general_benchmarks/HumanEval/data/humaneval-lua.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d2e91f3603f6aee63db2c2d5754d165397603a8c9bd6130842af7988b27a96fc
3
- size 298314
 
 
 
 
evaluation/general_benchmarks/HumanEval/data/humaneval-php DELETED
The diff for this file is too large to render. See raw diff
 
evaluation/general_benchmarks/HumanEval/data/humaneval-php.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:bebd86875b1d8e65a8b7e692ed7bdf64612b44aa388825ea4dab40c7047c786b
3
- size 388096
 
 
 
 
evaluation/general_benchmarks/HumanEval/data/humaneval-pl.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:949048bff3eaea7ae47cd5759042c088a7c227d53c74fe95a80728fd5aefbf77
3
- size 437506
 
 
 
 
evaluation/general_benchmarks/HumanEval/data/humaneval-python.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0eae07adadbb00d51962fdb78b9e2a26bfa8ade85dc54eb57cae9bffed2f5c54
3
- size 342974
 
 
 
 
evaluation/general_benchmarks/HumanEval/data/humaneval-r.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c2a3268ea54d2bfb8ce65bb2c808437ac0d9934c6caf99fcf75d6b6a4fb3f911
3
- size 311904
 
 
 
 
evaluation/general_benchmarks/HumanEval/data/humaneval-rb.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4204395b835019e32d55513325a0fad01c6d382fd1eb97b516f0458d00058302
3
- size 312806
 
 
 
 
evaluation/general_benchmarks/HumanEval/data/humaneval-rkt.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:083325daf55daf431ae6d6d17cf017afb18a6ec790d4c841b7c2b4752c5807ff
3
- size 312006
 
 
 
 
evaluation/general_benchmarks/HumanEval/data/humaneval-rs.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e33f562838e973b7e6b8d76dbc1fb84b076d7426629b4cdc624f12678778d2fa
3
- size 306470
 
 
 
 
evaluation/general_benchmarks/HumanEval/data/humaneval-scala.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:482b72a16613a563755d642e37792da68471399789b069fba5b1249e831445f3
3
- size 384243
 
 
 
 
evaluation/general_benchmarks/HumanEval/data/humaneval-sh DELETED
The diff for this file is too large to render. See raw diff
 
evaluation/general_benchmarks/HumanEval/data/humaneval-sh.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2f702106e9ce4aa7385de568d9248d6c57c90382d782b162cb9e072fbd01ccf8
3
- size 274180
 
 
 
 
evaluation/general_benchmarks/HumanEval/data/humaneval-swift.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e4418d6f532e4298969e727690cda80dc88b94e4abf836f8ff79ac18a737eaaa
3
- size 344436
 
 
 
 
evaluation/general_benchmarks/HumanEval/data/humaneval-ts DELETED
The diff for this file is too large to render. See raw diff
 
evaluation/general_benchmarks/HumanEval/data/humaneval-ts.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4b1af1050c9f226335d54a8c80553c39454a525cace3e67bdcfcc9092ba02637
3
- size 304732
 
 
 
 
evaluation/general_benchmarks/HumanEval/eval.sh DELETED
@@ -1,4 +0,0 @@
1
- MODEL_NAME_OR_PATH="/data3/models/DeepSeek/deepseek-coder-6.7b-base"
2
- DATASET_ROOT="HumanEval/data"
3
- LANGUAGE="python"
4
- CUDA_VISIBLE_DEVICES=5,6,7 python -m accelerate.commands.launch --config_file HumanEval/test_config.yaml HumanEval/eval_pal.py --model_path ${MODEL_NAME_OR_PATH} --language ${LANGUAGE} --dataroot ${DATASET_ROOT}
 
 
 
 
 
evaluation/general_benchmarks/HumanEval/eval_base_vllm.py DELETED
@@ -1,162 +0,0 @@
1
- import json
2
- import os
3
- import time
4
- from argparse import ArgumentParser
5
- # from accelerate import Accelerator
6
- # from accelerate import DistributedDataParallelKwargs
7
- from pathlib import Path
8
-
9
- import numpy as np
10
- import torch
11
- import torch.nn.functional as F
12
- from human_eval.evaluation import evaluate_functional_correctness
13
- from tqdm import tqdm
14
- from transformers import AutoModelForCausalLM, AutoTokenizer
15
- from utils.dataset import HumanEvalDataset
16
- from utils.utils import cleanup_code
17
- from vllm import LLM, SamplingParams
18
-
19
-
20
- class HumanEval:
21
- """
22
- HumanEval evaluation class.
23
- """
24
-
25
- def __init__(
26
- self,
27
- data_root,
28
- language="python",
29
- log_dir=None,
30
- issft=False,
31
- inference_increment=True,
32
- n_sample=1,
33
- k_sample=1,
34
- ):
35
- self.data_root = data_root
36
- self.k = k_sample
37
- self.n_sample = n_sample
38
- self.language = language
39
- self.log_dir = log_dir
40
- self.sft = issft
41
- self.inference_increment = inference_increment
42
- os.makedirs(self.log_dir, exist_ok=True)
43
-
44
- @torch.no_grad()
45
- def eval_model(self, args):
46
- """
47
- Evaluate the model on HumanEval.
48
- """
49
- assert (
50
- self.log_dir is not None
51
- ), "log_dir should not be None when evaluating humaneval"
52
- dataset = HumanEvalDataset(
53
- self.data_root,
54
- sample_num=self.n_sample,
55
- language=self.language,
56
- issft=self.sft,
57
- )
58
- model_name_or_path = args.model_path
59
- print("model", model_name_or_path)
60
- tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
61
- print(
62
- "load tokenizer {} from {} over.".format(
63
- tokenizer.__class__, model_name_or_path
64
- )
65
- )
66
-
67
- llm = LLM(
68
- model=model_name_or_path,
69
- tensor_parallel_size=1,
70
- max_model_len=4096,
71
- trust_remote_code=True,
72
- enforce_eager=True,
73
- )
74
- sampling_params = SamplingParams(
75
- temperature=0,
76
- max_tokens=1024,
77
- top_p=0.95,
78
- stop_token_ids=[tokenizer.eos_token_id],
79
- )
80
- messages_list = []
81
- for i in range(len(dataset)):
82
- data = dataset[i]
83
- prompt = data["prompt"].strip()
84
- messages_list.append(prompt)
85
- outputs = llm.generate(messages_list, sampling_params=sampling_params)
86
- assert len(dataset) == len(outputs), "dataset and outputs different lengths."
87
- log_file = os.path.join(self.log_dir, f"{self.language}.json")
88
- tmpfile = open(log_file, "w")
89
- for i, output in enumerate(tqdm(outputs)):
90
- data = dataset[i]
91
- output = output.outputs[0].text
92
- output = cleanup_code(
93
- output,
94
- self.language,
95
- "humaneval",
96
- self.sft,
97
- dataset.stopwords,
98
- )
99
- # sft mode does not need original prompt
100
- if not self.sft:
101
- suffixprediction = data["original_prompt"] + "\n" + output
102
- res = {
103
- "task_id": data["task_id"],
104
- "generation": suffixprediction,
105
- "prompt": data["original_prompt"],
106
- }
107
- tmpfile.write(json.dumps(res) + "\n")
108
-
109
- tmpfile.close()
110
- # calculate the final score of pass@k
111
- self._calculate_final_score(log_file)
112
- return
113
-
114
- def _calculate_final_score(self, logfilepath):
115
- """
116
- Calculate the final score.
117
- """
118
- res = evaluate_functional_correctness(
119
- input_file=logfilepath,
120
- problem_file=os.path.join(
121
- self.data_root, f"humaneval-{self.language}.jsonl"
122
- ),
123
- tmp_dir=self.log_dir,
124
- language=self.language,
125
- )
126
- print("score is", res["pass@%d" % self.k])
127
- os.remove(logfilepath)
128
- return
129
-
130
-
131
- if __name__ == "__main__":
132
- parser = ArgumentParser()
133
- parser.add_argument("--logdir", type=str, default="")
134
- parser.add_argument(
135
- "--model_path",
136
- type=str,
137
- help="model name or path",
138
- default="/data0/pretrained-models/qwen2-7b",
139
- )
140
-
141
- parser.add_argument("--language", type=str, default="python")
142
- parser.add_argument(
143
- "--dataroot",
144
- type=str,
145
- default="HumanEval/data",
146
- )
147
- args = parser.parse_args()
148
-
149
- logdir = args.logdir
150
- language = args.language
151
-
152
- if logdir == "":
153
- logdir = "output/tmp/"
154
-
155
- evaluator = HumanEval(
156
- data_root=args.dataroot,
157
- log_dir=logdir,
158
- n_sample=1,
159
- language=language,
160
- )
161
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
162
- evaluator.eval_model(args)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/general_benchmarks/HumanEval/eval_instruct.py DELETED
@@ -1,168 +0,0 @@
1
- import argparse
2
- import json
3
- import os
4
- from pathlib import Path
5
-
6
- import torch
7
- from tqdm import tqdm
8
-
9
- data_abs_dir = Path(__file__).parent / "data"
10
-
11
- from human_eval.evaluation import evaluate_functional_correctness
12
- from transformers import AutoModelForCausalLM, AutoTokenizer
13
- from utils.utils import extract_generation_code, languge_settings
14
-
15
-
16
- def build_deepseekcoder_instruction(languge: str, question: str):
17
- return """
18
- Please continue to complete the function. You are not allowed to modify the given code and do the completion only. Please return all completed function in a codeblock. Here is the given code to do completion:
19
- ```{}
20
- {}
21
- ```
22
- """.strip().format(
23
- languge.lower(), question.strip()
24
- )
25
-
26
-
27
- def generate_one(example, lang, tokenizer, model):
28
- prompt = build_deepseekcoder_instruction(
29
- languge_settings[lang]["full_name"], example["prompt"]
30
- )
31
- inputs = tokenizer.apply_chat_template(
32
- [{"role": "user", "content": prompt}],
33
- return_tensors="pt",
34
- add_generation_prompt=True,
35
- ).to(model.device)
36
-
37
- stop_id = tokenizer.convert_tokens_to_ids("<|EOT|>")
38
- assert isinstance(stop_id, int), "Invalid tokenizer, EOT id not found"
39
-
40
- outputs = model.generate(
41
- inputs,
42
- max_new_tokens=1024,
43
- do_sample=False,
44
- # top_p=0.95,
45
- # temperature=temperature,
46
- pad_token_id=stop_id,
47
- eos_token_id=stop_id,
48
- )
49
-
50
- output = tokenizer.decode(outputs[0][len(inputs[0]) :], skip_special_tokens=True)
51
- example["output"] = output
52
-
53
- return extract_generation_code(example, lang_code=lang)
54
-
55
-
56
- def generate_main(args):
57
- model_name_or_path = args.model
58
- lang = args.language
59
- saved_path = args.output_path
60
- temp_dir = args.temp_dir
61
- os.makedirs(temp_dir, exist_ok=True)
62
- problem_file = os.path.join(data_abs_dir, f"humaneval-{lang}.jsonl")
63
-
64
- print("model", model_name_or_path)
65
- tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
66
- print(
67
- "load tokenizer {} from {} over.".format(
68
- tokenizer.__class__, model_name_or_path
69
- )
70
- )
71
- model = AutoModelForCausalLM.from_pretrained(
72
- model_name_or_path,
73
- torch_dtype=torch.bfloat16,
74
- device_map="auto",
75
- # use_flash_attention_2=True
76
- )
77
- model.eval()
78
- examples = [json.loads(x) for x in open(problem_file) if x.strip()]
79
- print("Read {} examples for evaluation over.".format(len(examples)))
80
-
81
- generated_examples = []
82
- for ex in tqdm(examples, desc="Generating"):
83
- gen_example = generate_one(ex, args.language, tokenizer, model)
84
- generated_examples.append(gen_example)
85
-
86
- print("Generate all over!!!")
87
- with open(saved_path, "w", encoding="utf-8") as fw:
88
- for ex in generated_examples:
89
- fw.write(json.dumps(ex) + "\n")
90
- print(
91
- "Save {} processed examples into {} over!".format(
92
- len(generated_examples), saved_path
93
- )
94
- )
95
-
96
- result = evaluate_functional_correctness(
97
- input_file=saved_path,
98
- tmp_dir=temp_dir,
99
- n_workers=8,
100
- timeout=3.0,
101
- problem_file=problem_file,
102
- language=lang,
103
- )
104
- print(lang, result, model_name_or_path)
105
- pass
106
-
107
-
108
- def evaluation_only(args):
109
- lang = args.language
110
- temp_dir = args.temp_dir
111
- assert os.path.exists(args.output_path), "Not fond output file: {}".format(
112
- args.output_path
113
- )
114
- os.makedirs(temp_dir, exist_ok=True)
115
-
116
- output_name = os.path.basename(args.output_path)
117
- output_examples = [json.loads(x) for x in open(args.output_path) if x.strip()]
118
-
119
- processed_examples = [
120
- extract_generation_code(ex, lang) for ex in tqdm(output_examples, "Processing")
121
- ]
122
- processed_path = os.path.join(temp_dir, output_name)
123
- with open(processed_path, "w", encoding="utf-8") as fw:
124
- for ex in processed_examples:
125
- fw.write(json.dumps(ex) + "\n")
126
- print(
127
- "Save {} processed examples into {} over!".format(
128
- len(processed_examples), processed_path
129
- )
130
- )
131
-
132
- problem_file = os.path.join(data_abs_dir, f"humaneval-{lang}.jsonl")
133
- from human_eval.evaluation import evaluate_functional_correctness
134
-
135
- result = evaluate_functional_correctness(
136
- input_file=processed_path,
137
- tmp_dir=temp_dir,
138
- n_workers=8,
139
- timeout=3.0,
140
- problem_file=problem_file,
141
- language=lang,
142
- )
143
- print(lang, result)
144
-
145
-
146
- if __name__ == "__main__":
147
- parser = argparse.ArgumentParser()
148
- parser.add_argument(
149
- "--model",
150
- type=str,
151
- help="model name or path",
152
- default="/data0/pretrained-models/deepseek-coder-6.7b-instruct",
153
- )
154
- parser.add_argument(
155
- "--output_path",
156
- type=str,
157
- help="output path of your generation",
158
- default="/home/qyhuang/DeepSeek-Coder/outputs/deepseek-chat.json",
159
- )
160
- parser.add_argument("--language", type=str, help="langauge", default="python")
161
- parser.add_argument(
162
- "--temp_dir", type=str, help="temp dir for evaluation", default="tmp"
163
- )
164
- args = parser.parse_args()
165
-
166
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
167
- generate_main(args)
168
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/general_benchmarks/HumanEval/eval_instruct_vllm.py DELETED
@@ -1,225 +0,0 @@
1
- import argparse
2
- import json
3
- import os
4
- import shutil
5
- from pathlib import Path
6
-
7
- import torch
8
- import transformers
9
- from human_eval.evaluation import evaluate_functional_correctness
10
- from tqdm import tqdm
11
- from transformers import AutoTokenizer
12
- from utils.utils import extract_generation_code, languge_settings
13
- from vllm import LLM, SamplingParams
14
-
15
- data_abs_dir = Path(__file__).parent / "data"
16
-
17
-
18
- def build_deepseekcoder_instruction(languge: str, question: str):
19
- return """
20
- Please continue to complete the function. You are not allowed to modify the given code and do the completion only. Please return all completed function in a codeblock. Here is the given code to do completion:
21
- ```{}
22
- {}
23
- ```
24
- """.strip().format(
25
- languge.lower(), question.strip()
26
- )
27
-
28
-
29
- def create_dir(output_dir):
30
- if os.path.exists(output_dir):
31
- if not os.access(output_dir, os.W_OK):
32
- shutil.rmtree(output_dir)
33
- os.makedirs(output_dir)
34
- os.chmod(output_dir, 0o777)
35
- print("not write permission, makedir:", output_dir)
36
- else:
37
- print(f"{output_dir} exists!")
38
- else:
39
- os.makedirs(output_dir)
40
- os.chmod(output_dir, 0o777)
41
- print("makedir:", output_dir)
42
-
43
-
44
- def get_client_res(messages, example, output_key, open_ai_key=False):
45
- try:
46
- if open_ai_key:
47
- from openai import AzureOpenAI, OpenAI
48
- try:
49
- api_key = os.environ["OPENAI_API_KEY"]
50
- except KeyError:
51
- print("环境变量 OPENAI_API_KEY 未设置")
52
- api_key = "default_value"
53
-
54
- client = AzureOpenAI(
55
- api_key=api_key,
56
- api_version="2024-07-01-preview",
57
- azure_endpoint="https://zju-tablegpt.openai.azure.com/",
58
- )
59
- chat_response = client.chat.completions.create(
60
- model="gpt-4o",
61
- # model="gpt-4o-mini",
62
- messages=messages,
63
- top_p=0.95,
64
- temperature=0,
65
- max_tokens=1024,
66
- timeout=40,
67
- )
68
- else:
69
- # Set OpenAI's API key and API base to use vLLM's API server.
70
- openai_api_key = "EMPTY"
71
- openai_api_base = "http://localhost:8080/v1"
72
-
73
- client = OpenAI(
74
- api_key=openai_api_key,
75
- base_url=openai_api_base,
76
- )
77
- chat_response = client.chat.completions.create(
78
- model="qwen2-7b-sft",
79
- messages=messages,
80
- top_p=0.3,
81
- temperature=0.1,
82
- max_tokens=1024,
83
- )
84
- example[output_key] = chat_response.choices[0].message.content
85
- except Exception as e:
86
- print(f"An unexpected error occurred: {e}")
87
- example[output_key] = None
88
- example["input"] = messages
89
- return example
90
-
91
-
92
-
93
- def generate_main(args):
94
- model_name_or_path = args.model_path
95
- lang = args.language
96
- temp_dir = args.temp_dir
97
- create_dir(temp_dir)
98
- # os.makedirs(temp_dir, exist_ok=True)
99
- problem_file = os.path.join(data_abs_dir, f"humaneval-{lang}.jsonl")
100
- if not args.api:
101
- print("model", model_name_or_path)
102
- tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
103
- print(
104
- "load tokenizer {} from {} over.".format(
105
- tokenizer.__class__, model_name_or_path
106
- )
107
- )
108
- llm_args = {
109
- "model": model_name_or_path,
110
- "gpu_memory_utilization": 0.95,
111
- "trust_remote_code": True,
112
- "tensor_parallel_size": args.gpus_num,
113
- "dtype": "half",
114
- "max_model_len": 8192,
115
- "enforce_eager": True,
116
- }
117
-
118
- llm = LLM(**llm_args)
119
- sampling_params = SamplingParams(
120
- temperature=0,
121
- max_tokens=1024,
122
- top_p=0.95,
123
- stop_token_ids=[tokenizer.eos_token_id],
124
- )
125
-
126
- examples = [json.loads(x) for x in open(problem_file) if x.strip()]
127
- print("Read {} examples for evaluation over.".format(len(examples)))
128
- messages_list = []
129
- for example in tqdm(examples, desc="Generating"):
130
- prompt = build_deepseekcoder_instruction(
131
- languge_settings[lang]["full_name"], example["prompt"]
132
- )
133
- message = [{"role": "user", "content": prompt}]
134
- if args.api:
135
- messages_list.append(message)
136
- else:
137
- messages_list.append(
138
- tokenizer.apply_chat_template(
139
- message, tokenize=False, add_generation_prompt=True
140
- )
141
- )
142
- if args.api:
143
- from joblib import Parallel, delayed
144
- examples_ = Parallel(n_jobs=24)(
145
- delayed(get_client_res)(inp, examples[i], "output",open_ai_key=True)
146
- for i, inp in enumerate(tqdm(messages_list))
147
- )
148
-
149
- # 请求错误的重新请求
150
- examples = []
151
- for example in examples_:
152
- if example["output"] == None:
153
- example = get_client_res(
154
- example["input"], example, "output", open_ai_key=True
155
- )
156
- del example["input"]
157
- examples.append(example)
158
-
159
- generated_examples = []
160
- for example in examples:
161
- example = extract_generation_code(example, lang_code=lang)
162
- generated_examples.append(example)
163
- else:
164
- outputs = llm.generate(messages_list, sampling_params=sampling_params)
165
- generated_examples = []
166
- for i, output in enumerate(tqdm(outputs)):
167
- output = output.outputs[0].text
168
- example = examples[i]
169
- example["output"] = output
170
- example = extract_generation_code(example, lang_code=lang)
171
- generated_examples.append(example)
172
-
173
- print("Generate all over!!!")
174
- # os.makedirs(args.save_dir, exist_ok=True)
175
- create_dir(args.save_dir)
176
- saved_path = os.path.join(args.save_dir, "results_humaneval.json")
177
- with open(saved_path, "w", encoding="utf-8") as fw:
178
- for ex in generated_examples:
179
- fw.write(json.dumps(ex) + "\n")
180
- print(
181
- "Save {} processed examples into {} over!".format(
182
- len(generated_examples), saved_path
183
- )
184
- )
185
-
186
- result = evaluate_functional_correctness(
187
- input_file=saved_path,
188
- tmp_dir=temp_dir,
189
- n_workers=8,
190
- timeout=3.0,
191
- problem_file=problem_file,
192
- language=lang,
193
- out_path=saved_path,
194
- )
195
- print(lang, result, model_name_or_path)
196
-
197
-
198
- if __name__ == "__main__":
199
- parser = argparse.ArgumentParser()
200
- parser.add_argument(
201
- "--model_path",
202
- type=str,
203
- help="model name or path",
204
- default="/data4/sft_output/qwen2-instruct-0709/checkpoint-1400",
205
- )
206
- parser.add_argument(
207
- "--gpus_num", type=int, default=1, help="the number of GPUs you want to use."
208
- )
209
- parser.add_argument(
210
- "--save_dir",
211
- type=str,
212
- help="output path of your generation",
213
- default="output",
214
- )
215
- parser.add_argument("--api", action="store_true", help="infer api type")
216
- parser.add_argument("--language", type=str, help="langauge", default="python")
217
- parser.add_argument(
218
- "--temp_dir", type=str, help="temp dir for evaluation", default="output/tmp"
219
- )
220
- parser.add_argument("--seed", type=int, help="seed", default=42)
221
- args = parser.parse_args()
222
-
223
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
224
- transformers.set_seed(args.seed)
225
- generate_main(args)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/general_benchmarks/HumanEval/eval_pal.py DELETED
@@ -1,62 +0,0 @@
1
- import json
2
- import os
3
- import subprocess
4
- import sys
5
- from argparse import ArgumentParser
6
- from pathlib import Path
7
-
8
- import numpy as np
9
- import pandas as pd
10
- import torch
11
- import torch.distributed as dist
12
- import torch.nn.functional as F
13
- from accelerate import Accelerator, DistributedDataParallelKwargs
14
- from humaneval import HumanEval as evaltor
15
- from transformers import AutoModelForCausalLM, AutoTokenizer
16
-
17
- if __name__ == "__main__":
18
- kwargs_handlers = [DistributedDataParallelKwargs(find_unused_parameters=True)]
19
- accelerator = Accelerator(mixed_precision="bf16", kwargs_handlers=kwargs_handlers)
20
-
21
- parser = ArgumentParser()
22
- parser.add_argument("--logdir", type=str, default="./output")
23
- parser.add_argument(
24
- "--model_path",
25
- type=str,
26
- default="/data3/models/DeepSeek/deepseek-coder-6.7b-base",
27
- )
28
- parser.add_argument("--language", type=str, default="python")
29
- parser.add_argument("--dataroot", type=str, default="HumanEval/data")
30
- args = parser.parse_args()
31
-
32
- logdir = args.logdir
33
- language = args.language
34
- model_path = args.model_path
35
-
36
- if logdir == "":
37
- logdir = "tmp/"
38
- tokenizer = dict(
39
- cls=AutoTokenizer,
40
- model_path=model_path,
41
- )
42
-
43
- dataroot = args.dataroot
44
-
45
- evaluator = evaltor(
46
- data_root=dataroot,
47
- max_seq_len=4096,
48
- tokenizer_cfg=tokenizer,
49
- log_dir=logdir,
50
- n_sample=1,
51
- batch_size=1,
52
- language=language,
53
- max_gen_len=500,
54
- )
55
- model = AutoModelForCausalLM.from_pretrained(
56
- model_path,
57
- device_map=accelerator.device,
58
- trust_remote_code=True,
59
- torch_dtype=torch.bfloat16,
60
- )
61
- os.environ["TOKENIZERS_PARALLELISM"] = "false"
62
- evaluator.eval_model(model, accelerator)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/general_benchmarks/HumanEval/human_eval/__init__.py DELETED
File without changes
evaluation/general_benchmarks/HumanEval/human_eval/data.py DELETED
@@ -1,48 +0,0 @@
1
- import gzip
2
- import json
3
- import os
4
- from typing import Dict, Iterable
5
-
6
- ROOT = os.path.dirname(os.path.abspath(__file__))
7
- HUMAN_EVAL = os.path.join(ROOT, "..", "data", "HumanEval.jsonl.gz")
8
-
9
-
10
- def read_problems(evalset_file: str = HUMAN_EVAL) -> Dict[str, Dict]:
11
- return {task["task_id"]: task for task in stream_jsonl(evalset_file)}
12
-
13
-
14
- def stream_jsonl(filename: str) -> Iterable[Dict]:
15
- """
16
- Parses each jsonl line and yields it as a dictionary
17
- """
18
- if filename.endswith(".gz"):
19
- with open(filename, "rb") as gzfp:
20
- with gzip.open(gzfp, "rt") as fp:
21
- for line in fp:
22
- if any(not x.isspace() for x in line):
23
- yield json.loads(line)
24
- else:
25
- with open(filename, "r", encoding="utf-8") as fp:
26
- for line in fp:
27
- if any(not x.isspace() for x in line):
28
- yield json.loads(line)
29
-
30
-
31
- def write_jsonl(filename: str, data: Iterable[Dict], append: bool = False):
32
- """
33
- Writes an iterable of dictionaries to jsonl
34
- """
35
- if append:
36
- mode = "ab"
37
- else:
38
- mode = "wb"
39
- filename = os.path.expanduser(filename)
40
- if filename.endswith(".gz"):
41
- with open(filename, mode) as fp:
42
- with gzip.GzipFile(fileobj=fp, mode="wb") as gzfp:
43
- for x in data:
44
- gzfp.write((json.dumps(x) + "\n").encode("utf-8"))
45
- else:
46
- with open(filename, mode) as fp:
47
- for x in data:
48
- fp.write((json.dumps(x) + "\n").encode("utf-8"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/general_benchmarks/HumanEval/human_eval/evaluate_functional_correctness.py DELETED
@@ -1,32 +0,0 @@
1
- import sys
2
-
3
- import fire
4
-
5
- from .data import HUMAN_EVAL
6
- from .evaluation import evaluate_functional_correctness
7
-
8
-
9
- def entry_point(
10
- sample_file: str,
11
- k: str = "1,10,100",
12
- n_workers: int = 4,
13
- timeout: float = 3.0,
14
- problem_file: str = "",
15
- is_mbpp: bool = False,
16
- ):
17
- """
18
- Evaluates the functional correctness of generated samples, and writes
19
- results to f"{sample_file}_results.jsonl.gz"
20
- """
21
- k = list(map(int, k.split(",")))
22
- results = evaluate_functional_correctness(
23
- sample_file, k, n_workers, timeout, problem_file, is_mbpp
24
- )
25
- print(results)
26
-
27
-
28
- def main():
29
- fire.Fire(entry_point)
30
-
31
-
32
- sys.exit(main())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/general_benchmarks/HumanEval/human_eval/evaluation.py DELETED
@@ -1,351 +0,0 @@
1
- import gzip
2
- import itertools
3
- import json
4
- import os
5
- from concurrent.futures import ThreadPoolExecutor, as_completed
6
- from typing import *
7
-
8
- import numpy as np
9
- from tqdm.auto import tqdm
10
-
11
- from human_eval.data import stream_jsonl
12
- from human_eval.execution import check_correctness
13
-
14
- IMPORT_HELPER = {
15
- "python": [
16
- "import math",
17
- "import re",
18
- "import sys",
19
- "import copy",
20
- "import datetime",
21
- "import itertools",
22
- "import collections",
23
- "import heapq",
24
- "import functools",
25
- "import hashlib",
26
- "import numpy",
27
- "import numpy as np",
28
- "import string",
29
- "from typing import *",
30
- "from collections import *",
31
- ],
32
- "go": [
33
- "math",
34
- "strings",
35
- "fmt",
36
- "strconv",
37
- "time",
38
- "bytes",
39
- "regexp",
40
- "sort",
41
- "math/rand",
42
- "crypto/md5",
43
- ],
44
- "cpp": [
45
- "#include<stdlib.h>",
46
- "#include<algorithm>",
47
- "#include<math.h>",
48
- "#include<stdio.h>",
49
- "#include<vector>",
50
- "#include<string>",
51
- "#include<climits>",
52
- "#include<cstring>",
53
- "#include<iostream>",
54
- "#include<cassert>",
55
- ],
56
- "cs": [
57
- "using System.Numerics;",
58
- "using System.Diagnostics;",
59
- "using System.Collections.Generic;",
60
- "using System.Linq;",
61
- "using System.Text;",
62
- "using System.Security.Cryptography;",
63
- "using System.Collections.Generic;",
64
- ],
65
- }
66
-
67
-
68
- LANGUAGE_NAME = {
69
- "cpp": "CPP",
70
- "go": "Go",
71
- "java": "Java",
72
- "js": "JavaScript",
73
- "python": "Python",
74
- }
75
-
76
-
77
- def read_dataset(
78
- data_file: str = None,
79
- dataset_type: str = "humaneval",
80
- num_shot=None,
81
- ) -> Dict:
82
- """
83
- Reads a dataset and returns a dictionary of tasks.
84
- """
85
- if num_shot is not None:
86
- print(f"{num_shot}-shot setting...")
87
- if "humaneval" in dataset_type.lower():
88
- if data_file is None:
89
- current_path = os.path.dirname(os.path.abspath(__file__))
90
- data_file = os.path.join(
91
- current_path,
92
- "..",
93
- "humaneval-x",
94
- "python",
95
- "data",
96
- "humaneval_python.jsonl.gz",
97
- )
98
- dataset = {task["task_id"]: task for task in stream_jsonl(data_file)}
99
- else:
100
- raise f"Dataset: {dataset_type} not supported."
101
-
102
- return dataset
103
-
104
-
105
- def estimate_pass_at_k(
106
- num_samples: Union[int, List[int], np.ndarray],
107
- num_correct: Union[List[int], np.ndarray],
108
- k: int,
109
- ) -> np.ndarray:
110
- """
111
- Estimates pass@k of each problem and returns them in an array.
112
- """
113
-
114
- def estimator(n: int, c: int, k: int) -> float:
115
- """
116
- Calculates 1 - comb(n - c, k) / comb(n, k).
117
- """
118
- if n - c < k:
119
- return 1.0
120
- return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
121
-
122
- if isinstance(num_samples, int):
123
- num_samples_it = itertools.repeat(num_samples, len(num_correct))
124
- else:
125
- assert len(num_samples) == len(num_correct)
126
- num_samples_it = iter(num_samples)
127
-
128
- return np.array(
129
- [estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)]
130
- )
131
-
132
-
133
- def process_humaneval_test(
134
- sample, problems, example_test=False, is_mbpp=False, language="python"
135
- ):
136
- """
137
- Processes a sample for evaluation.
138
- """
139
- task_id = sample["task_id"]
140
- if is_mbpp:
141
- return sample["generation"] + "\n" + "\n".join(problems[task_id]["test"])
142
-
143
- prompt = sample["prompt"]
144
- if (
145
- example_test
146
- and "example_test" in problems[task_id]
147
- and problems[task_id]["example_test"] != ""
148
- ):
149
- test = problems[task_id]["example_test"]
150
- else:
151
- test = problems[task_id]["test"]
152
- code = sample["generation"]
153
-
154
- # Pre-process for different languages
155
- if language == "python":
156
- test_setup = "\n".join(IMPORT_HELPER["python"]) + "\n"
157
- test_string = test_setup + code + "\n" + test + "\n"
158
- elif language == "cpp":
159
- test_set_up = ""
160
- for s in IMPORT_HELPER["cpp"]:
161
- if s not in prompt:
162
- test_set_up += s + "\n"
163
- test_string = test_set_up + "\n" + code + "\n" + test
164
- elif language == "java":
165
- test_string = code + "\n" + test
166
- elif language == "cs":
167
- test_set_up = ""
168
- for s in IMPORT_HELPER["cs"]:
169
- test_set_up += s + "\n"
170
- test_string = test_set_up + "\n" + code + "\n" + test
171
- elif language in ["js", "javascript", "ts", "sh", "go"]:
172
- test_string = code + "\n" + test
173
- elif language == "go232":
174
- import_string = problems[task_id]["import"]
175
- prompt = prompt.replace(import_string, "")
176
- if example_test and "example_test" in problems[task_id]:
177
- test = problems[task_id]["example_test"]
178
- else:
179
- test = problems[task_id]["test"]
180
- test_setup = problems[task_id]["test_setup"]
181
- other_pkgs = []
182
- for pkg in IMPORT_HELPER["go"]:
183
- if pkg not in test_setup:
184
- p = pkg.split("/")[-1]
185
- if p + "." in code:
186
- other_pkgs.append(f'"{pkg}"')
187
- if other_pkgs:
188
- import_other_pkgs = (
189
- "import (\n" + " ".join([p + "\n" for p in other_pkgs]) + ")"
190
- )
191
- test_string = (
192
- test_setup
193
- + "\n"
194
- + import_other_pkgs
195
- + "\n"
196
- + prompt
197
- + code
198
- + "\n"
199
- + test
200
- )
201
- else:
202
- test_string = test_setup + "\n" + prompt + code + "\n" + test
203
- elif language == "rust":
204
- main = "\nfn main(){ \n } \n"
205
- declaration = problems[task_id]["declaration"]
206
- test_string = main + declaration + prompt + code + test
207
- elif language == "php":
208
- if code[:5] != "<?php":
209
- code = "<?php\n" + code
210
- test_string = code + "\n" + test + "?>"
211
- return test_string
212
-
213
-
214
- def stream_jsonl_all(filename: str) -> Iterable[Dict]:
215
- """
216
- Streams a JSONL file.
217
- """
218
- results = []
219
- if filename.endswith(".gz"):
220
- fp = gzip.open(open(filename, "rb"), "rt")
221
- else:
222
- fp = open(filename, "r")
223
- for line in fp:
224
- if any(not x.isspace() for x in line):
225
- results.append(json.loads(line))
226
- fp.close()
227
-
228
- return results
229
-
230
-
231
- def evaluate_functional_correctness(
232
- input_file: str = None,
233
- tmp_dir: str = "./",
234
- n_workers: int = 32,
235
- timeout: float = 10.0,
236
- problem_file: str = "../data/humaneval_python.jsonl.gz",
237
- out_path: str = None,
238
- k: List[int] = [1, 10, 100],
239
- test_groundtruth: bool = False,
240
- example_test: bool = False,
241
- is_mbpp: bool = False,
242
- language: str = "python",
243
- ):
244
- """
245
- Evaluates the functional correctness of a model.
246
- """
247
- if example_test:
248
- print("Example test...")
249
-
250
- problems = read_dataset(problem_file, dataset_type="humaneval")
251
- sample_jsonl = stream_jsonl_all(input_file)
252
-
253
- with ThreadPoolExecutor(max_workers=n_workers) as executor:
254
-
255
- futures = []
256
- completion_id = Counter()
257
- n_samples = 0
258
- # results = defaultdict(list)
259
- results = {}
260
-
261
- if test_groundtruth:
262
- print("Testing ground truth...")
263
- for sample in tqdm(problems.values()):
264
- task_id = sample["task_id"]
265
- lang = task_id.split("/")[0].lower()
266
- if lang == "javascript":
267
- lang = "js"
268
- tmp_dir_ = os.path.join(tmp_dir, lang, "evaluation")
269
- sample["generation"] = sample["canonical_solution"]
270
- sample["test_code"] = process_humaneval_test(
271
- sample, problems, example_test, language
272
- )
273
- if sample["test_code"] is None:
274
- continue
275
- args = (
276
- task_id,
277
- sample,
278
- lang,
279
- timeout,
280
- tmp_dir_,
281
- completion_id[task_id],
282
- )
283
- future = executor.submit(check_correctness, *args)
284
- futures.append(future)
285
- completion_id[task_id] += 1
286
- n_samples += 1
287
- else:
288
- print("Reading samples...")
289
- for sample in tqdm(sample_jsonl):
290
- task_id = sample["task_id"]
291
- if not is_mbpp:
292
- lang = language
293
- if not is_mbpp and lang == "javascript":
294
- lang = "js"
295
- if is_mbpp:
296
- lang = "python"
297
- tmp_dir_ = os.path.join(tmp_dir, lang, "evaluation")
298
- sample["task_id"] = task_id
299
- sample["test_code"] = process_humaneval_test(
300
- sample, problems, example_test, is_mbpp, language
301
- )
302
- if sample["test_code"] is None:
303
- continue
304
- if "completion_id" in sample:
305
- completion_id_ = sample["completion_id"]
306
- else:
307
- completion_id_ = completion_id[task_id]
308
- args = (task_id, sample, lang, timeout, tmp_dir_, completion_id_)
309
- future = executor.submit(check_correctness, *args)
310
- futures.append(future)
311
- completion_id[task_id] += 1
312
- n_samples += 1
313
-
314
- if len(completion_id) == len(problems):
315
- evaluate_pass_at_k = True
316
- else:
317
- evaluate_pass_at_k = False
318
-
319
- print("Running test suites...")
320
- for future in tqdm(as_completed(futures), total=len(futures)):
321
- result = future.result()
322
- # results[result["task_id"]].append((result["completion_id"], result))
323
- results[result["task_id"]] = result
324
-
325
- # Calculate pass@k.
326
- total, correct = [], []
327
- for result in results.values():
328
- # passed = [r[1]["passed"] for r in result]
329
- passed = [result["passed"]]
330
- total.append(len(passed))
331
- correct.append(sum(passed))
332
- total = np.array(total)
333
- correct = np.array(correct)
334
-
335
- if evaluate_pass_at_k:
336
- ks = k
337
- pass_at_k = {
338
- f"pass@{k}": estimate_pass_at_k(total, correct, k).mean()
339
- for k in ks
340
- if (total >= k).all()
341
- }
342
- print(pass_at_k)
343
- else:
344
- print("Total:", np.sum(total))
345
- print("Correct:", np.sum(correct))
346
-
347
- if out_path:
348
- with open(out_path, "w") as f:
349
- json.dump(list(results.values()), f, ensure_ascii=False)
350
-
351
- return pass_at_k
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/general_benchmarks/HumanEval/human_eval/execution.py DELETED
@@ -1,817 +0,0 @@
1
- import contextlib
2
- import faulthandler
3
- import gzip
4
- import io
5
- import json
6
- import multiprocessing
7
- import os
8
- import platform
9
- import random
10
- import signal
11
- import subprocess
12
- import tempfile
13
- import traceback
14
- from typing import *
15
-
16
- java_exec = ""
17
- node_exec = ""
18
- tsc_exec = ""
19
- go_exec = ""
20
- php_exec = ""
21
- cs_exec = ""
22
-
23
-
24
- def check_correctness(
25
- task_id: str,
26
- sample: dict,
27
- language_type: str,
28
- timeout: float = 3.0,
29
- tmp_dir: str = None,
30
- completion_id: Optional[int] = None,
31
- ) -> Dict:
32
- """
33
- Evaluates the functional correctness of a completion by running the test
34
- suite provided in the problem.
35
- """
36
-
37
- def unsafe_execute(tmp_dir):
38
- random_id = random.randint(1, 100000)
39
- if "python" in language_type.lower():
40
- with create_tempdir():
41
-
42
- # These system calls are needed when cleaning up tempdir.
43
- import os
44
- import shutil
45
-
46
- rmtree = shutil.rmtree
47
- rmdir = os.rmdir
48
- chdir = os.chdir
49
-
50
- # Disable functionalities that can make destructive changes to the test.
51
- reliability_guard()
52
-
53
- try:
54
- exec_globals = {}
55
- with swallow_io():
56
- with time_limit(timeout):
57
- # WARNING
58
- # This program exists to execute untrusted model-generated code. Although
59
- # it is highly unlikely that model-generated code will do something overtly
60
- # malicious in response to this test suite, model-generated code may act
61
- # destructively due to a lack of model capability or alignment.
62
- # Users are strongly encouraged to sandbox this evaluation suite so that it
63
- # does not perform destructive actions on their host or network.
64
- # Once you have read this disclaimer and taken appropriate precautions,
65
- # uncomment the following line and proceed at your own risk:
66
- exec(sample["test_code"], exec_globals)
67
- result.append("passed")
68
- except TimeoutException:
69
- result.append("timed out")
70
- except AssertionError as e:
71
- result.append(f"failed: AssertionError")
72
- except BaseException as e:
73
- result.append(f"failed: {e}")
74
- # print(sample["test_code"])
75
- # print(result)
76
- # Needed for cleaning up.
77
- shutil.rmtree = rmtree
78
- os.rmdir = rmdir
79
- os.chdir = chdir
80
-
81
- elif "go" in language_type.lower():
82
- assert (
83
- tmp_dir is not None
84
- ), "Go should be evaluated in a dir where necessary module files installed."
85
-
86
- import os
87
- import shutil
88
-
89
- if "tmp" not in tmp_dir:
90
- tmp_dir = os.path.join(tmp_dir, "tmp")
91
- tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
92
- if not os.path.exists(tmp_dir):
93
- os.makedirs(tmp_dir)
94
- origin_path = os.getcwd()
95
- os.chdir(tmp_dir)
96
- open(f"main_test.go", "w").write(sample["test_code"])
97
- try:
98
- exec_result = None
99
- with time_limit(timeout):
100
- # WARNING
101
- # This program exists to execute untrusted model-generated code. Although
102
- # it is highly unlikely that model-generated code will do something overtly
103
- # malicious in response to this test suite, model-generated code may act
104
- # destructively due to a lack of model capability or alignment.
105
- # Users are strongly encouraged to sandbox this evaluation suite so that it
106
- # does not perform destructive actions on their host or network.
107
- # Once you have read this disclaimer and taken appropriate precautions,
108
- # uncomment the following line and proceed at your own risk:
109
- exec_result = subprocess.run(
110
- [
111
- f"{go_exec}go",
112
- "test",
113
- f"-timeout={timeout}s",
114
- "main_test.go",
115
- ],
116
- timeout=timeout,
117
- capture_output=True,
118
- )
119
-
120
- if exec_result.returncode == 0:
121
- result.append("passed")
122
- else:
123
- if exec_result.stderr:
124
- try:
125
- err = exec_result.stderr.decode()
126
- except:
127
- err = exec_result.stderr
128
- else:
129
- try:
130
- err = exec_result.stdout.decode()
131
- except:
132
- err = exec_result.stdout
133
- result.append(f"failed: {err}")
134
-
135
- except TimeoutException:
136
- result.append("timed out")
137
- os.chdir(origin_path)
138
- shutil.rmtree(tmp_dir)
139
- elif "js" in language_type.lower():
140
- import os
141
- import shutil
142
-
143
- if "tmp" not in tmp_dir:
144
- tmp_dir = os.path.join(tmp_dir, "tmp")
145
- tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
146
- if not os.path.exists(tmp_dir):
147
- os.makedirs(tmp_dir)
148
- origin_path = os.getcwd()
149
- os.chdir(tmp_dir)
150
- open(f"test.js", "w").write(sample["test_code"])
151
- try:
152
- exec_result = None
153
- with time_limit(timeout):
154
- # WARNING
155
- # This program exists to execute untrusted model-generated code. Although
156
- # it is highly unlikely that model-generated code will do something overtly
157
- # malicious in response to this test suite, model-generated code may act
158
- # destructively due to a lack of model capability or alignment.
159
- # Users are strongly encouraged to sandbox this evaluation suite so that it
160
- # does not perform destructive actions on their host or network.
161
- # Once you have read this disclaimer and taken appropriate precautions,
162
- # uncomment the following line and proceed at your own risk:
163
- exec_result = subprocess.run(
164
- [f"{node_exec}node", "test.js"],
165
- timeout=timeout,
166
- capture_output=True,
167
- )
168
-
169
- if exec_result.stderr.decode():
170
- err = exec_result.stderr.decode()
171
- result.append(f"failed: {err}")
172
- elif exec_result.stdout.decode():
173
- err = exec_result.stdout.decode()
174
- result.append(f"failed: {err}")
175
- else:
176
- result.append("passed")
177
-
178
- except TimeoutException:
179
- result.append("timed out")
180
- os.chdir(origin_path)
181
- shutil.rmtree(tmp_dir)
182
- elif "cpp" in language_type.lower():
183
- import os
184
- import shutil
185
-
186
- origin_path = os.getcwd()
187
- if "tmp" not in tmp_dir:
188
- tmp_dir = os.path.join(tmp_dir, "tmp")
189
- tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
190
- if not os.path.exists(tmp_dir):
191
- os.makedirs(tmp_dir)
192
-
193
- os.chdir(tmp_dir)
194
- open(f"test.cpp", "w").write(sample["test_code"])
195
- if "162" in task_id:
196
- compilation_result = subprocess.run(
197
- ["/usr/bin/g++", "-std=c++17", "test.cpp", "-lcrypto", "-lssl"],
198
- timeout=timeout,
199
- capture_output=True,
200
- )
201
- else:
202
- compilation_result = subprocess.run(
203
- ["/usr/bin/g++", "-std=c++17", "test.cpp"],
204
- timeout=timeout,
205
- capture_output=True,
206
- )
207
- if compilation_result.returncode != 0:
208
- if compilation_result.stderr:
209
- err = compilation_result.stderr.decode()
210
- else:
211
- err = compilation_result.stdout.decode()
212
- result.append(f"failed: compilation error: {err}")
213
- else:
214
- try:
215
- exec_result = None
216
- with time_limit(timeout):
217
- # WARNING
218
- # This program exists to execute untrusted model-generated code. Although
219
- # it is highly unlikely that model-generated code will do something overtly
220
- # malicious in response to this test suite, model-generated code may act
221
- # destructively due to a lack of model capability or alignment.
222
- # Users are strongly encouraged to sandbox this evaluation suite so that it
223
- # does not perform destructive actions on their host or network.
224
- # Once you have read this disclaimer and taken appropriate precautions,
225
- # uncomment the following line and proceed at your own risk:
226
- exec_result = subprocess.run(
227
- ["./a.out"], timeout=timeout, capture_output=True
228
- )
229
-
230
- if exec_result.returncode == 0:
231
- result.append("passed")
232
- else:
233
- if exec_result.stderr:
234
- try:
235
- err = exec_result.stderr.decode()
236
- except:
237
- err = exec_result.stderr
238
- else:
239
- try:
240
- err = exec_result.stdout.decode()
241
- except:
242
- err = exec_result.stdout
243
- result.append(f"failed: {err}")
244
- except TimeoutException:
245
- result.append("timed out")
246
- # print(result[-1])
247
- # print(sample["test_code"])
248
- os.chdir(origin_path)
249
- shutil.rmtree(tmp_dir)
250
- elif "php" in language_type.lower():
251
- import os
252
- import shutil
253
-
254
- origin_path = os.getcwd()
255
- if "tmp" not in tmp_dir:
256
- tmp_dir = os.path.join(tmp_dir, "tmp")
257
- tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
258
- if not os.path.exists(tmp_dir):
259
- os.makedirs(tmp_dir)
260
-
261
- os.chdir(tmp_dir)
262
- open(f"test.php", "w").write(sample["test_code"])
263
- try:
264
- exec_result = None
265
- with time_limit(timeout):
266
- cmd = f"{php_exec}php -f test.php"
267
- exec_result = subprocess.run(
268
- cmd, timeout=timeout, capture_output=True, shell=True
269
- )
270
-
271
- if exec_result.returncode == 0:
272
- result.append("passed")
273
- else:
274
- if exec_result.stderr:
275
- try:
276
- err = exec_result.stderr.decode()
277
- except:
278
- err = exec_result.stderr
279
- else:
280
- try:
281
- err = exec_result.stdout.decode()
282
- except:
283
- err = exec_result.stdout
284
- result.append(f"failed: {err}")
285
- except TimeoutException:
286
- result.append("timed out")
287
- print(result[-1])
288
- print(sample["test_code"])
289
- os.chdir(origin_path)
290
- shutil.rmtree(tmp_dir)
291
- elif "sh" in language_type.lower():
292
- import os
293
- import shutil
294
-
295
- origin_path = os.getcwd()
296
- if "tmp" not in tmp_dir:
297
- tmp_dir = os.path.join(tmp_dir, "tmp")
298
- tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
299
- if not os.path.exists(tmp_dir):
300
- os.makedirs(tmp_dir)
301
-
302
- os.chdir(tmp_dir)
303
- open(f"test.sh", "w").write(sample["test_code"])
304
- try:
305
- exec_result = None
306
- with time_limit(timeout):
307
- cmd = "/bin/bash test.sh"
308
- exec_result = subprocess.run(
309
- cmd, timeout=10, capture_output=True, shell=True
310
- )
311
-
312
- if exec_result.returncode == 0:
313
- result.append("passed")
314
- else:
315
- if exec_result.stderr:
316
- try:
317
- err = exec_result.stderr.decode()
318
- except:
319
- err = exec_result.stderr
320
- else:
321
- try:
322
- err = exec_result.stdout.decode()
323
- except:
324
- err = exec_result.stdout
325
- result.append(f"failed: {err}")
326
- except TimeoutException:
327
- result.append("timed out")
328
- # print(result[-1])
329
- # print(sample["test_code"])
330
- os.chdir(origin_path)
331
- shutil.rmtree(tmp_dir)
332
- elif "ts" in language_type.lower():
333
- import os
334
- import shutil
335
-
336
- origin_path = os.getcwd()
337
- if "tmp" not in tmp_dir:
338
- tmp_dir = os.path.join(tmp_dir, "tmp")
339
- tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
340
- if not os.path.exists(tmp_dir):
341
- os.makedirs(tmp_dir)
342
-
343
- os.chdir(tmp_dir)
344
- env = {"PATH": f"{node_exec}:" + subprocess.os.environ["PATH"]}
345
- open(f"test.ts", "w").write(sample["test_code"])
346
- cmd = f"{tsc_exec}tsc test.ts --target ES2015 --lib ES2015,DOM"
347
- compilation_result = subprocess.run(
348
- cmd, timeout=timeout, capture_output=True, env=env, shell=True
349
- )
350
- if compilation_result.returncode != 0:
351
- if compilation_result.stderr:
352
- err = compilation_result.stderr.decode()
353
- else:
354
- err = compilation_result.stdout.decode()
355
- result.append(f"failed: compilation error: {err}")
356
- else:
357
- try:
358
- exec_result = None
359
- with time_limit(timeout):
360
- exec_result = subprocess.run(
361
- [f"{node_exec}node", "test.js"],
362
- timeout=timeout,
363
- capture_output=True,
364
- )
365
-
366
- if exec_result.returncode == 0:
367
- result.append("passed")
368
- else:
369
- if exec_result.stderr:
370
- try:
371
- err = exec_result.stderr.decode()
372
- except:
373
- err = exec_result.stderr
374
- else:
375
- try:
376
- err = exec_result.stdout.decode()
377
- except:
378
- err = exec_result.stdout
379
- result.append(f"failed: {err}")
380
- except TimeoutException:
381
- result.append("timed out")
382
- if result[-1] != "passed":
383
- env = {"PATH": f"{node_exec}:" + subprocess.os.environ["PATH"]}
384
- cmd = f"{tsc_exec}tsc test.ts"
385
- compilation_result = subprocess.run(
386
- cmd, timeout=timeout, capture_output=True, env=env, shell=True
387
- )
388
- if compilation_result.returncode != 0:
389
- if compilation_result.stderr:
390
- err = compilation_result.stderr.decode()
391
- else:
392
- err = compilation_result.stdout.decode()
393
- result[-1] = f"failed: compilation error: {err}"
394
- else:
395
- try:
396
- exec_result = None
397
- with time_limit(timeout):
398
- exec_result = subprocess.run(
399
- [f"{node_exec}node", "test.js"],
400
- timeout=timeout,
401
- capture_output=True,
402
- )
403
-
404
- if exec_result.returncode == 0:
405
- result[-1] = "passed"
406
- else:
407
- if exec_result.stderr:
408
- try:
409
- err = exec_result.stderr.decode()
410
- except:
411
- err = exec_result.stderr
412
- else:
413
- try:
414
- err = exec_result.stdout.decode()
415
- except:
416
- err = exec_result.stdout
417
- result[-1] = f"failed: {err}"
418
- except TimeoutException:
419
- result[-1] = "timed out"
420
-
421
- os.chdir(origin_path)
422
- shutil.rmtree(tmp_dir)
423
- elif "cs" in language_type.lower():
424
- import os
425
- import shutil
426
-
427
- origin_path = os.getcwd()
428
- if "tmp" not in tmp_dir:
429
- tmp_dir = os.path.join(tmp_dir, "tmp")
430
- tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
431
- if not os.path.exists(tmp_dir):
432
- os.makedirs(tmp_dir)
433
- os.chdir(tmp_dir)
434
- open(f"Program.cs", "w").write(sample["test_code"])
435
- cmd = f"{cs_exec}mcs -d:DEBUG Program.cs"
436
- compilation_result = subprocess.run(cmd, shell=True, capture_output=True)
437
- if compilation_result.returncode != 0:
438
- if compilation_result.stderr:
439
- err = compilation_result.stderr.decode()
440
- else:
441
- err = compilation_result.stdout.decode()
442
- result.append(f"failed: compilation error: {err}")
443
- else:
444
- try:
445
- exec_result = None
446
- cmd = f"{cs_exec}mono Program.exe"
447
- env = dict(MONO_TRACE_LISTENER="Console.Error")
448
- with time_limit(timeout):
449
- exec_result = subprocess.run(
450
- cmd,
451
- timeout=timeout,
452
- shell=True,
453
- capture_output=True,
454
- env=env,
455
- )
456
-
457
- if "Fail" not in exec_result.stderr.decode():
458
- result.append("passed")
459
- else:
460
- if exec_result.stderr:
461
- try:
462
- err = exec_result.stderr.decode()
463
- except:
464
- err = exec_result.stderr
465
- else:
466
- try:
467
- err = exec_result.stdout.decode()
468
- except:
469
- err = exec_result.stdout
470
- result.append(f"failed: {err}")
471
- except TimeoutException:
472
- result.append("timed out")
473
- except Exception as e:
474
- result.append(f"failed: {e}")
475
- os.chdir(origin_path)
476
- shutil.rmtree(tmp_dir)
477
- elif "rust" in language_type.lower():
478
- import os
479
-
480
- WD: str = os.path.dirname(os.path.abspath(__file__))
481
- RUST_DIR: str = os.path.join(WD, "rust")
482
- RUST_SRC: str = os.path.join(RUST_DIR, "src")
483
- RUST_BIN: str = os.path.join(RUST_SRC, "bin")
484
- RUST_TMP_DIR: str = os.path.join(RUST_DIR, "tmp")
485
- RUST_LOGS: str = os.path.join(RUST_TMP_DIR, "logs")
486
- RUST_EXT: str = ".rs"
487
-
488
- # Create mandatory tmp directories
489
- os.makedirs(RUST_TMP_DIR, exist_ok=True)
490
- os.makedirs(RUST_LOGS, exist_ok=True)
491
- os.makedirs(RUST_SRC, exist_ok=True)
492
- os.makedirs(RUST_BIN, exist_ok=True)
493
-
494
- with tempfile.NamedTemporaryFile(dir=RUST_BIN, delete=False) as f:
495
- # temporal file name
496
- file_prefix = sample["task_id"].lower().replace("/", "_")
497
- file_name: str = file_prefix + RUST_EXT
498
-
499
- os.rename(f.name, os.path.join(RUST_BIN, file_name))
500
-
501
- # Sample to pure Rust function
502
- rust_code: str = sample["test_code"]
503
-
504
- # dump the rust source code in the target temporal file
505
- f.write(rust_code.encode("utf-8"))
506
-
507
- # Proceed towards Rust binaries compilation. Therefore move to Rust module root dir.
508
- os.chdir(RUST_DIR)
509
-
510
- # Two possible outcomes
511
- # Pass OR Fail compilation
512
- log_filename: str = file_prefix + ".jsonl"
513
- log_path: str = os.path.join(RUST_LOGS, log_filename)
514
- cargo_check: str = (
515
- "cargo check --bin "
516
- + file_prefix
517
- + " --message-format json >> "
518
- + log_path
519
- )
520
- # Compilation build status
521
- returned_val_compilation: int
522
-
523
- # Overwrite file content
524
- if os.path.exists(log_path):
525
- if (file_size := os.path.getsize(log_path)) >= 0:
526
- os.remove(log_path)
527
- returned_val_compilation = os.system(cargo_check)
528
-
529
- else:
530
- returned_val_compilation = os.system(cargo_check)
531
-
532
- # 0 means success
533
- if returned_val_compilation == 0:
534
-
535
- # Execution pipeline
536
- cargo_test: str = (
537
- "cargo test --bin "
538
- + file_prefix
539
- + " --message-format json >> "
540
- + log_path
541
- )
542
- returned_val_execution = os.system(cargo_test)
543
-
544
- if returned_val_execution == 0:
545
- result.append("passed")
546
- else:
547
- result.append(f"failed: execution error")
548
-
549
- else:
550
- result.append(f"failed: compilation error")
551
-
552
- elif "java" in language_type.lower():
553
- assert tmp_dir is not None, "Java should be evaluated in a temporary dir."
554
-
555
- import os
556
- import shutil
557
-
558
- if "tmp" not in tmp_dir:
559
- tmp_dir = os.path.join(tmp_dir, "tmp")
560
- tmp_dir = os.path.join(tmp_dir, f"{task_id.replace('/', '-')}-{random_id}")
561
- if not os.path.exists(tmp_dir):
562
- os.makedirs(tmp_dir)
563
- open(os.path.join(tmp_dir, "Problem.java"), "w").write(sample["test_code"])
564
- origin_path = os.getcwd()
565
- os.system(f"cp ./javatuples-1.2.jar {tmp_dir}/")
566
- os.chdir(tmp_dir)
567
- res = "failed: unknown error"
568
- compile_returncode = -1
569
- for _ in range(5):
570
- try:
571
- cmd = f"{java_exec}javac -cp javatuples-1.2.jar Problem.java"
572
- compilation_result = subprocess.run(
573
- cmd, timeout=60, capture_output=True, shell=True
574
- )
575
- compile_returncode = compilation_result.returncode
576
- break
577
- except subprocess.TimeoutExpired as e:
578
- continue
579
- if compile_returncode != 0:
580
- res = "failed: compilation error"
581
- else:
582
- exec_result = None
583
- try:
584
- # WARNING
585
- # This program exists to execute untrusted model-generated code. Although
586
- # it is highly unlikely that model-generated code will do something overtly
587
- # malicious in response to this test suite, model-generated code may act
588
- # destructively due to a lack of model capability or alignment.
589
- # Users are strongly encouraged to sandbox this evaluation suite so that it
590
- # does not perform destructive actions on their host or network.
591
- # Once you have read this disclaimer and taken appropriate precautions,
592
- # uncomment the following line and proceed at your own risk:
593
- cmd = f"{java_exec}java -ea -cp .:javatuples-1.2.jar Problem"
594
- exec_result = subprocess.run(
595
- cmd, timeout=timeout, capture_output=True, shell=True
596
- )
597
- if exec_result.returncode == 0:
598
- res = "passed"
599
- elif exec_result.returncode == 1:
600
- if "AssertionError" in exec_result.stderr.decode(
601
- "unicode-escape"
602
- ):
603
- res = "failed: wrong answer"
604
- else:
605
- res = f"failed: {exec_result.stderr.decode()}"
606
- except subprocess.TimeoutExpired as e:
607
- res = "time out"
608
- except BaseException as e:
609
- res = f"failed: {e}"
610
-
611
- result.append(res)
612
- os.chdir(origin_path)
613
- shutil.rmtree(tmp_dir)
614
-
615
- manager = multiprocessing.Manager()
616
- result = manager.list()
617
-
618
- p = multiprocessing.Process(target=unsafe_execute, args=(tmp_dir,))
619
- p.start()
620
- p.join(timeout=timeout + 1)
621
- if p.is_alive():
622
- p.kill()
623
-
624
- if not result:
625
- result.append("timed out")
626
-
627
- return {
628
- "task_id": task_id,
629
- "completion_id": completion_id,
630
- "result": result[0],
631
- "passed": result[0] == "passed",
632
- "finish": -1 if "finish" not in sample else sample["finish"],
633
- "test_code": sample["test_code"],
634
- "prompt": sample["prompt"],
635
- # "canonical_solution" : sample["canonical_solution"],
636
- # "test" : sample["test"],
637
- # "text" : sample["text"],
638
- # "output" : sample["output"],
639
- # "generation" : sample["generation"],
640
- }
641
-
642
-
643
- # Copyright (c) OpenAI (https://openai.com)
644
-
645
- # Permission is hereby granted, free of charge, to any person obtaining a copy
646
- # of this software and associated documentation files (the "Software"), to deal
647
- # in the Software without restriction, including without limitation the rights
648
- # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
649
- # copies of the Software, and to permit persons to whom the Software is
650
- # furnished to do so, subject to the following conditions:
651
-
652
- # The above copyright notice and this permission notice shall be included in
653
- # all copies or substantial portions of the Software.
654
-
655
-
656
- # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
657
- # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
658
- # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
659
- # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
660
- # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
661
- # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
662
- # THE SOFTWARE.
663
- # ============================================================================
664
- @contextlib.contextmanager
665
- def time_limit(seconds: float):
666
- def signal_handler(signum, frame):
667
- raise TimeoutException("Timed out!")
668
-
669
- signal.setitimer(signal.ITIMER_REAL, seconds)
670
- signal.signal(signal.SIGALRM, signal_handler)
671
- try:
672
- yield
673
- finally:
674
- signal.setitimer(signal.ITIMER_REAL, 0)
675
-
676
-
677
- @contextlib.contextmanager
678
- def swallow_io():
679
- stream = WriteOnlyStringIO()
680
- with contextlib.redirect_stdout(stream):
681
- with contextlib.redirect_stderr(stream):
682
- with redirect_stdin(stream):
683
- yield
684
-
685
-
686
- @contextlib.contextmanager
687
- def create_tempdir():
688
- with tempfile.TemporaryDirectory() as dirname:
689
- with chdir(dirname):
690
- yield dirname
691
-
692
-
693
- class TimeoutException(Exception):
694
- pass
695
-
696
-
697
- class WriteOnlyStringIO(io.StringIO):
698
- """StringIO that throws an exception when it's read from"""
699
-
700
- def read(self, *args, **kwargs):
701
- raise IOError
702
-
703
- def readline(self, *args, **kwargs):
704
- raise IOError
705
-
706
- def readlines(self, *args, **kwargs):
707
- raise IOError
708
-
709
- def readable(self, *args, **kwargs):
710
- """Returns True if the IO object can be read."""
711
- return False
712
-
713
-
714
- class redirect_stdin(contextlib._RedirectStream): # type: ignore
715
- _stream = "stdin"
716
-
717
-
718
- @contextlib.contextmanager
719
- def chdir(root):
720
- if root == ".":
721
- yield
722
- return
723
- cwd = os.getcwd()
724
- os.chdir(root)
725
- try:
726
- yield
727
- except BaseException as exc:
728
- raise exc
729
- finally:
730
- os.chdir(cwd)
731
-
732
-
733
- def reliability_guard(maximum_memory_bytes: Optional[int] = None):
734
- """
735
- This disables various destructive functions and prevents the generated code
736
- from interfering with the test (e.g. fork bomb, killing other processes,
737
- removing filesystem files, etc.)
738
-
739
- WARNING
740
- This function is NOT a security sandbox. Untrusted code, including, model-
741
- generated code, should not be blindly executed outside of one. See the
742
- Codex paper for more information about OpenAI's code sandbox, and proceed
743
- with caution.
744
- """
745
-
746
- if maximum_memory_bytes is not None:
747
- import resource
748
-
749
- resource.setrlimit(
750
- resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes)
751
- )
752
- resource.setrlimit(
753
- resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes)
754
- )
755
- if not platform.uname().system == "Darwin":
756
- resource.setrlimit(
757
- resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes)
758
- )
759
-
760
- faulthandler.disable()
761
-
762
- import builtins
763
-
764
- builtins.exit = None
765
- builtins.quit = None
766
-
767
- import os
768
-
769
- os.environ["OMP_NUM_THREADS"] = "1"
770
-
771
- os.kill = None
772
- os.system = None
773
- os.putenv = None
774
- os.remove = None
775
- os.removedirs = None
776
- os.rmdir = None
777
- os.fchdir = None
778
- os.setuid = None
779
- os.fork = None
780
- os.forkpty = None
781
- os.killpg = None
782
- os.rename = None
783
- os.renames = None
784
- os.truncate = None
785
- os.replace = None
786
- os.unlink = None
787
- os.fchmod = None
788
- os.fchown = None
789
- os.chmod = None
790
- os.chown = None
791
- os.chroot = None
792
- os.fchdir = None
793
- os.lchflags = None
794
- os.lchmod = None
795
- os.lchown = None
796
- os.getcwd = None
797
- os.chdir = None
798
-
799
- import shutil
800
-
801
- shutil.rmtree = None
802
- shutil.move = None
803
- shutil.chown = None
804
-
805
- import subprocess
806
-
807
- subprocess.Popen = None # type: ignore
808
-
809
- __builtins__["help"] = None
810
-
811
- import sys
812
-
813
- sys.modules["ipdb"] = None
814
- sys.modules["joblib"] = None
815
- sys.modules["resource"] = None
816
- sys.modules["psutil"] = None
817
- sys.modules["tkinter"] = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/general_benchmarks/HumanEval/humaneval.py DELETED
@@ -1,217 +0,0 @@
1
- import datetime
2
- import json
3
- import multiprocessing
4
- import os
5
- import re
6
- import string
7
- import subprocess
8
- import time
9
-
10
- import numpy as np
11
- import torch
12
- import torch.distributed as dist
13
- # from attrdict import AttrDict
14
- from human_eval.evaluation import evaluate_functional_correctness
15
- from transformers import AutoTokenizer
16
- from utils.dataset import HumanEvalDataset
17
- from utils.utils import cleanup_code
18
-
19
-
20
- class HumanEval:
21
- """
22
- HumanEval evaluation class.
23
- """
24
-
25
- def __init__(
26
- self,
27
- data_root,
28
- max_seq_len=2048,
29
- language="python",
30
- max_gen_len=200,
31
- batch_size=512,
32
- log_dir=None,
33
- temperature=0,
34
- issft=False,
35
- top_p=0.95,
36
- model_name="",
37
- inference_increment=True,
38
- tokenizer_cfg=None,
39
- n_sample=40,
40
- k_sample=1,
41
- ):
42
- self.data_root = data_root
43
- self.max_seq_len = max_seq_len
44
- self.max_gen_len = max_gen_len
45
- self.batch_size = batch_size
46
- self.k = k_sample
47
- self.n_sample = n_sample
48
- self.language = language
49
- self.log_dir = log_dir
50
- self.sft = issft
51
- self.temperature = temperature
52
- self.top_p = top_p
53
- self.model_name = tokenizer_cfg["model_path"].replace("/", "_")
54
- self.inference_increment = inference_increment
55
- os.makedirs(self.log_dir, exist_ok=True)
56
- tokenizer_cls = tokenizer_cfg.pop("cls")
57
- try:
58
- self.tokenizer = AutoTokenizer.from_pretrained(
59
- tokenizer_cfg.pop("model_path"), trust_remote_code=True
60
- )
61
- except Exception as e:
62
- print(e)
63
- assert False
64
-
65
- @torch.no_grad()
66
- def eval_model(self, gpt, accelerator):
67
- """
68
- Evaluate the model on HumanEval.
69
- """
70
- assert (
71
- self.log_dir is not None
72
- ), "log_dir should not be None when evaluating humaneval"
73
- dataset = HumanEvalDataset(
74
- self.data_root,
75
- sample_num=self.n_sample,
76
- language=self.language,
77
- issft=self.sft,
78
- )
79
- nprompt = len(dataset) // self.n_sample
80
- dp_rank = accelerator.process_index
81
- dp_size = accelerator.num_processes
82
- if self.k > 1:
83
- assert self.n_sample >= 100, "HumanEval PASS@100 needs n_sample >= 100"
84
- gpt.eval()
85
- # each process will process a subset of the dataset
86
- prompt_indices_split = np.array_split(range(nprompt), dp_size)
87
- prompt_indices = prompt_indices_split[dp_rank]
88
- indices = [
89
- x * self.n_sample + j for x in prompt_indices for j in range(self.n_sample)
90
- ]
91
- all_num = len(indices)
92
- processed_num = 0
93
- log_file = os.path.join(
94
- self.log_dir,
95
- f"{self.model_name}_rank{dp_rank}_bs{self.batch_size}_shot_log_{self.language}.json",
96
- )
97
- tmpfile = open(log_file, "w")
98
- start_time = time.time()
99
- # split the dataset into batches and construct a list of inputs
100
- for idx in range(0, len(indices), self.batch_size):
101
- prompt_list = []
102
- prompt_lens = []
103
- orriginal_prompt_list = []
104
- tokenized_prompt_lens = []
105
- taskid = []
106
- # get the prompts from the dataset
107
- for j in indices[idx : idx + self.batch_size]:
108
- data = dataset[j]
109
- fprompt = data["prompt"].strip()
110
- prompt_list.append(fprompt)
111
- tmp = self.tokenizer.encode(fprompt)
112
- orriginal_prompt_list.append(data["original_prompt"])
113
- prompt_lens.append(len(fprompt))
114
- tokenized_prompt_lens.append(tmp)
115
- taskid.append(data["task_id"])
116
- input_ids = torch.tensor(tokenized_prompt_lens).to(accelerator.device)
117
- # generate the code
118
- if self.temperature != 0:
119
- decoded = gpt.generate(
120
- input_ids=input_ids,
121
- max_new_tokens=self.max_gen_len,
122
- do_sample=True,
123
- eos_token_id=self.tokenizer.eos_token_id,
124
- temperature=self.temperature,
125
- top_p=self.top_p,
126
- pad_token_id=self.tokenizer.eos_token_id,
127
- )
128
- else:
129
- decoded = gpt.generate(
130
- input_ids=input_ids,
131
- max_new_tokens=self.max_gen_len,
132
- do_sample=False,
133
- eos_token_id=self.tokenizer.eos_token_id,
134
- pad_token_id=self.tokenizer.eos_token_id,
135
- )
136
- # save the results to a file
137
- for local_idx, text in enumerate(decoded):
138
- prediction = decoded[local_idx]
139
- prediction = self.tokenizer.decode(prediction, skip_special_tokens=True)
140
- suffixprediction = prediction[prompt_lens[local_idx] :]
141
- suffixprediction = cleanup_code(
142
- suffixprediction,
143
- self.language,
144
- "humaneval",
145
- self.sft,
146
- dataset.stopwords,
147
- )
148
- # sft mode does not need original prompt
149
- if not self.sft:
150
- suffixprediction = (
151
- orriginal_prompt_list[local_idx] + "\n" + suffixprediction
152
- )
153
- res = {
154
- "task_id": taskid[local_idx],
155
- "generation": suffixprediction,
156
- "prompt": orriginal_prompt_list[local_idx],
157
- "wholecode": prediction,
158
- }
159
- tmpfile.write(json.dumps(res) + "\n")
160
- tmpfile.flush()
161
- processed_num += 1
162
- self.log_score(dp_rank, processed_num, all_num, start_time, self.batch_size)
163
- tmpfile.close()
164
- accelerator.wait_for_everyone()
165
- # calculate the final score of pass@k
166
- self._calculate_final_score(accelerator)
167
- accelerator.wait_for_everyone()
168
- return
169
-
170
- def log_score(self, dp_rank, processed_num, all_num, start_time, bs):
171
- """
172
- Log the score.
173
- """
174
- mem = torch.cuda.max_memory_allocated() / (1 << 30)
175
- avg_time = (time.time() - start_time) / processed_num * bs
176
- print(
177
- f"DP RANK:{dp_rank} process_num/all_num:{int(processed_num)}/{all_num} "
178
- f"avg_time_per_batch:{avg_time:.2f} s "
179
- f"still_need:{((all_num - processed_num) // bs + 1) * avg_time / 60:.2f} m",
180
- f"mem:{mem:.3f} GiB bs:{bs}",
181
- flush=True,
182
- )
183
- if processed_num == all_num:
184
- print(
185
- f"EVAL DONE! Process time {(time.time() - start_time) / 60:.2f} m",
186
- flush=True,
187
- )
188
-
189
- def _calculate_final_score(self, accelerator):
190
- """
191
- Calculate the final score.
192
- """
193
- if accelerator.is_local_main_process:
194
- logfilepath = os.path.join(self.log_dir, f"final_{self.model_name}.jsonl")
195
- logfile = open(logfilepath, "w")
196
- for i in range(accelerator.num_processes):
197
- tmplogfile = os.path.join(
198
- self.log_dir,
199
- f"{self.model_name}_rank{i}_bs{self.batch_size}_shot_log_{self.language}.json",
200
- )
201
- logfile.write(open(tmplogfile).read().strip() + "\n")
202
- os.remove(tmplogfile)
203
- logfile.close()
204
- timeout = 10
205
- runlang = self.language
206
- res = evaluate_functional_correctness(
207
- input_file=logfilepath,
208
- problem_file=os.path.join(
209
- self.data_root, f"humaneval-{self.language}.jsonl"
210
- ),
211
- tmp_dir=self.log_dir,
212
- timeout=timeout,
213
- language=runlang,
214
- )
215
- print("score is", res["pass@%d" % self.k])
216
- os.remove(logfilepath)
217
- return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/general_benchmarks/HumanEval/javatuples-1.2.jar DELETED
Binary file (65.5 kB)
 
evaluation/general_benchmarks/HumanEval/test_config.yaml DELETED
@@ -1,15 +0,0 @@
1
- compute_environment: LOCAL_MACHINE
2
- distributed_type: MULTI_GPU
3
- downcast_bf16: 'no'
4
- gpu_ids: all
5
- machine_rank: 0
6
- main_training_function: main
7
- mixed_precision: 'no'
8
- num_machines: 1
9
- num_processes: 3
10
- rdzv_backend: static
11
- same_network: true
12
- tpu_env: []
13
- tpu_use_cluster: false
14
- tpu_use_sudo: false
15
- use_cpu: false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/general_benchmarks/HumanEval/utils/dataset.py DELETED
@@ -1,72 +0,0 @@
1
- import json
2
- import os
3
-
4
- import numpy as np
5
-
6
-
7
- class HumanEvalDataset:
8
-
9
- def __init__(self, root, sample_num=1, language="python", issft=False):
10
- """
11
- root: the path to the HumanEval dataset
12
- sample_num: the number of samples for each prompt
13
- language: the language of the HumanEval dataset
14
- issft: whether to use the SFT setting
15
- """
16
- self.root = root
17
- self.data = open(
18
- os.path.join(self.root, f"humaneval-{language}.jsonl")
19
- ).readlines()
20
-
21
- tmp = self.get_qa_only_data(self.data, issft)
22
- self.clean_data = []
23
- for i in range(len(tmp)):
24
- for j in range(sample_num):
25
- self.clean_data.append(tmp[i])
26
- self.stopwords = self.clean_data[0]["stopwords"]
27
- np.random.seed(1234)
28
- print(f"Read HumanEval from {root}, number of samples {len(self.clean_data)}")
29
-
30
- def get_qa_only_data(self, data_json, sft=False):
31
- """
32
- data_json: the jsonl file of HumanEval
33
- sft: whether to use the SFT setting
34
- return: a list of dict, each dict contains the prompt, task_id and stopwords
35
- """
36
- ans = []
37
- for line in data_json:
38
- line = json.loads(line)
39
- prompt = line["prompt"].strip()
40
- if "prefix" in line:
41
- origin_prompt = line["prefix"]
42
- else:
43
- origin_prompt = line["prompt"]
44
-
45
- if sft:
46
- prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context.\nWrite a response that appropriately completes the request.\n\n### Instruction:\nWrite a program to perform the given task.\n\nInput:\n{prompt}\n\n### Response:\n"""
47
- if "stop_tokens" in line:
48
- s = line["stop_tokens"]
49
- else:
50
- s = []
51
- ans.append(
52
- {
53
- "prompt": prompt,
54
- "task_id": line["task_id"],
55
- "original_prompt": origin_prompt,
56
- "stopwords": s,
57
- }
58
- )
59
- return ans
60
-
61
- def __len__(self):
62
- """
63
- return the number of samples in the dataset
64
- """
65
- return len(self.clean_data)
66
-
67
- def __getitem__(self, index):
68
- """
69
- return the sample at index
70
- """
71
- sample = self.clean_data[index]
72
- return sample
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/general_benchmarks/HumanEval/utils/utils.py DELETED
@@ -1,161 +0,0 @@
1
- import re
2
-
3
- languge_settings = {
4
- "python": {
5
- "full_name": "Python",
6
- "indent": 4,
7
- },
8
- "cpp": {
9
- "full_name": "cpp",
10
- "indent": 0,
11
- "main": "int main()",
12
- },
13
- "java": {
14
- "full_name": "Java",
15
- "indent": 4,
16
- "main": "public static void main",
17
- },
18
- "cs": {
19
- "full_name": "csharp",
20
- "indent": 0,
21
- "main": "public static void Main",
22
- },
23
- "php": {
24
- "full_name": "PHP",
25
- "indent": 0,
26
- },
27
- "ts": {
28
- "full_name": "TypeScript",
29
- "indent": 0,
30
- },
31
- "js": {"full_name": "JavaScript", "indent": 0},
32
- "sh": {"full_name": "Bash", "indent": 0},
33
- }
34
-
35
-
36
- def get_function_name(question: str, lang: str):
37
- func_lines = [x for x in question.strip().split("\n") if x.strip()]
38
-
39
- if lang.lower() == "python":
40
- func_idx = [
41
- i for i in range(len(func_lines)) if func_lines[i].startswith("def ")
42
- ][-1]
43
- func_name = func_lines[func_idx].split("(")[0].strip()
44
- func_prefix = "\n".join(func_lines[:func_idx])
45
- return func_name, func_prefix
46
-
47
- func_name = func_lines[-1].split("{")[0].strip()
48
- func_prefix = "\n".join(func_lines[:-1])
49
- return func_name, func_prefix
50
-
51
-
52
- def extract_generation_code(example: str, lang_code: str, verbose: bool = False):
53
- task_id = example["task_id"]
54
- output = example.get("output", example.get("gpt_completion"))
55
- question = example["prompt"].strip()
56
- setting = languge_settings[lang_code]
57
- lang = setting["full_name"]
58
- indent = setting["indent"]
59
-
60
- try:
61
- code_block: str = re.findall(
62
- f"```{lang.lower()}\n(.*?)```", output, re.DOTALL | re.IGNORECASE
63
- )[0]
64
- if verbose:
65
- print(">>> Task: {}\n{}".format(task_id, code_block))
66
-
67
- # Remove main
68
- if setting.get("main", None) and setting["main"] in code_block:
69
- main_start = code_block.index(setting["main"])
70
- code_block = code_block[:main_start]
71
-
72
- func_name, func_prefix = get_function_name(question, lang)
73
-
74
- try:
75
- start = code_block.lower().index(func_name.lower())
76
- indent = 0
77
- while start - indent >= 0 and code_block[start - indent - 1] == " ":
78
- indent += 1
79
-
80
- try:
81
- end = code_block.rindex("\n" + " " * indent + "}")
82
- except:
83
- end = len(code_block)
84
- except:
85
- start = 0
86
- try:
87
- end = code_block.rindex("\n" + " " * indent + "}")
88
- except:
89
- end = len(code_block)
90
-
91
- body = code_block[start:end]
92
-
93
- if lang_code.lower() in ["php", "ts", "js"]:
94
- body += "\n" + " " * indent + "}"
95
-
96
- generation = func_prefix + "\n" + body + "\n"
97
- example["generation"] = generation
98
-
99
- except Exception as ex:
100
- print(
101
- "Failed to extract code block with error `{}`:\n>>> Task: {}\n>>> Output:\n{}".format(
102
- ex, task_id, output
103
- )
104
- )
105
- example["generation"] = example["prompt"] + "\n" + output
106
-
107
- return example
108
-
109
-
110
- def cleanup_code(
111
- code: str,
112
- language_type: str = None,
113
- dataset: str = None,
114
- issft: bool = False,
115
- stop_words=[],
116
- ):
117
- """
118
- Cleans up the generated code.
119
- """
120
-
121
- if language_type.lower() == "python":
122
- if issft:
123
- code = _clean_python_code_for_sft(code)
124
- stop_words = ["\ndef", "\nclass", "\nif", "\n#", "\nprint"]
125
- code = _truncate_code_at_stopwords(code, stop_words)
126
- elif language_type.lower() == "ts":
127
- code = _truncate_code_at_stopwords(
128
- code,
129
- stop_words
130
- + [
131
- "\nexport",
132
- "\nimport",
133
- "\nexport default",
134
- "\nimport default",
135
- "\nconsole.log",
136
- ],
137
- )
138
- else:
139
- code = _truncate_code_at_stopwords(code, stop_words)
140
-
141
- return code
142
-
143
-
144
- def _clean_python_code_for_sft(code):
145
- code = code.replace("\r", "")
146
- if "```python" in code:
147
- code_start_idx = code.index("```python")
148
- code = code[code_start_idx:].replace("```python", "").strip()
149
- end_idx = code.find("```") if "```" in code else len(code)
150
- code = code[:end_idx].strip()
151
-
152
- return code
153
-
154
-
155
- def _truncate_code_at_stopwords(code, stop_words):
156
- min_stop_idx = len(code)
157
- for stop_word in stop_words:
158
- stop_index = code.find(stop_word)
159
- if 0 <= stop_index < min_stop_idx:
160
- min_stop_idx = stop_index
161
- return code[:min_stop_idx]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/general_benchmarks/MATH/LICENSE DELETED
@@ -1,21 +0,0 @@
1
- MIT License
2
-
3
- Copyright (c) 2024 Zhibin Gou
4
-
5
- Permission is hereby granted, free of charge, to any person obtaining a copy
6
- of this software and associated documentation files (the "Software"), to deal
7
- in the Software without restriction, including without limitation the rights
8
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- copies of the Software, and to permit persons to whom the Software is
10
- furnished to do so, subject to the following conditions:
11
-
12
- The above copyright notice and this permission notice shall be included in all
13
- copies or substantial portions of the Software.
14
-
15
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- SOFTWARE.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/general_benchmarks/MATH/README.md DELETED
@@ -1,52 +0,0 @@
1
- ### Requirements
2
- You can install the required packages with the following command:
3
- ```bash
4
- cd latex2sympy
5
- pip install -e .
6
- cd ..
7
- pip install -r requirements.txt
8
- pip install vllm==0.5.1 --no-build-isolation
9
- pip install transformers==4.42.3
10
- ```
11
-
12
- ### Evaluation
13
- You can evaluate Qwen2.5/Qwen2-Math-Instruct series model with the following command:
14
- ```bash
15
- # Qwen2.5-Math-Instruct Series
16
- PROMPT_TYPE="qwen25-math-cot"
17
- # Qwen2.5-Math-1.5B-Instruct
18
- export CUDA_VISIBLE_DEVICES="0"
19
- MODEL_NAME_OR_PATH="Qwen/Qwen2.5-Math-1.5B-Instruct"
20
- bash sh/eval.sh $PROMPT_TYPE $MODEL_NAME_OR_PATH
21
-
22
- # Qwen2.5-Math-7B-Instruct
23
- export CUDA_VISIBLE_DEVICES="0"
24
- MODEL_NAME_OR_PATH="Qwen/Qwen2.5-Math-7B-Instruct"
25
- bash sh/eval.sh $PROMPT_TYPE $MODEL_NAME_OR_PATH
26
-
27
- # Qwen2.5-Math-72B-Instruct
28
- export CUDA_VISIBLE_DEVICES="0,1,2,3"
29
- MODEL_NAME_OR_PATH="Qwen/Qwen2.5-Math-72B-Instruct"
30
- bash sh/eval.sh $PROMPT_TYPE $MODEL_NAME_OR_PATH
31
-
32
-
33
- # Qwen2-Math-Instruct Series
34
- PROMPT_TYPE="qwen-boxed"
35
- # Qwen2-Math-1.5B-Instruct
36
- export CUDA_VISIBLE_DEVICES="0"
37
- MODEL_NAME_OR_PATH="Qwen/Qwen2-Math-1.5B-Instruct"
38
- bash sh/eval.sh $PROMPT_TYPE $MODEL_NAME_OR_PATH
39
-
40
- # Qwen2-Math-7B-Instruct
41
- export CUDA_VISIBLE_DEVICES="0"
42
- MODEL_NAME_OR_PATH="Qwen/Qwen2-Math-7B-Instruct"
43
- bash sh/eval.sh $PROMPT_TYPE $MODEL_NAME_OR_PATH
44
-
45
- # Qwen2-Math-72B-Instruct
46
- export CUDA_VISIBLE_DEVICES="0,1,2,3"
47
- MODEL_NAME_OR_PATH="Qwen/Qwen2-Math-72B-Instruct"
48
- bash sh/eval.sh $PROMPT_TYPE $MODEL_NAME_OR_PATH
49
- ```
50
-
51
- ## Acknowledgement
52
- The codebase is adapted from [math-evaluation-harness](https://github.com/ZubinGou/math-evaluation-harness).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evaluation/general_benchmarks/MATH/data/aime24/test.jsonl DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:af2b8bd2aa911b6333ad0df32f3ca05c7ae8ed10f1731f4372c8ae26990bf7ac
3
- size 156944