|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""The CodeEval metric estimates the pass@k metric for code synthesis. |
|
This is an evaluation harness for the HumanEval problem solving dataset |
|
described in the paper "Evaluating Large Language Models Trained on Code" |
|
(https://arxiv.org/abs/2107.03374).""" |
|
|
|
import itertools |
|
import os |
|
from collections import Counter, defaultdict |
|
from concurrent.futures import CancelledError, ThreadPoolExecutor, as_completed |
|
from typing import List, Optional |
|
import time |
|
|
|
import datasets |
|
import evaluate |
|
import numpy as np |
|
from tqdm import tqdm |
|
from pydantic import BaseModel |
|
|
|
from .execute import check_correctness |
|
|
|
_CITATION = """\ |
|
@misc{chen2021evaluating, |
|
title={Evaluating Large Language Models Trained on Code}, |
|
author={Mark Chen and Jerry Tworek and Heewoo Jun and Qiming Yuan \ |
|
and Henrique Ponde de Oliveira Pinto and Jared Kaplan and Harri Edwards \ |
|
and Yuri Burda and Nicholas Joseph and Greg Brockman and Alex Ray \ |
|
and Raul Puri and Gretchen Krueger and Michael Petrov and Heidy Khlaaf \ |
|
and Girish Sastry and Pamela Mishkin and Brooke Chan and Scott Gray \ |
|
and Nick Ryder and Mikhail Pavlov and Alethea Power and Lukasz Kaiser \ |
|
and Mohammad Bavarian and Clemens Winter and Philippe Tillet \ |
|
and Felipe Petroski Such and Dave Cummings and Matthias Plappert \ |
|
and Fotios Chantzis and Elizabeth Barnes and Ariel Herbert-Voss \ |
|
and William Hebgen Guss and Alex Nichol and Alex Paino and Nikolas Tezak \ |
|
and Jie Tang and Igor Babuschkin and Suchir Balaji and Shantanu Jain \ |
|
and William Saunders and Christopher Hesse and Andrew N. Carr \ |
|
and Jan Leike and Josh Achiam and Vedant Misra and Evan Morikawa \ |
|
and Alec Radford and Matthew Knight and Miles Brundage and Mira Murati \ |
|
and Katie Mayer and Peter Welinder and Bob McGrew and Dario Amodei \ |
|
and Sam McCandlish and Ilya Sutskever and Wojciech Zaremba}, |
|
year={2021}, |
|
eprint={2107.03374}, |
|
archivePrefix={arXiv}, |
|
primaryClass={cs.LG} |
|
} |
|
""" |
|
|
|
_DESCRIPTION = """\ |
|
This metric implements the evaluation harness for the HumanEval problem solving dataset |
|
described in the paper "Evaluating Large Language Models Trained on Code" |
|
(https://arxiv.org/abs/2107.03374). |
|
""" |
|
|
|
|
|
_KWARGS_DESCRIPTION = """ |
|
Calculates how good are predictions given some references, using certain scores |
|
Args: |
|
predictions: list of candidates to evaluate. Each candidates should be a list |
|
of strings with several code candidates to solve the problem. |
|
references: a list with a test for each prediction. Each test should evaluate the |
|
correctness of a code candidate. |
|
k: number of code candidates to consider in the evaluation (Default: [1, 10, 100]) |
|
num_workers: number of workers used to evaluate the canidate programs (Default: 4). |
|
timeout: |
|
Returns: |
|
pass_at_k: dict with pass rates for each k |
|
results: dict with granular results of each unittest |
|
Examples: |
|
>>> code_eval = evaluate.load("code_eval") |
|
>>> test_cases = ["assert add(2,3)==5"] |
|
>>> candidates = [["def add(a,b): return a*b", "def add(a, b): return a+b"]] |
|
>>> pass_at_k, results = code_eval.compute(references=test_cases, predictions=candidates, k=[1, 2]) |
|
>>> print(pass_at_k) |
|
{'pass@1': 0.5, 'pass@2': 1.0} |
|
""" |
|
|
|
|
|
_WARNING = """ |
|
################################################################################ |
|
!!!WARNING!!! |
|
################################################################################ |
|
The "code_eval" metric executes untrusted model-generated code in Python. |
|
Although it is highly unlikely that model-generated code will do something |
|
overtly malicious in response to this test suite, model-generated code may act |
|
destructively due to a lack of model capability or alignment. |
|
Users are strongly encouraged to sandbox this evaluation suite so that it |
|
does not perform destructive actions on their host or network. For more |
|
information on how OpenAI sandboxes its code, see the paper "Evaluating Large |
|
Language Models Trained on Code" (https://arxiv.org/abs/2107.03374). |
|
|
|
Once you have read this disclaimer and taken appropriate precautions, |
|
set the environment variable HF_ALLOW_CODE_EVAL="1". Within Python you can to this |
|
with: |
|
|
|
>>> import os |
|
>>> os.environ["HF_ALLOW_CODE_EVAL"] = "1" |
|
|
|
################################################################################\ |
|
""" |
|
|
|
_LICENSE = """The MIT License |
|
|
|
Copyright (c) OpenAI (https://openai.com) |
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy |
|
of this software and associated documentation files (the "Software"), to deal |
|
in the Software without restriction, including without limitation the rights |
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
|
copies of the Software, and to permit persons to whom the Software is |
|
furnished to do so, subject to the following conditions: |
|
|
|
The above copyright notice and this permission notice shall be included in |
|
all copies or substantial portions of the Software. |
|
|
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
|
THE SOFTWARE.""" |
|
|
|
|
|
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) |
|
class CodeEval(evaluate.Metric): |
|
def _info(self): |
|
return evaluate.MetricInfo( |
|
|
|
description=_DESCRIPTION, |
|
citation=_CITATION, |
|
inputs_description=_KWARGS_DESCRIPTION, |
|
|
|
features=datasets.Features( |
|
{ |
|
"predictions": datasets.Sequence(datasets.Value("string")), |
|
"references": datasets.Sequence(datasets.Value("string")), |
|
} |
|
), |
|
homepage="https://github.com/openai/human-eval", |
|
codebase_urls=["https://github.com/openai/human-eval"], |
|
reference_urls=["https://github.com/openai/human-eval"], |
|
license=_LICENSE, |
|
) |
|
|
|
def _compute( |
|
self, |
|
predictions, |
|
references, |
|
task_ids=None, |
|
k=[1, 10, 100], |
|
num_workers=4, |
|
timeout=3.0, |
|
early_stop=False, |
|
): |
|
"""Returns the scores""" |
|
|
|
if os.getenv("HF_ALLOW_CODE_EVAL", 0) != "1": |
|
raise ValueError(_WARNING) |
|
|
|
if os.name == "nt": |
|
raise NotImplementedError( |
|
"This metric is currently not supported on Windows." |
|
) |
|
|
|
task_ids = task_ids or list(range(len(predictions))) |
|
|
|
with ThreadPoolExecutor(max_workers=num_workers) as executor: |
|
results = {} |
|
for tid, pred, ref in zip(task_ids, predictions, references): |
|
results[tid] = [] |
|
for cid, candidate in enumerate(pred): |
|
result = Result(task_id=tid, completion_id=cid) |
|
for test_case in ref: |
|
assert isinstance(test_case, str) |
|
test_program = candidate + "\n" + test_case |
|
args = (test_program, timeout, tid, cid) |
|
future = executor.submit(check_correctness, *args) |
|
result.add(future) |
|
results[tid].append(result) |
|
|
|
pbar = tqdm(total=sum(len(r) for r in results.values())) |
|
prev_done_count = 0 |
|
done = False |
|
while not done: |
|
done = True |
|
cur_done_count = 0 |
|
for result in results.values(): |
|
for r in result: |
|
if not r.done(): |
|
r.refresh(early_stop) |
|
done = False |
|
else: |
|
cur_done_count += 1 |
|
pbar.update(cur_done_count - prev_done_count) |
|
prev_done_count = cur_done_count |
|
time.sleep(1) |
|
|
|
results = { |
|
task_id: [(r.completion_id, r.dict(exclude={"futures"})) for r in result] |
|
for task_id, result in results.items() |
|
} |
|
|
|
total, correct = [], [] |
|
for result in results.values(): |
|
passed = [r[1]["passed"] for r in result] |
|
total.append(len(passed)) |
|
correct.append(sum(passed)) |
|
total = np.array(total) |
|
correct = np.array(correct) |
|
|
|
ks = k |
|
pass_at_k = { |
|
f"pass@{k}": estimate_pass_at_k(total, correct, k).mean() |
|
for k in ks |
|
if (total >= k).all() |
|
} |
|
|
|
return pass_at_k, results |
|
|
|
|
|
def estimate_pass_at_k(num_samples, num_correct, k): |
|
"""Estimates pass@k of each problem and returns them in an array.""" |
|
|
|
def estimator(n: int, c: int, k: int) -> float: |
|
"""Calculates 1 - comb(n - c, k) / comb(n, k).""" |
|
if n - c < k: |
|
return 1.0 |
|
return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1)) |
|
|
|
if isinstance(num_samples, int): |
|
num_samples_it = itertools.repeat(num_samples, len(num_correct)) |
|
else: |
|
assert len(num_samples) == len(num_correct) |
|
num_samples_it = iter(num_samples) |
|
|
|
return np.array( |
|
[estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)] |
|
) |
|
|
|
|
|
class Result(BaseModel): |
|
task_id: str |
|
completion_id: int |
|
|
|
passed: Optional[bool] = None |
|
result: List[str] = [] |
|
futures: List[object] = [] |
|
|
|
def add(self, future): |
|
self.futures.append(future) |
|
self.result.append(None) |
|
|
|
def refresh(self, early_stop=False): |
|
for i, future in enumerate(self.futures): |
|
if self.result[i] is None and future.done(): |
|
try: |
|
self.result[i] = future.result()["result"] |
|
except CancelledError: |
|
self.result[i] = "Early Stopped" |
|
except Exception as e: |
|
self.result[i] = str(e) |
|
|
|
if early_stop: |
|
|
|
for future in self.futures[i + 1 :]: |
|
future.cancel() |
|
|
|
if all(r is not None for r in self.result): |
|
self.passed = all(r == "passed" for r in self.result) |
|
|
|
def done(self): |
|
return self.passed is not None |
|
|