loubnabnl HF staff commited on
Commit
2a517dd
1 Parent(s): 8805542

Create new file

Browse files
Files changed (1) hide show
  1. apps_metric.py +81 -0
apps_metric.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """TODO: Add a description here."""
15
+
16
+ import evaluate
17
+ import datasets
18
+ from utils import compute_metrics
19
+
20
+
21
+ _CITATION = """\
22
+ @article{hendrycksapps2021,
23
+ title={Measuring Coding Challenge Competence With APPS},
24
+ author={Dan Hendrycks and Steven Basart and Saurav Kadavath and Mantas Mazeika and Akul Arora and Ethan Guo and Collin Burns and Samir Puranik and Horace He and Dawn Song and Jacob Steinhardt},
25
+ journal={NeurIPS},
26
+ year={2021}
27
+ }
28
+ """
29
+
30
+
31
+ _DESCRIPTION = """\
32
+ This is a metric to evaluate code generation using the APPS benchmark "Measuring Coding Challenge Competence With
33
+ APPS" (https://arxiv.org/pdf/2105.09938.pdf).
34
+ """
35
+
36
+
37
+ # TODO: Add description of the arguments of the module here
38
+ _KWARGS_DESCRIPTION = """
39
+ Computes Average accuracy and strict accuracy for single generations, and pass@k for multiple generations.
40
+ Args:
41
+ predictions: list of code generations to score. It's a list of list(s), each corresponding to a problem from APPS dataset.
42
+
43
+ Returns:
44
+ metrics: dict of three metrics: average accuracy, stric accuracy, and pass@k.
45
+ Examples:
46
+ >>> my_new_module = evaluate.load("loubnabnl/apps_metric")
47
+ >>> results = my_new_module.compute(references=["s=inpu()\nprint(s)"])
48
+ >>> print(results)
49
+ {'avg_accuracy': 0, 'strict_accuracy': 0, 'pass_at_k': None}
50
+ """
51
+
52
+
53
+
54
+
55
+ @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
56
+ class apps_metric(evaluate.EvaluationModule):
57
+ """Evaluate code generation on APPS benchmark.
58
+ The generations are compiled and their corresponding unit tests are run"""
59
+
60
+ def _info(self):
61
+
62
+ return evaluate.EvaluationModuleInfo(
63
+
64
+ module_type="metric",
65
+ description=_DESCRIPTION,
66
+ citation=_CITATION,
67
+ inputs_description=_KWARGS_DESCRIPTION,
68
+
69
+ features=datasets.Features({
70
+ 'predictions': datasets.Sequence(datasets.Value("string")),
71
+ }),
72
+ homepage="https://github.com/hendrycks/apps",
73
+ reference_urls=["https://huggingface.co/datasets/codeparrot/apps"]
74
+ )
75
+
76
+
77
+
78
+ def _compute(self, generations, k_list=[1, 10, 100], count_errors=True, level=["all"]):
79
+ """Returns the scores"""
80
+ metrics = compute_metrics(generations, k_list=k_list, count_errors=count_errors, level=level)
81
+ return metrics