apps_metric

Sleeping

App Files Files Community

shunzh commited on May 8, 2023

Commit

cbe9336

•

1 Parent(s): a977e3e

fix _temp_run can't be pickled; pass indices to allow evaluation on subset

Browse files

Files changed (3) hide show

apps_metric.py +2 -2
tests.py +20 -10
utils.py +17 -10

apps_metric.py CHANGED Viewed

@@ -76,7 +76,7 @@ class apps_metric(evaluate.EvaluationModule):
-    def _compute(self, predictions, k_list=[1, 10, 100], count_errors=True, level="all", debug=False):
         """Returns the scores"""
-        metrics = compute_metrics(predictions, k_list=k_list, count_errors=count_errors, level=level, debug=debug)
         return metrics

+    def _compute(self, predictions, indices=None, k_list=[1, 10, 100], count_errors=True, level="all", debug=False):
         """Returns the scores"""
+        metrics = compute_metrics(predictions, indices=indices, k_list=k_list, count_errors=count_errors, level=level, debug=debug)
         return metrics

tests.py CHANGED Viewed

@@ -1,14 +1,24 @@
 import json
-from evaluate import load
-solution_sample1 = json.load(open("test_examples/solutions_problem_1.json", "r"))
-solution_sample2 = json.load(open("test_examples/solutions_problem_2.json", "r"))
-single_solutions = [solution_sample1[:1], solution_sample2[:1]]
-multiple_solutions = [solution_sample1[:3], solution_sample2[:3]]
-metric = load("codeparrot/apps_metric")
-result_1 = metric.compute(predictions=single_solutions, level="all")
-result_2 = metric.compute(predictions=multiple_solutions, level="all", k_list=[1, 2, 3])
-assert result_1 == {'avg_accuracy': 1.0, 'strict_accuracy': 1.0, 'pass_at_k': None}
-assert result_2 == {'avg_accuracy': None, 'strict_accuracy': None, 'pass_at_k': {'pass@1': 1.0, 'pass@2': 1.0, 'pass@3': 1.0}}

 import json
+from multiprocessing import freeze_support
+from apps_metric import apps_metric
+if __name__ == '__main__':
+    """
+    Verify by checking if reference solutions pass all test cases (with strict accuracy == 1).
+    Note that some reference solutions may not pass all test cases. So only throw a warning.
+    """
+    freeze_support()
+    solution_sample1 = json.load(open("test_examples/solutions_problem_1.json", "r"))
+    solution_sample2 = json.load(open("test_examples/solutions_problem_2.json", "r"))
+    single_solutions = [solution_sample1[:1], solution_sample2[:1]]
+    multiple_solutions = [solution_sample1[:3], solution_sample2[:3]]
+    metric = apps_metric()
+    result_1 = metric.compute(predictions=single_solutions, level="all")
+    result_2 = metric.compute(predictions=multiple_solutions, level="all", k_list=[1, 2, 3])
+    assert result_1 == {'avg_accuracy': 1.0, 'strict_accuracy': 1.0, 'pass_at_k': None}
+    assert result_2 == {'avg_accuracy': None, 'strict_accuracy': None, 'pass_at_k': {'pass@1': 1.0, 'pass@2': 1.0, 'pass@3': 1.0}}

utils.py CHANGED Viewed

@@ -9,13 +9,14 @@ from .testing_util import run_test
 DATASET = "codeparrot/apps"
 TIMEOUT = 10
 def check_correctness(sample, generation, timeout, debug=True):
     """Check correctness of code generation with a global timeout.
     The global timeout is to catch some extreme/rare cases not handled by the timeouts
     inside `run_test`"""
-    def _temp_run(sample, generation, debug, result):
-        result.append(run_test(sample, test=generation, debug=debug))
     manager = multiprocessing.Manager()
     result = manager.list()
     p = multiprocessing.Process(target=_temp_run, args=(sample, generation, debug, result))
@@ -32,12 +33,13 @@ def check_correctness(sample, generation, timeout, debug=True):
     return result[0]
-def evaluate_generations(generations: list, level: str = "all", debug: bool = False):
     """We take the list of code generations and try to compile them
      and the run their corresponding unit tests which are retrieved from the APPS dataset.
     Args:
         generations: list of code generations (same order as samples in APPS dataset)
         level: difficulty level used in the generation, can be "all", "introductory", "interview" or "competition"
     Returns:
@@ -47,10 +49,14 @@ def evaluate_generations(generations: list, level: str = "all", debug: bool = Fa
     # generations are code generations in the same order of the dataset
     apps_eval = load_dataset(DATASET, split="test", difficulties=[level])
     results = {}
-    for index in range(len(generations)):
         # code generations for problem (index)
-        problem_generations = generations[index]
         # get corresponding samples from APPS dataset
         sample = apps_eval[index]
         res = []
@@ -74,7 +80,7 @@ def evaluate_generations(generations: list, level: str = "all", debug: bool = Fa
                         print(f"Results were not True for all test cases")
             except Exception as e:
                 if debug:
-                    print(f"Compilation failed, test framework exception = {repr(e)}{e}\n")
                 break
             finally:
                 assert isinstance(curr_res, list)
@@ -125,7 +131,7 @@ def get_results(results: Dict[int, list], count_errors: bool = False, k_list: li
     metrics = {"avg_accuracy": None, "strict_accuracy": None, "pass_at_k": None}
-    if len(results[0]) == 1:
         # for single generations we compute average accuracy and stric accuracy: original APPS metrics
         print("Computing accuracy metrics...")
         res = []
@@ -173,10 +179,11 @@ def get_results(results: Dict[int, list], count_errors: bool = False, k_list: li
         metrics["pass_at_k"] = pass_at_k
     return metrics
-def compute_metrics(generations, level="all", k_list=[1, 10, 100], count_errors=True, debug=False):
     """Return metrics for the given generations.
     Args:
         generations: list of code generations for each problem (each generation is a list of generations)
         k_list: list of k values to compute pass@k when using multiple generations
         count_errors: whether to count compilation and runtime errors when using single generations
         level: difficulty level in APPS dataset that was used for the given generations (from: "all", "introductory", "interview", "competition")
@@ -204,7 +211,7 @@ def compute_metrics(generations, level="all", k_list=[1, 10, 100], count_errors=
     {'pass@1': 1.0, 'pass@2': 1.0, 'pass@3': 1.0}
     {'avg_accuracy': None, 'strict_accuracy': None, 'pass_at_k': {'pass@1': 1.0, 'pass@2': 1.0, 'pass@3': 1.0}}
     """
-    results = evaluate_generations(generations, level=level, debug=debug)
     metrics = get_results(results, count_errors=count_errors, k_list=k_list)
     return metrics

 DATASET = "codeparrot/apps"
 TIMEOUT = 10
+def _temp_run(sample, generation, debug, result):
+    result.append(run_test(sample, test=generation, debug=debug))
 def check_correctness(sample, generation, timeout, debug=True):
     """Check correctness of code generation with a global timeout.
     The global timeout is to catch some extreme/rare cases not handled by the timeouts
     inside `run_test`"""
     manager = multiprocessing.Manager()
     result = manager.list()
     p = multiprocessing.Process(target=_temp_run, args=(sample, generation, debug, result))
     return result[0]
+def evaluate_generations(generations: list, indices: list = [], level: str = "all", debug: bool = False):
     """We take the list of code generations and try to compile them
      and the run their corresponding unit tests which are retrieved from the APPS dataset.
     Args:
         generations: list of code generations (same order as samples in APPS dataset)
+        indices: list of indicies of problems to evaluate, if empty, evaluate all problems
         level: difficulty level used in the generation, can be "all", "introductory", "interview" or "competition"
     Returns:
     # generations are code generations in the same order of the dataset
     apps_eval = load_dataset(DATASET, split="test", difficulties=[level])
+    if indices is None:
+        indices = range(len(generations))
     results = {}
+    for index, generation in zip(indices, generations):
         # code generations for problem (index)
+        problem_generations = generation
         # get corresponding samples from APPS dataset
         sample = apps_eval[index]
         res = []
                         print(f"Results were not True for all test cases")
             except Exception as e:
                 if debug:
+                    print(f"Compilation failed, test framework exception = {repr(e)}\n")
                 break
             finally:
                 assert isinstance(curr_res, list)
     metrics = {"avg_accuracy": None, "strict_accuracy": None, "pass_at_k": None}
+    if len(list(results.values())[0]) == 1:
         # for single generations we compute average accuracy and stric accuracy: original APPS metrics
         print("Computing accuracy metrics...")
         res = []
         metrics["pass_at_k"] = pass_at_k
     return metrics
+def compute_metrics(generations, indices=None, level="all", k_list=[1, 10, 100], count_errors=True, debug=False):
     """Return metrics for the given generations.
     Args:
         generations: list of code generations for each problem (each generation is a list of generations)
+        indices: list of indices of problems (if None, generations are all problems)
         k_list: list of k values to compute pass@k when using multiple generations
         count_errors: whether to count compilation and runtime errors when using single generations
         level: difficulty level in APPS dataset that was used for the given generations (from: "all", "introductory", "interview", "competition")
     {'pass@1': 1.0, 'pass@2': 1.0, 'pass@3': 1.0}
     {'avg_accuracy': None, 'strict_accuracy': None, 'pass_at_k': {'pass@1': 1.0, 'pass@2': 1.0, 'pass@3': 1.0}}
     """
+    results = evaluate_generations(generations, indices=indices, level=level, debug=debug)
     metrics = get_results(results, count_errors=count_errors, k_list=k_list)
     return metrics