diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/__init__.py b/scripts/yans/lm-evaluation-harness/lm_eval/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..317c0291b96f68e5dafe73fa0d704bd33e0eaa9a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/__init__.py @@ -0,0 +1 @@ +from .evaluator import evaluate, simple_evaluate diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/__main__.py b/scripts/yans/lm-evaluation-harness/lm_eval/__main__.py new file mode 100644 index 0000000000000000000000000000000000000000..439cdbeaeef51fdd9976eac33632dd049e7ba173 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/__main__.py @@ -0,0 +1,461 @@ +import argparse +import json +import logging +import os +import sys +from functools import partial +from typing import Union + +from lm_eval import evaluator, utils +from lm_eval.evaluator import request_caching_arg_to_dict +from lm_eval.loggers import EvaluationTracker, WandbLogger +from lm_eval.tasks import TaskManager +from lm_eval.utils import handle_non_serializable, make_table, simple_parse_args_string + + +def _int_or_none_list_arg_type( + min_len: int, max_len: int, defaults: str, value: str, split_char: str = "," +): + def parse_value(item): + item = item.strip().lower() + if item == "none": + return None + try: + return int(item) + except ValueError: + raise argparse.ArgumentTypeError(f"{item} is not an integer or None") + + items = [parse_value(v) for v in value.split(split_char)] + num_items = len(items) + + if num_items == 1: + # Makes downstream handling the same for single and multiple values + items = items * max_len + elif num_items < min_len or num_items > max_len: + raise argparse.ArgumentTypeError( + f"Argument requires {max_len} integers or None, separated by '{split_char}'" + ) + elif num_items != max_len: + logging.warning( + f"Argument requires {max_len} integers or None, separated by '{split_char}'. " + "Missing values will be filled with defaults." + ) + default_items = [parse_value(v) for v in defaults.split(split_char)] + items.extend( + default_items[num_items:] + ) # extend items list with missing defaults + + return items + + +def check_argument_types(parser: argparse.ArgumentParser): + """ + Check to make sure all CLI args are typed, raises error if not + """ + for action in parser._actions: + if action.dest != "help" and not action.const: + if action.type is None: + raise ValueError( + f"Argument '{action.dest}' doesn't have a type specified." + ) + else: + continue + + +def setup_parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument( + "--model", "-m", type=str, default="hf", help="Name of model e.g. `hf`" + ) + parser.add_argument( + "--tasks", + "-t", + default=None, + type=str, + metavar="task1,task2", + help="Comma-separated list of task names or task groupings to evaluate on.\nTo get full list of tasks, use one of the commands `lm-eval --tasks {{list_groups,list_subtasks,list_tags,list}}` to list out all available names for task groupings; only (sub)tasks; tags; or all of the above", + ) + parser.add_argument( + "--model_args", + "-a", + default="", + type=str, + help="Comma separated string arguments for model, e.g. `pretrained=EleutherAI/pythia-160m,dtype=float32`", + ) + parser.add_argument( + "--num_fewshot", + "-f", + type=int, + default=None, + metavar="N", + help="Number of examples in few-shot context", + ) + parser.add_argument( + "--batch_size", + "-b", + type=str, + default=1, + metavar="auto|auto:N|N", + help="Acceptable values are 'auto', 'auto:N' or N, where N is an integer. Default 1.", + ) + parser.add_argument( + "--max_batch_size", + type=int, + default=None, + metavar="N", + help="Maximal batch size to try with --batch_size auto.", + ) + parser.add_argument( + "--device", + type=str, + default=None, + help="Device to use (e.g. cuda, cuda:0, cpu).", + ) + parser.add_argument( + "--output_path", + "-o", + default=None, + type=str, + metavar="DIR|DIR/file.json", + help="The path to the output file where the result metrics will be saved. If the path is a directory and log_samples is true, the results will be saved in the directory. Else the parent directory will be used.", + ) + parser.add_argument( + "--limit", + "-L", + type=float, + default=None, + metavar="N|0 argparse.Namespace: + check_argument_types(parser) + return parser.parse_args() + + +def cli_evaluate(args: Union[argparse.Namespace, None] = None) -> None: + if not args: + # we allow for args to be passed externally, else we parse them ourselves + parser = setup_parser() + args = parse_eval_args(parser) + + if args.wandb_args: + wandb_logger = WandbLogger(**simple_parse_args_string(args.wandb_args)) + + eval_logger = utils.eval_logger + eval_logger.setLevel(getattr(logging, f"{args.verbosity}")) + eval_logger.info(f"Verbosity set to {args.verbosity}") + os.environ["TOKENIZERS_PARALLELISM"] = "false" + + # update the evaluation tracker args with the output path and the HF token + if args.output_path: + args.hf_hub_log_args += f",output_path={args.output_path}" + if os.environ.get("HF_TOKEN", None): + args.hf_hub_log_args += f",token={os.environ.get('HF_TOKEN')}" + evaluation_tracker_args = simple_parse_args_string(args.hf_hub_log_args) + evaluation_tracker = EvaluationTracker(**evaluation_tracker_args) + + if args.predict_only: + args.log_samples = True + if (args.log_samples or args.predict_only) and not args.output_path: + raise ValueError( + "Specify --output_path if providing --log_samples or --predict_only" + ) + + if args.fewshot_as_multiturn and args.apply_chat_template is False: + raise ValueError( + "If fewshot_as_multiturn is set, apply_chat_template must be set to True." + ) + + if ( + args.num_fewshot is None or args.num_fewshot == 0 + ) and args.fewshot_as_multiturn: + raise ValueError( + "If fewshot_as_multiturn is set, num_fewshot must be greater than 0." + ) + + if args.include_path is not None: + eval_logger.info(f"Including path: {args.include_path}") + task_manager = TaskManager(args.verbosity, include_path=args.include_path) + + if "push_samples_to_hub" in evaluation_tracker_args and not args.log_samples: + eval_logger.warning( + "Pushing samples to the Hub requires --log_samples to be set. Samples will not be pushed to the Hub." + ) + + if args.limit: + eval_logger.warning( + " --limit SHOULD ONLY BE USED FOR TESTING." + "REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT." + ) + + if args.tasks is None: + eval_logger.error("Need to specify task to evaluate.") + sys.exit() + elif args.tasks == "list": + print(task_manager.list_all_tasks()) + sys.exit() + elif args.tasks == "list_groups": + print(task_manager.list_all_tasks(list_subtasks=False, list_tags=False)) + sys.exit() + elif args.tasks == "list_tags": + print(task_manager.list_all_tasks(list_groups=False, list_subtasks=False)) + sys.exit() + elif args.tasks == "list_subtasks": + print(task_manager.list_all_tasks(list_groups=False, list_tags=False)) + sys.exit() + else: + if os.path.isdir(args.tasks): + import glob + + task_names = [] + yaml_path = os.path.join(args.tasks, "*.yaml") + for yaml_file in glob.glob(yaml_path): + config = utils.load_yaml_config(yaml_file) + task_names.append(config) + else: + task_list = args.tasks.split(",") + task_names = task_manager.match_tasks(task_list) + for task in [task for task in task_list if task not in task_names]: + if os.path.isfile(task): + config = utils.load_yaml_config(task) + task_names.append(config) + task_missing = [ + task for task in task_list if task not in task_names and "*" not in task + ] # we don't want errors if a wildcard ("*") task name was used + + if task_missing: + missing = ", ".join(task_missing) + eval_logger.error( + f"Tasks were not found: {missing}\n" + f"{utils.SPACING}Try `lm-eval --tasks list` for list of available tasks", + ) + raise ValueError( + f"Tasks not found: {missing}. Try `lm-eval --tasks {{list_groups,list_subtasks,list_tags,list}}` to list out all available names for task groupings; only (sub)tasks; tags; or all of the above, or pass '--verbosity DEBUG' to troubleshoot task registration issues." + ) + + # Respect user's value passed in via CLI, otherwise default to True and add to comma-separated model args + if args.trust_remote_code: + eval_logger.info( + "Passed `--trust_remote_code`, setting environment variable `HF_DATASETS_TRUST_REMOTE_CODE=true`" + ) + # HACK: import datasets and override its HF_DATASETS_TRUST_REMOTE_CODE value internally, + # because it's already been determined based on the prior env var before launching our + # script--`datasets` gets imported by lm_eval internally before these lines can update the env. + import datasets + + datasets.config.HF_DATASETS_TRUST_REMOTE_CODE = True + + args.model_args = args.model_args + ",trust_remote_code=True" + + eval_logger.info(f"Selected Tasks: {task_names}") + + request_caching_args = request_caching_arg_to_dict( + cache_requests=args.cache_requests + ) + + results = evaluator.simple_evaluate( + model=args.model, + model_args=args.model_args, + tasks=task_names, + num_fewshot=args.num_fewshot, + batch_size=args.batch_size, + max_batch_size=args.max_batch_size, + device=args.device, + use_cache=args.use_cache, + limit=args.limit, + check_integrity=args.check_integrity, + write_out=args.write_out, + log_samples=args.log_samples, + evaluation_tracker=evaluation_tracker, + system_instruction=args.system_instruction, + apply_chat_template=args.apply_chat_template, + fewshot_as_multiturn=args.fewshot_as_multiturn, + gen_kwargs=args.gen_kwargs, + task_manager=task_manager, + verbosity=args.verbosity, + predict_only=args.predict_only, + random_seed=args.seed[0], + numpy_random_seed=args.seed[1], + torch_random_seed=args.seed[2], + fewshot_random_seed=args.seed[3], + **request_caching_args, + ) + + if results is not None: + if args.log_samples: + samples = results.pop("samples") + dumped = json.dumps( + results, indent=2, default=handle_non_serializable, ensure_ascii=False + ) + if args.show_config: + print(dumped) + + batch_sizes = ",".join(map(str, results["config"]["batch_sizes"])) + + # Add W&B logging + if args.wandb_args: + try: + wandb_logger.post_init(results) + wandb_logger.log_eval_result() + if args.log_samples: + wandb_logger.log_eval_samples(samples) + except Exception as e: + eval_logger.info(f"Logging to Weights and Biases failed due to {e}") + + evaluation_tracker.save_results_aggregated( + results=results, samples=samples if args.log_samples else None + ) + + if args.log_samples: + for task_name, config in results["configs"].items(): + evaluation_tracker.save_results_samples( + task_name=task_name, samples=samples[task_name] + ) + + if ( + evaluation_tracker.push_results_to_hub + or evaluation_tracker.push_samples_to_hub + ): + evaluation_tracker.recreate_metadata_card() + + print( + f"{args.model} ({args.model_args}), gen_kwargs: ({args.gen_kwargs}), limit: {args.limit}, num_fewshot: {args.num_fewshot}, " + f"batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}" + ) + print(make_table(results)) + if "groups" in results: + print(make_table(results, "groups")) + + if args.wandb_args: + # Tear down wandb run once all the logging is done. + wandb_logger.run.finish() + + +if __name__ == "__main__": + cli_evaluate() diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/evaluator.py b/scripts/yans/lm-evaluation-harness/lm_eval/evaluator.py new file mode 100644 index 0000000000000000000000000000000000000000..ea4746753c6e036efcbc988a7ccc3a67bf0dbf0c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/evaluator.py @@ -0,0 +1,649 @@ +import itertools +import json +import logging +import random +import time +from collections import defaultdict +from typing import TYPE_CHECKING, List, Optional, Union + +import numpy as np +import torch + +import lm_eval.api.metrics +import lm_eval.api.registry +import lm_eval.api.task +import lm_eval.models +from lm_eval.caching.cache import delete_cache +from lm_eval.evaluator_utils import ( + consolidate_group_results, + consolidate_results, + get_sample_size, + get_subtask_list, + get_task_list, + prepare_print_tasks, + print_writeout, + run_task_tests, +) +from lm_eval.loggers import EvaluationTracker +from lm_eval.loggers.utils import add_env_info, add_tokenizer_info, get_git_commit_hash +from lm_eval.tasks import ( + TaskManager, + get_task_dict, +) +from lm_eval.utils import ( + eval_logger, + handle_non_serializable, + hash_string, + positional_deprecated, + simple_parse_args_string, +) + + +if TYPE_CHECKING: + from lm_eval.api.model import LM + from lm_eval.api.task import Task + + +@positional_deprecated +def simple_evaluate( + model, + model_args: Optional[Union[str, dict]] = None, + tasks: Optional[List[Union[str, dict, object]]] = None, + num_fewshot: Optional[int] = None, + batch_size: Optional[Union[int, str]] = None, + max_batch_size: Optional[int] = None, + device: Optional[str] = None, + use_cache: Optional[str] = None, + cache_requests: bool = False, + rewrite_requests_cache: bool = False, + delete_requests_cache: bool = False, + limit: Optional[Union[int, float]] = None, + bootstrap_iters: int = 100000, + check_integrity: bool = False, + write_out: bool = False, + log_samples: bool = True, + evaluation_tracker: Optional[EvaluationTracker] = None, + system_instruction: Optional[str] = None, + apply_chat_template: bool = False, + fewshot_as_multiturn: bool = False, + gen_kwargs: Optional[str] = None, + task_manager: Optional[TaskManager] = None, + verbosity: str = "INFO", + predict_only: bool = False, + random_seed: int = 0, + numpy_random_seed: int = 1234, + torch_random_seed: int = 1234, + fewshot_random_seed: int = 1234, +): + """Instantiate and evaluate a model on a list of tasks. + + :param model: Union[str, LM] + Name of model or LM object, see lm_eval.models.get_model + :param model_args: Optional[str, dict] + String or dict arguments for each model class, see LM.create_from_arg_string and LM.create_from_arg_object. + Ignored if `model` argument is a LM object. + :param tasks: list[Union[str, dict, Task]] + List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise. + :param num_fewshot: int + Number of examples in few-shot context + :param batch_size: int or str, optional + Batch size for model + :param max_batch_size: int, optional + Maximal batch size to try with automatic batch size detection + :param device: str, optional + PyTorch device (e.g. "cpu" or "cuda:0") for running models + :param use_cache: str, optional + A path to a sqlite db file for caching model responses. `None` if not caching. + :param cache_requests: bool, optional + Speed up evaluation by caching the building of dataset requests. `None` if not caching. + :param rewrite_requests_cache: bool, optional + Rewrites all of the request cache if set to `True`. `None` if not desired. + :param delete_requests_cache: bool, optional + Deletes all of the request cache if set to `True`. `None` if not desired. + :param limit: int or float, optional + Limit the number of examples per task (only use this for testing), If <1, limit is a percentage of the total number of examples. + :param bootstrap_iters: + Number of iterations for bootstrap statistics, used when calculating stderrs. set to 0 for no stderr calculations to be performed. + :param check_integrity: bool + Whether to run the relevant part of the test suite for the tasks + :param write_out: bool + If True, write out an example document and model input for checking task integrity + :param log_samples: bool + If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis + :param system_instruction: str + System instruction to be applied to the prompt + :param apply_chat_template: bool + If True, apply chat template to the prompt + :param fewshot_as_multiturn: bool + Whether to provide the fewshot examples as a multiturn conversation or a single user turn. + :param gen_kwargs: str + String arguments for model generation + Ignored for all tasks with loglikelihood output_type + :param predict_only: bool + If true only model outputs will be generated and returned. Metrics will not be evaluated + :param random_seed: int + Random seed for python's random module. If set to None, the seed will not be set. + :param numpy_random_seed: int + Random seed for numpy. If set to None, the seed will not be set. + :param torch_random_seed: int + Random seed for torch. If set to None, the seed will not be set. + :param fewshot_random_seed: int + Random seed for fewshot sampler random generator. If set to None, the seed of generator will be set to None. + + :return + Dictionary of results + """ + eval_logger.setLevel(getattr(logging, f"{verbosity}")) + start_date = time.time() + + if delete_requests_cache: + eval_logger.info("Deleting requests cache...") + delete_cache() + + seed_message = [] + if random_seed is not None: + # See https://github.com/EleutherAI/lm-evaluation-harness/pull/1412 + seed_message.append(f"Setting random seed to {random_seed}") + random.seed(random_seed) + + if numpy_random_seed is not None: + seed_message.append(f"Setting numpy seed to {numpy_random_seed}") + np.random.seed(numpy_random_seed) + + if torch_random_seed is not None: + seed_message.append(f"Setting torch manual seed to {torch_random_seed}") + torch.manual_seed(torch_random_seed) + + if seed_message: + eval_logger.info(" | ".join(seed_message)) + + if tasks is None: + tasks = [] + if len(tasks) == 0: + raise ValueError( + "No tasks specified, or no tasks found. Please verify the task names." + ) + + if gen_kwargs is not None: + gen_kwargs = simple_parse_args_string(gen_kwargs) + eval_logger.warning( + "generation_kwargs specified through cli, these settings will update set parameters in yaml tasks. " + "Ensure 'do_sample=True' for non-greedy decoding!" + ) + if gen_kwargs == "": + gen_kwargs = None + + if isinstance(model, str): + if model_args is None: + eval_logger.warning("model_args not specified. Using defaults.") + model_args = "" + + if isinstance(model_args, dict): + eval_logger.info( + f"Initializing {model} model, with arguments: {model_args}" + ) + lm = lm_eval.api.registry.get_model(model).create_from_arg_obj( + model_args, + { + "batch_size": batch_size, + "max_batch_size": max_batch_size, + "device": device, + }, + ) + + else: + eval_logger.info( + f"Initializing {model} model, with arguments: {simple_parse_args_string(model_args)}" + ) + lm = lm_eval.api.registry.get_model(model).create_from_arg_string( + model_args, + { + "batch_size": batch_size, + "max_batch_size": max_batch_size, + "device": device, + }, + ) + else: + if not isinstance(model, lm_eval.api.model.LM): + raise TypeError + eval_logger.info("Using pre-initialized model") + lm = model + + if use_cache is not None: + eval_logger.info(f"Using cache at {use_cache + '_rank' + str(lm.rank) + '.db'}") + lm = lm_eval.api.model.CachingLM( + lm, + use_cache + # each rank receives a different cache db. + # necessary to avoid multiple writes to cache at once + + "_rank" + + str(lm.rank) + + ".db", + ) + + if task_manager is None: + task_manager = TaskManager(verbosity) + + task_dict = get_task_dict(tasks, task_manager) + + # helper function to recursively apply config overrides to leaf subtasks, skipping their constituent groups. + # (setting of num_fewshot ; bypassing metric calculation ; setting fewshot seed) + def _adjust_config(task_dict): + adjusted_task_dict = {} + for task_name, task_obj in task_dict.items(): + if isinstance(task_obj, dict): + adjusted_task_dict = { + **adjusted_task_dict, + **{task_name: _adjust_config(task_obj)}, + } + + else: + if task_obj.get_config("output_type") == "generate_until": + if gen_kwargs is not None: + task_obj.set_config( + key="generation_kwargs", value=gen_kwargs, update=True + ) + + if predict_only: + eval_logger.info( + f"Processing {task_name} in output-only mode. Metrics will not be calculated!" + ) + # we have to change the class properties post-hoc. This is pretty hacky. + task_obj.override_metric(metric_name="bypass") + + # override tasks' fewshot values to the provided num_fewshot arg value + # except if tasks have it set to 0 manually in their configs--then we should never overwrite that + if num_fewshot is not None: + if (default_num_fewshot := task_obj.get_config("num_fewshot")) == 0: + eval_logger.info( + f"num_fewshot has been set to 0 for {task_name} in its config. Manual configuration will be ignored." + ) + else: + eval_logger.warning( + f"Overwriting default num_fewshot of {task_name} from {default_num_fewshot} to {num_fewshot}" + ) + task_obj.set_config(key="num_fewshot", value=num_fewshot) + else: + # if num_fewshot not provided, and the task does not define a default one, default to 0 + if ( + default_num_fewshot := task_obj.get_config("num_fewshot") + ) is None: + task_obj.set_config(key="num_fewshot", value=0) + # fewshot_random_seed set for tasks, even with a default num_fewshot (e.g. in the YAML file) + task_obj.set_fewshot_seed(seed=fewshot_random_seed) + eval_logger.info( + f"Setting fewshot random generator seed to {fewshot_random_seed}" + ) + + adjusted_task_dict[task_name] = task_obj + + return adjusted_task_dict + + task_dict = _adjust_config(task_dict) + + if check_integrity: + run_task_tests(task_list=tasks) + + if evaluation_tracker is not None: + evaluation_tracker.general_config_tracker.log_experiment_args( + model_source=model, + model_args=model_args, + system_instruction=system_instruction, + chat_template=lm.chat_template if apply_chat_template else None, + fewshot_as_multiturn=fewshot_as_multiturn, + ) + + results = evaluate( + lm=lm, + task_dict=task_dict, + limit=limit, + cache_requests=cache_requests, + rewrite_requests_cache=rewrite_requests_cache, + bootstrap_iters=bootstrap_iters, + write_out=write_out, + log_samples=True if predict_only else log_samples, + system_instruction=system_instruction, + apply_chat_template=apply_chat_template, + fewshot_as_multiturn=fewshot_as_multiturn, + verbosity=verbosity, + ) + + if lm.rank == 0: + if isinstance(model, str): + model_name = model + elif hasattr(model, "config") and hasattr(model.config, "_name_or_path"): + model_name = model.config._name_or_path + else: + model_name = type(model).__name__ + + # add info about the model and few shot config + results["config"] = { + "model": model_name, + "model_args": model_args, + } + # add more detailed model info if available + if isinstance(lm, lm_eval.models.huggingface.HFLM): + results["config"].update(lm.get_model_info()) + # add info about execution + results["config"].update( + { + "batch_size": batch_size, + "batch_sizes": ( + list(lm.batch_sizes.values()) if hasattr(lm, "batch_sizes") else [] + ), + "device": device, + "use_cache": use_cache, + "limit": limit, + "bootstrap_iters": bootstrap_iters, + "gen_kwargs": gen_kwargs, + "random_seed": random_seed, + "numpy_seed": numpy_random_seed, + "torch_seed": torch_random_seed, + "fewshot_seed": fewshot_random_seed, + } + ) + results["git_hash"] = get_git_commit_hash() + results["date"] = start_date + add_env_info(results) # additional environment info to results + add_tokenizer_info(results, lm) # additional info about tokenizer + return results + else: + return None + + +@positional_deprecated +def evaluate( + lm: "LM", + task_dict, + limit: Optional[int] = None, + cache_requests: bool = False, + rewrite_requests_cache: bool = False, + bootstrap_iters: Optional[int] = 100000, + write_out: bool = False, + log_samples: bool = True, + system_instruction: Optional[str] = None, + apply_chat_template: bool = False, + fewshot_as_multiturn: bool = False, + verbosity: str = "INFO", +): + """Instantiate and evaluate a model on a list of tasks. + + :param lm: obj + Language Model + :param task_dict: dict[str, Task] + Dictionary of tasks. Tasks will be taken to have name type(task).config.task . + :param limit: int, optional + Limit the number of examples per task (only use this for testing) + :param bootstrap_iters: + Number of iterations for bootstrap statistics, used when calculating stderr. Set to 0 for skipping all stderr calculations. + :param write_out: bool + If True, write out an example document and model input for checking task integrity + :param log_samples: bool + If True, write out all model outputs and documents for per-sample measurement and post-hoc analysis + :param system_instruction: str + System instruction to be applied to the prompt + :param apply_chat_template: bool + If True, apply chat template to the prompt + :param fewshot_as_multiturn: bool + Whether to provide the fewshot examples as a multiturn conversation or a single user turn. + :return + Dictionary of results + """ + + eval_logger.setLevel(getattr(logging, f"{verbosity}")) + + # tracks all Instances/requests a model must generate output on. + requests = defaultdict(list) + # stores the amount to pad out reqs per req. type so that + # number of fwd passes per distributed rank is equal + padding_requests = defaultdict(int) + + # get lists of group hierarchy and each type of request + eval_tasks = get_task_list(task_dict) + if not log_samples: + if not all( + "bypass" not in getattr(task_output.task, "_metric_fn_list", {}).keys() + for task_output in eval_tasks + ): + raise ValueError("log_samples must be True for 'bypass' metric-only tasks") + for task_output in eval_tasks: + task: Task = task_output.task + limit = get_sample_size(task, limit) + task.build_all_requests( + limit=limit, + rank=lm.rank, + world_size=lm.world_size, + cache_requests=cache_requests, + rewrite_requests_cache=rewrite_requests_cache, + system_instruction=system_instruction, + apply_chat_template=apply_chat_template, + fewshot_as_multiturn=fewshot_as_multiturn, + chat_template=getattr(lm, "apply_chat_template") + if apply_chat_template + else None, + tokenizer_name=getattr(lm, "tokenizer_name", "") + if apply_chat_template + else "", + ) + eval_logger.debug( + f"Task: {task_output.task_name}; number of requests on this rank: {len(task.instances)}" + ) + if write_out: + print_writeout(task) + # aggregate Instances by LM method requested to get output. + for instance in task.instances: + reqtype = instance.request_type + requests[reqtype].append(instance) + + if lm.world_size > 1: + instances_rnk = torch.tensor(len(task._instances), device=lm.device) + gathered_item = ( + lm.accelerator.gather(instances_rnk).cpu().detach().numpy().tolist() + ) + # "multiple_choice" task types dispatch (several) "loglikelihood" request types + reqtype = ( + "loglikelihood" + if task.OUTPUT_TYPE == "multiple_choice" + else task.OUTPUT_TYPE + ) + # compute number of pseudo-batches to pad with (FSDP/DDP require even batches among ranks) + numpad = max(gathered_item) - gathered_item[lm.rank] + # todo: may not account for padding in cases like SquadV2 which has multiple req types + padding_requests[reqtype] += numpad + + ### Run LM on inputs, get all outputs ### + # execute each type of request + for reqtype, reqs in requests.items(): + eval_logger.info(f"Running {reqtype} requests") + # create `K` copies of each request `req` based off `K = req.repeats` + cloned_reqs = [] + for req in reqs: + cloned_reqs.extend([req] * req.repeats) + + if (lm.world_size > 1) and (padding_requests[reqtype] > 0): + for _ in range(padding_requests[reqtype]): + cloned_reqs.extend([req] * req.repeats) + + # run requests through model + resps = getattr(lm, reqtype)(cloned_reqs) + + # put responses from model into a list of length K for each request. + for x, req in zip(resps, cloned_reqs): + req.resps.append(x) + + if lm.world_size > 1: + lm.accelerator.wait_for_everyone() + + RANK = lm.rank + WORLD_SIZE = lm.world_size + ### Postprocess outputs ### + # TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately) + for task_output in eval_tasks: + task = task_output.task + task.apply_filters() + + ### Collect values of metrics on all datapoints ### + # # unpack results and sort back in order and return control to Task + # TODO: make it possible to use a different metric per filter + # Pre-process task.instances to group by doc_id + instances_by_doc_id = defaultdict(list) + for instance in task.instances: + instances_by_doc_id[instance.doc_id].append(instance) + # Sort instances within each group + for instances in instances_by_doc_id.values(): + instances.sort(key=lambda x: x.idx) + # iterate over different filters used + for filter_key in task.instances[0].filtered_resps.keys(): + doc_iterator = task.doc_iterator( + rank=RANK, limit=limit, world_size=WORLD_SIZE + ) + for doc_id, doc in doc_iterator: + requests = instances_by_doc_id[doc_id] + metrics = task.process_results( + doc, [req.filtered_resps[filter_key] for req in requests] + ) + if log_samples: + target = task.doc_to_target(doc) + example = { + "doc_id": doc_id, + "doc": doc, + "target": target, + "arguments": [req.args for req in requests], + "resps": [req.resps for req in requests], + "filtered_resps": [ + req.filtered_resps[filter_key] for req in requests + ], + "doc_hash": hash_string( + json.dumps( + requests[0].doc, + indent=2, + default=handle_non_serializable, + ensure_ascii=False, + ) + ), + "prompt_hash": hash_string(requests[0].arguments[0]), + "target_hash": hash_string(str(target)), + } + example.update(metrics) + task_output.logged_samples.append(example) + for metric, value in metrics.items(): + task_output.sample_metrics[(metric, filter_key)].append(value) + + if WORLD_SIZE > 1: + # if multigpu, then gather data across all ranks to rank 0 + # first gather logged samples across all ranks + for task_output in eval_tasks: + if log_samples: + # for task_name, task_samples in list(samples.items()): + full_samples = [None] * WORLD_SIZE if RANK == 0 else None + torch.distributed.gather_object( + obj=task_output.logged_samples, + object_gather_list=full_samples, + dst=0, + ) + + if RANK == 0: + task_output.logged_samples = list( + itertools.chain.from_iterable(full_samples) + ) + + # then collect metrics across all ranks + for metrics in task_output.sample_metrics: + metric_list = [None] * WORLD_SIZE if RANK == 0 else None + torch.distributed.gather_object( + obj=task_output.sample_metrics[metrics], + object_gather_list=metric_list, + dst=0, + ) + if RANK == 0: + task_output.sample_metrics[metrics] = list( + itertools.chain.from_iterable(metric_list) + ) + + if RANK == 0: + ### Aggregate results over all datapoints ### + # aggregate results ; run bootstrap CIs + for task_output in eval_tasks: + task_output.calculate_aggregate_metric(bootstrap_iters=bootstrap_iters) + ( + results, + samples, + configs, + versions, + num_fewshot, + higher_is_better, + ) = consolidate_results(eval_tasks) + + ### Calculate group metrics ### + if bool(results): + results, versions, show_group_table, *_ = consolidate_group_results( + results, versions, task_dict + ) + + results_agg, group_agg = prepare_print_tasks(task_dict, results) + subtask_list = get_subtask_list(task_dict) + + # collect all higher_is_better values for metrics + # in the group's subtasks. + # TODO: clean this up ; unify with the below metric_list loop? + _higher_is_better = {} + for group, task_list in subtask_list.items(): + if ( + len(task_list) != 0 + ): # subtask list will list "task_name": [] for solo tasks + for task in task_list: + for m, h in higher_is_better[task].items(): + if m not in _higher_is_better.keys(): + _higher_is_better[m] = h + + if ( + m in _higher_is_better + and _higher_is_better[m] is not None + and _higher_is_better[m] != h + ): + eval_logger.warning( + f"Higher_is_better values for metric {m} in group {group} are not consistent. Defaulting to None." + ) + _higher_is_better[m] = None + higher_is_better[group] = _higher_is_better + + results_dict = { + "results": dict(results_agg.items()), + **( + {"groups": dict(group_agg.items())} + if (bool(group_agg) & show_group_table) + else {} + ), + "group_subtasks": dict(reversed(subtask_list.items())), + "configs": dict(sorted(configs.items())), + "versions": dict(sorted(versions.items())), + "n-shot": dict(sorted(num_fewshot.items())), + "higher_is_better": dict(sorted(higher_is_better.items())), + "n-samples": { + task_output.task_name: { + "original": len(task_output.task.eval_docs), + "effective": min( + limit if limit else len(task_output.task.eval_docs), + len(task_output.task.eval_docs), + ), + } + for task_output in eval_tasks + }, + } + if log_samples: + results_dict["samples"] = dict(samples) + + return results_dict + + else: + return None + + +def request_caching_arg_to_dict(cache_requests: str) -> dict: + request_caching_args = { + "cache_requests": cache_requests in {"true", "refresh"}, + "rewrite_requests_cache": cache_requests == "refresh", + "delete_requests_cache": cache_requests == "delete", + } + + return request_caching_args diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/evaluator_utils.py b/scripts/yans/lm-evaluation-harness/lm_eval/evaluator_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..80ef759adeef060920eef76ee85e7b699131cbda --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/evaluator_utils.py @@ -0,0 +1,542 @@ +import collections +import math +import pathlib +import sys +from typing import List, Optional, Tuple, Union + +from lm_eval.api.group import ConfigurableGroup +from lm_eval.api.metrics import ( + aggregate_subtask_metrics, + pooled_sample_stderr, + stderr_for_metric, +) +from lm_eval.api.task import Task +from lm_eval.utils import eval_logger, positional_deprecated + + +class TaskOutput: + """ + Wrapper class for Task outputs.It contains various attributes and methods to manage and calculate metrics for the task. + + Attributes: + task (object): The task object. + task_name (str): The name of the task. + task_config (dict): The configuration of the task. + version (str): The version of the task. + group_name (str): The name of the task group. + n_shot (int): The number of shots for the task. + task_alias (str): The alias of the task. + group_alias (str): The alias of the task group. + is_group (bool): Indicates if the task is a group. + logged_samples (list): The list of logged samples. + sample_len (int): The length of the samples. + sample_metrics (defaultdict): The dictionary of samples' metrics. + agg_metrics (defaultdict): The dictionary of aggregate metrics. + + Methods: + from_taskdict(cls, task_name: str, task): + Creates a TaskOutput instance from a task dictionary. + + calculate_aggregate_metric(bootstrap_iters=100000) -> None: + Calculates the aggregate metrics for the task. + """ + + def __init__( + self, + task=None, + task_name=None, + task_config=None, + version=None, + group_name=None, + n_shot=None, + task_alias=None, + group_alias=None, + is_group=None, + ): + self.task = task + self.task_config = task_config + self.task_name = task_name + self.group_name = group_name + self.version = version + self.n_shot = n_shot + self.task_alias = task_alias + self.group_alias = group_alias + self.is_group = is_group + self.logged_samples = [] + self.sample_len = None + self.sample_metrics = collections.defaultdict(list) + self.agg_metrics = collections.defaultdict(list) + + @classmethod + def from_taskdict(cls, task_name: str, task): + if isinstance(task, tuple): + group_name, task = task + else: + group_name = None + if not task: + # these gets filtered out in get_task_list + # once they are added to group hierarchy + is_group = True + return cls( + task=task, task_name=task_name, is_group=is_group, group_name=group_name + ) + version = task.VERSION + task_config = dict(task.dump_config()) + if (n_shot := task_config.get("num_fewshot")) == 0: + n_shot = task_config.get("metadata", {}).get("num_fewshot", 0) + task_alias = task_config.get("alias") + group_alias = task_config.get("group_alias") + return cls( + task=task, + task_name=task_name, + task_config=task_config, + group_name=group_name, + version=version, + n_shot=n_shot, + task_alias=task_alias, + group_alias=group_alias, + ) + + def calculate_aggregate_metric(self, bootstrap_iters=100000) -> None: + for (metric, filter_key), items in self.sample_metrics.items(): + agg_fn = self.task.aggregation()[metric] + metric_key = f"{metric},{filter_key}" + self.agg_metrics[metric_key] = agg_fn(items) + self.sample_len = len(items) # TODO: same sample size for each metric? + if isinstance(bootstrap_iters, int): + stderr_fn = stderr_for_metric( + metric=agg_fn, + bootstrap_iters=min(bootstrap_iters, 100) + if metric in ["bleu", "chrf", "ter"] + else bootstrap_iters, + ) + self.agg_metrics[f"{metric}_stderr,{filter_key}"] = ( + stderr_fn(items) if (stderr_fn and len(items) > 1) else "N/A" + ) + else: + raise ValueError( + f"Received bootstrap_iters '{bootstrap_iters}' but expected an integer. Set to 0 to turn off stderr calculations." + ) + + def __repr__(self): + return ( + f"TaskOutput(task_name={self.task_name}, " + f"group_name={self.group_name}, " + f"version={self.version}, " + f"n_shot={self.n_shot}, " + f"task_alias={self.task_alias}, " + f"group_alias={self.group_alias})" + ) + + +def get_task_list(task_dict: dict) -> List[TaskOutput]: + outputs = [] + for task_name, task_obj in task_dict.items(): + if isinstance(task_obj, dict): + _outputs = get_task_list(task_obj) + outputs.extend(_outputs) + else: + task_output = TaskOutput.from_taskdict(task_name, task_obj) + outputs.append(task_output) + + return outputs + + +def get_subtask_list(task_dict, task_root=None, depth=0): + subtask_list = {} + for group_obj, task_obj in task_dict.items(): + if isinstance(group_obj, ConfigurableGroup): + # group_name = group_obj.group_name + group_name = group_obj.group_name + else: + group_name = group_obj + if isinstance(task_obj, dict): + _subtask_list = get_subtask_list( + task_obj, task_root=group_name, depth=depth + 1 + ) + if task_root: + subtask_list.setdefault((task_root, depth), []).extend( + [ + _task + for (_task, _depth) in _subtask_list.keys() + if (_depth - 1) == depth + ] + ) + + subtask_list = {**subtask_list, **_subtask_list} + else: + if isinstance(task_obj, ConfigurableGroup): + # group_or_task_name = task_obj.group_name + group_or_task_name = task_obj.group_name + elif isinstance(task_obj, Task): + # group_or_task_name = task_obj.task_name + group_or_task_name = task_obj.task_name + + if task_root is None: + subtask_list.setdefault((group_or_task_name, depth), []) + else: + subtask_list.setdefault((task_root, depth), []).append( + group_or_task_name + ) + + if depth == 0: + _subtask_list = {} + for group_key, task_list in subtask_list.items(): + group_name, depth = group_key + _subtask_list[group_name] = task_list + subtask_list = _subtask_list + + return subtask_list + + +def print_writeout(task) -> None: + for inst in task.instances: + # print the prompt for the first few documents + if inst.doc_id < 1: + eval_logger.info( + f"Task: {task}; document {inst.doc_id}; context prompt (starting on next line):\ + \n{inst.args[0]}\n(end of prompt on previous line)\ntarget string or answer choice index (starting on next line):\n{task.doc_to_target(inst.doc)}\n(end of target on previous line)" + ) + eval_logger.info(f"Request: {str(inst)}") + + +def get_sample_size(task, limit: Optional[int]) -> Union[int, None]: + if limit is not None: + limit = ( + int(math.ceil(len(task.eval_docs) * limit)) if limit < 1.0 else int(limit) + ) + return limit + + +def prepare_print_tasks( + task_dict: dict, + results: dict, + task_depth=0, + group_depth=0, +) -> Tuple[dict, dict]: + """ + @param task_dict: Dictionary representing the group hierarchy of tasks. Each key is a group name and its + value is a list of task names. + @param results: Dictionary containing the results of each task. Each key is a + group name and its value is a dictionary of task results. + @param task_depth: The indentation level for printing the task + hierarchy. Default is 0. + @param group_depth: The indentation level for printing the group + hierarchy. Default is 0. + @return: A tuple of two dictionaries: results_agg and groups_agg. results_agg contains + aggregated results for each task, and groups_agg contains aggregated results for each group. + + Prepares the task hierarchy and aggregates the results for each task and group recursively for printing. + """ + + def _sort_task_dict(task_dict): + """ + Helper utility. Sorts the task dict at the current level of the hierarchy based on alphabetized task name. + Required so that we end up sorting within each sub-header correctly. + """ + + return dict( + sorted( + task_dict.items(), + key=lambda item: item[0].group_name + if isinstance(item[0], ConfigurableGroup) + else item[0], + ) + ) + + task_agg = collections.defaultdict(dict) + group_agg = collections.defaultdict(dict) + task_dict = _sort_task_dict(task_dict) + for task_or_group_name, task_or_group_obj in task_dict.items(): + tab_string = " " * task_depth + "- " if task_depth > 0 else "" + if isinstance(task_or_group_name, ConfigurableGroup): + # string_name = task_or_group_name.group_name + name = task_or_group_name.group_name + from_configurable_group = True + task_or_group_obj = _sort_task_dict(task_or_group_obj) + elif isinstance(task_or_group_name, str): + name = task_or_group_name + if isinstance(task_or_group_obj, Task): + # string_name = task_or_group_obj.task_name + name = task_or_group_obj.task_name + from_configurable_group = False + + task_agg[name] = results[name].copy() + if from_configurable_group: + if task_or_group_name.group_alias is not None: + alias = task_or_group_name.group_alias + else: + alias = task_or_group_name.group + else: + if "alias" in task_agg[name]: + alias = task_agg[name]["alias"] + else: + alias = name + + task_agg[name]["alias"] = tab_string + alias + if "samples" in task_agg[name]: + task_agg[name].pop("samples") + + if from_configurable_group and (" " not in results[name]): + group_tab_string = " " * group_depth + "- " if group_depth > 0 else "" + group_agg[name] = results[name].copy() + group_agg[name]["alias"] = group_tab_string + alias + if "samples" in group_agg[name]: + group_agg[name].pop("samples") + + if isinstance(task_or_group_obj, dict): + task_depth += 1 + group_depth += 1 + _task_agg, _group_agg = prepare_print_tasks( + task_or_group_obj, results, task_depth, group_depth + ) + task_agg = { + **task_agg, + **_task_agg, + } + group_agg = {**group_agg, **_group_agg} + task_depth -= 1 + group_depth -= 1 + return task_agg, group_agg + + +def consolidate_results( + eval_tasks: List[TaskOutput], +) -> Tuple[dict, dict, dict, dict, dict, dict]: + """ + @param eval_tasks: list(TaskOutput). + @return: A tuple containing the consolidated results, samples, configs, versions, and num_fewshot. + + Consolidates the results of multiple evaluation tasks into a single structure. + + The method iterates over each evaluation instance and extracts relevant information to create the consolidated + results structure. The consolidated results structure has the following properties: + + - results: A defaultdict with task names as keys and dictionaries as values. Each dictionary contains + metric/filter pairs as keys and corresponding metric values as values. The "alias" key is used to store task + aliases specified in the task configuration. + - samples: A defaultdict with task names as keys and lists of log samples as values. + - configs: A defaultdict with task names as keys and task configurations as values. + - versions: A defaultdict with task names as keys and task versions as values. + - num_fewshot: A defaultdict with task names as keys and number of few-shot samples as values. + - higher_is_better: A defaultdict with task names as keys and indicators of whether higher values are better + for each metric as values. + + The method then returns the consolidated results, samples, configs, versions, and num_fewshot as a tuple. + """ + # stores the final result for each task, for each metric/filter pair. + results = collections.defaultdict(dict) + # logs info about each document evaluated. + samples = collections.defaultdict(list) + # store num-fewshot value per task + num_fewshot = collections.defaultdict(int) + # Tracks the YAML configs of all chosen task + configs = collections.defaultdict(dict) + # Tracks each task's version. + versions = collections.defaultdict(dict) + # Track `higher_is_better` for each metric + higher_is_better = collections.defaultdict(dict) + + for task_output in eval_tasks: + if "task_alias" in (task_config := task_output.task_config): + results[task_output.task_name]["alias"] = task_config["task_alias"] + else: + results[task_output.task_name]["alias"] = task_output.task_name + if group_alias := task_output.group_alias: + if group_alias not in results and (group_name := task_output.group_name): + results[group_name]["alias"] = group_alias + num_fewshot[task_output.task_name] = task_output.n_shot + configs[task_output.task_name] = task_output.task_config + versions[task_output.task_name] = task_output.version + samples[task_output.task_name] = task_output.logged_samples + higher_is_better[task_output.task_name] = task_output.task.higher_is_better() + for (metric, filter_key), items in task_output.sample_metrics.items(): + metric_key = f"{metric},{filter_key}" + results[task_output.task_name][metric_key] = task_output.agg_metrics[ + metric_key + ] + results[task_output.task_name]["samples"] = task_output.sample_len + results[task_output.task_name][f"{metric}_stderr,{filter_key}"] = ( + task_output.agg_metrics[f"{metric}_stderr,{filter_key}"] + ) + return results, samples, configs, versions, num_fewshot, higher_is_better + + +def consolidate_group_results( + results, + versions, + task_dict, + task_root=None, + show_group_table=False, + task_aggregation_list=None, +) -> Tuple[dict, dict, bool, Union[None,]]: + """ + (Recursively) calculates groups' aggregated metrics and updates the results and versions dictionaries with this info. + + @return: a tuple [results, versions, show_group_table, task_aggregation_list] with formats described below: + + - results: A defaultdict with task names (and, after this function is called, group names of + groups that perform aggregation) as keys, and dictionaries with "alias" and metric,filter_name pairs as keys. + - versions: A defaultdict with task names (and, after this function is called, group names of + groups that perform aggregation) as keys, and float values representing the task or group's version if a version is specified. (defaulting to None). + - show_group_table: a boolean which is true if there exists a group that requires printing of its aggregated scores in a group table. + - task_aggregation_list: a defaultdict listing the subtasks to average over to produce a given group's end metric. + + The method then returns the updated results, versions, show_group_table, and task_aggregation_list as a tuple. + In the top-level invocation of this function, task_aggregation_list is ignored. + """ + if task_root is None: + task_root = {} + + if task_aggregation_list is None: + task_aggregation_list = {} + + for group_or_task, group_or_task_info in task_dict.items(): + # Convert to string + if isinstance(group_or_task, ConfigurableGroup): + group_config = group_or_task.config + group_or_task = group_or_task.group_name + else: + group_config = None + + if isinstance(group_or_task_info, Task): + if task_root: + task_aggregation_list.setdefault(task_root, []).append( + group_or_task_info.task_name + ) + else: + ( + results, + versions, + show_group_table, + _task_aggregation_list, + ) = consolidate_group_results( + results, + versions, + group_or_task_info, + group_or_task, + show_group_table, + task_aggregation_list, + ) + if task_root: + task_aggregation_list.setdefault(task_root, []).extend( + task_aggregation_list.get(group_or_task, []) + ) + + if (group_config is None) or ( + group_config["aggregate_metric_list"] is None + ): + results[group_or_task][" "] = " " + continue + + if "aggregate_metric_list" in group_config: + agg_metric_list = group_config["aggregate_metric_list"] + + show_group_table = show_group_table | bool( + group_config["aggregate_metric_list"] + ) + + task_list = _task_aggregation_list[group_or_task] + + metric_list = list( + { + key + for task in task_list + for key in results[task].keys() + if "_stderr" not in key and key not in ["task", "alias", "samples"] + } + ) + for metric in metric_list: + stderr = "_stderr,".join(metric.split(",")) + + # gather metrics, sizes, and stderrs from subtasks + metrics = [ + results[task][metric] + for task in task_list + if metric in results[task] + ] # TODO: copy? + stderrs = [ + results[task][stderr] + for task in task_list + if stderr in results[task] + ] + sizes = [ + results[task]["samples"] + for task in task_list + if metric in results[task] + ] + + for metric_config in agg_metric_list: + for filter_name in metric_config["filter_list"]: + if metric != ",".join([metric_config["metric"], filter_name]): + continue + + # compute group's pooled metric and stderr + if metric_config["aggregation"] == "mean": + aggregate_fn = aggregate_subtask_metrics + else: + raise ValueError( + f"Currently, only 'mean' is supported for automatically aggregating scores across groups' subtasks. Got '{metric_config['aggregation']}' for group '{group_or_task}'" + ) + + results[group_or_task][metric] = aggregate_fn( + metrics, + sizes, + metric_config["weight_by_size"], + ) + # TODO: calculate groups' metrics using arbitrary agg fns + if "N/A" in stderrs: + results[group_or_task][stderr] = "N/A" + else: + # NOTE: this assumes we are using the mean to aggregate. There are warnings about this elsewhere + results[group_or_task][stderr] = pooled_sample_stderr( + stderrs, sizes + ) + + results[group_or_task]["samples"] = sum(sizes) + group_metadata = group_config.get("metadata", None) + if group_metadata is not None: + versions[group_or_task] = group_metadata.get("version", None) + # print(results) + return results, versions, show_group_table, task_aggregation_list + + +@positional_deprecated +def find_test_root(start_path: pathlib.Path) -> pathlib.Path: + """ + Search upward in the directory tree to a maximum of three layers + to find and return the package root (containing the 'tests' folder) + """ + cur_path = start_path.resolve() + max_layers = 3 + for _ in range(max_layers): + if (cur_path / "tests" / "test_version_stable.py").exists(): + return cur_path + else: + cur_path = cur_path.parent.resolve() + raise FileNotFoundError( + f"Unable to find package root within {max_layers} upwards" + f"of {start_path}" + ) + + +@positional_deprecated +def run_task_tests(task_list: List[str]): + """ + Find the package root and run the tests for the given tasks + """ + import pytest + + package_root = find_test_root(start_path=pathlib.Path(__file__)) + task_string = " or ".join(task_list) + args = [ + f"{package_root}/tests/test_version_stable.py", + f"--rootdir={package_root}", + "-k", + f"{task_string}", + ] + sys.path.append(str(package_root)) + pytest_return_val = pytest.main(args) + if pytest_return_val: + raise ValueError( + f"Not all tests for the specified tasks ({task_list}) ran successfully! Error code: {pytest_return_val}" + ) diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/filters/__init__.py b/scripts/yans/lm-evaluation-harness/lm_eval/filters/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..46fa4acd4cc4f06d1f62f25840b3c4d9ffc92b7e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/filters/__init__.py @@ -0,0 +1,25 @@ +from functools import partial +from typing import List + +from lm_eval.api.filter import FilterEnsemble +from lm_eval.api.registry import get_filter + +from . import extraction, selection, transformation + + +def build_filter_ensemble( + filter_name: str, components: List[List[str]] +) -> FilterEnsemble: + """ + Create a filtering pipeline. + """ + filters = [] + for function, kwargs in components: + if kwargs is None: + kwargs = {} + # create a filter given its name in the registry + f = partial(get_filter(function), **kwargs) + # add the filter as a pipeline step + filters.append(f) + + return FilterEnsemble(name=filter_name, filters=filters) diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/filters/__pycache__/__init__.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/filters/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..99cf59d6c780b9de400a317f5e0e4caa7648f467 Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/filters/__pycache__/__init__.cpython-310.pyc differ diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/filters/__pycache__/extraction.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/filters/__pycache__/extraction.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1d4daf22e3c801b0f2f05ab905e42d65b79f985e Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/filters/__pycache__/extraction.cpython-310.pyc differ diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/filters/__pycache__/selection.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/filters/__pycache__/selection.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a8356ba2a420fcf8a8e3bfe3291ed2c6951f923e Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/filters/__pycache__/selection.cpython-310.pyc differ diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/filters/__pycache__/transformation.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/filters/__pycache__/transformation.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b02d85f6a663194a88d87b018614c5b6d224a240 Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/filters/__pycache__/transformation.cpython-310.pyc differ diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/filters/decontamination.py b/scripts/yans/lm-evaluation-harness/lm_eval/filters/decontamination.py new file mode 100644 index 0000000000000000000000000000000000000000..4eda4e022445355f191926790b2edf8f0cfa4bbd --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/filters/decontamination.py @@ -0,0 +1,25 @@ +from lm_eval.api.filter import Filter +from lm_eval.api.registry import register_filter + + +@register_filter("decontaminate") +class DecontaminationFilter(Filter): + """ + A filter which evaluates + """ + + name = "track_decontamination" + + def __init__(self, path) -> None: + """ + + TODO: make sure only ever run one time on the train set (should this be cached as a class var? keyed by value for "path"). + should further cache result on a given (task_name, doc_id) + """ + self._decontam_results = None + + def apply(self, resps, docs) -> None: + """ + Return {"no_contamination", "only_contamination"} keys for the 2 different subsets + """ + pass diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/filters/extraction.py b/scripts/yans/lm-evaluation-harness/lm_eval/filters/extraction.py new file mode 100644 index 0000000000000000000000000000000000000000..41dc6208ce67ce36d69b2d91dcb6815a3fefb5a9 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/filters/extraction.py @@ -0,0 +1,184 @@ +import re +import sys +import unicodedata + +from lm_eval.api.filter import Filter +from lm_eval.api.registry import register_filter + + +@register_filter("regex") +class RegexFilter(Filter): + """ """ + + def __init__( + self, + regex_pattern: str = r"#### (\-?[0-9\.\,]+)", + group_select=0, + fallback: str = "[invalid]", + ) -> None: + """ + pass a string `regex` to run `re.compile(r"regex")` on. + `fallback` defines the output returned if no matches for the regex are located. + """ + self.regex_pattern = regex_pattern + self.regex = re.compile(regex_pattern) + self.group_select = group_select + self.fallback = fallback + + def apply(self, resps, docs): + # here, we assume we have a list, in which each element is + # a list of model responses for some particular input/target pair. + # so we process each of these (same input/target response sets) + # independently (and keep them a list.) + def filter_set(inst): + filtered = [] + for resp in inst: + match = self.regex.findall(resp) + if match: + match = match[self.group_select] + if isinstance(match, tuple): + match = [m for m in match if m][0] + match = match.strip() + else: + match = self.fallback + filtered.append(match) + return filtered + + # print(resps) + filtered_resps = list(map(lambda x: filter_set(x), resps)) + # print(filtered_resps) + + return filtered_resps + + +@register_filter("remove_whitespace") +class WhitespaceFilter(Filter): + """ """ + + def __init__(self) -> None: + pass + + def apply(self, resps, docs): + def filter_set(inst): + filtered_resp = [] + for resp in inst: + resp = resp.lstrip() + filtered_resp.append(resp) + return filtered_resp + + filtered_resps = [filter_set(resp) for resp in resps] + + return filtered_resps + + +@register_filter("multi_choice_regex") +class MultiChoiceRegexFilter(RegexFilter): + """ + A filter used to extract a model's answer on multiple choice questions with + letter answers. assumes each document has a "choices" field + containing the list of answer choices and that the answer label symbols + are of the form (A), (B), (C), ... or A, B, C. + """ + + def __init__( + self, + regex_pattern: str = r"#### (\-?[0-9\.\,]+)", + group_select=0, + fallback: str = "[invalid]", + ignore_case=False, + ignore_punctuation=False, + regexes_to_ignore=None, + ) -> None: + """ + regex_pattern: The basic regex pattern to use. If fails to match, we will use the customized match procedure + - step 1 : We parse the choices between ([A-Z])s then try to find these choices in the response. + - step 2 : We parse the choice with regex :[\s]*([A-?]), where ? varies by number of choices. + group_select: Selects the (group_select)th match from the findall result. + ignore_case: Ignores the case during step 1 matching + ignore_punctuation: Remove the punctuation during step 1 matching + regexes_to_ignore: Remove these regexes during step 1 matching + """ + super().__init__(regex_pattern, group_select, fallback) + self.ignore_case = ignore_case + self.ignore_punctuation = ignore_punctuation + self.regexes_to_ignore = regexes_to_ignore + + def apply(self, resps, docs): + # here, we assume we have a list, in which each element is + # a list of model responses for some particular input/target pair. + # so we process each of these (same input/target response sets) + # independently (and keep them a list.) + + def find_match(regex, resp, convert_dict={}): + match = regex.findall(resp) + if match: + match = match[self.group_select] + if isinstance(match, tuple): + match = [m for m in match if m][0] + match = match.strip() + if match and match in convert_dict: + match = convert_dict[match] + return match + + punct_tbl = dict.fromkeys( + i + for i in range(sys.maxunicode) + if unicodedata.category(chr(i)).startswith("P") + ) + + def filter_ignores(st): + if self.regexes_to_ignore is not None: + for s in self.regexes_to_ignore: + st = re.sub(s, "", st) + + if self.ignore_case: + st = st.lower() + + if self.ignore_punctuation: + # https://stackoverflow.com/a/266162 + st = st.translate(punct_tbl) + return st + + filtered_resps = [] + + for r, doc in zip(resps, docs): + fallback_regexes = [] + choice_to_alpha = {} + next_alpha = "A" + + without_paren_fallback_regexes = [] + without_paren_to_target = {} + + choices = doc["choices"] + for c in choices: + m = filter_ignores(c.strip()) + fallback_regexes.append(f"{re.escape(m)}") + choice_to_alpha[m] = f"({next_alpha})" + + without_paren_fallback_regexes.append(next_alpha) + without_paren_to_target[next_alpha] = f"({next_alpha})" + + next_alpha = chr(ord(next_alpha) + 1) + fallback_regex = re.compile("|".join(fallback_regexes)) + without_paren_fallback_regex = "|".join(without_paren_fallback_regexes) + without_paren_fallback_regex = re.compile( + f":[\s]*({without_paren_fallback_regex})" + ) + + filtered = [] + for resp in r: + match = find_match(self.regex, resp) + if not match: + match = find_match( + fallback_regex, filter_ignores(resp), choice_to_alpha + ) + if not match: + match = find_match( + without_paren_fallback_regex, resp, without_paren_to_target + ) + if not match: + match = self.fallback + filtered.append(match) + filtered_resps.append(filtered) + + return filtered_resps diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/filters/selection.py b/scripts/yans/lm-evaluation-harness/lm_eval/filters/selection.py new file mode 100644 index 0000000000000000000000000000000000000000..6e368b5980626c8008ed48c45a360046660db13e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/filters/selection.py @@ -0,0 +1,61 @@ +from collections import Counter + +from lm_eval.api.filter import Filter +from lm_eval.api.registry import register_filter + + +# TODO: implement "arg_max" filter. either it should take in an arbitrary "scoring"/reward function +# that takes an input and returns a scalar and then should select the max reward, +# or should implement different filters for different ways of handling a reward model's inference. + + +@register_filter("take_first") +class TakeFirstFilter(Filter): + def __init__(self) -> None: + """ + Can define custom behavior here, if an individual instantiation of a Filter class should have state. + """ + + def apply(self, resps, docs): + """ + Assuming each entry of `resps` is a list of model responses, we discard all but the first response. + """ + return map(lambda r: r[0], resps) + + +@register_filter("take_first_k") +class TakeKFilter(Filter): + def __init__(self, **kwargs) -> None: + self.k = kwargs.pop("k") + + super().__init__(**kwargs) + + def apply(self, resps, docs): + # need resp to be subscriptable to check below + resps = list(resps) + # check we have at least k responses per doc, else we can't take the first k + assert ( + len(resps[0]) >= self.k + ), f"Need at least {self.k} responses per doc to take first {self.k}, but got {len(resps[0])} only! Please increase TaskConfig.repeats ." + return map(lambda r: r[: self.k], resps) + + +@register_filter("majority_vote") +class MajorityVoteFilter(Filter): + def __init__(self) -> None: + """ + Can define custom behavior here, if an individual instantiation of a Filter class should have state. + """ + + def apply(self, resps, docs): + """ + Each entry of `resps` is a list of model responses. + We select the response that occurs most frequently in each entry of `resps`. + """ + + def select_majority(resp): + counts = Counter(resp) + vote = counts.most_common(1)[0][0] + return vote + + return map(lambda r: [select_majority(r)], resps) diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/filters/transformation.py b/scripts/yans/lm-evaluation-harness/lm_eval/filters/transformation.py new file mode 100644 index 0000000000000000000000000000000000000000..cac1c5921dafe74be0b8416bd3a0678dc1fa1570 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/filters/transformation.py @@ -0,0 +1,56 @@ +from lm_eval.api.filter import Filter +from lm_eval.api.registry import register_filter + + +@register_filter("lowercase") +class LowercaseFilter(Filter): + def __init__(self) -> None: + pass + + def apply(self, resps, docs): + def filter_set(inst): + return [resp.lower() for resp in inst] + + return [filter_set(resp) for resp in resps] + + +@register_filter("uppercase") +class UppercaseFilter(Filter): + def __init__(self) -> None: + pass + + def apply(self, resps, docs): + def filter_set(inst): + return [resp.upper() for resp in inst] + + return [filter_set(resp) for resp in resps] + + +@register_filter("map") +class MapFilter(Filter): + def __init__(self, mapping_dict: dict = None, default_value=None) -> None: + """ + Initializes the MapFilter with a given mapping dictionary and default value. + + Args: + - mapping_dict (dict): A dictionary containing the key-value mappings. + Default is an empty dictionary. + - default_value (Any): The value to be returned when a key is not found in the mapping_dict. + Default is None. + + Example: + mapper = MapFilter({'A': 1, 'B': 2}, default_value=0) + """ + if mapping_dict is None: + mapping_dict = {} + assert isinstance( + mapping_dict, dict + ), "Provided mapping_dict is not a dictionary" + self.mapping_dict = mapping_dict + self.default_value = default_value + + def apply(self, resps, docs): + def filter_set(inst): + return [self.mapping_dict.get(resp, self.default_value) for resp in inst] + + return [filter_set(resp) for resp in resps] diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/prompts/__init__.py b/scripts/yans/lm-evaluation-harness/lm_eval/prompts/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1f814214de4afaabd1367854c74dc2143c346744 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/prompts/__init__.py @@ -0,0 +1,126 @@ +import ast +import os +from typing import Dict + +from lm_eval import utils +from lm_eval.utils import eval_logger + + +# Prompt library. +# Stores prompts in a dictionary indexed by 2 levels: +# prompt category name, and prompt name. +# This allows us to access prompts +PROMPT_REGISTRY: Dict[str, Dict[str, str]] = { + "qa-basic": { + "question-newline-answer": "Question: {{question}}\nAnswer:", + "q-newline-a": "Q: {{question}}\nA:", + }, +} + + +def get_prompt(prompt_id: str, dataset_name: str = None, subset_name: str = None): + # unpack prompt name + category_name, prompt_name = prompt_id.split(":") + if subset_name is None: + dataset_full_name = dataset_name + else: + dataset_full_name = f"{dataset_name}-{subset_name}" + eval_logger.info(f"Loading prompt from {category_name} for {dataset_full_name}") + if category_name == "promptsource": + try: + from promptsource.templates import DatasetTemplates + except ModuleNotFoundError: + raise Exception( + "Tried to load a Promptsource template, but promptsource is not installed ", + "please install promptsource via pip install lm-eval[promptsource] or pip install -e .[promptsource]", + ) + try: + if subset_name is None: + prompts = DatasetTemplates(dataset_name=dataset_name) + else: + prompts = DatasetTemplates( + dataset_name=dataset_name, subset_name=subset_name + ) + except Exception: + raise ValueError(f"{dataset_name} and {subset_name} not found") + if prompt_name in prompts.all_template_names: + return prompts[prompt_name] + else: + raise ValueError( + f"{prompt_name} not in prompt list {prompts.all_template_names}" + ) + elif ".yaml" in category_name: + import yaml + + with open(category_name, "rb") as file: + prompt_yaml_file = yaml.full_load(file) + + prompt_string = prompt_yaml_file["prompts"][prompt_name] + return PromptString(prompt_string) + else: + try: + return PROMPT_REGISTRY[category_name][prompt_name] + except Exception: + raise ValueError( + f"expected only a single `:` as separator between \ + prompt category and name, but got `{prompt_id}` instead" + ) + + +def load_prompt_list( + use_prompt: str, dataset_name=None, subset_name=None, yaml_path=None, **kwargs +): + category_name, prompt_name = use_prompt.split(":") + + if category_name == "promptsource": + from promptsource.templates import DatasetTemplates + + if subset_name is None: + prompts = DatasetTemplates(dataset_name=dataset_name) + else: + prompts = DatasetTemplates( + dataset_name=dataset_name, subset_name=subset_name + ) + + prompt_list = utils.pattern_match(prompt_name, prompts.all_template_names) + + elif ".yaml" in category_name: + import yaml + + if yaml_path is not None: + category_name = os.path.realpath(os.path.join(yaml_path, category_name)) + + with open(category_name, "rb") as file: + prompt_yaml_file = yaml.full_load(file) + + prompt_list = utils.pattern_match( + prompt_name, prompt_yaml_file["prompts"].keys() + ) + + # category_name, *prompt_name = use_prompt.split(":") + # TODO allow to multiple prompt naming + # if len(prompt_name) > 1: + # prompt_list = [] + # for prompt in prompt_name: + # prompt_list.append(utils.pattern_match(prompt_name, prompts.all_template_names)) + # else: + # prompt_list = utils.pattern_match(prompt_name, prompts.all_template_names) + return [":".join([category_name, prompt]) for prompt in prompt_list] + + +class PromptString: + def __init__(self, prompt_string): + self.prompt_string = prompt_string + + def apply(self, doc): + doc_to_text = self.prompt_string["doc_to_text"] + doc_to_target = self.prompt_string["doc_to_target"] + + # TODO need a way to process doc_to_choice + if "doc_to_choice" in self.prompt_string: + raise Exception("Not yet implemented to accept doc_to_choice") + + text_string = utils.apply_template(doc_to_text, doc) + target_string = utils.apply_template(doc_to_target, doc) + + return [text_string, target_string] diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/prompts/__pycache__/__init__.cpython-310.pyc b/scripts/yans/lm-evaluation-harness/lm_eval/prompts/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c5f56c93c7ce0254dc23181cdc27e3a160107780 Binary files /dev/null and b/scripts/yans/lm-evaluation-harness/lm_eval/prompts/__pycache__/__init__.cpython-310.pyc differ diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/README.md new file mode 100644 index 0000000000000000000000000000000000000000..06b3111fbea68e14a41bc545b2ef5eb9eec9cdc4 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/README.md @@ -0,0 +1,119 @@ + +# Tasks + + A list of supported tasks and task groupings can be viewed with `lm-eval --tasks list`. + + For more information, including a full list of task names and their precise meanings or sources, follow the links provided to the individual README.md files for each subfolder. + +| Task Family | Description | Language(s) | +|-------------|-------------|-------------| +| [aclue](aclue/README.md) | Tasks focusing on ancient Chinese language understanding and cultural aspects. | Ancient Chinese | +| [aexams](aexams/README.md) | Tasks in Arabic related to various academic exams covering a range of subjects. | Arabic | +| [agieval](agieval/README.md) | Tasks involving historical data or questions related to history and historical texts. | English, Chinese | +| [anli](anli/README.md) | Adversarial natural language inference tasks designed to test model robustness. | English | +| [arabicmmlu](arabicmmlu/README.md) | Localized Arabic version of MMLU with multiple-choice questions from 40 subjects. | Arabic | +| [arc](arc/README.md) | Tasks involving complex reasoning over a diverse set of questions. | English | +| [arithmetic](arithmetic/README.md) | Tasks involving numerical computations and arithmetic reasoning. | English | +| [asdiv](asdiv/README.md) | Tasks involving arithmetic and mathematical reasoning challenges. | English | +| [babi](babi/README.md) | Tasks designed as question and answering challenges based on simulated stories. | English | +| [basqueglue](basqueglue/README.md) | Tasks designed to evaluate language understanding in Basque language. | Basque | +| [bbh](bbh/README.md) | Tasks focused on deep semantic understanding through hypothesization and reasoning. | English, German | +| [belebele](belebele/README.md) | Language understanding tasks in a variety of languages and scripts. | Multiple (122 languages) | +| benchmarks | General benchmarking tasks that test a wide range of language understanding capabilities. | | +| [bertaqa](bertaqa/README.md) | Local Basque cultural trivia QA tests in English and Basque languages. | English, Basque, Basque (MT) | +| [bigbench](bigbench/README.md) | Broad tasks from the BIG-bench benchmark designed to push the boundaries of large models. | Multiple | +| [blimp](blimp/README.md) | Tasks testing grammatical phenomena to evaluate language model's linguistic capabilities. | English | +| [ceval](ceval/README.md) | Tasks that evaluate language understanding and reasoning in an educational context. | Chinese | +| [cmmlu](cmmlu/README.md) | Multi-subject multiple choice question tasks for comprehensive academic assessment. | Chinese | +| code_x_glue | Tasks that involve understanding and generating code across multiple programming languages. | Go, Java, JS, PHP, Python, Ruby | +| [commonsense_qa](commonsense_qa/README.md) | CommonsenseQA, a multiple-choice QA dataset for measuring commonsense knowledge. | English | +| [copal_id](copal_id/README.md) | Indonesian causal commonsense reasoning dataset that captures local nuances. | Indonesian | +| [coqa](coqa/README.md) | Conversational question answering tasks to test dialog understanding. | English | +| [crows_pairs](crows_pairs/README.md) | Tasks designed to test model biases in various sociodemographic groups. | English, French | +| csatqa | Tasks related to SAT and other standardized testing questions for academic assessment. | Korean | +| [drop](drop/README.md) | Tasks requiring numerical reasoning, reading comprehension, and question answering. | English | +| [eq_bench](eq_bench/README.md) | Tasks focused on equality and ethics in question answering and decision-making. | English | +| [eus_exams](eus_exams/README.md) | Tasks based on various professional and academic exams in the Basque language. | Basque | +| [eus_proficiency](eus_proficiency/README.md) | Tasks designed to test proficiency in the Basque language across various topics. | Basque | +| [eus_reading](eus_reading/README.md) | Reading comprehension tasks specifically designed for the Basque language. | Basque | +| [eus_trivia](eus_trivia/README.md) | Trivia and knowledge testing tasks in the Basque language. | Basque | +| [fda](fda/README.md) | Tasks for extracting key-value pairs from FDA documents to test information extraction. | English | +| [fld](fld/README.md) | Tasks involving free-form and directed dialogue understanding. | English | +| [french_bench](french_bench/README.md) | Set of tasks designed to assess language model performance in French. | French| +| [glue](glue/README.md) | General Language Understanding Evaluation benchmark to test broad language abilities. | English | +| [gpqa](gpqa/README.md) | Tasks designed for general public question answering and knowledge verification. | English | +| [gsm8k](gsm8k/README.md) | A benchmark of grade school math problems aimed at evaluating reasoning capabilities. | English | +| [haerae](haerae/README.md) | Tasks focused on assessing detailed factual and historical knowledge. | Korean | +| [headqa](headqa/README.md) | A high-level education-based question answering dataset to test specialized knowledge. | Spanish, English | +| [hellaswag](hellaswag/README.md) | Tasks to predict the ending of stories or scenarios, testing comprehension and creativity. | English | +| [hendrycks_ethics](hendrycks_ethics/README.md) | Tasks designed to evaluate the ethical reasoning capabilities of models. | English | +| [hendrycks_math](hendrycks_math/README.md) | Mathematical problem-solving tasks to test numerical reasoning and problem-solving. | English | +| [ifeval](ifeval/README.md) | Interactive fiction evaluation tasks for narrative understanding and reasoning. | English | +| [inverse_scaling](inverse_scaling/README.md) | Multiple-choice tasks from the Inverse Scaling Prize, designed to find settings where larger language models perform worse. | English | +| [kmmlu](kmmlu/README.md) | Knowledge-based multi-subject multiple choice questions for academic evaluation. | Korean | +| [kobest](kobest/README.md) | A collection of tasks designed to evaluate understanding in Korean language. | Korean | +| [kormedmcqa](kormedmcqa/README.md) | Medical question answering tasks in Korean to test specialized domain knowledge. | Korean | +| [lambada](lambada/README.md) | Tasks designed to predict the endings of text passages, testing language prediction skills. | English | +| [lambada_cloze](lambada_cloze/README.md) | Cloze-style LAMBADA dataset. | English | +| [lambada_multilingual](lambada_multilingual/README.md) | Multilingual LAMBADA dataset. This is a legacy version of the multilingual dataset, and users should instead use `lambada_multilingual_stablelm`. | German, English, Spanish, French, Italian | +| [lambada_multilingual_stablelm](lambada_multilingual_stablelm/README.md) | Multilingual LAMBADA dataset. Users should prefer evaluating on this version of the multilingual dataset instead of on `lambada_multilingual`. | German, English, Spanish, French, Italian, Dutch, Portuguese | +| [leaderboard](leaderboard/README.md) | Task group used by Hugging Face's [Open LLM Leaderboard v2](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard). Those tasks are static and will not change through time | English | +| [logiqa](logiqa/README.md) | Logical reasoning tasks requiring advanced inference and deduction. | English, Chinese | +| [logiqa2](logiqa2/README.md) | Large-scale logical reasoning dataset adapted from the Chinese Civil Service Examination. | English, Chinese | +| [mathqa](mathqa/README.md) | Question answering tasks involving mathematical reasoning and problem-solving. | English | +| [mc_taco](mc_taco/README.md) | Question-answer pairs that require temporal commonsense comprehension. | English | +| [med_concepts_qa](med_concepts_qa/README.md) | Benchmark for evaluating LLMs on their abilities to interpret medical codes and distinguish between medical concept. | English | +| medmcqa | Medical multiple choice questions assessing detailed medical knowledge. | English | +| medqa | Multiple choice question answering based on the United States Medical License Exams. | | +| [mgsm](mgsm/README.md) | Benchmark of multilingual grade-school math problems. | Spanish, French, German, Russian, Chinese, Japanese, Thai, Swahili, Bengali, Telugu | +| [minerva_math](minerva_math/README.md) | Mathematics-focused tasks requiring numerical reasoning and problem-solving skills. | English | +| mmlu | Massive Multitask Language Understanding benchmark for broad domain language evaluation. Several variants are supported. | English | +| [mmlusr](mmlusr/README.md) | Variation of MMLU designed to be more rigourous. | English | +| model_written_evals | Evaluation tasks auto-generated for evaluating a collection of AI Safety concerns. | | +| [mutual](mutual/README.md) | A retrieval-based dataset for multi-turn dialogue reasoning. | English | +| [nq_open](nq_open/README.md) | Open domain question answering tasks based on the Natural Questions dataset. | English | +| [okapi/arc_multilingual](okapi/arc_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) **Machine Translated.** | +| [okapi/hellaswag_multilingual](okapi/hellaswag_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (30 languages) **Machine Translated.** | +| okapi/mmlu_multilingual | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (34 languages) **Machine Translated.** | +| [okapi/truthfulqa_multilingual](okapi/truthfulqa_multilingual/README.md) | Tasks that involve reading comprehension and information retrieval challenges. | Multiple (31 languages) **Machine Translated.** | +| [openbookqa](openbookqa/README.md) | Open-book question answering tasks that require external knowledge and reasoning. | English | +| [paloma](paloma/README.md) | Paloma is a comprehensive benchmark designed to evaluate open language models across a wide range of domains, ranging from niche artist communities to mental health forums on Reddit. | English | +| [paws-x](paws-x/README.md) | Paraphrase Adversaries from Word Scrambling, focusing on cross-lingual capabilities. | English, French, Spanish, German, Chinese, Japanese, Korean | +| [pile](pile/README.md) | Open source language modelling data set that consists of 22 smaller, high-quality datasets. | English | +| [pile_10k](pile_10k/README.md) | The first 10K elements of The Pile, useful for debugging models trained on it. | English | +| [piqa](piqa/README.md) | Physical Interaction Question Answering tasks to test physical commonsense reasoning. | English | +| [polemo2](polemo2/README.md) | Sentiment analysis and emotion detection tasks based on Polish language data. | Polish | +| [prost](prost/README.md) | Tasks requiring understanding of professional standards and ethics in various domains. | English | +| [pubmedqa](pubmedqa/README.md) | Question answering tasks based on PubMed research articles for biomedical understanding. | English | +| [qa4mre](qa4mre/README.md) | Question Answering for Machine Reading Evaluation, assessing comprehension and reasoning. | English | +| [qasper](qasper/README.md) | Question Answering dataset based on academic papers, testing in-depth scientific knowledge. | English | +| [race](race/README.md) | Reading comprehension assessment tasks based on English exams in China. | English | +| realtoxicityprompts | Tasks to evaluate language models for generating text with potential toxicity. | | +| [sciq](sciq/README.md) | Science Question Answering tasks to assess understanding of scientific concepts. | English | +| [scrolls](scrolls/README.md) | Tasks that involve long-form reading comprehension across various domains. | English | +| [siqa](siqa/README.md) | Social Interaction Question Answering to evaluate common sense and social reasoning. | English | +| [squad_completion](squad_completion/README.md) | A variant of the SQuAD question answering task designed for zero-shot evaluation of small LMs. | English | +| [squadv2](squadv2/README.md) | Stanford Question Answering Dataset version 2, a reading comprehension benchmark. | English | +| [storycloze](storycloze/README.md) | Tasks to predict story endings, focusing on narrative logic and coherence. | English | +| [super_glue](super_glue/README.md) | A suite of challenging tasks designed to test a range of language understanding skills. | English | +| [swag](swag/README.md) | Situations With Adversarial Generations, predicting the next event in videos. | English | +| [swde](swde/README.md) | Information extraction tasks from semi-structured web pages. | English | +| [tinyBenchmarks](tinyBenchmarks/README.md) | Evaluation of large language models with fewer examples using tiny versions of popular benchmarks. | English | +| [tmmluplus](tmmluplus/README.md) | An extended set of tasks under the TMMLU framework for broader academic assessments. | Traditional Chinese | +| [toxigen](toxigen/README.md) | Tasks designed to evaluate language models on their propensity to generate toxic content. | English | +| [translation](translation/README.md) | Tasks focused on evaluating the language translation capabilities of models. | Arabic, English, Spanish, Basque, Hindi, Indonesian, Burmese, Russian, Swahili, Telugu, Chinese | +| [triviaqa](triviaqa/README.md) | A large-scale dataset for trivia question answering to test general knowledge. | English | +| [truthfulqa](truthfulqa/README.md) | A QA task aimed at evaluating the truthfulness and factual accuracy of model responses. | English | +| [unitxt](unitxt/README.md) | A number of tasks implemented using the unitxt library for flexible, shareable, and reusable data preparation and evaluation for generative AI. | English | +| [unscramble](unscramble/README.md) | Tasks involving the rearrangement of scrambled sentences to test syntactic understanding. | English | +| [webqs](webqs/README.md) | Web-based question answering tasks designed to evaluate internet search and retrieval. | English | +| [wikitext](wikitext/README.md) | Tasks based on text from Wikipedia articles to assess language modeling and generation. | English | +| [winogrande](winogrande/README.md) | A large-scale dataset for coreference resolution, inspired by the Winograd Schema Challenge. | English | +| [wmdp](wmdp/README.md) | A benchmark with the objective of minimizing performance, based on potentially-sensitive multiple-choice knowledge questions. | English | +| [wmt2016](wmt2016/README.md) | Tasks from the WMT 2016 shared task, focusing on translation between multiple languages. | English, Czech, German, Finnish, Russian, Romanian, Turkish | +| [wsc273](wsc273/README.md) | The Winograd Schema Challenge, a test of commonsense reasoning and coreference resolution. | English | +| [xcopa](xcopa/README.md) | Cross-lingual Choice of Plausible Alternatives, testing reasoning in multiple languages. | Estonian, Haitian, Indonesian, Italian, Quechua, Swahili, Tamil, Thai, Turkish, Vietnamese, Chinese | +| [xnli](xnli/README.md) | Cross-Lingual Natural Language Inference to test understanding across different languages. | Arabic, Bulgarian, German, Greek, English, Spanish, French, Hindi, Russian, Swahili, Thai, Turkish, Urdu, Vietnamese, Chinese | +| [xnli_eu](xnli_eu/README.md) | Cross-lingual Natural Language Inference tasks in Basque. | Basque | +| [xstorycloze](xstorycloze/README.md) | Cross-lingual narrative understanding tasks to predict story endings in multiple languages. | Russian, Simplified Chinese, Spanish, Arabic, Hindi, Indonesian, Telugu, Swahili, Basque, Burmese | +| [xwinograd](xwinograd/README.md) | Cross-lingual Winograd schema tasks for coreference resolution in multiple languages. | English, French, Japanese, Portuguese, Russian, Chinese | diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/__init__.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b892717956240e6514ee2af6033da696686d1c71 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/__init__.py @@ -0,0 +1,650 @@ +import collections +import inspect +import logging +import os +from functools import partial +from typing import Dict, List, Mapping, Optional, Union + +from lm_eval import utils +from lm_eval.api.group import ConfigurableGroup, GroupConfig +from lm_eval.api.task import ConfigurableTask, Task +from lm_eval.evaluator_utils import get_subtask_list + + +GROUP_ONLY_KEYS = list(GroupConfig().to_dict().keys()) + + +class TaskManager: + """TaskManager indexes all tasks from the default `lm_eval/tasks/` + and an optional directory if provided. + + """ + + def __init__( + self, + verbosity="INFO", + include_path: Optional[Union[str, List]] = None, + include_defaults: bool = True, + ) -> None: + self.verbosity = verbosity + self.include_path = include_path + self.logger = utils.eval_logger + self.logger.setLevel(getattr(logging, f"{verbosity}")) + + self._task_index = self.initialize_tasks( + include_path=include_path, include_defaults=include_defaults + ) + self._all_tasks = sorted(list(self._task_index.keys())) + + self._all_groups = sorted( + [x for x in self._all_tasks if self._task_index[x]["type"] == "group"] + ) + self._all_subtasks = sorted( + [x for x in self._all_tasks if self._task_index[x]["type"] == "task"] + ) + self._all_tags = sorted( + [x for x in self._all_tasks if self._task_index[x]["type"] == "tag"] + ) + + self.task_group_map = collections.defaultdict(list) + + def initialize_tasks( + self, + include_path: Optional[Union[str, List]] = None, + include_defaults: bool = True, + ): + """Creates a dictionary of tasks index. + + :param include_path: Union[str, List] = None + An additional path to be searched for tasks recursively. + Can provide more than one such path as a list. + :param include_defaults: bool = True + If set to false, default tasks (those in lm_eval/tasks/) are not indexed. + :return + Dictionary of task names as key and task metadata + """ + if include_defaults: + all_paths = [os.path.dirname(os.path.abspath(__file__)) + "/"] + else: + all_paths = [] + if include_path is not None: + if isinstance(include_path, str): + include_path = [include_path] + all_paths.extend(include_path) + + task_index = {} + for task_dir in all_paths: + tasks = self._get_task_and_group(task_dir) + task_index = {**tasks, **task_index} + + return task_index + + @property + def all_tasks(self): + return self._all_tasks + + @property + def all_groups(self): + return self._all_groups + + @property + def all_subtasks(self): + return self._all_subtasks + + @property + def all_tags(self): + return self._all_tags + + @property + def task_index(self): + return self._task_index + + def list_all_tasks( + self, list_groups=True, list_tags=True, list_subtasks=True + ) -> str: + from pytablewriter import MarkdownTableWriter + + def sanitize_path(path): + # don't print full path if we are within the lm_eval/tasks dir ! + # if we aren't though, provide the full path. + if "lm_eval/tasks/" in path: + return "lm_eval/tasks/" + path.split("lm_eval/tasks/")[-1] + else: + return path + + group_table = MarkdownTableWriter() + group_table.headers = ["Group", "Config Location"] + gt_values = [] + for g in self.all_groups: + path = self.task_index[g]["yaml_path"] + if path == -1: + path = "---" + else: + path = sanitize_path(path) + gt_values.append([g, path]) + group_table.value_matrix = gt_values + + tag_table = MarkdownTableWriter() + tag_table.headers = ["Tag"] + tag_table.value_matrix = [[t] for t in self.all_tags] + + subtask_table = MarkdownTableWriter() + subtask_table.headers = ["Task", "Config Location", "Output Type"] + st_values = [] + for t in self.all_subtasks: + path = self.task_index[t]["yaml_path"] + + output_type = "" + + # read the yaml file to determine the output type + if path != -1: + config = utils.load_yaml_config(path, mode="simple") + if "output_type" in config: + output_type = config["output_type"] + elif ( + "include" in config + ): # if no output type, check if there is an include with an output type + include_path = path.split("/")[:-1] + config["include"] + include_config = utils.load_yaml_config(include_path, mode="simple") + if "output_type" in include_config: + output_type = include_config["output_type"] + + if path == -1: + path = "---" + else: + path = sanitize_path(path) + st_values.append([t, path, output_type]) + subtask_table.value_matrix = st_values + + result = "\n" + if list_groups: + result += group_table.dumps() + "\n\n" + if list_tags: + result += tag_table.dumps() + "\n\n" + if list_subtasks: + result += subtask_table.dumps() + "\n\n" + return result + + def match_tasks(self, task_list): + return utils.pattern_match(task_list, self.all_tasks) + + def _name_is_registered(self, name) -> bool: + if name in self.all_tasks: + return True + return False + + def _name_is_task(self, name) -> bool: + if self._name_is_registered(name) and (self.task_index[name]["type"] == "task"): + return True + return False + + def _name_is_tag(self, name) -> bool: + if self._name_is_registered(name) and (self.task_index[name]["type"] == "tag"): + return True + return False + + def _name_is_group(self, name) -> bool: + if self._name_is_registered(name) and ( + self.task_index[name]["type"] == "group" + ): + return True + return False + + def _name_is_python_task(self, name): + if self._name_is_registered(name) and ( + self.task_index[name]["type"] == "python_task" + ): + return True + return False + + def _config_is_task(self, config) -> bool: + if ("task" in config) and isinstance(config["task"], str): + return True + return False + + def _config_is_group(self, config) -> bool: + if ("task" in config) and isinstance(config["task"], list): + return True + return False + + def _config_is_python_task(self, config) -> bool: + if "class" in config: + return True + return False + + def _get_yaml_path(self, name): + if name not in self.task_index: + raise ValueError + return self.task_index[name]["yaml_path"] + + def _get_config(self, name): + if name not in self.task_index: + raise ValueError + yaml_path = self._get_yaml_path(name) + if yaml_path == -1: + return {} + else: + return utils.load_yaml_config(yaml_path, mode="full") + + def _get_tasklist(self, name): + if self._name_is_task(name): + raise ValueError + return self.task_index[name]["task"] + + def _process_alias(self, config, group=None): + # If the group is not the same as the original + # group which the group alias was intended for, + # Set the group_alias to None instead. + if ("group_alias" in config) and ("group" in config) and group is not None: + if config["group"] != group: + config["group_alias"] = None + return config + + def _class_has_config_in_constructor(self, cls): + constructor = getattr(cls, "__init__", None) + return ( + "config" in inspect.signature(constructor).parameters + if constructor + else False + ) + + def _load_individual_task_or_group( + self, + name_or_config: Optional[Union[str, dict]] = None, + parent_name: Optional[str] = None, + update_config: Optional[dict] = None, + ) -> Mapping: + def _load_task(config, task): + if "include" in config: + config = { + **utils.load_yaml_config( + yaml_path=None, + yaml_config={"include": config.pop("include")}, + mode="full", + ), + **config, + } + if self._config_is_python_task(config): + if self._class_has_config_in_constructor(config["class"]): + task_object = config["class"](config=config) + else: + task_object = config["class"]() + if isinstance(task_object, ConfigurableTask): + # very scuffed: set task name here. TODO: fixme? + task_object.config.task = config["task"] + else: + task_object = ConfigurableTask(config=config) + + return {task: task_object} + + def _get_group_and_subtask_from_config(config): + group_name = ConfigurableGroup(config=config) + subtask_list = [] + for task in group_name.config["task"]: + if isinstance(task, str) and self._name_is_tag(task): + subtask_list.extend(self._get_tasklist(task)) + else: + subtask_list.append(task) + return group_name, subtask_list + + def _process_group_config(config, update_config=None): + if update_config is not None: + config = {**config, **update_config} + _update_config = { + k: v for k, v in config.items() if k not in GROUP_ONLY_KEYS + } + if not bool(_update_config): + _update_config = None + + group_config = {k: v for k, v in config.items() if k in GROUP_ONLY_KEYS} + return group_config, _update_config + + if isinstance(name_or_config, str): + if update_config is not None: + # Process name_or_config as a dict instead + name_or_config = {"task": name_or_config, **update_config} + elif self._name_is_task(name_or_config) or self._name_is_python_task( + name_or_config + ): + task_config = self._get_config(name_or_config) + return _load_task(task_config, task=name_or_config) + else: + subtask_list = self._get_tasklist(name_or_config) + if subtask_list == -1: + group_config = self._get_config(name_or_config) + group_config, update_config = _process_group_config(group_config) + group_name, subtask_list = _get_group_and_subtask_from_config( + group_config + ) + else: + if self._name_is_tag(name_or_config): + fn = partial( + self._load_individual_task_or_group, + update_config=name_or_config + if isinstance(name_or_config, dict) + else None, + ) + return dict( + collections.ChainMap(*map(fn, reversed(subtask_list))) + ) + else: + group_name = ConfigurableGroup( + config={"group": name_or_config, "task": subtask_list} + ) + + if isinstance(name_or_config, dict): + if self._config_is_task(name_or_config): + name = name_or_config.pop("task") + if update_config is not None: + name_or_config = {**name_or_config, **update_config} + # If the name is registered as a group + if self._name_is_group(name): + group_config = self._get_config(name) + + group_config, update_config = _process_group_config( + group_config, name_or_config + ) + group_name, subtask_list = _get_group_and_subtask_from_config( + group_config + ) + elif self._name_is_tag(name): + subtask_list = self._get_tasklist(name) + fn = partial( + self._load_individual_task_or_group, + update_config=name_or_config, + ) + return dict(collections.ChainMap(*map(fn, reversed(subtask_list)))) + else: + if self._name_is_registered(name): + base_task_config = self._get_config(name) + + # Check if this is a duplicate. + if parent_name is not None: + num_duplicate = len( + list( + filter( + lambda x: x.startswith(name), + self.task_group_map[parent_name], + ) + ) + ) + if num_duplicate > 0: + name = f"{name}-{num_duplicate}" + self.task_group_map[parent_name].append(name) + + task_config = { + **base_task_config, + **name_or_config, + } + else: + task_config = name_or_config + return _load_task(task_config, task=name) + else: + group_config, update_config = _process_group_config(name_or_config) + group_name, subtask_list = _get_group_and_subtask_from_config( + group_config + ) + + fn = partial( + self._load_individual_task_or_group, + parent_name=group_name, + update_config=update_config, + ) + return { + group_name: dict(collections.ChainMap(*map(fn, reversed(subtask_list)))) + } + + def load_task_or_group(self, task_list: Optional[Union[str, list]] = None) -> dict: + """Loads a dictionary of task objects from a list + + :param task_list: Union[str, list] = None + Single string or list of string of task names to be loaded + + :return + Dictionary of task objects + """ + if isinstance(task_list, str): + task_list = [task_list] + + all_loaded_tasks = dict( + collections.ChainMap(*map(self._load_individual_task_or_group, task_list)) + ) + return all_loaded_tasks + + def load_config(self, config: Dict): + return self._load_individual_task_or_group(config) + + def _get_task_and_group(self, task_dir: str): + """Creates a dictionary of tasks index with the following metadata, + - `type`, that can be either `task`, `python_task`, `group` or `tags`. + `task` refer to regular task configs, `python_task` are special + yaml files that only consists of `task` and `class` parameters. + `group` are group configs. `tags` are labels that can be assigned + to tasks to assist in sorting and calling tasks of certain themes. + - `yaml_path`, path to the yaml file. If the entry is a `group` that + was configured through a task config, the yaml_path will be -1 + and all subtasks will be listed in `task` (see below) + - `task`, reserved for entries with `type` as `group`. This will list + all subtasks. When a group config is created (as opposed to task + config having `group` parameter set), this will be set to -1 to + avoid recursive indexing. The whole list of subtasks will be loaded + at evaluation. + + :param task_dir: str + A directory to check for tasks + + :return + Dictionary of task names as key and task metadata + """ + # TODO: remove group in next release + print_info = True + ignore_dirs = [ + "__pycache__", + ".ipynb_checkpoints", + ] + tasks_and_groups = collections.defaultdict() + for root, dirs, file_list in os.walk(task_dir): + dirs[:] = [d for d in dirs if d not in ignore_dirs] + for f in file_list: + if f.endswith(".yaml"): + yaml_path = os.path.join(root, f) + config = utils.load_yaml_config(yaml_path, mode="simple") + if self._config_is_python_task(config): + # This is a python class config + tasks_and_groups[config["task"]] = { + "type": "python_task", + "yaml_path": yaml_path, + } + elif self._config_is_group(config): + # This is a group config + tasks_and_groups[config["group"]] = { + "type": "group", + "task": -1, # This signals that + # we don't need to know + # the task list for indexing + # as it can be loaded + # when called. + "yaml_path": yaml_path, + } + + # # Registered the level 1 tasks from a group config + # for config in config["task"]: + # if isinstance(config, dict) and self._config_is_task(config): + # task = config["task"] + # tasks_and_groups[task] = { + # "type": "task", + # "yaml_path": yaml_path, + # } + + elif self._config_is_task(config): + # This is a task config + task = config["task"] + tasks_and_groups[task] = { + "type": "task", + "yaml_path": yaml_path, + } + + # TODO: remove group in next release + for attr in ["tag", "group"]: + if attr in config: + if attr == "group" and print_info: + self.logger.info( + "`group` and `group_alias` keys in tasks' configs will no longer be used in the next release of lm-eval. " + "`tag` will be used to allow to call a collection of tasks just like `group`. " + "`group` will be removed in order to not cause confusion with the new ConfigurableGroup " + "which will be the offical way to create groups with addition of group-wide configuations." + ) + print_info = False + # attr = "tag" + + attr_list = config[attr] + if isinstance(attr_list, str): + attr_list = [attr_list] + + for tag in attr_list: + if tag not in tasks_and_groups: + tasks_and_groups[tag] = { + "type": "tag", + "task": [task], + "yaml_path": -1, + } + elif tasks_and_groups[tag]["type"] != "tag": + self.logger.info( + f"The tag {tag} is already registered as a group, this tag will not be registered. " + "This may affect tasks you want to call." + ) + break + else: + tasks_and_groups[tag]["task"].append(task) + else: + self.logger.debug(f"File {f} in {root} could not be loaded") + + return tasks_and_groups + + +def get_task_name_from_config(task_config: Dict[str, str]) -> str: + if "task" in task_config: + return task_config["task"] + if "dataset_name" in task_config: + return "{dataset_path}_{dataset_name}".format(**task_config) + else: + return "{dataset_path}".format(**task_config) + + +def get_task_name_from_object(task_object): + if hasattr(task_object, "config"): + return task_object._config["task"] + + # TODO: scrap this + # this gives a mechanism for non-registered tasks to have a custom name anyways when reporting + return ( + task_object.EVAL_HARNESS_NAME + if hasattr(task_object, "EVAL_HARNESS_NAME") + else type(task_object).__name__ + ) + + +def _check_duplicates(task_dict: dict) -> List[str]: + """helper function solely used in validating get_task_dict output. + Takes the output of lm_eval.evaluator_utils.get_subtask_list and + returns a list of all leaf subtasks contained within, and errors if any such leaf subtasks are + "oversubscribed" to several disjoint groups. + """ + subtask_names = [] + for key, value in task_dict.items(): + subtask_names.extend(value) + + duplicate_tasks = { + task_name for task_name in subtask_names if subtask_names.count(task_name) > 1 + } + + # locate the potentially problematic groups that seem to 'compete' for constituent subtasks + competing_groups = [ + group + for group in task_dict.keys() + if len(set(task_dict[group]).intersection(duplicate_tasks)) > 0 + ] + + if len(duplicate_tasks) > 0: + raise ValueError( + f"Found 1 or more tasks while trying to call get_task_dict() that were members of more than 1 called group: {list(duplicate_tasks)}. Offending groups: {competing_groups}. Please call groups which overlap their constituent tasks in separate evaluation runs." + ) + + +def get_task_dict( + task_name_list: Union[str, List[Union[str, Dict, Task]]], + task_manager: Optional[TaskManager] = None, +): + """Creates a dictionary of task objects from either a name of task, config, or prepared Task object. + + :param task_name_list: List[Union[str, Dict, Task]] + Name of model or LM object, see lm_eval.models.get_model + :param task_manager: TaskManager = None + A TaskManager object that stores indexed tasks. If not set, + task_manager will load one. This should be set by the user + if there are additional paths that want to be included + via `include_path` + + :return + Dictionary of task objects + """ + + task_name_from_string_dict = {} + task_name_from_config_dict = {} + task_name_from_object_dict = {} + + if isinstance(task_name_list, str): + task_name_list = [task_name_list] + elif isinstance(task_name_list, list): + if not all([isinstance(task, (str, dict, Task)) for task in task_name_list]): + raise TypeError( + "Expected all list items to be of types 'str', 'dict', or 'Task', but at least one entry did not match." + ) + else: + raise TypeError( + f"Expected a 'str' or 'list' but received {type(task_name_list)}." + ) + + string_task_name_list = [task for task in task_name_list if isinstance(task, str)] + others_task_name_list = [ + task for task in task_name_list if not isinstance(task, str) + ] + if len(string_task_name_list) > 0: + if task_manager is None: + task_manager = TaskManager() + + task_name_from_string_dict = task_manager.load_task_or_group( + string_task_name_list + ) + + for task_element in others_task_name_list: + if isinstance(task_element, dict): + task_name_from_config_dict = { + **task_name_from_config_dict, + **task_manager.load_config(config=task_element), + } + + elif isinstance(task_element, Task): + task_name_from_object_dict = { + **task_name_from_object_dict, + get_task_name_from_object(task_element): task_element, + } + + if not set(task_name_from_string_dict.keys()).isdisjoint( + set(task_name_from_object_dict.keys()) + ): + raise ValueError + + final_task_dict = { + **task_name_from_string_dict, + **task_name_from_config_dict, + **task_name_from_object_dict, + } + + # behavior can get odd if one tries to invoke several groups that "compete" for the same task. + # (notably, because one could request several num_fewshot values at once in GroupConfig overrides for the subtask + # and we'd be unsure which to use and report.) + # we explicitly check and error in this case. + _check_duplicates(get_subtask_list(final_task_dict)) + + return final_task_dict diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/anli/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/anli/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ba3f99d4826f0604f583772a2b48fe676a6f3e06 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/anli/README.md @@ -0,0 +1,56 @@ +# ANLI + +### Paper + +Title: `Adversarial NLI: A New Benchmark for Natural Language Understanding` + +Paper Link: https://arxiv.org/abs/1910.14599 + +Adversarial NLI (ANLI) is a dataset collected via an iterative, adversarial +human-and-model-in-the-loop procedure. It consists of three rounds that progressively +increase in difficulty and complexity, and each question-answer includes annotator- +provided explanations. + +Homepage: https://github.com/facebookresearch/anli + +### Citation + +``` +@inproceedings{nie-etal-2020-adversarial, + title = "Adversarial {NLI}: A New Benchmark for Natural Language Understanding", + author = "Nie, Yixin and + Williams, Adina and + Dinan, Emily and + Bansal, Mohit and + Weston, Jason and + Kiela, Douwe", + booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics", + year = "2020", + publisher = "Association for Computational Linguistics", +} +``` + +### Groups and Tasks + +#### Groups + +* `anli`: Evaluates `anli_r1`, `anli_r2`, and `anli_r3` + +#### Tasks +* `anli_r1`: The data collected adversarially in the first round. +* `anli_r2`: The data collected adversarially in the second round, after training on the previous round's data. +* `anli_r3`: The data collected adversarially in the third round, after training on the previous multiple rounds of data. + + +### Checklist + +For adding novel benchmarks/datasets to the library: + * [x] Is the task an existing benchmark in the literature? + * [x] Have you referenced the original paper that introduced the task? + * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [ ] Is the "Main" variant of this task clearly denoted? +* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/anli/anli_r1.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/anli/anli_r1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2de1d259600c85a31f6d2bec69d37783cc0cd0f8 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/anli/anli_r1.yaml @@ -0,0 +1,26 @@ +tag: + - anli +task: anli_r1 +dataset_path: anli +dataset_name: null +output_type: multiple_choice +training_split: train_r1 +validation_split: dev_r1 +test_split: test_r1 +doc_to_text: "{{premise}}\nQuestion: {{hypothesis}} True, False, or Neither?\nAnswer:" +# True = entailment +# False = contradiction +# Neither = neutral +doc_to_target: "{{['True', 'Neither', 'False'][label]}}" +doc_to_choice: + - "True" + - "Neither" + - "False" +should_decontaminate: true +doc_to_decontamination_query: premise +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/anli/anli_r2.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/anli/anli_r2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..85f28d67cf230fa36cd38dd8d6a345f6e679c53e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/anli/anli_r2.yaml @@ -0,0 +1,5 @@ +include: anli_r1.yaml +task: anli_r2 +training_split: train_r2 +validation_split: dev_r2 +test_split: test_r2 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/anli/anli_r3.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/anli/anli_r3.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6b9f98a867f7d03b90e84a425dc8b044b4cc96fb --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/anli/anli_r3.yaml @@ -0,0 +1,5 @@ +include: anli_r1.yaml +task: anli_r3 +training_split: train_r3 +validation_split: dev_r3 +test_split: test_r3 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/drop/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/drop/README.md new file mode 100644 index 0000000000000000000000000000000000000000..6b7fc47b7165034bd74c524048f5f54ea8d041cf --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/drop/README.md @@ -0,0 +1,53 @@ +# DROP + +### Paper + +Title: `DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs` + +Abstract: https://aclanthology.org/attachments/N19-1246.Supplementary.pdf + +DROP is a QA dataset which tests comprehensive understanding of paragraphs. In +this crowdsourced, adversarially-created, 96k question-answering benchmark, a +system must resolve multiple references in a question, map them onto a paragraph, +and perform discrete operations over them (such as addition, counting, or sorting). + +Homepage: https://allenai.org/data/drop + +Acknowledgement: This implementation is based on the official evaluation for `DROP`: +https://github.com/allenai/allennlp-reading-comprehension/blob/master/allennlp_rc/eval/drop_eval.py + +### Citation + +``` +@misc{dua2019drop, + title={DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs}, + author={Dheeru Dua and Yizhong Wang and Pradeep Dasigi and Gabriel Stanovsky and Sameer Singh and Matt Gardner}, + year={2019}, + eprint={1903.00161}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` + +### Groups and Tasks + +#### Groups + +* Not part of a group yet. + +#### Tasks + +* `drop` + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [ ] Is the task an existing benchmark in the literature? + * [ ] Have you referenced the original paper that introduced the task? + * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [ ] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/drop/default.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/drop/default.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4a936121524950e8a89822058cb2b29f244f31a4 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/drop/default.yaml @@ -0,0 +1,26 @@ +task: drop +dataset_path: EleutherAI/drop +output_type: generate_until +training_split: train +validation_split: validation +process_docs: !function utils.process_docs +doc_to_text: "{{passage}} {{question}}" +doc_to_target: "{{ answer|join(',')}}" +target_delimiter: "" +process_results: !function utils.process_results +should_decontaminate: true +doc_to_decontamination_query: "{{passage}} {{question}}" +generation_kwargs: + until: + - "." +metric_list: + - metric: em + aggregation: mean + higher_is_better: true + - metric: f1 + aggregation: mean + higher_is_better: true +metadata: + version: 3.0 +dataset_kwargs: + trust_remote_code: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/drop/utils.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/drop/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..fc4e7d4b4db1775cdae632d4a5334adeeeffb318 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/drop/utils.py @@ -0,0 +1,205 @@ +import re +import string + +import numpy as np + + +_ARTICLES = re.compile(r"\b(a|an|the)\b", re.UNICODE) + + +def process_docs(dataset): + def _process(doc): + return { + "id": doc["query_id"], + "passage": doc["passage"], + "question": doc["question"], + "answers": get_answers(doc), + } + + return dataset.map(_process) + + +def get_answers(doc): + def _flatten_validated_answers(validated_answers): + """Flattens a dict of lists of validated answers. + {"number": ['1', '8'], ...} + -> [{"number": ['1'], ...}, {"number": ['8'], ...}] + """ + valid_answers = [] + for i in range(len(validated_answers["number"])): + valid_answers.append( + { + "number": validated_answers["number"][i], + "date": validated_answers["date"][i], + "spans": validated_answers["spans"][i], + } + ) + return valid_answers + + answers = [] + answers_set = set() + candidates = [doc["answer"]] + _flatten_validated_answers(doc["validated_answers"]) + for candidate in candidates: + answer = parse_answer(candidate) + if answer in answers_set: + continue + answers_set.add(answer) + answers.append(answer) + return answers + + +def parse_answer(answer): + # NOTE: Everything is returned as a tuple for uniformity and hashability. + if answer["number"] != "": + return (str(answer["number"]),) + if answer["spans"] != []: + return tuple(answer["spans"]) + return ( + " ".join( + [answer["date"]["day"], answer["date"]["month"], answer["date"]["year"]] + ).strip(), + ) + + +def process_results(doc, results): + preds, golds = results, doc["answers"] + max_em = 0 + max_f1 = 0 + for gold_answer in golds: + exact_match, f1_score = get_metrics(preds, gold_answer) + if gold_answer[0].strip(): + max_em = max(max_em, exact_match) + max_f1 = max(max_f1, f1_score) + return {"em": max_em, "f1": max_f1} + + +def get_metrics(predicted, gold): + """ + Takes a predicted answer and a gold answer (that are both either a string or a list of + strings), and returns exact match and the DROP F1 metric for the prediction. If you are + writing a script for evaluating objects in memory (say, the output of predictions during + validation, or while training), this is the function you want to call, after using + :func:`answer_json_to_strings` when reading the gold answer from the released data file. + """ + predicted_bags = _answer_to_bags(predicted) + gold_bags = _answer_to_bags(gold) + + if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len( + gold_bags[0] + ): + exact_match = 1.0 + else: + exact_match = 0.0 + + f1_per_bag = _align_bags(predicted_bags[1], gold_bags[1]) + f1 = np.mean(f1_per_bag) + f1 = round(f1, 2) + return exact_match, f1 + + +def _answer_to_bags(answer): + if isinstance(answer, (list, tuple)): + raw_spans = answer + else: + raw_spans = [answer] + normalized_spans = [] + token_bags = [] + for raw_span in raw_spans: + normalized_span = _normalize(raw_span) + normalized_spans.append(normalized_span) + token_bags.append(set(normalized_span.split())) + return normalized_spans, token_bags + + +def _align_bags(predicted, gold): + """ + Takes gold and predicted answer sets and first finds the optimal 1-1 alignment + between them and gets maximum metric values over all the answers. + """ + from scipy.optimize import linear_sum_assignment + + scores = np.zeros([len(gold), len(predicted)]) + for gold_index, gold_item in enumerate(gold): + for pred_index, pred_item in enumerate(predicted): + if _match_numbers_if_present(gold_item, pred_item): + scores[gold_index, pred_index] = _compute_f1(pred_item, gold_item) + row_ind, col_ind = linear_sum_assignment(-scores) + + max_scores = np.zeros([max(len(gold), len(predicted))]) + for row, column in zip(row_ind, col_ind): + max_scores[row] = max(max_scores[row], scores[row, column]) + return max_scores + + +def _compute_f1(predicted_bag, gold_bag): + intersection = len(gold_bag.intersection(predicted_bag)) + if not predicted_bag: + precision = 1.0 + else: + precision = intersection / float(len(predicted_bag)) + if not gold_bag: + recall = 1.0 + else: + recall = intersection / float(len(gold_bag)) + f1 = ( + (2 * precision * recall) / (precision + recall) + if not (precision == 0.0 and recall == 0.0) + else 0.0 + ) + return f1 + + +def _match_numbers_if_present(gold_bag, predicted_bag): + gold_numbers = set() + predicted_numbers = set() + for word in gold_bag: + if _is_number(word): + gold_numbers.add(word) + for word in predicted_bag: + if _is_number(word): + predicted_numbers.add(word) + if (not gold_numbers) or gold_numbers.intersection(predicted_numbers): + return True + return False + + +def _is_number(text): + try: + float(text) + return True + except ValueError: + return False + + +def _remove_articles(text): + return _ARTICLES.sub(" ", text) + + +def _white_space_fix(text): + return " ".join(text.split()) + + +def _remove_punc(text): + exclude = set(string.punctuation) + if not _is_number(text): + return "".join(ch for ch in text if ch not in exclude) + else: + return text + + +def _fix_number(text): + return str(float(text)) if _is_number(text) else text + + +def _tokenize(text): + return re.split(" |-", text) + + +def _normalize(answer): + tokens = [ + _white_space_fix(_remove_articles(_fix_number(_remove_punc(token.lower())))) + for token in _tokenize(answer) + ] + tokens = [token for token in tokens if token.strip()] + normalized = " ".join(tokens).strip() + return normalized diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/hendrycks_math/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/hendrycks_math/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8c8a328fcff7773e630939b1ef1cbae8ff1b0d02 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/hendrycks_math/README.md @@ -0,0 +1,54 @@ +# MATH + +## Paper +Measuring Mathematical Problem Solving With the MATH Dataset +https://arxiv.org/abs/2103.03874 + +Many intellectual endeavors require mathematical problem solving, but this skill remains beyond the capabilities of computers. To measure this ability in machine learning models, we introduce MATH, a new dataset of 12,500 challenging competition mathematics problems. Each problem in MATH has a full step-by-step solution which can be used to teach models to generate answer derivations and explanations. + +NOTE: This task corresponds to the MATH (`hendrycks_math`) implementation at https://github.com/EleutherAI/lm-evaluation-harness/tree/master . For the variant which uses the custom 4-shot prompt in the Minerva paper (https://arxiv.org/abs/2206.14858), and SymPy answer checking as done by Minerva, see `lm_eval/tasks/minerva_math`. + +Homepage: https://github.com/hendrycks/math + + +## Citation +``` +@article{hendrycksmath2021, + title={Measuring Mathematical Problem Solving With the MATH Dataset}, + author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt}, + journal={NeurIPS}, + year={2021} +} +``` + +### Groups and Tasks + +#### Groups + +- `hendrycks_math`: the MATH benchmark from Hendrycks et al. 0- or few-shot. + +#### Tasks + +- `hendrycks_math_algebra` +- `hendrycks_math_counting_and_prob` +- `hendrycks_math_geometry` +- `hendrycks_math_intermediate_algebra` +- `hendrycks_math_num_theory` +- `hendrycks_math_prealgebra` +- `hendrycks_math_precalc` + +### Checklist + +The checklist is the following: + +For adding novel benchmarks/datasets to the library: +* [x] Is the task an existing benchmark in the literature? + * [x] Have you referenced the original paper that introduced the task? + * [x] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + * Answer extraction code is taken from the original MATH benchmark paper's repository. + + +If other tasks on this dataset are already supported: +* [x] Is the "Main" variant of this task clearly denoted? +* [x] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [x] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/hendrycks_math/hendrycks_math.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/hendrycks_math/hendrycks_math.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d01cf9b2465b4e825ed8b5c67fe0aca281c31781 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/hendrycks_math/hendrycks_math.yaml @@ -0,0 +1,15 @@ +group: hendrycks_math +task: + - hendrycks_math_algebra + - hendrycks_math_counting_and_prob + - hendrycks_math_geometry + - hendrycks_math_intermediate_algebra + - hendrycks_math_num_theory + - hendrycks_math_prealgebra + - hendrycks_math_precalc +aggregate_metric_list: + - metric: exact_match + aggregation: mean + weight_by_size: true +metadata: + version: 1.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/hendrycks_math/hendrycks_math_algebra.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/hendrycks_math/hendrycks_math_algebra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9ce9c9c5ad64d894732054caa679d73b1df23881 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/hendrycks_math/hendrycks_math_algebra.yaml @@ -0,0 +1,25 @@ +tag: + - math_word_problems +task: hendrycks_math_algebra +dataset_path: EleutherAI/hendrycks_math +process_docs: !function utils.process_docs +dataset_name: algebra +output_type: generate_until +training_split: train +test_split: test +doc_to_text: "Problem: {{problem}}\nAnswer:" +process_results: !function utils.process_results +doc_to_target: "{{answer}}" +generation_kwargs: + until: + - "Problem:" + do_sample: false + temperature: 0 +metric_list: + - metric: exact_match + aggregation: mean + higher_is_better: true +metadata: + version: 1.0 +dataset_kwargs: + trust_remote_code: true diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/hendrycks_math/hendrycks_math_counting_and_prob.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/hendrycks_math/hendrycks_math_counting_and_prob.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6de3e140c252803afd1246e155c14c9c66672351 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/hendrycks_math/hendrycks_math_counting_and_prob.yaml @@ -0,0 +1,3 @@ +include: hendrycks_math_algebra.yaml +dataset_name: counting_and_probability +task: hendrycks_math_counting_and_prob diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/hendrycks_math/hendrycks_math_geometry.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/hendrycks_math/hendrycks_math_geometry.yaml new file mode 100644 index 0000000000000000000000000000000000000000..2016439a8673cd970fe8869b61f001ee71c54b36 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/hendrycks_math/hendrycks_math_geometry.yaml @@ -0,0 +1,3 @@ +include: hendrycks_math_algebra.yaml +dataset_name: geometry +task: hendrycks_math_geometry diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/hendrycks_math/hendrycks_math_intermediate_algebra.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/hendrycks_math/hendrycks_math_intermediate_algebra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..022e1cdd085568dbee4e9171721e31a5079dfa48 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/hendrycks_math/hendrycks_math_intermediate_algebra.yaml @@ -0,0 +1,3 @@ +include: hendrycks_math_algebra.yaml +dataset_name: intermediate_algebra +task: hendrycks_math_intermediate_algebra diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/hendrycks_math/hendrycks_math_num_theory.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/hendrycks_math/hendrycks_math_num_theory.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7a1e53bc306336834dde059168d35fefc454d0b4 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/hendrycks_math/hendrycks_math_num_theory.yaml @@ -0,0 +1,3 @@ +include: hendrycks_math_algebra.yaml +dataset_name: number_theory +task: hendrycks_math_num_theory diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/hendrycks_math/hendrycks_math_prealgebra.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/hendrycks_math/hendrycks_math_prealgebra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7a0baa388a3531cbe00765fb545b83ae2b540842 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/hendrycks_math/hendrycks_math_prealgebra.yaml @@ -0,0 +1,3 @@ +include: hendrycks_math_algebra.yaml +dataset_name: prealgebra +task: hendrycks_math_prealgebra diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/hendrycks_math/hendrycks_math_precalc.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/hendrycks_math/hendrycks_math_precalc.yaml new file mode 100644 index 0000000000000000000000000000000000000000..af81fec9362204da67b8bb4f193773a0cc67cb67 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/hendrycks_math/hendrycks_math_precalc.yaml @@ -0,0 +1,3 @@ +include: hendrycks_math_algebra.yaml +dataset_name: precalculus +task: hendrycks_math_precalc diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/hendrycks_math/utils.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/hendrycks_math/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..0edd59a16a30be95f79403239e73af65c12d8d66 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/hendrycks_math/utils.py @@ -0,0 +1,231 @@ +from typing import Dict, List + +import datasets + + +def process_docs(dataset: datasets.Dataset) -> datasets.Dataset: + def _process_doc(doc: dict) -> dict: + out_doc = { + "problem": doc["problem"], + "solution": doc["solution"], + "answer": remove_boxed(last_boxed_only_string(doc["solution"])), + } + return out_doc + + return dataset.map(_process_doc) + + +def process_results(doc: dict, results: List[str]) -> Dict[str, int]: + retval = 0 + indices = [pos for pos, char in enumerate(results[0]) if char == "$"] + if len(indices) <= 1: + answer = results[0] + else: + answer = results[0][indices[0] + 1 : indices[-1]] + + if is_equiv(answer, remove_boxed(last_boxed_only_string(doc["solution"]))): + retval = 1 + + results = { + "exact_match": retval, + } + return results + + +# string normalization from https://github.com/EleutherAI/lm-evaluation-harness/blob/master/lm_eval/tasks/hendrycks_math.py +def is_equiv(str1, str2, verbose=False): + if str1 is None and str2 is None: + print("WARNING: Both None") + return True + if str1 is None or str2 is None: + return False + + try: + ss1 = strip_string(str1) + ss2 = strip_string(str2) + if verbose: + print(ss1, ss2) + return ss1 == ss2 + except Exception: + return str1 == str2 + + +def remove_boxed(s): + if "\\boxed " in s: + left = "\\boxed " + assert s[: len(left)] == left + return s[len(left) :] + + left = "\\boxed{" + + assert s[: len(left)] == left + assert s[-1] == "}" + + return s[len(left) : -1] + + +def last_boxed_only_string(string): + idx = string.rfind("\\boxed") + if "\\boxed " in string: + return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0] + if idx < 0: + idx = string.rfind("\\fbox") + if idx < 0: + return None + + i = idx + right_brace_idx = None + num_left_braces_open = 0 + while i < len(string): + if string[i] == "{": + num_left_braces_open += 1 + if string[i] == "}": + num_left_braces_open -= 1 + if num_left_braces_open == 0: + right_brace_idx = i + break + i += 1 + + if right_brace_idx is None: + retval = None + else: + retval = string[idx : right_brace_idx + 1] + + return retval + + +def fix_fracs(string): + substrs = string.split("\\frac") + new_str = substrs[0] + if len(substrs) > 1: + substrs = substrs[1:] + for substr in substrs: + new_str += "\\frac" + if substr[0] == "{": + new_str += substr + else: + try: + assert len(substr) >= 2 + except AssertionError: + return string + a = substr[0] + b = substr[1] + if b != "{": + if len(substr) > 2: + post_substr = substr[2:] + new_str += "{" + a + "}{" + b + "}" + post_substr + else: + new_str += "{" + a + "}{" + b + "}" + else: + if len(substr) > 2: + post_substr = substr[2:] + new_str += "{" + a + "}" + b + post_substr + else: + new_str += "{" + a + "}" + b + string = new_str + return string + + +def fix_a_slash_b(string): + if len(string.split("/")) != 2: + return string + a = string.split("/")[0] + b = string.split("/")[1] + try: + a = int(a) + b = int(b) + assert string == "{}/{}".format(a, b) + new_string = "\\frac{" + str(a) + "}{" + str(b) + "}" + return new_string + except AssertionError: + return string + + +def remove_right_units(string): + # "\\text{ " only ever occurs (at least in the val set) when describing units + if "\\text{ " in string: + splits = string.split("\\text{ ") + assert len(splits) == 2 + return splits[0] + else: + return string + + +def fix_sqrt(string): + if "\\sqrt" not in string: + return string + splits = string.split("\\sqrt") + new_string = splits[0] + for split in splits[1:]: + if split[0] != "{": + a = split[0] + new_substr = "\\sqrt{" + a + "}" + split[1:] + else: + new_substr = "\\sqrt" + split + new_string += new_substr + return new_string + + +def strip_string(string): + # linebreaks + string = string.replace("\n", "") + + # remove inverse spaces + string = string.replace("\\!", "") + + # replace \\ with \ + string = string.replace("\\\\", "\\") + + # replace tfrac and dfrac with frac + string = string.replace("tfrac", "frac") + string = string.replace("dfrac", "frac") + + # remove \left and \right + string = string.replace("\\left", "") + string = string.replace("\\right", "") + + # Remove circ (degrees) + string = string.replace("^{\\circ}", "") + string = string.replace("^\\circ", "") + + # remove dollar signs + string = string.replace("\\$", "") + + # remove units (on the right) + string = remove_right_units(string) + + # remove percentage + string = string.replace("\\%", "") + string = string.replace("\%", "") # noqa: W605 + + # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string + string = string.replace(" .", " 0.") + string = string.replace("{.", "{0.") + # if empty, return empty string + if len(string) == 0: + return string + if string[0] == ".": + string = "0" + string + + # to consider: get rid of e.g. "k = " or "q = " at beginning + if len(string.split("=")) == 2: + if len(string.split("=")[0]) <= 2: + string = string.split("=")[1] + + # fix sqrt3 --> sqrt{3} + string = fix_sqrt(string) + + # remove spaces + string = string.replace(" ", "") + + # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b} + string = fix_fracs(string) + + # manually change 0.5 --> \frac{1}{2} + if string == "0.5": + string = "\\frac{1}{2}" + + # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y + string = fix_a_slash_b(string) + + return string diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/siqa/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/siqa/README.md new file mode 100644 index 0000000000000000000000000000000000000000..ca58844b90079a607dd1a6a8a049106c26f57deb --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/siqa/README.md @@ -0,0 +1,37 @@ +# Social IQA + +### Paper + +Title: Social IQA: Commonsense Reasoning about Social Interactions + +Abstract: https://arxiv.org/abs/1904.09728 + +> We introduce Social IQa, the first largescale benchmark for commonsense reasoning about social situations. Social IQa contains 38,000 multiple choice questions for probing emotional and social intelligence in a variety of everyday situations (e.g., Q: "Jordan wanted to tell Tracy a secret, so Jordan leaned towards Tracy. Why did Jordan do this?" A: "Make sure no one else could hear"). Through crowdsourcing, we collect commonsense questions along with correct and incorrect answers about social interactions, using a new framework that mitigates stylistic artifacts in incorrect answers by asking workers to provide the right answer to a different but related question. Empirical results show that our benchmark is challenging for existing question-answering models based on pretrained language models, compared to human performance (>20% gap). Notably, we further establish Social IQa as a resource for transfer learning of commonsense knowledge, achieving state-of-the-art performance on multiple commonsense reasoning tasks (Winograd Schemas, COPA). + +Homepage: https://allenai.org/data/socialiqa + + +### Citation + +``` +@inproceedings{sap2019social, + title={Social IQa: Commonsense Reasoning about Social Interactions}, + author={Sap, Maarten and Rashkin, Hannah and Chen, Derek and Le Bras, Ronan and Choi, Yejin}, + booktitle={Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)}, + pages={4463--4473}, + year={2019} +} +``` + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [X] Is the task an existing benchmark in the literature? + * [X] Have you referenced the original paper that introduced the task? + * [X] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? The original paper doesn't have an associated implementation, but there is an official entry in [BigBench](https://github.com/google/BIG-bench/tree/main/bigbench/benchmark_tasks/social_iqa). I use the same prompting format as BigBench. + + +If other tasks on this dataset are already supported: +* [ ] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/siqa/siqa.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/siqa/siqa.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6e387a73a9e005520f426b9d097a10d433279ce2 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/siqa/siqa.yaml @@ -0,0 +1,16 @@ +task: social_iqa +dataset_path: social_i_qa +dataset_name: null +output_type: multiple_choice +training_split: train +validation_split: validation +doc_to_text: "Q: {{context}} {{question}}\nA:" +target_delimiter: " " +doc_to_choice: "{{[answerA, answerB, answerC]}}" +doc_to_target: "{{ (label|int) - 1 }}" +metric_list: + - metric: acc + aggregation: mean + higher_is_better: true +metadata: + version: 0.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/squadv2/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/squadv2/README.md new file mode 100644 index 0000000000000000000000000000000000000000..bad0c4e2d80ec17c3f4a4c2f15db2ce6a6632db4 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/squadv2/README.md @@ -0,0 +1,54 @@ +# Task-name + +### Paper + +Title: `Know What You Don’t Know: Unanswerable Questions for SQuAD` +Abstract: https://arxiv.org/abs/1806.03822 + +Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, +consisting of questions posed by crowdworkers on a set of Wikipedia articles, +where the answer to every question is a segment of text, or span, from the +corresponding reading passage, or the question might be unanswerable. +SQuAD2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable +questions written adversarially by crowdworkers to look similar to answerable ones. +To do well on SQuAD2.0, systems must not only answer questions when possible, but +also determine when no answer is supported by the paragraph and abstain from answering. + +Homepage: https://rajpurkar.github.io/SQuAD-explorer/ + + +### Citation + +``` +@misc{rajpurkar2018know, + title={Know What You Don't Know: Unanswerable Questions for SQuAD}, + author={Pranav Rajpurkar and Robin Jia and Percy Liang}, + year={2018}, + eprint={1806.03822}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +``` + +### Groups and Tasks + +#### Groups + +* Not part of a group yet + +#### Tasks + +* `squadv2`: `Default squadv2 task` + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [ ] Is the task an existing benchmark in the literature? + * [ ] Have you referenced the original paper that introduced the task? + * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [ ] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/squadv2/squadv2.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/squadv2/squadv2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..13e451645cc23284f3b45f15527c365410118617 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/squadv2/squadv2.yaml @@ -0,0 +1,2 @@ +task: squadv2 +class: !function task.SQuAD2 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/squadv2/task.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/squadv2/task.py new file mode 100644 index 0000000000000000000000000000000000000000..32c44c6022203aea400b91872b9e31ee22ff902a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/squadv2/task.py @@ -0,0 +1,241 @@ +""" +Know What You Don’t Know: Unanswerable Questions for SQuAD +https://arxiv.org/pdf/1806.03822.pdf + +Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, +consisting of questions posed by crowdworkers on a set of Wikipedia articles, +where the answer to every question is a segment of text, or span, from the +corresponding reading passage, or the question might be unanswerable. +SQuAD2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable +questions written adversarially by crowdworkers to look similar to answerable ones. +To do well on SQuAD2.0, systems must not only answer questions when possible, but +also determine when no answer is supported by the paragraph and abstain from answering. + +Homepage: https://rajpurkar.github.io/SQuAD-explorer/ +""" + +from functools import partial +from math import exp + +import datasets +from packaging import version + +from lm_eval.api.instance import Instance +from lm_eval.api.task import ConfigurableTask + + +_CITATION = """ +@misc{rajpurkar2018know, + title={Know What You Don't Know: Unanswerable Questions for SQuAD}, + author={Pranav Rajpurkar and Robin Jia and Percy Liang}, + year={2018}, + eprint={1806.03822}, + archivePrefix={arXiv}, + primaryClass={cs.CL} +} +""" + + +def _squad_metric(predictions, references): + squad_metric = datasets.load_metric("squad_v2") + return squad_metric.compute(predictions=predictions, references=references) + + +def _squad_agg(key, items): + predictions, references = zip(*items) + + return _squad_metric(predictions=predictions, references=references).get(key, 0) + + +class SQuAD2(ConfigurableTask): + VERSION = 3 + DATASET_PATH = "squad_v2" + DATASET_NAME = None + + def __init__(self, config=None): + super().__init__(config={"metadata": {"version": self.VERSION}}) + + # HF changed squad on us so we have to make sure we aren't running the old one + assert version.parse(datasets.__version__) >= version.parse( + "1.11.0" + ), "datasets v1.11.0 or later required for SQuAD" + + def has_training_docs(self): + return True + + def has_validation_docs(self): + return True + + def has_test_docs(self): + return False + + def training_docs(self): + return self.dataset["train"] + + def validation_docs(self): + return self.dataset["validation"] + + def doc_to_text(self, doc): + return ( + "Title: " + + doc["title"] + + "\n\n" + + "Background: " + + doc["context"] + + "\n\n" + + "Question: " + + doc["question"] + + "\n\n" + + "Answer:" + ) + + def should_decontaminate(self): + return True + + def doc_to_decontamination_query(self, doc): + return doc["context"] + + def doc_to_target(self, doc): + answer_list = doc["answers"]["text"] + if len(answer_list) > 0: + answer = answer_list[0] + else: + answer = "unanswerable" + return " " + answer + + def construct_requests(self, doc, ctx, **kwargs): + """Uses RequestFactory to construct Requests and returns an iterable of + Requests which will be sent to the LM. + + :param doc: + The document as returned from training_docs, validation_docs, or test_docs. + :param ctx: str + The context string, generated by fewshot_context. This includes the natural + language description, as well as the few shot examples, and the question + part of the document for `doc`. + """ + + return [ + Instance( + request_type="generate_until", + doc=doc, + arguments=(ctx, {"until": ["\n"]}), + idx=0, + **kwargs, + ), + Instance( + request_type="loglikelihood", + doc=doc, + arguments=(ctx, " " + "unanswerable"), + idx=0, + **kwargs, + ), + ] + + def process_results(self, doc, results): + """Take a single document and the LM results and evaluates, returning a + dict where keys are the names of submetrics and values are the values of + the metric for that one document + + :param doc: + The document as returned from training_docs, validation_docs, or test_docs. + :param results: + The results of the requests created in construct_requests. + """ + + continuation, (logprob_unanswerable, _) = results + + no_answer_probability = exp(logprob_unanswerable) + + predictions = { + "id": doc["id"], + "prediction_text": continuation, + "no_answer_probability": no_answer_probability, + } + + references = { + "id": doc["id"], + "answers": doc["answers"], + } + + return { + "exact": ( + predictions, + references, + ), # Exact match (the normalized answer exactly match the gold answer) + "f1": ( + predictions, + references, + ), # The F-score of predicted tokens versus the gold answer + "HasAns_exact": ( + predictions, + references, + ), # Exact match (the normalized answer exactly match the gold answer) + "HasAns_f1": ( + predictions, + references, + ), # The F-score of predicted tokens versus the gold answer + "NoAns_exact": ( + predictions, + references, + ), # Exact match (the normalized answer exactly match the gold answer) + "NoAns_f1": ( + predictions, + references, + ), # The F-score of predicted tokens versus the gold answer + "best_exact": ( + predictions, + references, + ), # Best exact match (with varying threshold) + "best_f1": (predictions, references), # Best F1 (with varying threshold) + } + + def aggregation(self): + """ + :returns: {str: [float] -> float} + A dictionary where keys are the names of submetrics and values are + functions that aggregate a list of metrics + """ + return { + "exact": partial( + _squad_agg, "exact" + ), # Exact match (the normalized answer exactly match the gold answer) + "f1": partial( + _squad_agg, "f1" + ), # The F-score of predicted tokens versus the gold answer + "HasAns_exact": partial( + _squad_agg, "HasAns_exact" + ), # Exact match (the normalized answer exactly match the gold answer) + "HasAns_f1": partial( + _squad_agg, "HasAns_f1" + ), # The F-score of predicted tokens versus the gold answer + "NoAns_exact": partial( + _squad_agg, "NoAns_exact" + ), # Exact match (the normalized answer exactly match the gold answer) + "NoAns_f1": partial( + _squad_agg, "NoAns_f1" + ), # The F-score of predicted tokens versus the gold answer + "best_exact": partial( + _squad_agg, "best_exact" + ), # Best exact match (with varying threshold) + "best_f1": partial( + _squad_agg, "best_f1" + ), # Best F1 (with varying threshold) + } + + def higher_is_better(self): + """ + :returns: {str: bool} + A dictionary where keys are the names of submetrics and values are + whether a higher value of the submetric is better + """ + return { + "exact": True, # Exact match (the normalized answer exactly match the gold answer) + "f1": True, # The F-score of predicted tokens versus the gold answer + "HasAns_exact": True, # Exact match (the normalized answer exactly match the gold answer) + "HasAns_f1": True, # The F-score of predicted tokens versus the gold answer + "NoAns_exact": True, # Exact match (the normalized answer exactly match the gold answer) + "NoAns_f1": True, # The F-score of predicted tokens versus the gold answer + "best_exact": True, # Best exact match (with varying threshold) + "best_f1": True, # Best F1 (with varying threshold) + } diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/README.md b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/README.md new file mode 100644 index 0000000000000000000000000000000000000000..7f5ea3c3be4f84c0bf5c733dccce3c8d95931bda --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/README.md @@ -0,0 +1,60 @@ +# XCOPA + +### Paper + +Title: `XCOPA: A Multilingual Dataset for Causal Commonsense Reasoning` + +Abstract: https://ducdauge.github.io/files/xcopa.pdf + +The Cross-lingual Choice of Plausible Alternatives dataset is a benchmark to evaluate the ability of machine learning models to transfer commonsense reasoning across languages. +The dataset is the translation and reannotation of the English COPA (Roemmele et al. 2011) and covers 11 languages from 11 families and several areas around the globe. +The dataset is challenging as it requires both the command of world knowledge and the ability to generalise to new languages. +All the details about the creation of XCOPA and the implementation of the baselines are available in the paper. + +Homepage: https://github.com/cambridgeltl/xcopa + +### Citation + +``` +@inproceedings{ponti2020xcopa, + title={{XCOPA: A} Multilingual Dataset for Causal Commonsense Reasoning}, + author={Edoardo M. Ponti, Goran Glava\v{s}, Olga Majewska, Qianchu Liu, Ivan Vuli\'{c} and Anna Korhonen}, + booktitle={Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)}, + year={2020}, + url={https://ducdauge.github.io/files/xcopa.pdf} +} +``` + +### Groups and Tasks + +#### Groups + +* `xcopa` + +#### Tasks + +* `xcopa_et`: Estonian +* `xcopa_ht`: Haitian Creole +* `xcopa_id`: Indonesian +* `xcopa_it`: Italian +* `xcopa_qu`: Cusco-Collao Quechua +* `xcopa_sw`: Kiswahili +* `xcopa_ta`: Tamil +* `xcopa_th`: Thai +* `xcopa_tr`: Turkish +* `xcopa_vi`: Vietnamese +* `xcopa_zh`: Mandarin Chinese + + +### Checklist + +For adding novel benchmarks/datasets to the library: +* [ ] Is the task an existing benchmark in the literature? + * [ ] Have you referenced the original paper that introduced the task? + * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test? + + +If other tasks on this dataset are already supported: +* [ ] Is the "Main" variant of this task clearly denoted? +* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates? +* [ ] Have you noted which, if any, published evaluation setups are matched by this variant? diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/_xcopa.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/_xcopa.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c73141e6e810155ec7fcb7dcb864c8991176a195 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/_xcopa.yaml @@ -0,0 +1,19 @@ +group: xcopa +task: + - xcopa_et + - xcopa_ht + - xcopa_id + - xcopa_it + - xcopa_qu + - xcopa_sw + - xcopa_ta + - xcopa_th + - xcopa_tr + - xcopa_vi + - xcopa_zh +aggregate_metric_list: + - metric: acc + aggregation: mean + weight_by_size: True +metadata: + version: 1.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/default_et.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/default_et.yaml new file mode 100644 index 0000000000000000000000000000000000000000..76e750ffe322f0f91b4ccdf146600e3f5bd28bdf --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/default_et.yaml @@ -0,0 +1,13 @@ +task: xcopa_et +dataset_path: xcopa +dataset_name: et +output_type: multiple_choice +validation_split: validation +test_split: test +doc_to_text: !function utils.doc_to_text_et +doc_to_target: label +doc_to_choice: !function utils.doc_to_choice +metric_list: + - metric: acc +metadata: + version: 1.0 diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/default_ht.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/default_ht.yaml new file mode 100644 index 0000000000000000000000000000000000000000..21e22e1a6ecfe560de9f8ee2f19423b182d0df39 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/default_ht.yaml @@ -0,0 +1,4 @@ +include: default_et.yaml +task: xcopa_ht +dataset_name: ht +doc_to_text: !function utils.doc_to_text_ht diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/default_id.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/default_id.yaml new file mode 100644 index 0000000000000000000000000000000000000000..08fda55c8bba30023936fc11c2efa8de6007125c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/default_id.yaml @@ -0,0 +1,4 @@ +include: default_et.yaml +task: xcopa_id +dataset_name: id +doc_to_text: !function utils.doc_to_text_id diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/default_it.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/default_it.yaml new file mode 100644 index 0000000000000000000000000000000000000000..51ba1002cb4cd0c97e2f9ec1e96c249a4b449db5 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/default_it.yaml @@ -0,0 +1,4 @@ +include: default_et.yaml +task: xcopa_it +dataset_name: it +doc_to_text: !function utils.doc_to_text_it diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/default_qu.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/default_qu.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c6f356001076e79a09cd8020b6fb6b0a4c052c25 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/default_qu.yaml @@ -0,0 +1,4 @@ +include: default_et.yaml +task: xcopa_qu +dataset_name: qu +doc_to_text: !function utils.doc_to_text_qu diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/default_sw.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/default_sw.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4174cb0ef3b639ad5d2817dc45640c66bd9401c7 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/default_sw.yaml @@ -0,0 +1,4 @@ +include: default_et.yaml +task: xcopa_sw +dataset_name: sw +doc_to_text: !function utils.doc_to_text_sw diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/default_ta.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/default_ta.yaml new file mode 100644 index 0000000000000000000000000000000000000000..216cacf89bd233858e613909e32e4b909c6bb338 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/default_ta.yaml @@ -0,0 +1,4 @@ +include: default_et.yaml +task: xcopa_ta +dataset_name: ta +doc_to_text: !function utils.doc_to_text_ta diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/default_th.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/default_th.yaml new file mode 100644 index 0000000000000000000000000000000000000000..90346b8c85be2ccff6e12ffcd64f3bd9ccb1ed70 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/default_th.yaml @@ -0,0 +1,4 @@ +include: default_et.yaml +task: xcopa_th +dataset_name: th +doc_to_text: !function utils.doc_to_text_th diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/default_tr.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/default_tr.yaml new file mode 100644 index 0000000000000000000000000000000000000000..81dac28670f00227b641fe4af46ad1542f7d173e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/default_tr.yaml @@ -0,0 +1,4 @@ +include: default_et.yaml +task: xcopa_tr +dataset_name: tr +doc_to_text: !function utils.doc_to_text_tr diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/default_vi.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/default_vi.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c08cdd1a2c08c86e792f0d91ce46838c4a27798a --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/default_vi.yaml @@ -0,0 +1,4 @@ +include: default_et.yaml +task: xcopa_vi +dataset_name: vi +doc_to_text: !function utils.doc_to_text_vi diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/default_zh.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/default_zh.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ad681e6a86dca8a3aae5b06af8835eb96bf1768c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/default_zh.yaml @@ -0,0 +1,4 @@ +include: default_et.yaml +task: xcopa_zh +dataset_name: zh +doc_to_text: !function utils.doc_to_text_zh diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/utils.py b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..fe9d85920baa7098fd20f853da6eadcbc787dedd --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xcopa/utils.py @@ -0,0 +1,114 @@ +from functools import partial + + +def convert_choice(choice): + return choice[0].lower() + choice[1:] + + +def doc_to_text(doc, connector): + # Drop the period + conn = connector[doc["question"]] + return doc["premise"].strip()[:-1] + f" {conn}" + + +def doc_to_choice(doc): + return [convert_choice(doc["choice1"]), convert_choice(doc["choice2"])] + + +doc_to_text_et = partial( + doc_to_text, + connector={ + "cause": "sest", + "effect": "seetõttu", + }, +) + + +doc_to_text_ht = partial( + doc_to_text, + connector={ + "cause": "poukisa", + "effect": "donk sa", + }, +) + + +doc_to_text_it = partial( + doc_to_text, + connector={ + "cause": "perché", + "effect": "quindi", + }, +) + + +doc_to_text_id = partial( + doc_to_text, + connector={ + "cause": "karena", + "effect": "maka", + }, +) + + +doc_to_text_qu = partial( + doc_to_text, + connector={ + "cause": "imataq", + "effect": "chaymi", + }, +) + + +doc_to_text_sw = partial( + doc_to_text, + connector={ + "cause": "kwa sababu", + "effect": "kwa hiyo", + }, +) + + +doc_to_text_zh = partial( + doc_to_text, + connector={ + "cause": "因为", + "effect": "所以", + }, +) + + +doc_to_text_ta = partial( + doc_to_text, + connector={ + "cause": "காரணமாக", + "effect": "எனவே", + }, +) + + +doc_to_text_th = partial( + doc_to_text, + connector={ + "cause": "เพราะ", + "effect": "ดังนั้น", + }, +) + + +doc_to_text_tr = partial( + doc_to_text, + connector={ + "cause": "çünkü", + "effect": "bu yüzden", + }, +) + + +doc_to_text_vi = partial( + doc_to_text, + connector={ + "cause": "bởi vì", + "effect": "vì vậy", + }, +) diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xnli/xnli_de.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xnli/xnli_de.yaml new file mode 100644 index 0000000000000000000000000000000000000000..8eef760eea5cb31fb5e205443dd1deb5f5880af7 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xnli/xnli_de.yaml @@ -0,0 +1,7 @@ +# Generated by utils.py +dataset_name: de +doc_to_choice: '{{[premise+", richtig? Ja, "+hypothesis,premise+", richtig? Auch, + "+hypothesis,premise+", richtig? Nein, "+hypothesis]}}' +doc_to_text: '' +include: xnli_common_yaml +task: xnli_de diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xnli/xnli_es.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xnli/xnli_es.yaml new file mode 100644 index 0000000000000000000000000000000000000000..7b00a8d9e3bb7b172cc73ea8f4fd4e07f6534da1 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xnli/xnli_es.yaml @@ -0,0 +1,7 @@ +# Generated by utils.py +dataset_name: es +doc_to_choice: '{{[premise+", correcto? Sí, "+hypothesis,premise+", correcto? Asi + que, "+hypothesis,premise+", correcto? No, "+hypothesis]}}' +doc_to_text: '' +include: xnli_common_yaml +task: xnli_es diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xnli/xnli_th.yaml b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xnli/xnli_th.yaml new file mode 100644 index 0000000000000000000000000000000000000000..9d725e83c388900a33f0e9df31abecceef697e8b --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/tasks/xnli/xnli_th.yaml @@ -0,0 +1,7 @@ +# Generated by utils.py +dataset_name: th +doc_to_choice: '{{[premise+", ถูกต้อง? ใช่, "+hypothesis,premise+", ถูกต้อง? ดังนั้น, + "+hypothesis,premise+", ถูกต้อง? ไม่, "+hypothesis]}}' +doc_to_text: '' +include: xnli_common_yaml +task: xnli_th diff --git a/scripts/yans/lm-evaluation-harness/lm_eval/utils.py b/scripts/yans/lm-evaluation-harness/lm_eval/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..7166e24d0723e397f00347d6f14eb14e5902a452 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/lm_eval/utils.py @@ -0,0 +1,501 @@ +import collections +import fnmatch +import functools +import hashlib +import importlib.util +import inspect +import json +import logging +import os +import re +from dataclasses import asdict, is_dataclass +from itertools import islice +from typing import Any, Callable, List + +import numpy as np +import yaml +from jinja2 import BaseLoader, Environment, StrictUndefined + + +logging.basicConfig( + format="%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d] %(message)s", + datefmt="%Y-%m-%d:%H:%M:%S", + level=logging.INFO, +) +eval_logger = logging.getLogger("lm-eval") + +SPACING = " " * 47 + +HIGHER_IS_BETTER_SYMBOLS = { + True: "↑", + False: "↓", +} + + +def hash_string(string: str) -> str: + return hashlib.sha256(string.encode("utf-8")).hexdigest() + + +def escaped_split(text, sep_char, maxsplit=-1): + """Split text into a list on occurrences of the given separation + character `sep_char`. The separation character may be escaped by a + backslash to avoid splitting at that location. + + The separation character must be a string of size 1. + + If `maxsplit` is given, at most `maxsplit` splits are done (thus, + the list will have at most `maxsplit + 1` elements). If `maxsplit` + is not specified or less than 0, then there is no limit on the + number of splits (all possible splits are made). + """ + assert ( + len(sep_char) == 1 + ), "separation string must be a single character for escaped splitting" + + if maxsplit == 0: + return text + maxsplit = max(0, maxsplit) + + return re.split(r"(? str: + """ + Given the sample results filenames, extracts and returns the task name. + """ + return filename[filename.find("_") + 1 : filename.rfind("_")] + + +def get_file_datetime(filename: str) -> str: + """ + Given the results and sample results filenames, extracts and returns the datetime. + """ + return filename[filename.rfind("_") + 1 :].replace(".jsonl", "") + + +def sanitize_model_name(model_name: str) -> str: + """ + Given the model name, returns a sanitized version of it. + """ + return re.sub(r"[\"<>:/\|\\?\*\[\]]+", "__", model_name) + + +def sanitize_task_name(task_name: str) -> str: + """ + Given the task name, returns a sanitized version of it. + """ + return re.sub(r"\W", "_", task_name) + + +def get_latest_filename(filenames: List[str]) -> str: + """ + Given a list of filenames, returns the filename with the latest datetime. + """ + return max(filenames, key=lambda f: get_file_datetime(f)) + + +def get_results_filenames(filenames: List[str]) -> List[str]: + """ + Extracts filenames that correspond to aggregated results. + """ + return [f for f in filenames if "/results_" in f and ".json" in f] + + +def get_sample_results_filenames(filenames: List[str]) -> List[str]: + """ + Extracts filenames that correspond to sample results. + """ + return [f for f in filenames if "/samples_" in f and ".json" in f] + + +def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len): + """ + - context_len allows for a rolling window context, allowing each prediction window to potentially + condition on some context + + :param token_list: list + List of tokens to be PREDICTED + :param max_seq_len: int + max_seq_len of model (or max_seq_len we want to use) + :param context_len: int + Amount of desired token context for prediction. Needs to be at least 1. + :param prefix_token: token + Dummy token like so the first token has something to condition on + :return: generator + Generator of tuples + (input_tokens, pred_tokens) + Note: Score only the last len(pred_tokens) logits of the LM + """ + assert 1 <= context_len <= max_seq_len + if not token_list: + return + # +1 offset, going from input->preds + pred_len = max_seq_len - context_len + 1 + predicted = 0 + + # Special handling for first window: predict all tokens + first_seq_len = min(max_seq_len, len(token_list)) + yield ([prefix_token] + token_list[: first_seq_len - 1], token_list[:first_seq_len]) + predicted += first_seq_len + + while predicted < len(token_list): + window_pred_len = min(len(token_list) - predicted, pred_len) + window_end = predicted + window_pred_len + + yield ( + token_list[window_end - max_seq_len - 1 : window_end - 1], + token_list[window_end - window_pred_len : window_end], + ) + predicted += window_pred_len + + +def make_disjoint_window(pair): + """Takes output from get_rolling_token_windows and makes the context not overlap with the continuation""" + a, b = pair + return a[: len(a) - (len(b) - 1)], b + + +class EnhancedJSONEncoder(json.JSONEncoder): + """ + Provides a proper json encoding for the loggers and trackers json dumps. + Notably manages the json encoding of dataclasses. + """ + + def default(self, o): + if is_dataclass(o): + return asdict(o) + return super().default(o) + + +class Reorderer: + def __init__(self, arr: List[Any], fn: Callable) -> None: + """Reorder an array according to some function + + Args: + arr (List[Any]): The initial array + fn (Callable[[Any], Any]): A function to determine the priority of elements + """ + self.size = len(arr) + arr = list(enumerate(arr)) + arr = group(arr, lambda x: fn(x[1])) + # arr = [([y[0] for y in x], x[0][1]) for x in arr] + # TODO: overhaul reorderer. It currently grouped requests by content but we don't want this + arr = [([y[0]], x[0][1]) for x in arr for y in x] + arr.sort(key=lambda x: fn(x[1])) + + self.arr = arr + + def get_reordered(self): + """Gets the reordered array + + Returns: + List[Any]: The reordered array + """ + return [x[1] for x in self.arr] + + def get_original(self, newarr): + """Restores the original order of a new array based on the old array's order + + Args: + newarr (List[Any]): The array to be restored + + Returns: + List[Any]: The array restored to the original order + """ + res = [None] * self.size + cov = [False] * self.size + + for (inds, _), v in zip(self.arr, newarr): + for ind in inds: + res[ind] = v + cov[ind] = True + + assert all(cov) + + return res + + +def make_table(result_dict, column: str = "results", sort_results: bool = False): + """Generate table of results.""" + from pytablewriter import LatexTableWriter, MarkdownTableWriter + + if column == "results": + column_name = "Tasks" + elif column == "groups": + column_name = "Groups" + + all_headers = [ + column_name, + "Version", + "Filter", + "n-shot", + "Metric", + "", + "Value", + "", + "Stderr", + ] + + md_writer = MarkdownTableWriter() + latex_writer = LatexTableWriter() + md_writer.headers = all_headers + latex_writer.headers = all_headers + + values = [] + + keys = result_dict[column].keys() + if sort_results: + # sort entries alphabetically by task or group name. + # NOTE: we default here to false, because order matters for multi-level table printing a la mmlu. + # sorting here would mess that up + keys = sorted(keys) + for k in keys: + dic = result_dict[column][k] + version = result_dict["versions"].get(k, " N/A") + n = str(result_dict.get("n-shot", " ").get(k, " ")) + higher_is_better = result_dict.get("higher_is_better", {}).get(k, {}) + + if "alias" in dic: + k = dic.pop("alias") + + metric_items = dic.items() + metric_items = sorted(metric_items) + + for (mf), v in metric_items: + m, _, f = mf.partition(",") + if m.endswith("_stderr"): + continue + + hib = HIGHER_IS_BETTER_SYMBOLS.get(higher_is_better.get(m), "") + + v = "%.4f" % v if isinstance(v, float) else v + + if m + "_stderr" + "," + f in dic: + se = dic[m + "_stderr" + "," + f] + se = " N/A" if se == "N/A" else "%.4f" % se + values.append([k, version, f, n, m, hib, v, "±", se]) + else: + values.append([k, version, f, n, m, hib, v, "", ""]) + k = "" + version = "" + md_writer.value_matrix = values + latex_writer.value_matrix = values + + # todo: make latex table look good + # print(latex_writer.dumps()) + + return md_writer.dumps() + + +def positional_deprecated(fn): + """ + A decorator to nudge users into passing only keyword args (`kwargs`) to the + wrapped function, `fn`. + """ + + @functools.wraps(fn) + def _wrapper(*args, **kwargs): + if len(args) != 1 if inspect.ismethod(fn) else 0: + print( + f"WARNING: using {fn.__name__} with positional arguments is " + "deprecated and will be disallowed in a future version of " + "lm-evaluation-harness!" + ) + return fn(*args, **kwargs) + + return _wrapper + + +def ignore_constructor(loader, node): + return node + + +def import_function(loader, node): + function_name = loader.construct_scalar(node) + yaml_path = os.path.dirname(loader.name) + + *module_name, function_name = function_name.split(".") + if isinstance(module_name, list): + module_name = ".".join(module_name) + module_path = os.path.normpath(os.path.join(yaml_path, "{}.py".format(module_name))) + + spec = importlib.util.spec_from_file_location(module_name, module_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + function = getattr(module, function_name) + return function + + +def load_yaml_config(yaml_path=None, yaml_config=None, yaml_dir=None, mode="full"): + if mode == "simple": + constructor_fn = ignore_constructor + elif mode == "full": + constructor_fn = import_function + + # Add the import_function constructor to the YAML loader + yaml.add_constructor("!function", constructor_fn) + if yaml_config is None: + with open(yaml_path, "rb") as file: + yaml_config = yaml.full_load(file) + + if yaml_dir is None: + yaml_dir = os.path.dirname(yaml_path) + + assert yaml_dir is not None + + if "include" in yaml_config: + include_path = yaml_config["include"] + del yaml_config["include"] + + if isinstance(include_path, str): + include_path = [include_path] + + # Load from the last one first + include_path.reverse() + final_yaml_config = {} + for path in include_path: + # Assumes that path is a full path. + # If not found, assume the included yaml + # is in the same dir as the original yaml + if not os.path.isfile(path): + path = os.path.join(yaml_dir, path) + + try: + included_yaml_config = load_yaml_config(yaml_path=path, mode=mode) + final_yaml_config.update(included_yaml_config) + except Exception as ex: + # If failed to load, ignore + raise ex + + final_yaml_config.update(yaml_config) + return final_yaml_config + return yaml_config + + +def regex_replace(string, pattern, repl, count: int = 0): + """Implements the `re.sub` function as a custom Jinja filter.""" + return re.sub(pattern, repl, string, count=count) + + +env = Environment( + loader=BaseLoader, undefined=StrictUndefined, keep_trailing_newline=True +) +env.filters["regex_replace"] = regex_replace + + +def apply_template(template: str, doc: dict) -> str: + rtemplate = env.from_string(template) + return rtemplate.render(**doc) + + +def create_iterator(raw_iterator, *, rank=0, world_size=1, limit=None): + """ + Method for creating a (potentially) sliced and limited + iterator from a raw document iterator. Used for splitting data + among ranks in multigpu setting or only pulling a sample of documents + """ + return islice(raw_iterator, rank, limit, world_size) + + +def weighted_f1_score(items): + from sklearn.metrics import f1_score + + unzipped_list = list(zip(*items)) + golds = unzipped_list[0] + preds = unzipped_list[1] + fscore = f1_score(golds, preds, average="weighted") + return fscore diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/anli_r2-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/anli_r2-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..6dc08ebbaa852afef27dbd6002575ada16870eb0 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/anli_r2-v0-res.json @@ -0,0 +1 @@ +{"results": {"anli_r2": {"acc": 0.356, "acc_stderr": 0.015149042659306628}}, "versions": {"anli_r2": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/arc_challenge-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/arc_challenge-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..49f34a73061139ee50a27896a5de9d0f1613941c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/arc_challenge-v0-res.json @@ -0,0 +1 @@ +{"results": {"arc_challenge": {"acc": 0.24488054607508533, "acc_norm": 0.2440273037542662, "acc_norm_stderr": 0.012551447627856257, "acc_stderr": 0.012566273985131354}}, "versions": {"arc_challenge": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_age-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_age-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..5dad8bf864709209d905dadb52930eaf43ff3eb0 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_english_age-v0-res.json @@ -0,0 +1 @@ +{"results": {"crows_pairs_english_age": {"likelihood_difference": 0.3160680928470684, "likelihood_difference_stderr": 0.02397758321605678, "pct_stereotype": 0.43956043956043955, "pct_stereotype_stderr": 0.05231815698566189}}, "versions": {"crows_pairs_english_age": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_race_color-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_race_color-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..16127e96ad2c7051d8daf0bc0ad5114a6a07eb69 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_race_color-v0-loglikelihood @@ -0,0 +1 @@ +6f9119026abff33c5c882d6172e092e806a8b21bd86864022978b1961839350f \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_sexual_orientation-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_sexual_orientation-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..5bb8a4336d89c12896186dc53f0bdd7f480c8df0 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/crows_pairs_french_sexual_orientation-v0-res.json @@ -0,0 +1 @@ +{"results": {"crows_pairs_french_sexual_orientation": {"likelihood_difference": 0.3160680928470684, "likelihood_difference_stderr": 0.02397758321605678, "pct_stereotype": 0.43956043956043955, "pct_stereotype_stderr": 0.05231815698566189}}, "versions": {"crows_pairs_french_sexual_orientation": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_geography-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_geography-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..0fb76aa9ba10e63a931ec3707ac6d9f3d84292aa --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_geography-v0-res.json @@ -0,0 +1 @@ +{"results": {"hendrycksTest-high_school_geography": {"acc": 0.2474747474747475, "acc_norm": 0.2777777777777778, "acc_norm_stderr": 0.03191178226713547, "acc_stderr": 0.03074630074212452}}, "versions": {"hendrycksTest-high_school_geography": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_government_and_politics-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_government_and_politics-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..12ea726b4ba81f264017a7fd71d18a6ac318b0ab --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_government_and_politics-v0-loglikelihood @@ -0,0 +1 @@ +11f40d8f48ba5cd739e21d54c3c04d3761f81df5cb7ddd77df868d24ced44b49 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_us_history-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_us_history-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..5b7a76909c08ab9c47c5e1eb6c06945e990fb639 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/hendrycksTest-high_school_us_history-v0-res.json @@ -0,0 +1 @@ +{"results": {"hendrycksTest-high_school_us_history": {"acc": 0.29901960784313725, "acc_norm": 0.28431372549019607, "acc_norm_stderr": 0.03166009679399814, "acc_stderr": 0.03213325717373618}}, "versions": {"hendrycksTest-high_school_us_history": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai_mt_de-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai_mt_de-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..12f5d349ebd170ee5295656bc3907f872453eca6 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai_mt_de-v0-res.json @@ -0,0 +1 @@ +{"results": {"lambada_openai_mt_de": {"acc": 0.0, "acc_stderr": 0.0, "ppl": 1.6479047769869253, "ppl_stderr": 0.006497321146240192}}, "versions": {"lambada_openai_mt_de": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai_mt_it-v0-loglikelihood b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai_mt_it-v0-loglikelihood new file mode 100644 index 0000000000000000000000000000000000000000..ca3fd80298aa1c565c978b26e992ccd42c7144f6 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/lambada_openai_mt_it-v0-loglikelihood @@ -0,0 +1 @@ +fd87c6c5cf4e0499c5f9f80e5bd7ee6a4f3d2991902a0cc3ec9e6eaf22d6760a \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/mutual_plus-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/mutual_plus-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..9c0348826363dfe77e60cb28cf546110e25bab35 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/mutual_plus-v0-res.json @@ -0,0 +1 @@ +{"results": {"mutual_plus": {"mrr": 0.5275583145221953, "mrr_stderr": 0.009940894824430708, "r@1": 0.2595936794582393, "r@1_stderr": 0.014737047402750955, "r@2": 0.45372460496614, "r@2_stderr": 0.01673517854461967}}, "versions": {"mutual_plus": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_github-v1-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_github-v1-res.json new file mode 100644 index 0000000000000000000000000000000000000000..cc06a45501fb498db32e56d0677ef01f10869cc9 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_github-v1-res.json @@ -0,0 +1 @@ +{"results": {"pile_github": {"bits_per_byte": 0.00013764216145332133, "byte_perplexity": 1.0000954108274611, "word_perplexity": 1.0009643183931227}}, "versions": {"pile_github": 1}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/pile_pubmed-central-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_pubmed-central-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..6e5f1efe495f7030764f96e45460a4d47315b1e3 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/pile_pubmed-central-v0-res.json @@ -0,0 +1 @@ +{"results": {"pile_pubmed-central": {"bits_per_byte": 1.5812411832795375e-05, "byte_perplexity": 1.0000158125368497, "word_perplexity": 1.000123107107861}}, "versions": {"pile_pubmed-central": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/reversed_words-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/reversed_words-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..9285ff2694c140b120aca438098daa39fc282a87 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/reversed_words-v0-res.json @@ -0,0 +1 @@ +{"results": {"reversed_words": {"acc": 0.0, "acc_stderr": 0.0}}, "versions": {"reversed_words": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/squad2-v1-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/squad2-v1-res.json new file mode 100644 index 0000000000000000000000000000000000000000..dd69f00abb989ba3d254b9a6925087e10737b8d6 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/squad2-v1-res.json @@ -0,0 +1 @@ +{"results": {"squad2": {"HasAns_exact": 0.0, "HasAns_f1": 0.0, "NoAns_exact": 0.0, "NoAns_f1": 0.0, "best_exact": 50.07159100480081, "best_f1": 50.07159100480081, "exact": 0.0, "f1": 0.0}}, "versions": {"squad2": 1}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/textsynth_test_9d5f33dbfe1e254928c89f5ed85e4c010d888065f55a8f1b863bc1eb0340a5f2.pkl b/scripts/yans/lm-evaluation-harness/tests/testdata/textsynth_test_9d5f33dbfe1e254928c89f5ed85e4c010d888065f55a8f1b863bc1eb0340a5f2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..46e0dbbbda5bd6902bd4cd205d59976e71a3b0fa --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/textsynth_test_9d5f33dbfe1e254928c89f5ed85e4c010d888065f55a8f1b863bc1eb0340a5f2.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a06027365696479403ecb4abb20da8ea6befb2aba6d0098f1dae42df661b542d +size 1813 diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wmt14-en-fr-v0-greedy_until b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt14-en-fr-v0-greedy_until new file mode 100644 index 0000000000000000000000000000000000000000..6d48d5579e95eb72bdc6c4dc8b4149e5f495b55e --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt14-en-fr-v0-greedy_until @@ -0,0 +1 @@ +368ae7eec0f902b5123f2d5197caa5109a23942011c53fe68d9eaeee20180e46 \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-pl-en-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-pl-en-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..a2f5cb31be388a5ca081567e72d16e42774aa008 --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-pl-en-v0-res.json @@ -0,0 +1 @@ +{"results": {"wmt20-pl-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.01353367757716276, "chrf_stderr": 0.00018386199249976465, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-pl-en": 0}} \ No newline at end of file diff --git a/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-ps-en-v0-res.json b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-ps-en-v0-res.json new file mode 100644 index 0000000000000000000000000000000000000000..00c9c742e4d34830628a52aa34cfd7faaa86516c --- /dev/null +++ b/scripts/yans/lm-evaluation-harness/tests/testdata/wmt20-ps-en-v0-res.json @@ -0,0 +1 @@ +{"results": {"wmt20-ps-en": {"bleu": 0.0, "bleu_stderr": 0.0, "chrf": 0.015192865365105723, "chrf_stderr": 0.00011334541381539086, "ter": 1.0, "ter_stderr": 0.0}}, "versions": {"wmt20-ps-en": 0}} \ No newline at end of file