koichi12 commited on Nov 28, 2024

Commit

277ed5d

verified ·

1 Parent(s): 4c8cf60

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +3 -0
scripts/yans/baseline_phi2/averaging_checkpoint.sh +20 -0
scripts/yans/baseline_phi2/preprocess_train.sh +15 -0
scripts/yans/baseline_phi2/save_hf_model.sh +9 -0
scripts/yans/baseline_phi2/train.sh +101 -0
scripts/yans/eval/lm-evaluation-harness/scripts/__init__.py +0 -0
scripts/yans/eval/lm-evaluation-harness/scripts/clean_training_data/README.md +33 -0
scripts/yans/eval/lm-evaluation-harness/scripts/clean_training_data/__init__.py +0 -0
scripts/yans/eval/lm-evaluation-harness/scripts/clean_training_data/compress_and_package.py +73 -0
scripts/yans/eval/lm-evaluation-harness/scripts/clean_training_data/generate_13_grams.py +216 -0
scripts/yans/eval/lm-evaluation-harness/scripts/clean_training_data/investigate_pile.py +94 -0
scripts/yans/eval/lm-evaluation-harness/scripts/clean_training_data/janitor_util.cpp +208 -0
scripts/yans/eval/lm-evaluation-harness/scripts/clean_training_data/process_sorted_buckets.py +131 -0
scripts/yans/eval/lm-evaluation-harness/scripts/clean_training_data/sort_13_gram_buckets.py +63 -0
scripts/yans/eval/lm-evaluation-harness/scripts/cost_estimate.py +98 -0
scripts/yans/eval/lm-evaluation-harness/scripts/harness_example.py +38 -0
scripts/yans/eval/lm-evaluation-harness/scripts/main_eval.py +127 -0
scripts/yans/eval/lm-evaluation-harness/scripts/make_table_tasks.py +52 -0
scripts/yans/eval/lm-evaluation-harness/scripts/merge_json.py +48 -0
scripts/yans/eval/lm-evaluation-harness/scripts/run_task_for_models.sh +28 -0
scripts/yans/lm-evaluation-harness/.coveragerc +28 -0
scripts/yans/lm-evaluation-harness/.flake8 +5 -0
scripts/yans/lm-evaluation-harness/.gitignore +24 -0
scripts/yans/lm-evaluation-harness/.pre-commit-config.yaml +54 -0
scripts/yans/lm-evaluation-harness/CITATION.bib +10 -0
scripts/yans/lm-evaluation-harness/CODEOWNERS +1 -0
scripts/yans/lm-evaluation-harness/LICENSE.md +21 -0
scripts/yans/lm-evaluation-harness/README.md +497 -0
scripts/yans/lm-evaluation-harness/bin/python +3 -0
scripts/yans/lm-evaluation-harness/bin/python3 +3 -0
scripts/yans/lm-evaluation-harness/bin/python3.10 +3 -0
scripts/yans/lm-evaluation-harness/docs/API_guide.md +198 -0
scripts/yans/lm-evaluation-harness/docs/CONTRIBUTING.md +79 -0
scripts/yans/lm-evaluation-harness/docs/README.md +11 -0
scripts/yans/lm-evaluation-harness/docs/decontamination.md +71 -0
scripts/yans/lm-evaluation-harness/docs/img/fewshot_example_gpt3.png +0 -0
scripts/yans/lm-evaluation-harness/docs/interface.md +162 -0
scripts/yans/lm-evaluation-harness/docs/model_guide.md +163 -0
scripts/yans/lm-evaluation-harness/docs/new_task_guide.md +492 -0
scripts/yans/lm-evaluation-harness/docs/task_guide.md +317 -0
scripts/yans/lm-evaluation-harness/eval.sh +5 -0
scripts/yans/lm-evaluation-harness/eval2.sh +5 -0
scripts/yans/lm-evaluation-harness/eval3.sh +5 -0
scripts/yans/lm-evaluation-harness/eval4.sh +5 -0
scripts/yans/lm-evaluation-harness/examples/lm-eval-overview.ipynb +1230 -0
scripts/yans/lm-evaluation-harness/examples/visualize-wandb.ipynb +170 -0
scripts/yans/lm-evaluation-harness/examples/visualize-zeno.ipynb +115 -0
scripts/yans/lm-evaluation-harness/ignore.txt +8 -0
scripts/yans/lm-evaluation-harness/ja_eval.sh +5 -0
scripts/yans/lm-evaluation-harness/ja_eval2.sh +5 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+scripts/yans/lm-evaluation-harness/bin/python3.10 filter=lfs diff=lfs merge=lfs -text
+scripts/yans/lm-evaluation-harness/bin/python3 filter=lfs diff=lfs merge=lfs -text
+scripts/yans/lm-evaluation-harness/bin/python filter=lfs diff=lfs merge=lfs -text

scripts/yans/baseline_phi2/averaging_checkpoint.sh ADDED Viewed

	@@ -0,0 +1,20 @@

+set -eux
+LLM_RECIPES_DIR=/code/llm-recipes
+CHECKPOINTS=(
+    /work/models/additiona_trained/mistral-llm-recipes-mistral-ja-v1/iter_0019200/model.pt
+    /work/models/additiona_trained/mistral-llm-recipes-mistral-ja-v1/iter_0019400/model.pt
+    /work/models/additiona_trained/mistral-llm-recipes-mistral-ja-v1/iter_0019600/model.pt
+    /work/models/additiona_trained/mistral-llm-recipes-mistral-ja-v1/iter_0019800/model.pt
+    /work/models/additiona_trained/mistral-llm-recipes-mistral-ja-v1/iter_0020000/model.pt
+)
+# Concatenate the checkpoints
+INPUTS=${CHECKPOINTS[@]}
+OUTPUT_DIR=/work/models/additiona_trained/mistral-llm-recipes-mistral-ja-v1/averaged/averaged-19200-20000.pt
+mkdir -p $(dirname $OUTPUT_DIR)
+python $LLM_RECIPES_DIR/tools/merge_checkpoints.py \
+       --inputs $INPUTS \
+       --output $OUTPUT_DIR

scripts/yans/baseline_phi2/preprocess_train.sh ADDED Viewed

	@@ -0,0 +1,15 @@

+set -eux
+LLM_RECIPES_DIR=/project
+INPUT_JSONL_FILE_PATH=${1:-/share/yans/datasets/jsonl/llm-jp-corpus-v1/ja/ja_wiki/train_0.jsonl}
+OUTPUT_FILE_PREFIX=${2:-/work/llm_recipes/datasets/bin/baseline_phi2/llm_jp_corpus_v1_ja_wiki_train_0/data}
+TOKENIZER_PATH=${3:-/share/pretrained_lm/Phi/Phi-2}
+mkdir -p $(dirname $OUTPUT_FILE_PREFIX)
+python $LLM_RECIPES_DIR/megatron_lm/tools/preprocess_data.py \
+       --input $INPUT_JSONL_FILE_PATH \
+       --output-prefix $OUTPUT_FILE_PREFIX \
+       --tokenizer-type HFPreTrainedTokenizer \
+       --tokenizer-model $TOKENIZER_PATH \
+       --workers 32 \
+       --append-eod \
+       --log-interval 1000

scripts/yans/baseline_phi2/save_hf_model.sh ADDED Viewed

	@@ -0,0 +1,9 @@

+set -eux
+LLM_RECIPES_DIR=/code/llm-recipes
+source $LLM_RECIPES_DIR/scripts/wmt2024/tokens.sh
+python $LLM_RECIPES_DIR/tools/save_hf_model.py \
+       --base_model_name_or_path mistralai/Mistral-7B-v0.1 \
+       --state_dict_path /work/models/additiona_trained/mistral-llm-recipes-mistral-ja-v1/averaged/averaged-19200-20000.pt \
+       --output_dir /work/models/additiona_trained_hf/mistral-llm-recipes-mistral-ja-v1-averaged-19200-20000 \
+       --hf_repo_id skim-wmt24/mistral-llm-recipes-mistral-ja-v1-averaged-19200-20000-hf

scripts/yans/baseline_phi2/train.sh ADDED Viewed

	@@ -0,0 +1,101 @@

+set -eux
+NOW=`date +%Y-%m-%d-%H:%M:%S`
+LLM_RECIPES_DIR=/project
+source $LLM_RECIPES_DIR/scripts/wmt2024/tokens.sh
+rm -f /tmp/hffs-*
+export WANDB_NOTES="Train sample"
+wandb login
+NUM_GPU_PER_NODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+NUM_NODES=1
+NUM_GPUS=$((${NUM_NODES} * ${NUM_GPU_PER_NODE}))
+# training config
+SEQ_LENGTH=4096
+SLIDING_WINDOW_SIZE=131072
+DATA_PARALLEL_SIZE=$NUM_GPUS
+MICRO_BATCH_SIZE=1
+GLOBAL_BATCH_SIZE=320
+TRAIN_STEPS=20000
+VALID_MICRO_BATCH_SIZE=1
+# optimizer config
+LR=2e-5
+MIN_LR=1e-6
+LR_WARMUP_STEPS=500
+LR_DECAY_STEPS=$TRAIN_STEPS
+WEIGHT_DECAY=0.1
+GRAD_CLIP=1.0
+# checkpoint & tokenizer
+TOKENIZER_MODEL=/share/pretrained_lm/Phi/Phi-2
+BASE_MODEL=$TOKENIZER_MODEL
+LOAD_DIR=$BASE_MODEL
+SAVE_DIR=/work/llm_recipes/models/yans-baseline-Phi-2
+mkdir -p $(dirname $SAVE_DIR)
+SAVE_BASE_NAME=$(basename $SAVE_DIR)
+LOG_FILE_PATH=$SAVE_DIR/train_${NOW}.log
+mkdir -p ${SAVE_DIR}
+# data config
+TRAIN_DATA_PATH="519177757 /work/llm_recipes/datasets/bin/baseline_phi2/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document"
+TRAIN_DATA_PATH="${TRAIN_DATA_PATH} 519177757 /work/llm_recipes/datasets/bin/baseline_phi2/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document"
+VALID_DATA_PATH="519177757 /work/llm_recipes/datasets/bin/baseline_phi2/llm_jp_corpus_v1_ja_wiki_train_0/data_text_document"
+TEST_DATA_PATH=${VALID_DATA_PATH}
+set +e
+cd $LLM_RECIPES_DIR
+# run
+DISTRIBUTED_ARGS="--nproc_per_node $NUM_GPU_PER_NODE --nnodes 1 --node_rank 0 --master_addr localhost --master_port 8000"
+torchrun $DISTRIBUTED_ARGS examples/finetuning.py \
+  --seq-length ${SEQ_LENGTH} \
+  --sliding-window-size ${SLIDING_WINDOW_SIZE} \
+  --micro-batch-size ${MICRO_BATCH_SIZE} \
+  --valid_micro_batch_size ${VALID_MICRO_BATCH_SIZE} \
+  --global-batch-size ${GLOBAL_BATCH_SIZE} \
+  --train-iters ${TRAIN_STEPS} \
+  --tokenizer-type HFPreTrainedTokenizer \
+  --tokenizer-model ${TOKENIZER_MODEL} \
+  --train-data-path ${TRAIN_DATA_PATH} \
+  --valid-data-path ${VALID_DATA_PATH} \
+  --test-data-path ${TEST_DATA_PATH} \
+  --lr ${LR} \
+  --min-lr ${MIN_LR} \
+  --lr-decay-style cosine \
+  --lr-warmup-iters ${LR_WARMUP_STEPS} \
+  --lr-decay-iters ${LR_DECAY_STEPS} \
+  --weight-decay ${WEIGHT_DECAY} \
+  --grad-clip-norm ${GRAD_CLIP} \
+  --optimizer anyprecision \
+  --adam-beta1 0.9 \
+  --adam-beta2 0.95 \
+  --adam-eps 1e-6 \
+  --save-interval 500 \
+  --eval-interval 500 \
+  --eval-iters 10 \
+  --bf16 \
+  --mixed-precision \
+  --base-model ${BASE_MODEL} \
+  --save ${SAVE_DIR} \
+  --load ${SAVE_DIR} \
+  --fsdp-activation-checkpointing \
+  --sharding-strategy FULL_SHARD \
+  --checkpoint-type LOCAL_STATE_DICT \
+  --save-n-checkpoints 10 \
+  --upload-all-checkpoints-to-hf \
+  --hf-upload-retry-limit 2 \
+  --hf-repo-id shirayukikun/$SAVE_BASE_NAME \
+  --wandb-entity "keitokudo" \
+  --wandb-project "llm_tutorial" \
+  --wandb-name ${SAVE_BASE_NAME}_train_${NOW} 2>&1 | tee $LOG_FILE_PATH
+#   --attn-implementation eager \
+#  --uploa-all-checkpoints-to-hf
+rm -f /tmp/hffs-*

scripts/yans/eval/lm-evaluation-harness/scripts/__init__.py ADDED Viewed

File without changes

scripts/yans/eval/lm-evaluation-harness/scripts/clean_training_data/README.md ADDED Viewed

	@@ -0,0 +1,33 @@

+janitor.py contains a script to remove benchmark data contamination from training data sets.
+It uses the approach described in the [GPT-3 paper](https://arxiv.org/abs/2005.14165).
+## Algorithm
+1) Collects all contamination text files that are to be removed from training data
+2) Filters training data by finding `N`gram matches between the training data
+   and any contamination
+   1) `N`grams ignore case and punctuation and are split on whitespace.
+   2) Matching `N`gram substrings are removed, as is a `window_to_remove` character window around
+    the match, splitting the training data into chunks
+   3) Any chunks less than `minimum_slice_length` are removed
+   4) Training data sets split into more than `too_dirty_cutoff` are considered
+    completey contaminated and removed
+OpenAI used:
+```
+ngram_n = 13
+window_to_remove = 200
+minimum_slice_length = 200
+too_dirty_cutoff = 10
+```
+## Compiling
+Janitor can be used as a pure python program, but it is much faster if the ngram
+code is run in C++. To compile the C++ code, run
+```
+pip install pybind11
+c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix)
+```
+If your your compiler isn't linked to python, you may need to add to the above `-undefined dynamic_lookup`

scripts/yans/eval/lm-evaluation-harness/scripts/clean_training_data/__init__.py ADDED Viewed

File without changes

scripts/yans/eval/lm-evaluation-harness/scripts/clean_training_data/compress_and_package.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import glob
+import argparse
+import os
+import subprocess
+import shutil
+from tqdm import tqdm
+from tqdm_multiprocess import TqdmMultiProcessPool
+import logging
+from tqdm_multiprocess.logger import setup_logger_tqdm
+logger = logging.getLogger(__name__)
+def process_task(
+    working_directory, output_directory, bucket_file_path, tqdm_func, global_tqdm
+):
+    command = f"zstd {bucket_file_path}"
+    logger.info(command)
+    subprocess.call(command, shell=True)
+    compressed_file = bucket_file_path + ".zst"
+    if output_directory:
+        shutil.move(compressed_file, output_directory)
+    os.remove(bucket_file_path)
+    global_tqdm.update()
+def compress_and_move(working_directory, output_directory, process_count):
+    os.makedirs(output_directory, exist_ok=True)
+    original_info_file_path = os.path.join(working_directory, "info.json")
+    assert os.path.exists(original_info_file_path)
+    tasks = []
+    bucket_file_paths = glob.glob(
+        os.path.join(working_directory, "output", f"*.bkt.txt.sorted")
+    )
+    for bucket_file_path in bucket_file_paths:
+        task = (process_task, (working_directory, output_directory, bucket_file_path))
+        tasks.append(task)
+    pool = TqdmMultiProcessPool(process_count)
+    def on_done(_):
+        return None
+    def on_error(_):
+        return None
+    global_progress = tqdm(
+        total=len(bucket_file_paths), dynamic_ncols=True, unit="file"
+    )
+    _ = pool.map(global_progress, tasks, on_error, on_done)
+    shutil.copy(original_info_file_path, os.path.join(output_directory, "info.json"))
+parser = argparse.ArgumentParser(description="sort 13gram buckets")
+parser.add_argument("-dir", "--working_directory", required=True)
+parser.add_argument("-output", "--output_directory", required=True)
+parser.add_argument("-procs", "--process_count", type=int, default=8)
+if __name__ == "__main__":
+    version = 1.00
+    print(f"Running version {version}")
+    logfile_path = "compress_and_package.log"
+    setup_logger_tqdm(logfile_path)
+    args = parser.parse_args()
+    compress_and_move(args.working_directory, args.output_directory, args.process_count)

scripts/yans/eval/lm-evaluation-harness/scripts/clean_training_data/generate_13_grams.py ADDED Viewed

	@@ -0,0 +1,216 @@

+"""
+Outputs all 13-grams found in The Pile.
+Loops through all documents and uses the logic found in janitor.py to extract 13-grams.
+We bucket each 13-gram by hash into separate file buckets to allow easy parallel processing in the
+next stage. We also include the current pile document_id with each ngram instance to allow the
+filtering to exclude 13-grams that match more then 10 unique documents (done further down the pipeline).
+We didn't use lm_dataformat to output as it increases time 4x (slow jsonify) and makes
+resuming hard (and we had the storage).
+Arguments
+---------
+--working_directory (-dir)
+    Directory containing the pile distribution. An "output" subdirectory will be created underneath
+    to store the bucketed 13-grams, checkpoint and done files. Default: current directory
+--n_value (-n)
+    n value in n-gram, added for later use if ever needed. Default: 13
+--bucket_count (-buckets)
+    Number of file buckets to use when generating 13grams. Default: 500
+"""
+import argparse
+import json
+import pickle
+import os
+import sys
+from pathlib import Path
+import glob
+import signal
+from signal import SIGINT
+from tqdm import tqdm
+from lm_eval.decontamination.janitor import Janitor, word_ngrams
+from lm_eval.decontamination.archiver import TextArchive, Reader
+import logging
+from tqdm_multiprocess.logger import setup_logger_tqdm
+logger = logging.getLogger(__name__)
+terminate = False
+def handler(signal_received, frame):
+    global terminate
+    terminate = True
+def yield_pile(start_offsets=None, checkpoint_offset=None):
+    directory = "pile"
+    if not os.path.exists(directory):
+        print(
+            "We expect the pile archives to be in the 'pile' directory, but this was not found."
+        )
+        raise Exception("Pile directory not found.")
+    files = list(sorted(glob.glob(os.path.join(directory, "*.jsonl.zst*"))))
+    pile_global_offset = 0
+    start_file = 0
+    if checkpoint_offset:
+        for file_i, start_offset in enumerate(start_offsets):
+            if start_offset > checkpoint_offset:
+                break
+            start_file = file_i
+            pile_global_offset = start_offset
+    for file_i, file in enumerate(files):
+        if file_i < start_file:
+            logger.info(f"Skipping file {file}")
+            continue
+        logger.info(f"Reading from pile file: {file}")
+        reader = Reader()
+        for document in reader.read(file):
+            yield (pile_global_offset, document)
+            pile_global_offset += 1
+# Hash buckets > disk backed files. Supports file position checkpointing and resuming
+# Allows you to write continuously and checkpoint intermittently. If a failure occurs
+# the buckets are simply truncated at your last checkpoint.
+class Buckets:
+    def __init__(self, directory, num_buckets):
+        self.bucket_files = [
+            os.path.join(directory, f"ngrams_{i}.bkt.txt") for i in range(num_buckets)
+        ]
+        self.buckets = list(map(TextArchive, self.bucket_files))
+        self.checkpoint_file = os.path.join(directory, f"bucket_offsets.ckpt")
+        if os.path.exists(self.checkpoint_file):
+            self.bucket_offsets = pickle.load(open(self.checkpoint_file, "rb"))
+        else:
+            self.bucket_offsets = [0 for i in range(len(self.buckets))]
+        for i, offset in enumerate(self.bucket_offsets):
+            bucket = self.buckets[i]
+            bucket.fh.seek(offset)
+            bucket.fh.truncate()
+    def add_data(self, key, value):
+        i = hash(key) % len(self.buckets)
+        bucket = self.buckets[i]
+        bucket.add_data(value)
+    def save_checkpoint(self):
+        for bucket in self.buckets:
+            bucket.fh.flush()
+        bucket_offsets = [bucket.fh.tell() for bucket in self.buckets]
+        pickle.dump(bucket_offsets, open(self.checkpoint_file, "wb"))
+    def close_buckets(self):
+        for bucket in self.buckets:
+            bucket.commit()
+def do_ngrams_in_buckets(n_value, working_directory, bucket_count):
+    pile_statistics = json.load(open("pile_statistics.json", "r"))
+    pile_document_count = pile_statistics["Document Count"]
+    start_offsets = pile_statistics["File Start Offsets"]
+    output_directory = os.path.join(working_directory, "output")
+    os.makedirs(output_directory, exist_ok=True)
+    logger.info(f"Generating {n_value}-grams and bucketing.")
+    # Done file
+    done_file = os.path.join(output_directory, f"ngram_buckets.done")
+    if os.path.exists(done_file):
+        logger.info("ngrams already generated and bucketed, skipping")
+        return
+    # Checkpoint
+    checkpoint_file = os.path.join(working_directory, f"pile_offset.ckpt")
+    if os.path.exists(checkpoint_file):
+        checkpoint_offset = pickle.load(open(checkpoint_file, "rb"))
+        iterate = True
+    else:
+        checkpoint_offset = 0
+        iterate = False
+    logger.info(f"Starting at pile document index {checkpoint_offset}")
+    buckets = Buckets(output_directory, bucket_count)
+    janitor = Janitor()
+    batch_size = 1000
+    batch_counter = 0
+    with tqdm(total=checkpoint_offset, dynamic_ncols=True, unit="docs") as progress:
+        for offset, document in yield_pile(start_offsets, checkpoint_offset):
+            if iterate:
+                logger.info(f"Iterating to offset {checkpoint_offset} from {offset}")
+                progress.update(offset)
+                iterate = False
+            if offset < checkpoint_offset:
+                progress.update()
+                if terminate:
+                    return
+                continue
+            if offset == checkpoint_offset:
+                progress.reset(total=pile_document_count)
+                progress.update(checkpoint_offset)
+            # Save checkpoint every "batch_size", only allow terminate after checkpoint
+            if batch_counter == batch_size:
+                progress.update(batch_size)
+                batch_counter = 0
+                buckets.save_checkpoint()
+                pickle.dump(offset, open(checkpoint_file, "wb"))
+                if terminate:
+                    buckets.close_buckets()
+                    return
+            ngrams = word_ngrams(janitor.normalize_string(document), n_value)
+            for ngram in ngrams:
+                buckets.add_data(ngram, f"{ngram} {offset}")
+            batch_counter += 1
+    buckets.close_buckets()
+    Path(done_file).touch()
+parser = argparse.ArgumentParser(description="Generate 13 grams from Pile.")
+parser.add_argument("-dir", "--working_directory", default="")
+parser.add_argument("-n", "--n_value", type=int, default=13)
+parser.add_argument("-buckets", "--bucket_count", type=int, default=500)
+if __name__ == "__main__":
+    version = 1.00
+    print(f"Running version {version}")
+    if "PYTHONHASHSEED" not in os.environ or os.environ["PYTHONHASHSEED"] != "0":
+        print("Please run 'export PYTHONHASHSEED=0' before running generate.")
+        sys.exit()
+    # Handle sigint (ctrl-c) cleanly
+    previous_signal_int = signal.signal(SIGINT, handler)
+    logfile_path = "ngrams.log"
+    setup_logger_tqdm(logfile_path)
+    args = parser.parse_args()
+    do_ngrams_in_buckets(args.n_value, args.working_directory, args.bucket_count)
+    info_dict = {"title": "dataset ngrams", "ngram_size": 13}
+    info_dict_path = os.path.join(args.working_directory, "info.json")
+    json.dump(info_dict, open(info_dict_path, "w"))

scripts/yans/eval/lm-evaluation-harness/scripts/clean_training_data/investigate_pile.py ADDED Viewed

	@@ -0,0 +1,94 @@

+from lm_eval.decontamination.archiver import Reader
+import os
+import json
+from functools import reduce
+import glob
+import tqdm
+from tqdm_multiprocess import TqdmMultiProcessPool
+def get_file_stats(file_path, tqdm_func, global_tqdm):
+    reader = Reader()
+    total_documents = 0
+    total_size = 0
+    update_frequency = 10000
+    current_file_position = 0
+    with tqdm_func(
+        total=os.path.getsize(file_path), dynamic_ncols=True, unit="byte", unit_scale=1
+    ) as progress:
+        for document in reader.read(file_path, get_meta=True):
+            total_size += len(document)
+            total_documents += 1
+            if total_documents % update_frequency == 0:
+                new_file_pos = reader.fh.tell()
+                bytes_read = new_file_pos - current_file_position
+                current_file_position = new_file_pos
+                progress.update(bytes_read)
+                global_tqdm.update(bytes_read)
+    return (total_documents, total_size)
+def get_files():
+    directory = "pile"
+    files = list(sorted(glob.glob(os.path.join(directory, "*.jsonl.zst*"))))
+    print(files)
+    return files
+def get_stats():
+    files = get_files()
+    total_size_bytes = sum(map(lambda x: os.path.getsize(x), files))
+    pool = TqdmMultiProcessPool(4)
+    global_tqdm = tqdm.tqdm(
+        total=total_size_bytes, dynamic_ncols=True, unit="byte", unit_scale=1
+    )
+    # Generate minhashes with pool
+    tasks = [(get_file_stats, (file,)) for file in files]
+    def on_done(_):
+        return None
+    def on_error(_):
+        return None
+    results = pool.map(global_tqdm, tasks, on_error, on_done)
+    total_documents, total_size = reduce(
+        lambda x, y: (x[0] + y[0], x[1] + y[1]), results
+    )
+    start_offsets = []
+    current_offset = 0
+    for file_document_count, _ in results:
+        start_offsets.append(current_offset)
+        current_offset += file_document_count
+    return (total_documents, total_size, start_offsets)
+if __name__ == "__main__":
+    version = 1.01
+    print(f"Running version {version}")
+    stats_file_path = "pile_statistics.json"
+    if os.path.exists(stats_file_path):
+        stats = json.load(open(stats_file_path, "r"))
+    else:
+        document_count, total_document_size_chars, start_offsets = get_stats()
+        stats = {
+            "Data": "Pile statistics",
+            "Document Count": document_count,
+            "Total Pile Characters": total_document_size_chars,
+            "File Start Offsets": start_offsets,
+        }
+        json.dump(stats, open(stats_file_path, "w"), indent=4)
+    print(f"document_count: {stats['Document Count']}")
+    print(f"total_chars: {stats['Total Pile Characters']}")
+    print(f"start_offsets: {stats['File Start Offsets']}")

scripts/yans/eval/lm-evaluation-harness/scripts/clean_training_data/janitor_util.cpp ADDED Viewed

	@@ -0,0 +1,208 @@

+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <queue>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+bool is_whitespace(char ch) noexcept {
+  // " \t\n\r\x0b\x0c" (python string.whitespace)
+  return ch == 32 or (9 <= ch and ch <= 13);
+  //    return ch <= 32; // arguably too general, but slightly faster
+}
+bool is_punctuation(char c) noexcept {
+  // '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'      ascii values:    33-47,  58-64,
+  // 91-96,  123-126
+  return (33 <= c and c <= 47) or (58 <= c and c <= 64) or
+         (91 <= c and c <= 96) or (123 <= c and c <= 126);
+}
+// Takes a string and makes ngrams of length N, splitting grams on whitespace
+// and ignoring ignored characters Returns a LARGE array of ngrams
+std::vector<std::string> clean_ngram(std::string const &input,
+                                     std::string const &ignore,
+                                     size_t ngram_n) noexcept {
+  size_t num_grams = 0;
+  std::vector<std::string> ngram_list;
+  std::vector<uint8_t> gram_lengths;
+  std::string current_ngram;
+  // Max gram length is set to 10 below.
+  current_ngram.reserve(11 * ngram_n);
+  gram_lengths.reserve(ngram_n);
+  bool started_gram = false;
+  gram_lengths.push_back(0);
+  // for (size_t i=0; i<input.length(); i++) {
+  //  this is slightly faster, and we don't need the index in this one
+  for (auto iter = input.begin(); iter != input.end(); iter++) {
+    // If whitespace, end the current ngram and start the next
+    // alternatively, (perhaps marginally) faster: if (is_whitespace(ch)) { ...
+    // }
+    if (is_whitespace(*iter) || gram_lengths.back() > 10) {
+      // Skip all whitespace
+      while (++iter != input.end() && is_whitespace(*iter))
+        ;
+      iter--;
+      if (started_gram) {
+        num_grams += 1;
+        // Building 1grams is a special case
+        if (ngram_n == 1) {
+          ngram_list.push_back(current_ngram);
+          current_ngram = current_ngram.substr(gram_lengths.front());
+          gram_lengths.back() = 0;
+          // If there are enough grams to form an ngram, save
+        } else if (num_grams >= ngram_n) {
+          // Save the current ngram
+          ngram_list.push_back(current_ngram);
+          // Start the next ngram by dropping the first gram and its space from
+          // the ngram
+          current_ngram = current_ngram.substr(gram_lengths.front() + 1);
+          current_ngram += ' ';
+          // Drop the length of the first gram and prepare to record the length
+          // of the new gram
+          gram_lengths.erase(gram_lengths.begin());
+          gram_lengths.push_back(0);
+          // Otherwise, continute building
+        } else {
+          current_ngram += ' ';
+          gram_lengths.push_back(0);
+        }
+        started_gram = false;
+      }
+      // Skip ignored characters
+      // alternatively, (perhaps marginally) faster: if (is_punctuation(ch))
+      // continue;
+    } else if (ignore.find(*iter) != std::string::npos) {
+      continue;
+    }
+    // If it is a non-ignored character, add it to the ngram and update the last
+    // gram's length
+    else {
+      current_ngram += tolower(*iter);
+      gram_lengths.back() += 1;
+      started_gram = true;
+    }
+  }
+  return ngram_list;
+}
+// Takes a string and makes ngrams of length N, splitting grams on whitespace
+// and ignoring ignored characters Returns a LARGE array of tuples of (ngram,
+// start_idx, end_idx)
+std::vector<std::tuple<std::string, size_t, size_t>>
+clean_ngram_with_indices(std::string const &input, std::string const &ignore,
+                         size_t ngram_n) noexcept {
+  size_t num_grams = 0;
+  std::vector<std::tuple<std::string, size_t, size_t>> ngram_list;
+  std::vector<uint8_t> gram_lengths;
+  std::vector<size_t> gram_start_indices;
+  std::string current_ngram;
+  // Max gram length is set to 10 below.
+  current_ngram.reserve(11 * ngram_n);
+  bool started_gram = false;
+  gram_lengths.push_back(0);
+  gram_start_indices.push_back(0);
+  for (size_t i = 0; i < input.length(); i++) {
+    char ch = input[i];
+    // If whitespace, end the current ngram and start the next
+    if (is_whitespace(ch) || gram_lengths.back() > 10) {
+      // Skip all whitespace
+      while (++i < input.length() && is_whitespace(input[i]))
+        ;
+      i--;
+      if (started_gram) {
+        num_grams += 1;
+        // Building 1grams is a special case
+        if (ngram_n == 1) {
+          ngram_list.push_back(
+              std::make_tuple(current_ngram, gram_start_indices.front(), i));
+          current_ngram = current_ngram.substr(gram_lengths.front());
+          gram_lengths.back() = 0;
+          gram_start_indices.back() = i + 1;
+          // If there are enough grams to form an ngram, save
+        } else if (num_grams >= ngram_n) {
+          // Save the current ngram
+          ngram_list.push_back(
+              std::make_tuple(current_ngram, gram_start_indices.front(), i));
+          // Start the next ngram by dropping the first gram and its space from
+          // the ngram
+          current_ngram = current_ngram.substr(gram_lengths.front() + 1);
+          current_ngram += ' ';
+          // Drop the length of the first gram and prepare to record the length
+          // of the new gram
+          gram_lengths.erase(gram_lengths.begin());
+          gram_lengths.push_back(0);
+          gram_start_indices.erase(gram_start_indices.begin());
+          gram_start_indices.push_back(i + 1);
+          // Otherwise, continute building
+        } else {
+          current_ngram += ' ';
+          gram_lengths.push_back(0);
+          gram_start_indices.push_back(i + 1);
+        }
+        started_gram = false;
+      }
+      // Skip ignored characters
+    } else if (ignore.find(ch) != std::string::npos) {
+      continue;
+      // If it is a non-ignored character, add it to the ngram and update the
+      // last gram's length
+    } else {
+      current_ngram += tolower(ch);
+      gram_lengths.back() += 1;
+      started_gram = true;
+    }
+  }
+  return ngram_list;
+}
+PYBIND11_MODULE(janitor_util, m) {
+  m.doc() = "pybind11 example plugin"; // optional module docstring
+  //    m.def("add", &add, "A function which adds two numbers");  // example
+  //    function
+  m.def("clean_ngram", &clean_ngram,
+        "Create ngrams of words, ignoring some characters");
+  m.def("clean_ngram_with_indices", &clean_ngram_with_indices,
+        "Create ngrams of words with indices, ignoring some characters");
+}
+// Example compile
+// c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes)
+// janitor_util.cpp -o janitor_util$(python3-config --extension-suffix) If
+// python and gcc aren't linked, append to the above:    -undefined
+// dynamic_lookup

scripts/yans/eval/lm-evaluation-harness/scripts/clean_training_data/process_sorted_buckets.py ADDED Viewed

	@@ -0,0 +1,131 @@

+"""
+Processes each sorted bucket, creating a new file listing all ngrams that matched more then 10
+unique documents with their unique document counts. Uses multiprocessing and very little memory
+as we stream from presorted buckets. Will use a lot of disk though.
+Arguments
+---------
+--working_directory (-dir)
+    Directory containing the sorted buckets, processed files will be deposited here. Default: current directory
+--move_dir (-move)
+    Directory to move processed 13grams too. Default: Do nothing
+--process_count (-procs)
+    Number of processes to use. Default: 4
+"""
+import argparse
+import glob
+import os
+from pathlib import Path
+import re
+import shutil
+from tqdm import tqdm
+from tqdm_multiprocess import TqdmMultiProcessPool
+from scripts.clean_training_data.archiver import TextReader, TextArchive
+import logging
+from tqdm_multiprocess.logger import setup_logger_tqdm
+logger = logging.getLogger(__name__)
+# Multiprocessed
+def process_bucket(
+    bucket_file_path, processed_directory, move_dir, tqdm_func, global_tqdm
+):
+    bucket_id = re.sub("\D", "", os.path.basename(bucket_file_path))  # noqa: W605
+    done_file = os.path.join(
+        processed_directory, f"ngram_bucket_processing_{bucket_id}.done"
+    )
+    if os.path.exists(done_file):
+        logger.info(f"bucket {bucket_id} already processed, skipping")
+        return
+    # For managing tqdm
+    file_size = os.path.getsize(bucket_file_path)
+    bucket_progress = tqdm_func(
+        total=file_size, dynamic_ncols=True, unit="byte", unit_scale=1
+    )
+    current_file_position = 0
+    update_frequency = 100 * 1000000  # 100mb
+    update_counter = 0
+    # Iterate through and output ngrams which occur in more then 10 documents
+    bucket = TextReader(bucket_file_path)
+    output_file_path = bucket_file_path + ".processed"
+    output_archive = TextArchive(output_file_path, mode="wb")
+    current_ngram = ""
+    current_ngram_document_ids = set()
+    for line in bucket.read():
+        [ngram, document_id] = line.rsplit(" ", 1)
+        # Write ngram if more then 10 unique document occurrences
+        if ngram != current_ngram:
+            if len(current_ngram_document_ids) > 10:
+                output_archive.add_data(
+                    f"{current_ngram} {len(current_ngram_document_ids)}"
+                )
+            current_ngram = ngram
+            current_ngram_document_ids = set()
+        current_ngram_document_ids.add(document_id)
+        # Update tqdm
+        update_counter += bucket.fh.tell() - current_file_position
+        current_file_position = bucket.fh.tell()
+        if update_counter > update_frequency:
+            bucket_progress.update(update_counter)
+            update_counter = 0
+    # Remainder
+    if len(current_ngram_document_ids) > 10:
+        output_archive.add_data(f"{current_ngram} {len(current_ngram_document_ids)}")
+    output_archive.commit()
+    Path(done_file).touch()
+    if move_dir:
+        shutil.move(output_file_path, move_dir)
+    global_tqdm.update()
+def process_sorted_buckets(working_directory, move_dir, process_count):
+    bucket_file_paths = glob.glob(os.path.join(working_directory, f"*.bkt.txt.sorted"))
+    processed_directory = os.path.join(working_directory, "processed")
+    os.makedirs(processed_directory, exist_ok=True)
+    pool = TqdmMultiProcessPool(process_count)
+    tasks = [
+        (process_bucket, (bucket_file, processed_directory, move_dir))
+        for bucket_file in bucket_file_paths
+    ]
+    global_tqdm = tqdm(total=len(bucket_file_paths), dynamic_ncols=True, unit="bucket")
+    def on_done(_):
+        return None
+    def on_error(_):
+        return None
+    _ = pool.map(global_tqdm, tasks, on_error, on_done)
+parser = argparse.ArgumentParser(description="Process 13 grams from sorted buckets.")
+parser.add_argument("-dir", "--working_directory", default="")
+parser.add_argument("-move", "--move_dir", default="")
+parser.add_argument("-procs", "--process_count", type=int, default=4)
+if __name__ == "__main__":
+    logfile_path = "process13grams.log"
+    setup_logger_tqdm(logfile_path)
+    args = parser.parse_args()
+    process_sorted_buckets(args.working_directory, args.move_dir, args.process_count)

scripts/yans/eval/lm-evaluation-harness/scripts/clean_training_data/sort_13_gram_buckets.py ADDED Viewed

	@@ -0,0 +1,63 @@

+"""
+Iteratively runs gnu sort on each bucket, uses up to 8 cores.
+Arguments
+---------
+--working_directory (-dir)
+    Directory containing the bucketed 13-grams. Sorted buckets will be deposited in the same
+    directory and the unsorted buckets are removed after.
+"""
+import glob
+import argparse
+import os
+import signal
+from signal import SIGINT
+import subprocess
+from tqdm import tqdm
+import logging
+from tqdm_multiprocess.logger import setup_logger_tqdm
+logger = logging.getLogger(__name__)
+terminate = False
+def handler(signal_received, frame):
+    global terminate
+    terminate = True
+def sort_13_gram_buckets(working_directory):
+    bucket_file_paths = glob.glob(os.path.join(working_directory, f"*.bkt.txt"))
+    for bucket_file_path in tqdm(bucket_file_paths, dynamic_ncols=True):
+        sorted_file_path = bucket_file_path + ".sorted"
+        command = f"sort {bucket_file_path} > {sorted_file_path}"
+        logger.info(command)
+        subprocess.call(command, shell=True)
+        if terminate:
+            return
+        os.remove(bucket_file_path)
+parser = argparse.ArgumentParser(description="sort 13gram buckets")
+parser.add_argument("-dir", "--working_directory", default="")
+if __name__ == "__main__":
+    version = 1.00
+    print(f"Running version {version}")
+    # Handle sigint (ctrl-c) cleanly
+    previous_signal_int = signal.signal(SIGINT, handler)
+    logfile_path = "sort13grambuckets.log"
+    setup_logger_tqdm(logfile_path)
+    args = parser.parse_args()
+    sort_13_gram_buckets(args.working_directory)

scripts/yans/eval/lm-evaluation-harness/scripts/cost_estimate.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import random
+import transformers
+from lm_eval import tasks, evaluator
+from lm_eval.base import LM
+class DryrunLM(LM):
+    def __init__(self):
+        self.tokencost = 0
+        self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2")
+        self.tokenizer.pad_token = "<|endoftext|>"
+    @classmethod
+    def create_from_arg_string(cls, arg_string):
+        return cls()
+    def loglikelihood(self, requests):
+        res = []
+        for ctx, cont in requests:
+            res.append((-random.random(), False))
+            self.tokencost += len(self.tokenizer.tokenize(ctx + cont))
+        return res
+    def greedy_until(self, requests):
+        res = []
+        for ctx, until in requests:
+            res.append("lol")
+            # assume worst case - generates until 256
+            self.tokencost += len(self.tokenizer.tokenize(ctx)) + 256
+        return res
+    def loglikelihood_rolling(self, requests):
+        res = []
+        for (s,) in requests:
+            # assume worst case: extra full context
+            self.tokencost += len(self.tokenizer.tokenize(s)) + 2048
+        return res
+def main():
+    lm = DryrunLM()
+    task_list = "arc_challenge,arc_easy,boolq,cola,copa,headqa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,record,rte,sciq,sst,triviaqa,webqs,wic,wikitext,winogrande,wnli,wsc"
+    values = []
+    for taskname in task_list.split(","):
+        lm.tokencost = 0
+        evaluator.evaluate(
+            lm=lm,
+            task_dict={taskname: tasks.get_task(taskname)()},
+            num_fewshot=0,
+            limit=None,
+            bootstrap_iters=10,
+            description_dict=None,
+        )
+        print(taskname, lm.tokencost)
+        values.append(
+            [
+                taskname,
+                lm.tokencost,
+                lm.tokencost / 1000 * 0.0008,
+                lm.tokencost / 1000 * 0.0012,
+                lm.tokencost / 1000 * 0.006,
+                lm.tokencost / 1000 * 0.06,
+            ]
+        )
+    from pytablewriter import MarkdownTableWriter
+    writer = MarkdownTableWriter()
+    writer.headers = ["Task", "Tokens", "Ada", "Babbage", "Curie", "Davinci"]
+    values.sort(key=lambda x: -x[1])
+    totcost = sum([x[1] for x in values])
+    values.append(
+        [
+            "**Total**",
+            totcost,
+            totcost / 1000 * 0.0008,
+            totcost / 1000 * 0.0012,
+            totcost / 1000 * 0.006,
+            totcost / 1000 * 0.06,
+        ]
+    )
+    writer.value_matrix = values
+    print(writer.dumps())
+if __name__ == "__main__":
+    main()

scripts/yans/eval/lm-evaluation-harness/scripts/harness_example.py ADDED Viewed

	@@ -0,0 +1,38 @@

+#!/usr/bin/env python
+# This script runs eval in the cluster. Use it as a basis for your own harnesses.
+from run_eval import build_executor, run_job
+from run_eval import JAEVAL8_TASKS, JAEVAL8_FEWSHOT
+from main_eval import main as main_eval
+def build_task_list(tasks, prompt):
+    out = []
+    # Some tasks don't have a prompt version
+    promptless = ["xwinograd_ja"]
+    for task in tasks:
+        if task not in promptless:
+            out.append(f"{task}-{prompt}")
+        else:
+            out.append(task)
+    return out
+def main():
+    executor = build_executor("eval", gpus_per_task=8, cpus_per_gpu=12)
+    tasks = build_task_list(JAEVAL8_TASKS, "0.3")
+    eval_args = {
+        "tasks": tasks,
+        "num_fewshot": JAEVAL8_FEWSHOT,
+        "model": "hf-causal",
+        "model_args": "pretrained=rinna/japanese-gpt-1b,use_fast=False",
+        "device": "cuda",
+        "limit": 100,
+        "verbose": True,
+    }
+    run_job(executor, main_eval, eval_args=eval_args, output_path="./check.json")
+if __name__ == "__main__":
+    main()

scripts/yans/eval/lm-evaluation-harness/scripts/main_eval.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import os
+import argparse
+import json
+import logging
+import fnmatch
+from lm_eval import tasks, evaluator
+logging.getLogger("openai").setLevel(logging.WARNING)
+class MultiChoice:
+    def __init__(self, choices):
+        self.choices = choices
+    # Simple wildcard support (linux filename patterns)
+    def __contains__(self, values):
+        for value in values.split(","):
+            if len(fnmatch.filter(self.choices, value)) == 0:
+                return False
+        return True
+    def __iter__(self):
+        for choice in self.choices:
+            yield choice
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", required=True)
+    parser.add_argument("--model_args", default="")
+    parser.add_argument("--tasks", default=None, choices=MultiChoice(tasks.ALL_TASKS))
+    parser.add_argument("--num_fewshot", type=str, default="0")
+    parser.add_argument("--batch_size", type=int, default=None)
+    parser.add_argument("--device", type=str, default=None)
+    parser.add_argument("--output_path", default=None)
+    parser.add_argument("--limit", type=str, default=None)
+    parser.add_argument("--no_cache", action="store_true")
+    parser.add_argument("--decontamination_ngrams_path", default=None)
+    parser.add_argument("--description_dict_path", default=None)
+    parser.add_argument("--check_integrity", action="store_true")
+    parser.add_argument("--verbose", action="store_true")
+    # TODO This is deprecated and throws an error, remove it
+    parser.add_argument("--provide_description", action="store_true")
+    return parser.parse_args()
+def clean_args(args) -> dict:
+    """Handle conversion to lists etc. for args"""
+    assert not args.provide_description, "provide-description is not implemented"
+    if args.limit:
+        print(
+            "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
+        )
+    if args.tasks is None:
+        args.tasks = tasks.ALL_TASKS
+    else:
+        args.tasks = pattern_match(args.tasks.split(","), tasks.ALL_TASKS)
+    print(f"Selected Tasks: {args.tasks}")
+    if args.num_fewshot is not None:
+        args.num_fewshot = [int(n) for n in args.num_fewshot.split(",")]
+    if args.limit is not None:
+        args.limit = [
+            int(n) if n.isdigit() else float(n) for n in args.limit.split(",")
+        ]
+    return vars(args)
+# Returns a list containing all values of the source_list that
+# match at least one of the patterns
+def pattern_match(patterns, source_list):
+    task_names = []
+    for pattern in patterns:
+        for matching in fnmatch.filter(source_list, pattern):
+            task_names.append(matching)
+    return task_names
+def main(eval_args: dict, description_dict_path: str = None, output_path: str = None):
+    """Run evaluation and optionally save output.
+    For a description of eval args, see `simple_evaluate`.
+    """
+    if description_dict_path:
+        with open(description_dict_path, "r") as f:
+            eval_args["description_dict"] = json.load(f)
+    results = evaluator.simple_evaluate(**eval_args)
+    dumped = json.dumps(results, indent=2, ensure_ascii=False)
+    print(dumped)
+    if output_path:
+        os.makedirs(os.path.dirname(output_path), exist_ok=True)
+        with open(output_path, "w") as f:
+            f.write(dumped)
+    return results
+if __name__ == "__main__":
+    args = parse_args()
+    args = clean_args(args)
+    # This is not used
+    args.pop("provide_description", None)
+    # treat non-eval args separately
+    description_dict_path = args.get("description_dict_path", None)
+    args.pop("description_dict_path", None)
+    output_path = args.get("output_path", None)
+    args.pop("output_path", None)
+    results = main(args, description_dict_path, output_path)
+    print(
+        f"{args['model']} ({args['model_args']}), limit: {args['limit']}, "
+        f"num_fewshot: {args['num_fewshot']}, batch_size: {args['batch_size']}"
+    )
+    print(evaluator.make_table(results))

scripts/yans/eval/lm-evaluation-harness/scripts/make_table_tasks.py ADDED Viewed

	@@ -0,0 +1,52 @@

+"""
+Usage:
+   python make_table_tasks.py --output <markdown_filename>
+"""
+import argparse
+import logging
+from lm_eval import tasks
+from pytablewriter import MarkdownTableWriter
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+def check(tf):
+    if tf:
+        return "✓"
+    else:
+        return " "
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--output", type=str, default="task_table.md")
+    args = parser.parse_args()
+    writer = MarkdownTableWriter()
+    writer.headers = ["Task Name", "Train", "Val", "Test", "Val/Test Docs", "Metrics"]
+    values = []
+    tasks = tasks.TASK_REGISTRY.items()
+    tasks = sorted(tasks, key=lambda x: x[0])
+    for tname, Task in tasks:
+        task = Task()
+        v = [
+            tname,
+            check(task.has_training_docs()),
+            check(task.has_validation_docs()),
+            check(task.has_test_docs()),
+            len(
+                list(
+                    task.test_docs() if task.has_test_docs() else task.validation_docs()
+                )
+            ),
+            ", ".join(task.aggregation().keys()),
+        ]
+        logger.info(v)
+        values.append(v)
+    writer.value_matrix = values
+    table = writer.dumps()
+    with open(args.output, "w") as f:
+        f.write(table)

scripts/yans/eval/lm-evaluation-harness/scripts/merge_json.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from glob import glob
+from pathlib import Path
+import json
+import sys
+# Find task-specific json files and add them to result json files.
+# task name, ex: xwinograd_ja
+try:
+    task = sys.argv[1]
+except IndexError:
+    print("Give task name as first argument, like: xwinograd_ja")
+    sys.exit(1)
+# task-specific result files have names like result.TASK.json
+task_results = glob(f"models/**/result.{task}.json", recursive=True)
+# given a task-specific file, the result.json file always exists
+for tres in task_results:
+    tres = Path(tres)
+    res = tres.parent / "result.json"
+    with open(res) as resfile:
+        res_data = json.loads(resfile.read())
+    with open(tres) as resfile:
+        tres_data = json.loads(resfile.read())
+    if task in res_data["results"]:
+        # Ideally we would overwrite these, but it can be tricky to get the few
+        # shot order correct, so adding that later.
+        # TODO overwrite
+        print(f"Not updating {tres.parent.name} because results already present")
+        continue
+    # update the relevant keys
+    for key in ("results", "versions"):
+        res_data[key][task] = tres_data[key][task]
+    # because the result is new, fewshot goes at the end
+    # for a single task, fewshow is a scalar and not an array
+    # XXX is the type change a bug?
+    tres_fewshot = tres_data["config"]["num_fewshot"]
+    res_data["config"]["num_fewshot"].append(tres_fewshot)
+    with open(res, "w") as resfile:
+        out = json.dumps(res_data, indent=2)
+        resfile.write(out)

scripts/yans/eval/lm-evaluation-harness/scripts/run_task_for_models.sh ADDED Viewed

	@@ -0,0 +1,28 @@

+#!/bin/bash
+# Given a task, run it on all relevant models and update their results.
+# See run_task_batch.sh for project, job name, and other batch settings.
+set -eou pipefail
+task=$1
+fewshot=$2
+# cd to script dir and then go up one so all paths work
+cd $(dirname -- "$0")/..
+# models.txt is a space-separated file. To generate it, use the code in this
+# function from the project root, then edit the file to remove models you don't
+# want to use (like community).
+function generate_models_txt() {
+   find models/ -name harness.sh \
+	   | xargs grep MODEL_ARGS= \
+	   | sed -e 's:^models/::' -e 's:/harness.sh.MODEL_ARGS=: :' -e 's:"::g' \
+	   > scripts/models.txt
+}
+cat scripts/models.txt | while read model_path args; do
+  # The echo is just for debugging
+  echo sbatch scripts/run_task_batch.sh $task $fewshot $args $model_path
+  sbatch scripts/run_task_batch.sh $task $fewshot $args $model_path
+done
+# after the batches have finished, use the following command to update results.json
+#   python scripts/merge_json.py

scripts/yans/lm-evaluation-harness/.coveragerc ADDED Viewed

	@@ -0,0 +1,28 @@

+[run]
+# tasks that aren't wired up.
+omit =
+    lm_eval/tasks/quac.py
+    lm_eval/tasks/storycloze.py
+    lm_eval/tasks/cbt.py
+    lm_eval/tasks/sat.py
+    lm_eval/tasks/triviaqa.py
+    lm_eval/tasks/naturalqs.py
+    lm_eval/models/dummy.py
+[report]
+exclude_lines =
+    # Skip any pass lines such as may be used for @abstractmethod
+    pass
+    # Have to re-enable the standard pragma
+    pragma: no cover
+    # Don't complain about missing debug-only code:
+    def __repr__
+    if self\.debug
+    # Don't complain if tests don't hit defensive assertion code:
+    raise AssertionError
+    raise NotImplementedError
+    return NotImplemented

scripts/yans/lm-evaluation-harness/.flake8 ADDED Viewed

	@@ -0,0 +1,5 @@

+[flake8]
+ignore = E203, E266, E501, W503, F403, F401, C901
+max-line-length = 127
+max-complexity = 10
+select = B,C,E,F,W,T4,B9

scripts/yans/lm-evaluation-harness/.gitignore ADDED Viewed

	@@ -0,0 +1,24 @@

+env
+*.pyc
+output/
+data/
+lm_cache
+.idea
+build
+dist
+*.egg-info
+venv
+.vscode/
+temp
+__pycache__
+.ipynb_checkpoints
+temp
+test_logs/
+# IPython
+profile_default/
+ipython_config.py
+# don't track (the default location of) the cached requests
+lm_eval/caching/.cache
+# don't track files created by wandb
+wandb
+examples/wandb

scripts/yans/lm-evaluation-harness/.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,54 @@

+# Ignore test linting to avoid conflicting changes to version stability.
+exclude: ^tests/testdata/
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+      - id: check-added-large-files
+      - id: check-ast
+      - id: check-byte-order-marker
+      - id: check-case-conflict
+      - id: check-json
+      - id: check-merge-conflict
+        args: [--assume-in-merge]
+      - id: check-symlinks
+      - id: check-yaml
+        args: ["--unsafe"]
+      - id: destroyed-symlinks
+      - id: detect-private-key
+      - id: end-of-file-fixer
+      - id: no-commit-to-branch
+        always_run: false
+      - id: requirements-txt-fixer
+      - id: trailing-whitespace
+        args: [--markdown-linebreak-ext=md]
+      - id: fix-byte-order-marker
+        exclude: docs/CNAME
+      - id: fix-encoding-pragma
+        args: [--remove]
+      - id: mixed-line-ending
+        args: [--fix=lf]
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.4.8
+    hooks:
+      # Run the linter.
+      - id: ruff
+        args:
+          - --fix
+        # Run the formatter.
+      - id: ruff-format
+  - repo: https://github.com/codespell-project/codespell
+    rev: v2.3.0
+    hooks:
+      - id: codespell
+        exclude: >
+          (?x)^(
+              .*\.json|ignore.txt|lm_eval/tasks/.*|.*yaml|.*\.ipynb
+          )$
+        args: [--check-filenames, --check-hidden, --ignore-words=ignore.txt]
+#  - repo: https://github.com/pre-commit/mirrors-mypy
+#    rev: v1.5.1
+#    hooks:
+#    - id: mypy
+#      additional_dependencies: [".[sentencepiece,multilingual,promptsource,gptq]", "types-PyYAML", "types-requests"]
+#      exclude: ^tests/.*$

scripts/yans/lm-evaluation-harness/CITATION.bib ADDED Viewed

	@@ -0,0 +1,10 @@

+@misc{eval-harness,
+  author       = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy},
+  title        = {A framework for few-shot language model evaluation},
+  month        = 12,
+  year         = 2023,
+  publisher    = {Zenodo},
+  version      = {v0.4.0},
+  doi          = {10.5281/zenodo.10256836},
+  url          = {https://zenodo.org/records/10256836}
+}

scripts/yans/lm-evaluation-harness/CODEOWNERS ADDED Viewed

	@@ -0,0 +1 @@


1	+ * @haileyschoelkopf @lintangsutawika

scripts/yans/lm-evaluation-harness/LICENSE.md ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2020 EleutherAI
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

scripts/yans/lm-evaluation-harness/README.md ADDED Viewed

	@@ -0,0 +1,497 @@

+# Language Model Evaluation Harness
+[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.10256836.svg)](https://doi.org/10.5281/zenodo.10256836)
+---
+*Latest News 📣*
+- [2024/07] [API model](docs/API_guide.md) support has been updated and refactored, introducing support for batched and async requests, and making it significantly easier to customize and use for your own purposes. **To run Llama 405B, we recommend using VLLM's OpenAI-compliant API to host the model, and use the `local-completions` model type to evaluate the model.**
+- [2024/07] New Open LLM Leaderboard tasks have been added ! You can find them under the [leaderboard](lm_eval/tasks/leaderboard/README.md) task group.
+---
+## Announcement
+**A new v0.4.0 release of lm-evaluation-harness is available** !
+New updates and features include:
+- **New Open LLM Leaderboard tasks have been added ! You can find them under the [leaderboard](lm_eval/tasks/leaderboard/README.md) task group.**
+- Internal refactoring
+- Config-based task creation and configuration
+- Easier import and sharing of externally-defined task config YAMLs
+- Support for Jinja2 prompt design, easy modification of prompts + prompt imports from Promptsource
+- More advanced configuration options, including output post-processing, answer extraction, and multiple LM generations per document, configurable fewshot settings, and more
+- Speedups and new modeling libraries supported, including: faster data-parallel HF model usage, vLLM support, MPS support with HuggingFace, and more
+- Logging and usability changes
+- New tasks including CoT BIG-Bench-Hard, Belebele, user-defined task groupings, and more
+Please see our updated documentation pages in `docs/` for more details.
+Development will be continuing on the `main` branch, and we encourage you to give us feedback on what features are desired and how to improve the library further, or ask questions, either in issues or PRs on GitHub, or in the [EleutherAI discord](https://discord.gg/eleutherai)!
+---
+## Overview
+This project provides a unified framework to test generative language models on a large number of different evaluation tasks.
+**Features:**
+- Over 60 standard academic benchmarks for LLMs, with hundreds of subtasks and variants implemented.
+- Support for models loaded via [transformers](https://github.com/huggingface/transformers/) (including quantization via [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ)), [GPT-NeoX](https://github.com/EleutherAI/gpt-neox), and [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/), with a flexible tokenization-agnostic interface.
+- Support for fast and memory-efficient inference with [vLLM](https://github.com/vllm-project/vllm).
+- Support for commercial APIs including [OpenAI](https://openai.com), and [TextSynth](https://textsynth.com/).
+- Support for evaluation on adapters (e.g. LoRA) supported in [HuggingFace's PEFT library](https://github.com/huggingface/peft).
+- Support for local models and benchmarks.
+- Evaluation with publicly available prompts ensures reproducibility and comparability between papers.
+- Easy support for custom prompts and evaluation metrics.
+The Language Model Evaluation Harness is the backend for 🤗 Hugging Face's popular [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard), has been used in [hundreds of papers](https://scholar.google.com/scholar?oi=bibs&hl=en&authuser=2&cites=15052937328817631261,4097184744846514103,1520777361382155671,17476825572045927382,18443729326628441434,14801318227356878622,7890865700763267262,12854182577605049984,15641002901115500560,5104500764547628290), and is used internally by dozens of organizations including NVIDIA, Cohere, BigScience, BigCode, Nous Research, and Mosaic ML.
+## Install
+To install the `lm-eval` package from the github repository, run:
+```bash
+git clone https://github.com/EleutherAI/lm-evaluation-harness
+cd lm-evaluation-harness
+pip install -e .
+```
+We also provide a number of optional dependencies for extended functionality. A detailed table is available at the end of this document.
+## Basic Usage
+### User Guide
+A user guide detailing the full list of supported arguments is provided [here](./docs/interface.md), and on the terminal by calling `lm_eval -h`. Alternatively, you can use `lm-eval` instead of `lm_eval`.
+A list of supported tasks (or groupings of tasks) can be viewed with `lm-eval --tasks list`. Task descriptions and links to corresponding subfolders are provided [here](./lm_eval/tasks/README.md).
+### Hugging Face `transformers`
+To evaluate a model hosted on the [HuggingFace Hub](https://huggingface.co/models) (e.g. GPT-J-6B) on `hellaswag` you can use the following command (this assumes you are using a CUDA-compatible GPU):
+```bash
+lm_eval --model hf \
+    --model_args pretrained=EleutherAI/gpt-j-6B \
+    --tasks hellaswag \
+    --device cuda:0 \
+    --batch_size 8
+```
+Additional arguments can be provided to the model constructor using the `--model_args` flag. Most notably, this supports the common practice of using the `revisions` feature on the Hub to store partially trained checkpoints, or to specify the datatype for running a model:
+```bash
+lm_eval --model hf \
+    --model_args pretrained=EleutherAI/pythia-160m,revision=step100000,dtype="float" \
+    --tasks lambada_openai,hellaswag \
+    --device cuda:0 \
+    --batch_size 8
+```
+Models that are loaded via both `transformers.AutoModelForCausalLM` (autoregressive, decoder-only GPT style models) and `transformers.AutoModelForSeq2SeqLM` (such as encoder-decoder models like T5) in Huggingface are supported.
+Batch size selection can be automated by setting the  ```--batch_size``` flag to ```auto```. This will perform automatic detection of the largest batch size that will fit on your device. On tasks where there is a large difference between the longest and shortest example, it can be helpful to periodically recompute the largest batch size, to gain a further speedup. To do this, append ```:N``` to above flag to automatically recompute the largest batch size ```N``` times. For example, to recompute the batch size 4 times, the command would be:
+```bash
+lm_eval --model hf \
+    --model_args pretrained=EleutherAI/pythia-160m,revision=step100000,dtype="float" \
+    --tasks lambada_openai,hellaswag \
+    --device cuda:0 \
+    --batch_size auto:4
+```
+> [!Note]
+> Just like you can provide a local path to `transformers.AutoModel`, you can also provide a local path to `lm_eval` via `--model_args pretrained=/path/to/model`
+#### Multi-GPU Evaluation with Hugging Face `accelerate`
+We support three main ways of using Hugging Face's [accelerate 🚀](https://github.com/huggingface/accelerate) library for multi-GPU evaluation.
+To perform *data-parallel evaluation* (where each GPU loads a **separate full copy** of the model), we leverage the `accelerate` launcher as follows:
+```
+accelerate launch -m lm_eval --model hf \
+    --tasks lambada_openai,arc_easy \
+    --batch_size 16
+```
+(or via `accelerate launch --no-python lm_eval`).
+For cases where your model can fit on a single GPU, this allows you to evaluate on K GPUs K times faster than on one.
+**WARNING**: This setup does not work with FSDP model sharding, so in `accelerate config` FSDP must be disabled, or the NO_SHARD FSDP option must be used.
+The second way of using `accelerate` for multi-GPU evaluation is when your model is *too large to fit on a single GPU.*
+In this setting, run the library *outside the `accelerate` launcher*, but passing `parallelize=True` to `--model_args` as follows:
+```
+lm_eval --model hf \
+    --tasks lambada_openai,arc_easy \
+    --model_args parallelize=True \
+    --batch_size 16
+```
+This means that your model's weights will be split across all available GPUs.
+For more advanced users or even larger models, we allow for the following arguments when `parallelize=True` as well:
+- `device_map_option`: How to split model weights across available GPUs. defaults to "auto".
+- `max_memory_per_gpu`: the max GPU memory to use per GPU in loading the model.
+- `max_cpu_memory`: the max amount of CPU memory to use when offloading the model weights to RAM.
+- `offload_folder`: a folder where model weights will be offloaded to disk if needed.
+The third option is to use both at the same time. This will allow you to take advantage of both data parallelism and model sharding, and is especially useful for models that are too large to fit on a single GPU.
+```
+accelerate launch --multi_gpu --num_processes {nb_of_copies_of_your_model} \
+    -m lm_eval --model hf \
+    --tasks lambada_openai,arc_easy \
+    --model_args parallelize=True \
+    --batch_size 16
+```
+To learn more about model parallelism and how to use it with the `accelerate` library, see the [accelerate documentation](https://huggingface.co/docs/transformers/v4.15.0/en/parallelism)
+**Warning: We do not natively support multi-node evaluation using the `hf` model type! Please reference [our GPT-NeoX library integration](https://github.com/EleutherAI/gpt-neox/blob/main/eval.py) for an example of code in which a custom multi-machine evaluation script is written.**
+**Note: we do not currently support multi-node evaluations natively, and advise using either an externally hosted server to run inference requests against, or creating a custom integration with your distributed framework [as is done for the GPT-NeoX library](https://github.com/EleutherAI/gpt-neox/blob/main/eval_tasks/eval_adapter.py).**
+### NVIDIA `nemo` models
+[NVIDIA NeMo Framework](https://github.com/NVIDIA/NeMo) is a generative AI framework built for researchers and pytorch developers working on language models.
+To evaluate a `nemo` model, start by installing NeMo following [the documentation](https://github.com/NVIDIA/NeMo?tab=readme-ov-file#installation). We highly recommended to use the NVIDIA PyTorch or NeMo container, especially if having issues installing Apex or any other dependencies (see [latest released containers](https://github.com/NVIDIA/NeMo/releases)). Please also install the lm evaluation harness library following the instructions in [the Install section](https://github.com/EleutherAI/lm-evaluation-harness/tree/main?tab=readme-ov-file#install).
+NeMo models can be obtained through [NVIDIA NGC Catalog](https://catalog.ngc.nvidia.com/models) or in [NVIDIA's Hugging Face page](https://huggingface.co/nvidia). In [NVIDIA NeMo Framework](https://github.com/NVIDIA/NeMo/tree/main/scripts/nlp_language_modeling) there are conversion scripts to convert the `hf` checkpoints of popular models like llama, falcon, mixtral or mpt to `nemo`.
+Run a `nemo` model on one GPU:
+```bash
+lm_eval --model nemo_lm \
+    --model_args path=<path_to_nemo_model> \
+    --tasks hellaswag \
+    --batch_size 32
+```
+It is recommended to unpack the `nemo` model to avoid the unpacking inside the docker container - it may overflow disk space. For that you can run:
+```
+mkdir MY_MODEL
+tar -xvf MY_MODEL.nemo -c MY_MODEL
+```
+#### Multi-GPU evaluation with NVIDIA `nemo` models
+By default, only one GPU is used. But we do support either data replication or tensor/pipeline parallelism during evaluation, on one node.
+1) To enable data replication, set the `model_args` of `devices` to the number of data replicas to run. For example, the command to run 8 data replicas over 8 GPUs is:
+```bash
+torchrun --nproc-per-node=8 --no-python lm_eval \
+    --model nemo_lm \
+    --model_args path=<path_to_nemo_model>,devices=8 \
+    --tasks hellaswag \
+    --batch_size 32
+```
+2) To enable tensor and/or pipeline parallelism, set the `model_args` of `tensor_model_parallel_size` and/or `pipeline_model_parallel_size`. In addition, you also have to set up `devices` to be equal to the product of `tensor_model_parallel_size` and/or `pipeline_model_parallel_size`. For example, the command to use one node of 4 GPUs with tensor parallelism of 2 and pipeline parallelism of 2 is:
+```bash
+torchrun --nproc-per-node=4 --no-python lm_eval \
+    --model nemo_lm \
+    --model_args path=<path_to_nemo_model>,devices=4,tensor_model_parallel_size=2,pipeline_model_parallel_size=2 \
+    --tasks hellaswag \
+    --batch_size 32
+```
+Note that it is recommended to substitute the `python` command by `torchrun --nproc-per-node=<number of devices> --no-python` to facilitate loading the model into the GPUs. This is especially important for large checkpoints loaded into multiple GPUs.
+Not supported yet: multi-node evaluation and combinations of data replication with tensor or pipeline parallelism.
+### Tensor + Data Parallel and Optimized Inference with `vLLM`
+We also support vLLM for faster inference on [supported model types](https://docs.vllm.ai/en/latest/models/supported_models.html), especially faster when splitting a model across multiple GPUs. For single-GPU or multi-GPU — tensor parallel, data parallel, or a combination of both — inference, for example:
+```bash
+lm_eval --model vllm \
+    --model_args pretrained={model_name},tensor_parallel_size={GPUs_per_model},dtype=auto,gpu_memory_utilization=0.8,data_parallel_size={model_replicas} \
+    --tasks lambada_openai \
+    --batch_size auto
+```
+To use vllm, do `pip install lm_eval[vllm]`. For a full list of supported vLLM configurations, please reference our [vLLM integration](https://github.com/EleutherAI/lm-evaluation-harness/blob/e74ec966556253fbe3d8ecba9de675c77c075bce/lm_eval/models/vllm_causallms.py) and the vLLM documentation.
+vLLM occasionally differs in output from Huggingface. We treat Huggingface as the reference implementation, and provide a [script](./scripts/model_comparator.py) for checking the validity of vllm results against HF.
+> [!Tip]
+> For fastest performance, we recommend using `--batch_size auto` for vLLM whenever possible, to leverage its continuous batching functionality!
+> [!Tip]
+> Passing `max_model_len=4096` or some other reasonable default to vLLM through model args may cause speedups or prevent out-of-memory errors when trying to use auto batch size, such as for Mistral-7B-v0.1 which defaults to a maximum length of 32k.
+### Model APIs and Inference Servers
+Our library also supports the evaluation of models served via several commercial APIs, and we hope to implement support for the most commonly used performant local/self-hosted inference servers.
+To call a hosted model, use:
+```bash
+export OPENAI_API_KEY=YOUR_KEY_HERE
+lm_eval --model openai-completions \
+    --model_args model=davinci \
+    --tasks lambada_openai,hellaswag
+```
+We also support using your own local inference server with servers that mirror the OpenAI Completions and ChatCompletions APIs.
+```bash
+lm_eval --model local-completions --tasks gsm8k --model_args model=facebook/opt-125m,base_url=http://{yourip}:8000/v1/completions,num_concurrent=1,max_retries=3,tokenized_requests=False,batch_size=16
+```
+Note that for externally hosted models, configs such as `--device` which relate to where to place a local model should not be used and do not function. Just like you can use `--model_args` to pass arbitrary arguments to the model constructor for local models, you can use it to pass arbitrary arguments to the model API for hosted models. See the documentation of the hosting service for information on what arguments they support.
+| API or Inference Server                                                                                                   | Implemented?                    | `--model <xxx>` name                                | Models supported:                                                                                                                                                                                                                                                                                                                                          | Request Types:                                             |
+|---------------------------------------------------------------------------------------------------------------------------|---------------------------------|-----------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------|
+| OpenAI Completions                                                                                                        | :heavy_check_mark:              | `openai-completions`, `local-completions`           | All OpenAI Completions API models                                                                                                                                                                                                                                                                                                                          | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| OpenAI ChatCompletions                                                                                                    | :heavy_check_mark:        | `openai-chat-completions`, `local-chat-completions` | [All ChatCompletions API models](https://platform.openai.com/docs/guides/gpt)                                                                                                                                                                                                                                                                              | `generate_until` (no logprobs)                             |
+| Anthropic                                                                                                                 | :heavy_check_mark:              | `anthropic`                                         | [Supported Anthropic Engines](https://docs.anthropic.com/claude/reference/selecting-a-model)                                                                                                                                                                                                                                                               | `generate_until` (no logprobs)                             |
+| Anthropic Chat                                                                                                                | :heavy_check_mark:              | `anthropic-chat`, `anthropic-chat-completions`      | [Supported Anthropic Engines](https://docs.anthropic.com/claude/docs/models-overview)                                                                                                                                                                                                                                                                      | `generate_until` (no logprobs)                             |
+| Textsynth                                                                                                                 | :heavy_check_mark:                   | `textsynth`                                         | [All supported engines](https://textsynth.com/documentation.html#engines)                                                                                                                                                                                                                                                                                  | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| Cohere                                                                                                                    | [:hourglass: - blocked on Cohere API bug](https://github.com/EleutherAI/lm-evaluation-harness/pull/395) | N/A                                                 | [All `cohere.generate()` engines](https://docs.cohere.com/docs/models)                                                                                                                                                                                                                                                                                     | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| [Llama.cpp](https://github.com/ggerganov/llama.cpp) (via [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)) | :heavy_check_mark:              | `gguf`, `ggml`                                      | [All models supported by llama.cpp](https://github.com/ggerganov/llama.cpp)                                                                                                                                                                                                                                                                                | `generate_until`, `loglikelihood`, (perplexity evaluation not yet implemented) |
+| vLLM                                                                                                                      | :heavy_check_mark:       | `vllm`                                              | [Most HF Causal Language Models](https://docs.vllm.ai/en/latest/models/supported_models.html)                                                                                                                                                                                                                                                              | `generate_until`, `loglikelihood`, `loglikelihood_rolling` |
+| Mamba                       | :heavy_check_mark:       | `mamba_ssm`                                         | [Mamba architecture Language Models via the `mamba_ssm` package](https://huggingface.co/state-spaces)                                                                                                                                                                                                                                                      | `generate_until`, `loglikelihood`, `loglikelihood_rolling`                             |
+| Huggingface Optimum (Causal LMs)    | ✔️         | `openvino`                                          | Any decoder-only AutoModelForCausalLM converted with Huggingface Optimum into OpenVINO™ Intermediate Representation (IR) format                                                                                                                                                                                                                            |  `generate_until`, `loglikelihood`, `loglikelihood_rolling`                         | ...                                                      |
+| Neuron via AWS Inf2 (Causal LMs)    | ✔️         | `neuronx`                                           | Any decoder-only AutoModelForCausalLM supported to run on [huggingface-ami image for inferentia2](https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2)                                                                                                                                                                                            |  `generate_until`, `loglikelihood`, `loglikelihood_rolling`                         | ...                                                      |
+| [Neural Magic DeepSparse](https://github.com/neuralmagic/deepsparse)    | ✔️         | `deepsparse`                                        | Any LM from [SparseZoo](https://sparsezoo.neuralmagic.com/) or on [HF Hub with the "deepsparse" tag](https://huggingface.co/models?other=deepsparse)                                                                                                                                                                                                       |  `generate_until`, `loglikelihood`                         | ...                                                      |
+| [Neural Magic SparseML](https://github.com/neuralmagic/sparseml)    | ✔️         | `sparseml`                                          | Any decoder-only AutoModelForCausalLM from [SparseZoo](https://sparsezoo.neuralmagic.com/) or on [HF Hub](https://huggingface.co/neuralmagic). Especially useful for models with quantization like [`zoo:llama2-7b-gsm8k_llama2_pretrain-pruned60_quantized`](https://sparsezoo.neuralmagic.com/models/llama2-7b-gsm8k_llama2_pretrain-pruned60_quantized) |  `generate_until`, `loglikelihood`, `loglikelihood_rolling`                         | ...                                                      |
+| Your local inference server!                                                                                              | :heavy_check_mark:                             | `local-completions` or `local-chat-completions`     | Support for OpenAI API-compatible servers, with easy customization for other APIs.                                                                                                                                                                                                                                                                         | `generate_until`, `loglikelihood`, `loglikelihood_rolling`                                          |                                | ...                |
+Models which do not supply logits or logprobs can be used with tasks of type `generate_until` only, while local models, or APIs that supply logprobs/logits of their prompts, can be run on all task types: `generate_until`, `loglikelihood`, `loglikelihood_rolling`, and `multiple_choice`.
+For more information on the different task `output_types` and model request types, see [our documentation](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/model_guide.md#interface).
+> [!Note]
+> For best performance with closed chat model APIs such as Anthropic Claude 3 and GPT-4, we recommend carefully looking at a few sample outputs using `--limit 10` first to confirm answer extraction and scoring on generative tasks is performing as expected. providing `system="<some system prompt here>"` within `--model_args` for anthropic-chat-completions, to instruct the model what format to respond in, may be useful.
+### Other Frameworks
+A number of other libraries contain scripts for calling the eval harness through their library. These include [GPT-NeoX](https://github.com/EleutherAI/gpt-neox/blob/main/eval_tasks/eval_adapter.py), [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed/blob/main/examples/MoE/readme_evalharness.md), and [mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/blob/master/eval_harness.py).
+To create your own custom integration you can follow instructions from [this tutorial](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md#external-library-usage).
+### Additional Features
+> [!Note]
+> For tasks unsuitable for direct evaluation — either due risks associated with executing untrusted code or complexities in the evaluation process — the `--predict_only` flag is available to obtain decoded generations for post-hoc evaluation.
+If you have a Metal compatible Mac, you can run the eval harness using the MPS back-end by replacing `--device cuda:0` with `--device mps` (requires PyTorch version 2.1 or higher). **Note that the PyTorch MPS backend is still in early stages of development, so correctness issues or unsupported operations may exist. If you observe oddities in model performance on the MPS back-end, we recommend first checking that a forward pass of your model on `--device cpu` and `--device mps` match.**
+> [!Note]
+> You can inspect what the LM inputs look like by running the following command:
+> ```bash
+> python write_out.py \
+>     --tasks <task1,task2,...> \
+>     --num_fewshot 5 \
+>     --num_examples 10 \
+>     --output_base_path /path/to/output/folder
+> ```
+> This will write out one text file for each task.
+To verify the data integrity of the tasks you're performing in addition to running the tasks themselves, you can use the `--check_integrity` flag:
+```bash
+lm_eval --model openai \
+    --model_args engine=davinci \
+    --tasks lambada_openai,hellaswag \
+    --check_integrity
+```
+## Advanced Usage Tips
+For models loaded with the HuggingFace  `transformers` library, any arguments provided via `--model_args` get passed to the relevant constructor directly. This means that anything you can do with `AutoModel` can be done with our library. For example, you can pass a local path via `pretrained=` or use models finetuned with [PEFT](https://github.com/huggingface/peft) by taking the call you would run to evaluate the base model and add `,peft=PATH` to the `model_args` argument:
+```bash
+lm_eval --model hf \
+    --model_args pretrained=EleutherAI/gpt-j-6b,parallelize=True,load_in_4bit=True,peft=nomic-ai/gpt4all-j-lora \
+    --tasks openbookqa,arc_easy,winogrande,hellaswag,arc_challenge,piqa,boolq \
+    --device cuda:0
+```
+Models provided as delta weights can be easily loaded using the Hugging Face transformers library. Within --model_args, set the delta argument to specify the delta weights, and use the pretrained argument to designate the relative base model to which they will be applied:
+```bash
+lm_eval --model hf \
+    --model_args pretrained=Ejafa/llama_7B,delta=lmsys/vicuna-7b-delta-v1.1 \
+    --tasks hellaswag
+```
+[GPTQ](https://github.com/PanQiWei/AutoGPTQ) quantized models can be loaded by specifying their file names in `,autogptq=NAME` (or `,autogptq=True` for default names) in the `model_args` argument:
+```bash
+lm_eval --model hf \
+    --model_args pretrained=model-name-or-path,autogptq=model.safetensors,gptq_use_triton=True \
+    --tasks hellaswag
+```
+We support wildcards in task names, for example you can run all of the machine-translated lambada tasks via `--task lambada_openai_mt_*`.
+## Saving Results
+To save evaluation results provide an `--output_path`. We also support logging model responses with the `--log_samples` flag for post-hoc analysis.
+Additionally, one can provide a directory with `--use_cache` to cache the results of prior runs. This allows you to avoid repeated execution of the same (model, task) pairs for re-scoring.
+To push results and samples to the Hugging Face Hub, first ensure an access token with write access is set in the `HF_TOKEN` environment variable. Then, use the `--hf_hub_log_args` flag to specify the organization, repository name, repository visibility, and whether to push results and samples to the Hub - [example dataset on the  HF Hub](https://huggingface.co/datasets/KonradSzafer/lm-eval-results-demo). For instance:
+```bash
+lm_eval --model hf \
+    --model_args pretrained=model-name-or-path,autogptq=model.safetensors,gptq_use_triton=True \
+    --tasks hellaswag \
+    --log_samples \
+    --output_path results \
+    --hf_hub_log_args hub_results_org=EleutherAI,hub_repo_name=lm-eval-results,push_results_to_hub=True,push_samples_to_hub=True,public_repo=False \
+```
+This allows you to easily download the results and samples from the Hub, using:
+```python
+from datasets import load_dataset
+load_dataset("EleutherAI/lm-eval-results-private", "hellaswag", "latest")
+```
+For a full list of supported arguments, check out the [interface](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/interface.md) guide in our documentation!
+## Visualizing Results
+You can seamlessly visualize and analyze the results of your evaluation harness runs using both Weights & Biases (W&B) and Zeno.
+### Zeno
+You can use [Zeno](https://zenoml.com) to visualize the results of your eval harness runs.
+First, head to [hub.zenoml.com](https://hub.zenoml.com) to create an account and get an API key [on your account page](https://hub.zenoml.com/account).
+Add this key as an environment variable:
+```bash
+export ZENO_API_KEY=[your api key]
+```
+You'll also need to install the `lm_eval[zeno]` package extra.
+To visualize the results, run the eval harness with the `log_samples` and `output_path` flags.
+We expect `output_path` to contain multiple folders that represent individual model names.
+You can thus run your evaluation on any number of tasks and models and upload all of the results as projects on Zeno.
+```bash
+lm_eval \
+    --model hf \
+    --model_args pretrained=EleutherAI/gpt-j-6B \
+    --tasks hellaswag \
+    --device cuda:0 \
+    --batch_size 8 \
+    --log_samples \
+    --output_path output/gpt-j-6B
+```
+Then, you can upload the resulting data using the `zeno_visualize` script:
+```bash
+python scripts/zeno_visualize.py \
+    --data_path output \
+    --project_name "Eleuther Project"
+```
+This will use all subfolders in `data_path` as different models and upload all tasks within these model folders to Zeno.
+If you run the eval harness on multiple tasks, the `project_name` will be used as a prefix and one project will be created per task.
+You can find an example of this workflow in [examples/visualize-zeno.ipynb](examples/visualize-zeno.ipynb).
+### Weights and Biases
+With the [Weights and Biases](https://wandb.ai/site) integration, you can now spend more time extracting deeper insights into your evaluation results. The integration is designed to streamline the process of logging and visualizing experiment results using the Weights & Biases (W&B) platform.
+The integration provide functionalities
+- to automatically log the evaluation results,
+- log the samples as W&B Tables for easy visualization,
+- log the `results.json` file as an artifact for version control,
+- log the `<task_name>_eval_samples.json` file if the samples are logged,
+- generate a comprehensive report for analysis and visualization with all the important metric,
+- log task and cli specific configs,
+- and more out of the box like the command used to run the evaluation, GPU/CPU counts, timestamp, etc.
+First you'll need to install the lm_eval[wandb] package extra. Do `pip install lm_eval[wandb]`.
+Authenticate your machine with an your unique W&B token. Visit https://wandb.ai/authorize to get one. Do `wandb login` in your command line terminal.
+Run eval harness as usual with a `wandb_args` flag. Use this flag to provide arguments for initializing a wandb run ([wandb.init](https://docs.wandb.ai/ref/python/init)) as comma separated string arguments.
+```bash
+lm_eval \
+    --model hf \
+    --model_args pretrained=microsoft/phi-2,trust_remote_code=True \
+    --tasks hellaswag,mmlu_abstract_algebra \
+    --device cuda:0 \
+    --batch_size 8 \
+    --output_path output/phi-2 \
+    --limit 10 \
+    --wandb_args project=lm-eval-harness-integration \
+    --log_samples
+```
+In the stdout, you will find the link to the W&B run page as well as link to the generated report. You can find an example of this workflow in [examples/visualize-wandb.ipynb](examples/visualize-wandb.ipynb), and an example of how to integrate it beyond the CLI.
+## How to Contribute or Learn More?
+For more information on the library and how everything fits together, check out all of our [documentation pages](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/docs)! We plan to post a larger roadmap of desired + planned library improvements soon, with more information on how contributors can help.
+### Implementing new tasks
+To implement a new task in the eval harness, see [this guide](./docs/new_task_guide.md).
+In general, we follow this priority list for addressing concerns about prompting and other eval details:
+1. If there is widespread agreement among people who train LLMs, use the agreed upon procedure.
+2. If there is a clear and unambiguous official implementation, use that procedure.
+3. If there is widespread agreement among people who evaluate LLMs, use the agreed upon procedure.
+4. If there are multiple common implementations but not universal or widespread agreement, use our preferred option among the common implementations. As before, prioritize choosing from among the implementations found in LLM training papers.
+These are guidelines and not rules, and can be overruled in special circumstances.
+We try to prioritize agreement with the procedures used by other groups to decrease the harm when people inevitably compare runs across different papers despite our discouragement of the practice. Historically, we also prioritized the implementation from [Language Models are Few Shot Learners](https://arxiv.org/abs/2005.14165) as our original goal was specifically to compare results with that paper.
+### Support
+The best way to get support is to open an issue on this repo or join the [EleutherAI Discord server](https://discord.gg/eleutherai). The `#lm-thunderdome` channel is dedicated to developing this project and the `#release-discussion` channel is for receiving support for our releases. If you've used the library and have had a positive (or negative) experience, we'd love to hear from you!
+## Optional Extras
+Extras dependencies can be installed via `pip install -e ".[NAME]"`
+| Name            | Use                                          |
+|-----------------|----------------------------------------------|
+| api             | For using api models (Anthropic, OpenAI API) |
+| deepsparse      | For running NM's DeepSparse models           |
+| dev             | For linting PRs and contributions            |
+| gptq            | For loading models with GPTQ                 |
+| hf_transfer     | For speeding up HF Hub file downloads        |
+| ifeval          | For running the IFEval task                  |
+| neuronx         | For running on AWS inf2 instances            |
+| mamba           | For loading Mamba SSM models                 |
+| math            | For running math task answer checking        |
+| multilingual    | For multilingual tokenizers                  |
+| optimum         | For running Intel OpenVINO models            |
+| promptsource    | For using PromptSource prompts               |
+| sentencepiece   | For using the sentencepiece tokenizer        |
+| sparseml        | For using NM's SparseML models               |
+| testing         | For running library test suite               |
+| vllm            | For loading models with vLLM                 |
+| zeno            | For visualizing results with Zeno            |
+| --------------- | ---------------------------------------      |
+| all             | Loads all extras (not recommended)           |
+## Cite as
+```
+@misc{eval-harness,
+  author       = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy},
+  title        = {A framework for few-shot language model evaluation},
+  month        = 12,
+  year         = 2023,
+  publisher    = {Zenodo},
+  version      = {v0.4.0},
+  doi          = {10.5281/zenodo.10256836},
+  url          = {https://zenodo.org/records/10256836}
+}
+```

scripts/yans/lm-evaluation-harness/bin/python ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:45692c3da2492563eabf0a8f5dc18d20dc9c34ffe3a18202563e00bae684be91
+size 5904904

scripts/yans/lm-evaluation-harness/bin/python3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:45692c3da2492563eabf0a8f5dc18d20dc9c34ffe3a18202563e00bae684be91
+size 5904904

scripts/yans/lm-evaluation-harness/bin/python3.10 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:45692c3da2492563eabf0a8f5dc18d20dc9c34ffe3a18202563e00bae684be91
+size 5904904

scripts/yans/lm-evaluation-harness/docs/API_guide.md ADDED Viewed

	@@ -0,0 +1,198 @@

+# TemplateAPI Usage Guide
+The `TemplateAPI` class is a versatile superclass designed to facilitate the integration of various API-based language models into the lm-evaluation-harness framework. This guide will explain how to use and extend the `TemplateAPI` class to implement your own API models. If your API implements the OpenAI API you can use the `local-completions` or the `local-chat-completions` (defined [here](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/openai_completions.py)) model types, which can also serve as examples of how to effectively subclass this template.
+## Overview
+The `TemplateAPI` class provides a template for creating API-based model implementations. It handles common functionalities such as:
+- Tokenization (optional)
+- Batch processing
+- Caching
+- Retrying failed requests
+- Parsing API responses
+To use this class, you typically need to subclass it and implement specific methods for your API.
+## Key Methods to Implement
+When subclassing `TemplateAPI`, you need to implement the following methods:
+1. `_create_payload`: Creates the JSON payload for API requests.
+2. `parse_logprobs`: Parses log probabilities from API responses.
+3. `parse_generations`: Parses generated text from API responses.
+4. `headers`: Returns the headers for the API request.
+You may also need to override other methods or properties depending on your API's specific requirements.
+> [!NOTE]
+> Currently loglikelihood and MCQ based tasks (such as MMLU) are only supported for completion endpoints. Not for chat-completion — those that expect a list of dicts — endpoints! Completion APIs which support instruct tuned models can be evaluated with the `--apply_chat_template` option in order to simultaneously evaluate models using a chat template format while still being able to access the model logits needed for loglikelihood-based tasks.
+# TemplateAPI Usage Guide
+## TemplateAPI Arguments
+When initializing a `TemplateAPI` instance or a subclass, you can provide several arguments to customize its behavior. Here's a detailed explanation of some important arguments:
+- `model` or `pretrained` (str):
+   - The name or identifier of the model to use.
+   - `model` takes precedence over `pretrained` when both are provided.
+- `base_url` (str):
+   - The base URL for the API endpoint.
+- `tokenizer` (str, optional):
+  - The name or path of the tokenizer to use.
+  - If not provided, it defaults to using the same tokenizer name as the model.
+- `num_concurrent` (int):
+   - Number of concurrent requests to make to the API.
+   - Useful for APIs that support parallel processing.
+   - Default is 1 (sequential processing).
+- `tokenized_requests` (bool):
+  - Determines whether the input is pre-tokenized. Defaults to `True`.
+  - Requests can be sent in either tokenized form (`list[list[int]]`) or as text (`list[str]`, or `str` for batch_size=1).
+  - For loglikelihood-based tasks, prompts require tokenization to calculate the context length. If `False` prompts are decoded back to text before being sent to the API.
+  - Not as important for `generate_until` tasks.
+  - Ignored for chat formatted inputs (list[dict...]) or if tokenizer_backend is None.
+- `tokenizer_backend` (str, optional):
+  - Required for loglikelihood-based or MCQ tasks.
+  - Specifies the tokenizer library to use. Options are "tiktoken", "huggingface", or None.
+  - Default is "huggingface".
+- `max_length` (int, optional):
+  - Maximum length of input + output.
+  - Default is 2048.
+- `max_retries` (int, optional):
+   - Maximum number of retries for failed API requests.
+   - Default is 3.
+- `max_gen_toks` (int, optional):
+  - Maximum number of tokens to generate in completion tasks.
+  - Default is 256 or set in task yaml.
+- `batch_size` (int or str, optional):
+  - Number of requests to batch together (if the API supports batching).
+  - Can be an integer or "auto" (which defaults to 1 for API models).
+  - Default is 1.
+- `seed` (int, optional):
+  - Random seed for reproducibility.
+  - Default is 1234.
+- `add_bos_token` (bool, optional):
+  - Whether to add the beginning-of-sequence token to inputs (when tokenizing).
+  - Default is False.
+- `custom_prefix_token_id` (int, optional):
+  - Custom token ID to use as a prefix for inputs.
+  - If not provided, uses the model's default BOS or EOS token (if `add_bos_token` is True).
+Example usage:
+```python
+class MyAPIModel(TemplateAPI):
+    def __init__(self, **kwargs):
+        super().__init__(
+            model="my-model",
+            base_url="https://api.mymodel.com/v1/completions",
+            tokenizer_backend="huggingface",
+            num_concurrent=5,
+            max_retries=5,
+            batch_size=10,
+            **kwargs
+        )
+    # Implement other required methods...
+```
+When subclassing `TemplateAPI`, you can override these arguments in your `__init__` method to set default values specific to your API. You can also add additional (potentially user-specified) arguments as needed for your specific implementation.
+## Example Implementation: OpenAI API
+The `OpenAICompletionsAPI` and `OpenAIChatCompletion` ([here](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/models/openai_completions.py) classes demonstrate how to implement API models using the `TemplateAPI` class. Here's a breakdown of the key components:
+### 1. Subclassing and Initialization
+```python
+@register_model("openai-completions")
+class OpenAICompletionsAPI(LocalCompletionsAPI):
+    def __init__(
+        self,
+        base_url="https://api.openai.com/v1/completions",
+        tokenizer_backend="tiktoken",
+        **kwargs,
+    ):
+        super().__init__(
+            base_url=base_url, tokenizer_backend=tokenizer_backend, **kwargs
+        )
+```
+### 2. Implementing API Key Retrieval
+```python
+@cached_property
+def api_key(self):
+    key = os.environ.get("OPENAI_API_KEY", None)
+    if key is None:
+        raise ValueError(
+            "API key not found. Please set the OPENAI_API_KEY environment variable."
+        )
+    return key
+```
+### 3. Creating the Payload
+```python
+def _create_payload(
+    self,
+    messages: Union[List[List[int]], List[dict], List[str], str],
+    generate=False,
+    gen_kwargs: Optional[dict] = None,
+    **kwargs,
+) -> dict:
+    if generate:
+        # ... (implementation for generation)
+    else:
+        # ... (implementation for log likelihood)
+```
+### 4. Parsing API Responses
+```python
+@staticmethod
+def parse_logprobs(
+    outputs: Union[Dict, List[Dict]],
+    tokens: List[List[int]] = None,
+    ctxlens: List[int] = None,
+    **kwargs,
+) -> List[Tuple[float, bool]]:
+    # ... (implementation)
+@staticmethod
+def parse_generations(outputs: Union[Dict, List[Dict]], **kwargs) -> List[str]:
+    # ... (implementation)
+```
+The requests are initiated in the `model_call` or the `amodel_call` methods.
+## Implementing Your Own API Model
+To implement your own API model:
+1. Subclass `TemplateAPI` or one of its subclasses (e.g., `LocalCompletionsAPI`).
+2. Override the `__init__` method if you need to set specific parameters.
+3. Implement the `_create_payload` and `header` methods to create the appropriate payload for your API.
+4. Implement the `parse_logprobs` and `parse_generations` methods to parse your API's responses.
+5. Override the `api_key` property if your API requires authentication.
+6. Override any other methods as necessary to match your API's behavior.
+## Best Practices
+1. Use the `@register_model` decorator to register your model with the framework (and import it in `lm_eval/models/__init__.py`!).
+3. Use environment variables for sensitive information like API keys.
+4. Properly handle batching and concurrent requests if supported by your API.

scripts/yans/lm-evaluation-harness/docs/CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,79 @@

+# Contributing to LM Evaluation Harness
+Welcome and thank you for your interest in the LM Evaluation Harness! We welcome contributions and feedback and appreciate your time spent with our library, and hope you find it useful!
+## Important Resources
+There are several places information about LM Evaluation Harness is located:
+- Our [documentation pages](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/docs)
+- We occasionally use [GitHub Milestones](https://github.com/EleutherAI/lm-evaluation-harness/milestones) to track progress toward specific near-term version releases.
+- We maintain a [Project Board](https://github.com/orgs/EleutherAI/projects/25) for tracking current work items and PRs, and for future roadmap items or feature requests.
+- Further discussion and support conversations are located in the #lm-thunderdome channel of the [EleutherAI discord](https://discord.gg/eleutherai).
+## Code Style
+LM Evaluation Harness uses [ruff](https://github.com/astral-sh/ruff) for linting via [pre-commit](https://pre-commit.com/).
+You can install linters and dev tools via
+```pip install lm_eval[dev]``` or ```pip install -e ".[dev]"```
+Then, run
+```pre-commit install```
+in order to ensure linters and other checks will be run upon committing.
+## Testing
+We use [pytest](https://docs.pytest.org/en/latest/) for running unit tests. All library unit tests can be run via:
+```
+python -m pytest --showlocals -s -vv -n=auto --ignore=tests/models/test_neuralmagic.py --ignore=tests/models/test_openvino.py
+```
+## Contributor License Agreement
+We ask that new contributors agree to a Contributor License Agreement affirming that EleutherAI has the rights to use your contribution to our library.
+First-time pull requests will have a reply added by @CLAassistant containing instructions for how to confirm this, and we require it before merging your PR.
+## Contribution Best Practices
+We recommend a few best practices to make your contributions or reported errors easier to assist with.
+**For Pull Requests:**
+- PRs should be titled descriptively, and be opened with a brief description of the scope and intent of the new contribution.
+- New features should have appropriate documentation added alongside them.
+- Aim for code maintainability, and minimize code copying.
+- If opening a task, try to share test results on the task using a publicly-available model, and if any public results are available on the task, compare to them.
+**For Feature Requests:**
+- Provide a short paragraph's worth of description. What is the feature you are requesting? What is its motivation, and an example use case of it? How does this differ from what is currently supported?
+**For Bug Reports**:
+- Provide a short description of the bug.
+- Provide a *reproducible example*--what is the command you run with our library that results in this error? Have you tried any other steps to resolve it?
+- Provide a *full error traceback* of the error that occurs, if applicable. A one-line error message or small screenshot snippet is unhelpful without the surrounding context.
+- Note what version of the codebase you are using, and any specifics of your environment and setup that may be relevant.
+**For Requesting New Tasks**:
+- Provide a 1-2 sentence description of what the task is and what it evaluates.
+- Provide a link to the paper introducing the task.
+- Provide a link to where the dataset can be found.
+- Provide a link to a paper containing results on an open-source model on the task, for use in comparisons and implementation validation.
+- If applicable, link to any codebase that has implemented the task (especially the original publication's codebase, if existent).
+## How Can I Get Involved?
+To quickly get started, we maintain a list of good first issues, which can be found [on our project board](https://github.com/orgs/EleutherAI/projects/25/views/8) or by [filtering GH Issues](https://github.com/EleutherAI/lm-evaluation-harness/issues?q=is%3Aopen+label%3A%22good+first+issue%22+label%3A%22help+wanted%22). These are typically smaller code changes or self-contained features which can be added without extensive familiarity with library internals, and we recommend new contributors consider taking a stab at one of these first if they are feeling uncertain where to begin.
+There are a number of distinct ways to contribute to LM Evaluation Harness, and all are extremely helpful! A sampling of ways to contribute include:
+- **Implementing and verifying new evaluation tasks**: Is there a task you'd like to see LM Evaluation Harness support? Consider opening an issue requesting it, or helping add it! Verifying and cross-checking task implementations with their original versions is also a very valuable form of assistance in ensuring standardized evaluation.
+- **Improving documentation** - Improvements to the documentation, or noting pain points / gaps in documentation, are helpful in order for us to improve the user experience of the library and clarity + coverage of documentation.
+- **Testing and devops** - We are very grateful for any assistance in adding tests for the library that can be run for new PRs, and other devops workflows.
+- **Adding new modeling / inference library integrations** - We hope to support a broad range of commonly-used inference libraries popular among the community, and welcome PRs for new integrations, so long as they are documented properly and maintainable.
+- **Proposing or Contributing New Features** - We want LM Evaluation Harness to support a broad range of evaluation usecases. If you have a feature that is not currently supported but desired, feel free to open an issue describing the feature and, if applicable, how you intend to implement it. We would be happy to give feedback on the cleanest way to implement new functionalities and are happy to coordinate with interested contributors via GH discussions or via discord.
+We hope that this has been helpful, and appreciate your interest in contributing! Further questions can be directed to [our Discord](discord.gg/eleutherai).

scripts/yans/lm-evaluation-harness/docs/README.md ADDED Viewed

	@@ -0,0 +1,11 @@

+# Eval Harness Documentation
+Welcome to the docs for the LM Evaluation Harness!
+## Table of Contents
+* To learn about the public interface of the library, as well as how to evaluate via the command line or as integrated into an external library, see the [Interface](./interface.md).
+* To learn how to add a new library, API, or model type to the library, as well as a quick explainer on the types of ways to evaluate an LM, see the [Model Guide](./model_guide.md).
+  * For an extended description of how to extend the library to new model classes served over an API, see the [API Guide](./API_guide.md).
+* For a crash course on adding new tasks to the library, see our [New Task Guide](./new_task_guide.md).
+* To learn more about pushing the limits of task configuration that the Eval Harness supports, see the [Task Configuration Guide](./task_guide.md).

scripts/yans/lm-evaluation-harness/docs/decontamination.md ADDED Viewed

	@@ -0,0 +1,71 @@

+# Decontamination
+## Usage
+The provided directory should contain
+the ngram files and info.json produced in "Pile Ngram Generation" further down.
+```bash
+python -m lm_eval \
+    --model gpt2 \
+    --device 0 \
+    --tasks sciq
+```
+## Background
+Downstream evaluations test model generalization, and are less useful when test set data also exists in the training set, referred to as leakage or contamination.
+Filtering your training set against the test set is a good first step, however this isn't always possible, as in the case of a new benchmark or one that wasn't considered prior to model training. When training set filtering isn't possible, it is useful to measure the impact of test set leakage by detecting the contaminated test examples and producing a clean version of the benchmark.
+The basis for our decontamination procedure can be found in Appendix C of "Language Models are Few-Shot Learners". OpenAI defined a test document as contaminated if any N-gram overlap existed with any training document. They used a range of N values between 8 and 13 depending on dataset, while we just used 13 for simplicity.
+## Implementation
+Contamination detection can be found in `lm_eval/decontaminate.py` with supporting code in `lm_eval/decontamination/`.
+decontaminate.py does the following:
+1. Build dictionaries of all ngrams and their corresponding evaluation/document ids.
+2. Scan through sorted files containing training set n-grams.
+3. If a match is found, the corresponding evaluation/document combinations are marked as contaminated.
+`lm_eval/evaluator.py` can then produce a clean version of the benchmark by excluding the results of contaminated documents. For each metric, a clean version will be shown in the results with a "decontaminate" suffix.
+This is disabled by default for new tasks, to support decontamination on a task override the "should_decontaminate" and "doc_to_decontamination_query" methods. For more details see the [task guide](task_guide.md).
+## Pile Ngram Generation
+The relevant scripts can be found in `scripts/clean_training_data`, which also import from
+`lm_eval/decontamination/`
+1. git clone https://github.com/EleutherAI/lm-evaluation-harness.git
+2. pip install -r requirements.txt
+3. Download The Pile from [The Eye](https://the-eye.eu/public/AI/pile/train/)
+4. Place pile files in "pile" directory under "lm-evaluation-harness" (or create a symlink)
+5. Run generate_13_grams.
+```bash
+export PYTHONHASHSEED=0
+python -m scripts/clean_training_data/generate_13_grams \
+       -dir path/to/working/directory \
+       -n 13 \
+       -buckets 500
+```
+Took approximately 4 days for us. We had the time to wait, but this could be scaled out by doing partial pile scans on multiple instances of this script and merging the relevant buckets. We fixed PYTHONHASHSEED to ensure reproducibility of bucket hashing in case you need to stop and start.
+6. Sort the generated 13-grams.
+```bash
+python -m scripts/clean_training_data/sort_13_gram_buckets \
+       -dir path/to/working/directory/output
+```
+Took approximately 5 days for us. You could speed this up by spreading the files around to different machines and running the sort script before gathering them together.
+7. Compress the sorted 13 grams files and place them together with info.json.
+This step only takes a few hours.
+```bash
+python -m scripts/clean_training_data/compress_and_package \
+       -dir path/to/working/directory \
+       -output path/to/final/directory \
+       -procs 8
+```

scripts/yans/lm-evaluation-harness/docs/img/fewshot_example_gpt3.png ADDED Viewed

scripts/yans/lm-evaluation-harness/docs/interface.md ADDED Viewed

	@@ -0,0 +1,162 @@

+# User Guide
+This document details the interface exposed by `lm-eval` and provides details on what flags are available to users.
+## Command-line Interface
+A majority of users run the library by cloning it from Github, installing the package as editable, and running the `python -m lm_eval` script.
+Equivalently, running the library can be done via the `lm-eval` entrypoint at the command line.
+This mode supports a number of command-line arguments, the details of which can be also be seen via running with `-h` or `--help`:
+- `--model` : Selects which model type or provider is evaluated. Must be a string corresponding to the name of the model type/provider being used. See [the main README](https://github.com/EleutherAI/lm-evaluation-harness/tree/main#model-apis-and-inference-servers) for a full list of enabled model names and supported libraries or APIs.
+- `--model_args` : Controls parameters passed to the model constructor. Accepts a string containing comma-separated keyword arguments to the model class of the format `"arg1=val1,arg2=val2,..."`, such as, for example `--model_args pretrained=EleutherAI/pythia-160m,dtype=float32`. For a full list of what keyword arguments, see the initialization of the `lm_eval.api.model.LM` subclass, e.g. [`HFLM`](https://github.com/EleutherAI/lm-evaluation-harness/blob/365fcda9b85bbb6e0572d91976b8daf409164500/lm_eval/models/huggingface.py#L66)
+- `--tasks` : Determines which tasks or task groups are evaluated. Accepts a comma-separated list of task names or task group names. Must be solely comprised of valid tasks/groups. A list of supported tasks can be viewed with `--tasks list`.
+- `--num_fewshot` : Sets the number of few-shot examples to place in context. Must be an integer.
+- `--gen_kwargs` : takes an arg string in same format as `--model_args` and creates a dictionary of keyword arguments. These will be passed to the models for all called `generate_until` (free-form or greedy generation task) tasks, to set options such as the sampling temperature or `top_p` / `top_k`. For a list of what args are supported for each model type, reference the respective library's documentation (for example, the documentation for `transformers.AutoModelForCausalLM.generate()`.) These kwargs will be applied to all `generate_until` tasks called--we do not currently support unique gen_kwargs or batch_size values per task in a single run of the library. To control these on a per-task level, set them in that task's YAML file.
+- `--batch_size` : Sets the batch size used for evaluation. Can be a positive integer or `"auto"` to automatically select the largest batch size that will fit in memory, speeding up evaluation. One can pass `--batch_size auto:N` to re-select the maximum batch size `N` times during evaluation. This can help accelerate evaluation further, since `lm-eval` sorts documents in descending order of context length.
+- `--max_batch_size` : Sets the maximum batch size to try to fit in memory, if `--batch_size auto` is passed.
+- `--device` : Sets which device to place the model onto. Must be a string, for example, `"cuda", "cuda:0", "cpu", "mps"`. Defaults to "cuda", and can be ignored if running multi-GPU or running a non-local model type.
+- `--output_path` : A string of the form `dir/file.jsonl` or `dir/`. Provides a path where high-level results will be saved, either into the file named or into the directory named. If `--log_samples` is passed as well, then per-document outputs and metrics will be saved into the directory as well.
+- `--log_samples` : If this flag is passed, then the model's outputs, and the text fed into the model, will be saved at per-document granularity. Must be used with `--output_path`.
+- `--limit` : Accepts an integer, or a float between 0.0 and 1.0 . If passed, will limit the number of documents to evaluate to the first X documents (if an integer) per task or first X% of documents per task. Useful for debugging, especially on costly API models.
+- `--use_cache` : Should be a path where a sqlite db file can be written to. Takes a string of format `/path/to/sqlite_cache_` in order to create a cache db at `/path/to/sqlite_cache_rank{i}.db` for each process (0-NUM_GPUS). This allows results of prior runs to be cached, so that there is no need to re-run results in order to re-score or re-run a given (model, task) pair again.
+- `--cache_requests` : Can be "true", "refresh", or "delete". "true" means that the cache should be used. "refresh" means that you wish to regenerate the cache, which you should run if you change your dataset configuration for a given task. "delete" will delete the cache. Cached files are stored under lm_eval/cache/.cache unless you specify a different path via the environment variable: `LM_HARNESS_CACHE_PATH`. e.g. `LM_HARNESS_CACHE_PATH=~/Documents/cache_for_lm_harness`.
+- `--check_integrity` : If this flag is used, the library tests for each task selected are run to confirm task integrity.
+- `--write_out` : Used for diagnostic purposes to observe the format of task documents passed to a model. If this flag is used, then prints the prompt and gold target string for the first document of each task.
+- `--show_config` : If used, prints the full `lm_eval.api.task.TaskConfig` contents (non-default settings the task YAML file) for each task which was run, at the completion of an evaluation. Useful for when one is modifying a task's configuration YAML locally to transmit the exact configurations used for debugging or for reproducibility purposes.
+- `--include_path` : Accepts a path to a folder. If passed, then all YAML files containing `lm-eval` compatible task configurations will be added to the task registry as available tasks. Used for when one is writing config files for their own task in a folder other than `lm_eval/tasks/`.
+- `--system_instruction`: Specifies a system instruction string to prepend to the prompt.
+- `--apply_chat_template` : If this flag is on, a chat template will be applied to the prompt. For Hugging Face models, the chat template is taken from the tokenizer, if the tokenizer does not have a chat template, a default one will be applied. For other models, chat templating is not currently implemented.
+- `--fewshot_as_multiturn` : If this flag is on, the Fewshot examples are treated as a multi-turn conversation. Questions are provided as user content and answers are provided as assistant responses. Requires `--num_fewshot` to be set to be greater than 0, and `--apply_chat_template` to be on.
+- `--predict_only`: Generates the model outputs without computing metrics. Use with `--log_samples` to retrieve decoded results.
+* `--seed`: Set seed for python's random, numpy and torch.  Accepts a comma-separated list of 3 values for python's random, numpy, and torch seeds, respectively, or a single integer to set the same seed for all three.  The values are either an integer or 'None' to not set the seed. Default is `0,1234,1234` (for backward compatibility).  E.g. `--seed 0,None,8` sets `random.seed(0)` and `torch.manual_seed(8)`. Here numpy's seed is not set since the second value is `None`.  E.g, `--seed 42` sets all three seeds to 42.
+* `--wandb_args`:  Tracks logging to Weights and Biases for evaluation runs and includes args passed to `wandb.init`, such as `project` and `job_type`. Full list [here](https://docs.wandb.ai/ref/python/init). e.g., ```--wandb_args project=test-project,name=test-run```
+* `--hf_hub_log_args` : Logs evaluation results to Hugging Face Hub. Accepts a string with the arguments separated by commas. Available arguments:
+    * `hub_results_org` - organization name on Hugging Face Hub, e.g., `EleutherAI`. If not provided, the results will be pushed to the owner of the Hugging Face token,
+    * `hub_repo_name` - repository name on Hugging Face Hub (deprecated, `details_repo_name` and `results_repo_name` should be used instead), e.g., `lm-eval-results`,
+    * `details_repo_name` - repository name on Hugging Face Hub to store details, e.g., `lm-eval-results`,
+    * `results_repo_name` - repository name on Hugging Face Hub to store results, e.g., `lm-eval-results`,
+    * `push_results_to_hub` - whether to push results to Hugging Face Hub, can be `True` or `False`,
+    * `push_samples_to_hub` - whether to push samples results to Hugging Face Hub, can be `True` or `False`. Requires `--log_samples` to be set,
+    * `public_repo` - whether the repository is public, can be `True` or `False`,
+    * `leaderboard_url` - URL to the leaderboard, e.g., `https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard`.
+    * `point_of_contact` - Point of contact for the results dataset, e.g., `[email protected]`.
+    * `gated` - whether to gate the details dataset, can be `True` or `False`.
+## External Library Usage
+We also support using the library's external API for use within model training loops or other scripts.
+`lm_eval` supplies two functions for external import and use: `lm_eval.evaluate()` and `lm_eval.simple_evaluate()`.
+`simple_evaluate()` can be used by simply creating an `lm_eval.api.model.LM` subclass that implements the methods described in the [Model Guide](https://github.com/EleutherAI/lm-evaluation-harness/tree/main/docs/model_guide.md), and wrapping your custom model in that class as follows:
+```python
+import lm_eval
+...
+my_model = initialize_my_model() # create your model (could be running finetuning with some custom modeling code)
+...
+# instantiate an LM subclass that takes your initialized model and can run
+# - `Your_LM.loglikelihood()`
+# - `Your_LM.loglikelihood_rolling()`
+# - `Your_LM.generate_until()`
+lm_obj = Your_LM(model=my_model, batch_size=16)
+# indexes all tasks from the `lm_eval/tasks` subdirectory.
+# Alternatively, you can set `TaskManager(include_path="path/to/my/custom/task/configs")`
+# to include a set of tasks in a separate directory.
+task_manager = lm_eval.tasks.TaskManager()
+# Setting `task_manager` to the one above is optional and should generally be done
+# if you want to include tasks from paths other than ones in `lm_eval/tasks`.
+# `simple_evaluate` will instantiate its own task_manager if it is set to None here.
+results = lm_eval.simple_evaluate( # call simple_evaluate
+    model=lm_obj,
+    tasks=["taskname1", "taskname2"],
+    num_fewshot=0,
+    task_manager=task_manager,
+    ...
+)
+```
+See the `simple_evaluate()` and `evaluate()` functions in [lm_eval/evaluator.py](../lm_eval/evaluator.py#:~:text=simple_evaluate) for a full description of all arguments available. All keyword arguments to simple_evaluate share the same role as the command-line flags described previously.
+Additionally, the `evaluate()` function offers the core evaluation functionality provided by the library, but without some of the special handling and simplification + abstraction provided by `simple_evaluate()`.
+As a brief example usage of `evaluate()`:
+```python
+import lm_eval
+# suppose you've defined a custom lm_eval.api.Task subclass in your own external codebase
+from my_tasks import MyTask1
+...
+# create your model (could be running finetuning with some custom modeling code)
+my_model = initialize_my_model()
+...
+# instantiate an LM subclass that takes your initialized model and can run
+# - `Your_LM.loglikelihood()`
+# - `Your_LM.loglikelihood_rolling()`
+# - `Your_LM.generate_until()`
+lm_obj = Your_LM(model=my_model, batch_size=16)
+# optional: the task_manager indexes tasks including ones
+# specified by the user through `include_path`.
+task_manager = lm_eval.tasks.TaskManager(
+    include_path="/path/to/custom/yaml"
+    )
+# To get a task dict for `evaluate`
+task_dict = lm_eval.tasks.get_task_dict(
+    [
+        "mmlu", # A stock task
+        "my_custom_task", # A custom task
+        {
+            "task": ..., # A dict that configures a task
+            "doc_to_text": ...,
+            },
+        MyTask1 # A task object from `lm_eval.task.Task`
+        ],
+    task_manager # A task manager that allows lm_eval to
+                 # load the task during evaluation.
+                 # If none is provided, `get_task_dict`
+                 # will instantiate one itself, but this
+                 # only includes the stock tasks so users
+                 # will need to set this if including
+                 # custom paths is required.
+    )
+results = evaluate(
+    lm=lm_obj,
+    task_dict=task_dict,
+    ...
+)
+```

scripts/yans/lm-evaluation-harness/docs/model_guide.md ADDED Viewed

	@@ -0,0 +1,163 @@

+# New Model Guide
+This guide may be of special interest to users who are using the library outside of the repository, via installing the library via pypi and calling `lm_eval.evaluator.evaluate()` to evaluate an existing model.
+In order to properly evaluate a given LM, we require implementation of a wrapper class subclassing the `lm_eval.api.model.LM` class, that defines how the Evaluation Harness should interface with your model. This guide walks through how to write this `LM` subclass via adding it to the library!
+## Setup
+To get started contributing, go ahead and fork the main repo, clone it, create a branch with the name of your model, and install the project requirements in your environment:
+```sh
+# After forking...
+git clone https://github.com/<YOUR-USERNAME>/lm-evaluation-harness.git
+cd lm-evaluation-harness
+git checkout -b <model-type>
+pip install -e ".[dev]"
+```
+Now, we'll create a new file where we'll be adding our model:
+```sh
+touch lm_eval/models/<my_model_filename>.py
+```
+**Tip: this filename should not shadow package names! For example, naming your file `anthropic.py` is disallowed since the API's name on pypi is `anthropic`, but naming it `anthropic_llms.py` works with no problems.**
+## Interface
+All models must subclass the `lm_eval.api.model.LM` class.
+The LM class enforces a common interface via which we can extract responses from a model:
+```python
+class MyCustomLM(LM):
+    #...
+    def loglikelihood(self, requests: list[Instance]) -> list[tuple[float, bool]]:
+        #...
+    def loglikelihood_rolling(self, requests: list[Instance]) -> list[tuple[float, bool]]:
+        #...
+    def generate_until(self, requests: list[Instance]) -> list[str]:
+        #...
+    #...
+```
+Where `Instance` is a dataclass defined in [`lm_eval.api.instance`](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/api/instance.py) with property `args` of request-dependent type signature described below.
+We support three types of requests, consisting of different interactions / measurements with an autoregressive LM.
+All three request types take as input `requests` of type `list[Instance]` that have a matching `Instance.request_type` to the method name.
+- `generate_until`
+  - Each request contains `Instance.args : Tuple[str, dict]` containing 1. an input string to the LM and 2. a dictionary of keyword arguments used to control generation parameters.
+  - Using this input and these generation parameters, text will be sampled from the language model (typically until a maximum output length or specific stopping string sequences--for example, `{"until": ["\n\n", "."], "max_gen_toks": 128}`).
+  - The generated input+output text from the model will then be returned.
+- `loglikelihood`
+  - Each request contains `Instance.args : Tuple[str, str]` containing 1. an input string to the LM and 2. a target string on which the loglikelihood of the LM producing this target, conditioned on the input, will be returned.
+  - Each request will have, as result, `(ll, is_greedy): Tuple[float, int]` returned, where `ll` is a floating point number representing the log probability of generating the target string conditioned on the input, and `is_greedy` being either the value `0` or `1`, with it being `1` if and only if the target string *would be generated by greedy sampling from the LM* (that is, if the  target string is the *most likely* N-token string to be output by the LM given the input. )
+- `loglikelihood_rolling`
+  - Each request contains `Instance.args : Tuple[str]`, which is an input string to the model whose *entire* loglikelihood, conditioned on purely the EOT token, will be calculated.
+  - This is used to evaluate *perplexity* on a data distribution.
+  - It should return `(ll,) : Tuple[float]` , a.k.a. solely the *loglikelihood* of producing each piece of text given no starting input.
+To allow a model to be evaluated on all types of tasks, you will need to implement these three types of measurements (note that `loglikelihood_rolling` is a special case of `loglikelihood`). For a reference implementation, check out `lm_eval/models/huggingface.py` ! Additionally, check out `lm_eval.api.model.TemplateLM` for a class that abstracts away some commonly used functions across LM subclasses, or see if your model would lend itself well to subclassing the `lm_eval.models.huggingface.HFLM` class and overriding just the initialization or a couple methods!
+**Tip: be careful of indexing in loglikelihood!**
+LMs take in tokens in position `[0 1 2 ... N]` and output a probability distribution for token position `N+1`. We provide a simplified graphic here, excerpted from `huggingface.py`:
+```
+# how this all works (illustrated on a causal decoder-only setup):
+#          CTX      CONT
+# inp    0 1 2 3|4 5 6 7 8 9   <- last token is deleted by inp[:, :-1]
+# model  \               \
+# logits   1 2 3|4 5 6 7 8 9   <- the ctx half gets tossed out by the
+# cont_toks      4 5 6 7 8 9      [:, -len(continuation_enc):, :self.vocab_size] slice
+```
+The final token of the target is not passed into the LM, because we want the LM's predictions *up to but not past* that final target token. For more information, check out https://github.com/EleutherAI/lm-evaluation-harness/issues/942 .
+## Registration
+Congrats on implementing your model! Now it's time to test it out.
+To make your model usable via the command line interface to `lm-eval` using `python -m lm_eval`, you'll need to tell `lm-eval` what your model's name is.
+This is done via a *decorator*, `lm_eval.api.registry.register_model`. Using `register_model()`, one can both tell the package what the model's name(s) to be used are when invoking it with `python -m lm_eval --model <name>` and alert `lm-eval` to the model's existence.
+```python
+from lm_eval.api.registry import register_model
+@register_model("<name1>", "<name2>")
+class MyCustomLM(LM):
+```
+Using this decorator results in the class being added to an accounting of the usable LM types maintained internally to the library at `lm_eval.api.registry.MODEL_REGISTRY`. See `lm_eval.api.registry` for more detail on what sorts of registries and decorators exist in the library!
+**Tip: be sure to import your model in `lm_eval/models/__init__.py!`**
+## Testing
+We also recommend that new model contributions be accompanied by short tests of their 3 core functionalities, at minimum. To see an example of such tests, look at https://github.com/EleutherAI/lm-evaluation-harness/blob/35bdecd379c0cefad6897e67db892f4a6026a128/tests/test_ggml.py .
+## Chat Templating
+Many models are fine-tuned with a [Chat Template](https://huggingface.co/docs/transformers/main/en/chat_templating) in order to enable back-and-forth interaction between a "User"'s queries and the model (often called "Assistant")'s responses. It can be desirable to evaluate fine-tuned models on evaluation tasks while wrapped in the conversational format they expect.
+In order to make your model optionally compatible with a chat format, three additional methods must be implemented:
+```python
+class MyCustomLM(LM):
+    #...
+    @property
+    def tokenizer_name(self) -> str:
+        # should return a string denoting the name of the model's tokenizer and/or the accompanying chat template.
+    @property
+    def chat_template(self) -> str:
+        # should return a chat template formatting string that is used to build prompt from a user/assistant chat history.
+        # this will be saved in the evaluation results for reproducibility.
+    def apply_chat_template(self, chat_history: List[Dict[str, str]]) -> str:
+        # responsible for taking as input a chat history that would be fed into the model, and
+        # rendering it as a string that can be then tokenized and input into the model.
+    #...
+```
+- `apply_chat_template`
+  - This method performs the bulk of the work required for chat-formatting.
+  - As input, a `chat_history: List[Dict[str, str]]` is passed in. This is a transcript of a conversation of a form similar to
+      ```
+      [
+        {"system": <user-provided system message such as "You are a helpful math-focused chatbot">},
+        {"user": <task example - a few-shot example 'input'>}
+        {"assistant": <correct response to the above example>},
+        # ... more few-shot examples, potentially
+        {"user": <test set query--response on which we will evaluate>},
+      ]
+      ```
+      which can then be converted into a string input.
+  - The output is a string representing this conversation that can be fed into the model.
+  - For example, this consists of simply calling `tokenizer.apply_chat_template` for HFLM--see the implementation there for reference.
+- `tokenizer_name`
+  - LM Eval Harness supports [caching requests](https://github.com/EleutherAI/lm-evaluation-harness/blob/4902aaaf1f374682f95ac25fe2e13b23faddc91a/lm_eval/__main__.py#L140) that are sent to a model, for faster setup when repeating an already-performed evaluation.
+  - However, we don't want to use the cache of chat transcripts rendered using one chat template or system prompt to send to a model with a different template! So, we use this `lm.tokenizer_name` string to distinguish caches for a given model (and chat template) from one another.
+- `chat_template`
+  - Chat templates are typically provided as a Jinja template string or a string formatted with str.format to include user and assistant messages in a single prompt. This template string is saved in the evaluation results to ensure reproducibility.
+If not implemented for a given model type, the flags `--apply_chat_template` , `--fewshot_as_multiturn`, and `--system_instruction` cannot be used.
+## Other
+**Pro tip**: In order to make the Evaluation Harness overestimate total runtimes rather than underestimate it, HuggingFace models come in-built with the ability to provide responses on data points in *descending order by total input length* via `lm_eval.utils.Reorderer`. Take a look at `lm_eval.models.hf_causal.HFLM` to see how this is done, and see if you can implement it in your own model!
+## Conclusion
+After reading this guide, you should be able to add new model APIs or implementations to the Eval Harness library!

scripts/yans/lm-evaluation-harness/docs/new_task_guide.md ADDED Viewed

	@@ -0,0 +1,492 @@

+# New Task Guide
+`lm-evaluation-harness` is a framework that strives to support a wide range of zero- and few-shot evaluation tasks on autoregressive language models (LMs).
+This documentation page provides a walkthrough to get started creating your own task, in `lm-eval` versions v0.4.0 and later.
+A more interactive tutorial is available as a Jupyter notebook [here](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/examples/lm-eval-overview.ipynb).
+## Setup
+If you haven't already, go ahead and fork the main repo, clone it, create a branch with the name of your task, and install the project requirements in your environment:
+```sh
+# After forking...
+git clone https://github.com/<YOUR-USERNAME>/lm-evaluation-harness.git
+cd lm-evaluation-harness
+git checkout -b <task-name>
+pip install -e ".[dev]"
+```
+In this document, we'll walk through the basics of implementing a static benchmark evaluation in two formats: a *generative* task which requires sampling text from a model, such as [`gsm8k`](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/gsm8k/gsm8k.yaml), and a *discriminative*, or *multiple choice*, task where the model picks the most likely of several fixed answer choices, such as [`sciq`](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/sciq/sciq.yaml).
+## Creating a YAML file
+To implement a new standard task, we'll need to write a YAML file which configures our task logic. We start by making a new empty YAML file. This file can have any name, but we recommend placing it in a subfolder of `lm_eval/tasks` titled by the dataset or task's shorthand name: for example,
+```sh
+touch lm_eval/tasks/<dataset_name>/<my_new_task_name>.yaml
+```
+Or, copy the template subfolder we provide from `templates/new_yaml_task`:
+```sh
+cp -r templates/new_yaml_task lm_eval/tasks/
+```
+and rename the folders and YAML file(s) as desired.
+### Selecting and configuring a dataset
+All data downloading and management is handled through the HuggingFace (**HF**) [`datasets`](https://github.com/huggingface/datasets) API. So, the first thing you should do is check to see if your task's dataset is already provided in their catalog [here](https://huggingface.co/datasets). If it's not in there, please consider adding it to their Hub to make it accessible to a wider user base by following their [new dataset guide](https://github.com/huggingface/datasets/blob/main/ADD_NEW_DATASET.md)
+.
+Once you have a HuggingFace dataset prepared for your task, we want to assign our new YAML to use this dataset:
+```yaml
+dataset_path: ... # the name of the dataset on the HF Hub.
+dataset_name: ... # the dataset configuration to use. Leave `null` if your dataset does not require a config to be passed. See https://huggingface.co/docs/datasets/load_hub#configurations for more info.
+dataset_kwargs: null # any extra keyword arguments that should be passed to the dataset constructor, e.g. `data_dir`.
+```
+Next, we'd like to tell our task what the dataset's train, validation, and test splits are named, if they exist:
+```yaml
+training_split: <split name of training set, or `null`>
+validation_split: <split name of val. set, or `null`>
+test_split: <split name of test set, or `null`>
+```
+Tests will run on the `test_split` if it is available, and otherwise evaluate on the `validation_split`.
+We can also specify from which split the task should retrieve few-shot examples via:
+```yaml
+fewshot_split: <split name to draw fewshot examples from, or `null`>
+```
+or by hardcoding them, either using the following in the yaml file:
+```yaml
+fewshot_config:
+  sampler: first_n
+  samples: [
+    {<sample 1>},
+    {<sample 2>},
+  ]
+```
+or by adding the function `list_fewshot_samples` in the associated utils.py file:
+```python
+def list_fewshot_samples() -> list[dict]:
+  return [{<sample 1>}, {<sample 2>}]
+```
+See `lm_eval/tasks/minerva_math/minerva_math_algebra.yaml` for an example of the latter, and `lm_eval/tasks/gsm8k/gsm8k-cot.yaml` for an example of the former.
+In this case, each sample must contain the same fields as the samples in the above sets--for example, if `doc_to_text` expects an `input` field when rendering input prompts, these provided samples must include an `input` key.
+If neither above options are not set, we will default to train/validation/test sets, in that order.
+Finally, our dataset may not be already in the exact format we want. Maybe we have to strip whitespace and special characters via a regex from our dataset's "question" field! Or maybe we just want to rename its columns to match a convention we'll be using for our prompts.
+Let's create a python file in the directory where we're writing our YAML file:
+```bash
+touch lm_eval/tasks/<dataset_name>/utils.py
+```
+Now, in `utils.py` we'll write a function to process each split of our dataset:
+TODO: Change the example to one that's in the tasks/
+```python
+def process_docs(dataset: datasets.Dataset):
+    def _helper(doc):
+      # modifies the contents of a single
+      # document in our dataset.
+      doc["choices"] = [doc["choice1"], doc["choice2"], doc["wrong_answer"]]
+      doc["gold"] = doc["label"]
+      return doc
+    return dataset.map(_helper) # returns back a datasets.Dataset object
+```
+Now, in our YAML config file we'll use the `!function` constructor, and tell the config where our imported Python function will come from. At runtime, before doing anything else we will preprocess our dataset according to this function!
+```yaml
+process_docs: !function utils.process_docs
+```
+### Using Local Datasets
+To load a local dataset for evaluation, you can specify data files in the `dataset_kwargs` field, such as the following for JSON files:
+```
+dataset_path: json
+dataset_name: null
+dataset_kwargs:
+  data_files: /path/to/my/json
+```
+Or with files already split into separate directories:
+```
+dataset_path: arrow
+dataset_kwargs:
+  data_files:
+    train: /path/to/arrow/train/data-00000-of-00001.arrow
+    validation: /path/to/arrow/validation/data-00000-of-00001.arrow
+```
+Alternatively, if you have previously downloaded a dataset from huggingface hub (using `save_to_disk()`) and wish to use the local files, you will need to use `data_dir` under `dataset_kwargs` to point to where the directory is.
+```
+dataset_path: hellaswag
+dataset_kwargs:
+  data_dir: hellaswag_local/
+```
+You can also set `dataset_path` as a directory path in your local system. This will assume that there is a loading script with the same name as the directory. [See datasets docs](https://huggingface.co/docs/datasets/loading#local-loading-script).
+## Writing a Prompt Template
+The next thing we need to do is decide what format to use when presenting the data to the LM. This is our **prompt**, where we'll define both an input and output format.
+To write a prompt, users will use `doc_to_text`, `doc_to_target`, and `doc_to_choice` (Optional when certain conditions are met).
+`doc_to_text` defines the input string a model will be given while `doc_to_target` and `doc_to_choice` will be used to generate the target text. `doc_to_target` can be either a text string that refers to the target string or an integer that refers to the index of the correct label. When it is set as an index, `doc_to_choice` must be also be set with the appropriate list of possible choice strings.
+### Basic prompts
+If a dataset is straightforward enough, users can enter the feature name directly. This assumes that no preprocessing is required. For example in [Swag](https://github.com/EleutherAI/lm-evaluation-harness/blob/1710b42d52d0f327cb0eb3cb1bfbbeca992836ca/lm_eval/tasks/swag/swag.yaml#L10-L11), `doc_to_text` and `doc_to_target` given the name of one of the feature each.
+```yaml
+doc_to_text: startphrase
+doc_to_target: label
+```
+Hard-coding is also possible as is the case in [SciQ](https://github.com/EleutherAI/lm-evaluation-harness/blob/1710b42d52d0f327cb0eb3cb1bfbbeca992836ca/lm_eval/tasks/sciq/sciq.yaml#L11).
+```yaml
+doc_to_target: 3
+```
+`doc_to_choice` can be directly given a list of text as option (See [Toxigen](https://github.com/EleutherAI/lm-evaluation-harness/blob/1710b42d52d0f327cb0eb3cb1bfbbeca992836ca/lm_eval/tasks/toxigen/toxigen.yaml#L11))
+```yaml
+doc_to_choice: ['No', 'Yes']
+```
+if a dataset feature is already a list, you can set the name of the feature as `doc_to_choice` (See [Hellaswag](https://github.com/EleutherAI/lm-evaluation-harness/blob/e0eda4d3ffa10e5f65e0976161cd134bec61983a/lm_eval/tasks/hellaswag/hellaswag.yaml#L13))
+```
+doc_to_choice: choices
+```
+### Writing a prompt with Jinja 2
+We support the [Jinja 2](https://jinja.palletsprojects.com/en/3.1.x/) templating language for writing prompts. In practice, this means you can take your dataset's columns and do many basic string manipulations to place each document into prompted format.
+Take for example the dataset `super_glue/boolq`. As input, we'd like to use the features `passage` and `question` and string them together so that for a a sample line `doc`, the model sees something the format of:
+```
+doc["passage"]
+Question: doc["question"]?
+Answer:
+```
+We do this by [writing](https://github.com/EleutherAI/lm-evaluation-harness/blob/1710b42d52d0f327cb0eb3cb1bfbbeca992836ca/lm_eval/tasks/super_glue/boolq/default.yaml#L9C1-L9C61)
+```yaml
+doc_to_text: "{{passage}}\nQuestion: {{question}}?\nAnswer:"
+```
+Such that `{{passage}}` will be replaced by `doc["passage"]` and `{{question}}` with `doc["question"]` when rendering the prompt template.
+Our intended output is for the model to predict a single whitespace, and then the answer to the question. We do this via:
+```yaml
+doc_to_target: "{{answer}}"
+```
+**Important**: we now add `target_delimiter` between input and target which defaults to " ", such that the full input-output string is `doc_to_target(doc) + target_delimiter + doc_to_text(doc)`. `doc_to_text` and `doc_to_target` should not contain trailing right or left whitespace, respectively.
+#### Multiple choice format
+For tasks which are multiple choice (a fixed, finite set of label words per each document) and evaluated via comparing loglikelihoods of all label words (the `multiple_choice` task output type) we enforce a particular convention on prompt format.
+An annotated example in the case of SciQ is as follows:
+```yaml
+doc_to_text: "{{support.lstrip()}}\nQuestion: {{question}}\nAnswer:" # This is the input portion of the prompt for this doc. It will have " {{choice}}" appended to it as target for each choice in answer_choices.
+doc_to_target: 3 # this contains the index into the answer choice list of the correct answer.
+doc_to_choice: "{{[distractor1, distractor2, distractor3, correct_answer]}}"
+```
+Task implementers are thus able to decide what the answer choices should be for a document, and what prompt format to use.
+The label index can also be sourced from a feature directly. For example in `superglue/boolq`, the label index if defined in the feature `label`. We can set `doc_to_target` as simply `label`. The options or verbalizers can be written in a the form of a list `["no", "yes"]` that will correspond to the label index.
+```yaml
+doc_to_text: "{{passage}}\nQuestion: {{question}}?\nAnswer:"
+doc_to_target: label
+doc_to_choice: ["no", "yes"]
+```
+### Using Python Functions for Prompts
+There may be cases where the prompt we want to implement is easier expressed in Python instead of Jinja 2. For this, we can use Python helper functions that are defined in the YAML config. It should be noted that the function script must be in the same directory as the yaml.
+A good example is WikiText that requires a lot of regex rules to clean the samples.
+```
+def wikitext_detokenizer(doc):
+    string = doc["page"]
+    # contractions
+    string = string.replace("s '", "s'")
+    string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
+    ...
+    string = string.replace(" 's", "'s")
+    return string
+```
+We can load this function in `doc_to_target` by using a `!function` operator after `doc_to_target` and followed by `<file name>.<function name>`. In the file [wikitext.yaml](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/wikitext/wikitext.yaml) we write:
+```
+doc_to_target: !function preprocess_wikitext.wikitext_detokenizer
+```
+### Importing a Prompt from Promptsource
+[Promptsource](https://github.com/bigscience-workshop/promptsource/tree/main/promptsource) is a great repository for crowdsourced prompts for many datasets. We can load these prompts easily by using the `use_prompt` argument and filling it with the format `"promptsource:<name of prompt template>"`. To use this, `doc_to_text` and `doc_to_target` should be left undefined. This will fetch the template of the dataset defined in the YAML file.
+For example, For Super Glue BoolQ, if we want to use the prompt template `GPT-3 Style` we can add this to the YAML file.
+```
+use_prompt: "promptsource:GPT-3 Style"
+```
+If you would like to run evaluation on all prompt templates, you can simply call it this way.
+```
+use_prompt: "promptsource:*"
+```
+### Setting metrics
+You're almost done! Now we need to choose how to score our task.
+- *If this is a multiple choice task:* do you just want to check your model's accuracy in choosing the correct answer choice?
+- *If this is a generation task:* do you just want to check how often your model outputs *exactly the ground-truth output string provided*?
+If the answer to the above is no: you'll need to record what scoring metrics to use! Metrics can be listed in the following format:
+```yaml
+metric_list:
+  - metric: <name of the metric here>
+    aggregation: <name of the aggregation fn here>
+    higher_is_better: <true or false>
+  - metric: !function script.function
+    aggregation: ...
+    higher_is_better: ...
+```
+`aggregation` and `higher_is_better` can optionally be left out to default to the manually-set defaults if using a natively supported metric, otherwise it must be defined explicitly (for example, when using a custom metric implemented as a function).
+For a full list of natively supported metrics and aggregation functions see [`docs/task_guide.md`](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/task_guide.md). All metrics supported in [HuggingFace Evaluate](https://github.com/huggingface/evaluate/tree/main/metrics) can also be used, and will be loaded if a given metric name is not one natively supported in `lm-eval` or `hf_evaluate` is set to `true`.
+### Optional, More Advanced Setup
+Some tasks may require more advanced processing logic than is described in this guide.
+As a heuristic check:
+* Does your task require generating multiple free-form outputs per input document?
+* Does your task require complex, multi-step post-processing of generated model outputs?
+* Does your task require subsetting documents on the fly based on their content?
+* Do you expect to compute metrics after applying multiple such processing steps on your model outputs?
+* Does your task rely on metrics that need a custom implementation?
+For more detail on the task system and advanced features, see [`docs/task_guide.md`](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/docs/task_guide.md) . If none of the above sound like they apply to your task, it's time to continue onto checking your task performance!
+### Task name + tags (registering a task)
+To test a task conveniently, it helps to *register* the task--that is, to give it a name and make the `lm-eval` library aware it exists!
+If you're writing your YAML file inside the `lm_eval/tasks` folder, you just need to give your task a name! You can do this inside your YAML file:
+```yaml
+task: <name of the task>
+```
+Including a task name is mandatory.
+It is often also convenient to label your task with several `tag` values, though this field is optional:
+```yaml
+tag:
+  - tag1
+  - tag2
+```
+This will add your task to the `tag1` and `tag2` tags, enabling people to know how to categorize your task, and if desired run all tasks in one of these groups at once, your task along with them.
+If your task is not in the `lm_eval/tasks` folder, you'll need to tell the Eval Harness where to look for YAML files.
+You can do this via the `--include_path` argument in `__main__.py`. This command will be used to initialize the `TaskManager` object which you can also use for your custom scripts.
+```python
+task_manager = TaskManager(args.verbosity, include_path=args.include_path)
+```
+Passing `--tasks /path/to/yaml/file` is also accepted.
+### Advanced Group Configs
+While `tag` values are helpful when you want to be able to quickly and conveniently run a set of related tasks via `--tasks my_tag_name`, often, we wish to implement more complex logic. For example, the MMLU benchmark contains 57 *subtasks* that must all be *averaged* together in order to report a final 'MMLU score'.
+Groupings of tasks might also use particular variants of a task--for example, we might want to default to evaluating a task as 5-shot when called as part of a given grouping, but not have a preference for number of shots when evaluating it as a standalone.
+We implement this via **groups**, which are distinct from tags. Groups can be implemented via *group config* YAML files, which are laid out similarly but slightly differently to tasks' YAML configs.
+The most basic form of group can be defined via a YAML config similar to the following:
+```yaml
+group: nli_tasks
+task:
+  - cb
+  - anli_r1
+  - rte
+metadata:
+  version: 1.0
+```
+This will behave almost identically to a `tag` that includes these 3 tasks, but with one key distinction: we'll print the `nli_tasks` group as a row (with no associated metrics) in our table of outputs, and visually show that these 3 tasks appear under its subheader.
+Now, let's assume we actually want to report an aggregate score for `nli_tasks`. We would instead use a YAML config like the following:
+```yaml
+group: nli_tasks
+task:
+  - cb
+  - anli_r1
+  - rte
+aggregate_metric_list:
+  - metric: acc
+    aggregation: mean
+    weight_by_size: true # defaults to `true`. Set this to `false` to do a "macro" average (taking each subtask's average accuracy, and summing those accuracies and dividing by 3)--by default we do a "micro" average (retain all subtasks' per-document accuracies, and take the mean over all documents' accuracies to get our aggregate mean).
+metadata:
+  version: 1.0
+```
+Similar to our `metric_list` for listing out the metrics we want to calculate for a given task, we use an `aggregate_metric_list` field to specify which metric name to aggregate across subtasks, what aggregation function to use, and whether we should micro- or macro- average these metrics. See [./task_guide.md](./task_guide.md) for a full list of related sub-keys.
+**[!Tip]: currently, we predominantly only support the aggregation of group metrics that use `mean` (either micro- or macro- averaged) over their subtasks. If you require even more complex aggregation rules, you may want to perform aggregation offline.**
+Group configs can be fairly complex! We can do various operations, such as defining new subtask(s) inline in our group YAML, overriding an existing task's specific config value, or nesting existing groups within our
+For example, let's build a config for evaluating MMLU and a few natural language inference tasks. For MMLU, we can write the name for the benchmark as a subtask written under `task`. You can configure the parameters such as `num_fewshot`. If the task being configured is a group such as `mmlu` or `super_glue`, the parameter set will be applied to all of the subtasks.
+```yaml
+group: nli_and_mmlu
+task:
+  - group: nli_tasks
+    task:
+      - cb
+      - anli_r1
+      - rte
+    aggregate_metric_list:
+      - metric: acc
+        aggregation: mean
+        higher_is_better: true
+  - task: mmlu
+    num_fewshot: 2
+```
+### Configuring python classes
+There can occasions when yaml-based tasks cannot accommodate how a task is handled. LM-Eval supports the manually implementing tasks as was previously done before `0.4.x`. To register the task, you can simply make a yaml with the name of the task in `task` and the class object in `class` using the `!function` prefix.
+```yaml
+task: squadv2
+class: !function task.SQuAD2
+```
+This also applies to building group configurations with subtasks that are python classes.
+```yaml
+group: scrolls
+task:
+  - task: scrolls_qasper
+    class: !function task.Qasper
+  - task: scrolls_quality
+    class: !function task.QuALITY
+  - task: scrolls_narrativeqa
+    class: !function task.NarrativeQA
+  ...
+```
+You can also pass a custom argument to your class by accepting `config` in the custom class constructor.
+Here's how to do it:
+```yaml
+task: 20_newsgroups
+class: !function task.Unitxt
+recipe: card=cards.20_newsgroups,template=templates.classification.multi_class.title
+```
+In this example, `recipe` is the custom argument for the `Unitxt` class.
+## Beautifying Table Display
+To avoid conflict, each task needs to be registered with a unique name. Because of this, slight variations of task are still counted as unique tasks and need to be named uniquely. This could be done by appending an additional naming that may refer to the variation such as in MMLU where the template used to evaluated for flan are differentiated from the default by the prefix `mmlu_flan_*`. Printing the full task names can easily clutter the results table at the end of the evaluation especially when you have a long list of tasks or are using a benchmark that comprises of many tasks. To make it more legible, you can use `task_alias` and `group_alias` to provide an alternative task name and group name that will be printed. For example in `mmlu_abstract_algebra.yaml` we set `task_alias` to `abstract_algebra`. In group configs, a `group_alias` for a group can also be set.
+```
+"dataset_name": "abstract_algebra"
+"description": "The following are multiple choice questions (with answers) about abstract\
+  \ algebra.\n\n"
+"include": "_default_template_yaml"
+"task": "mmlu_abstract_algebra"
+"task_alias": "abstract_algebra"
+```
+## Checking validity
+After registering your task, you can now check on your data downloading and verify that the few-shot samples look as intended. Run the following command with your desired args:
+```bash
+python -m scripts.write_out \
+    --output_base_path <path> \
+    --tasks <your-task-name> \
+    --sets <train | val | test> \
+    --num_fewshot K \
+    --num_examples N \
+```
+Open the file specified at the `--output_base_path <path>` and ensure it passes
+a simple eye test.
+## Versioning
+One key feature in LM Evaluation Harness is the ability to version tasks and groups--that is, mark them with a specific version number that can be bumped whenever a breaking change is made.
+This version info can be provided by adding the following to your new task or group config file:
+```
+metadata:
+  version: 0
+```
+Now, whenever a change needs to be made to your task in the future, please increase the version number by 1 so that users can differentiate the different task iterations and versions.
+If you are incrementing a task's version, please also consider adding a changelog to the task's README.md noting the date, PR number, what version you have updated to, and a one-liner describing the change.
+for example,
+* \[Dec 25, 2023\] (PR #999) Version 0.0 -> 1.0: Fixed a bug with answer extraction that led to underestimated performance.
+## Checking performance + equivalence
+It's now time to check models' performance on your task! In the evaluation harness, we intend to support a wide range of evaluation tasks and setups, but prioritize the inclusion of already-proven benchmarks following the precise evaluation setups in the literature where possible.
+To enable this, we provide a checklist that should be completed when contributing a new task, to enable accurate book-keeping and to ensure that tasks added to the library are well-tested and, where applicable, precedented.
+### Task Validity Checklist
+The checklist is the following:
+For adding novel benchmarks/datasets to the library:
+* [ ] Is the task an existing benchmark in the literature?
+  * [ ] Have you referenced the original paper that introduced the task?
+  * [ ] If yes, does the original paper provide a reference implementation? If so, have you checked against the reference implementation and documented how to run such a test?
+If other tasks on this dataset are already supported:
+* [ ] Is the "Main" variant of this task clearly denoted?
+* [ ] Have you provided a short sentence in a README on what each new variant adds / evaluates?
+* [ ] Have you noted which, if any, published evaluation setups are matched by this variant?
+It is recommended to include a filled-out copy of this checklist in the README.md for the subfolder you are creating, if you have created a new subfolder in `lm_eval/tasks`.
+**Finally, please add a short description of your task(s), along with a link to its subfolder in lm_eval/tasks , to [`lm_eval/tasks/README.md`](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/README.md) so that users can discover your task in the library, and follow the link to your README for more information about the variants supported, their task names, and the original source of the dataset and/or evaluation setup.**
+## Submitting your task
+You're all set! Now push your work and make a pull request to the `main` branch! Thanks for the contribution :). If there are any questions, please leave a message in the `#lm-thunderdome` channel on the EAI discord!

scripts/yans/lm-evaluation-harness/docs/task_guide.md ADDED Viewed

	@@ -0,0 +1,317 @@

+# Task Configuration
+The `lm-evaluation-harness` is meant to be an extensible and flexible framework within which many different evaluation tasks can be defined. All tasks in the new version of the harness are built around a YAML configuration file format.
+These YAML configuration files, along with the current codebase commit hash, are intended to be shareable such that providing the YAML config enables another researcher to precisely replicate the evaluation setup used by another, in the case that the prompt or setup differs from standard `lm-eval` task implementations.
+While adding a standard evaluation task on a new dataset can be occasionally as simple as swapping out a Hugging Face dataset path in an existing file, more specialized evaluation setups also exist. Here we'll provide a crash course on the more advanced logic implementable in YAML form available to users.
+If your intended task relies on features beyond what are described in this guide, we'd love to hear about it! Feel free to open an issue describing the scenario on Github, create a PR to the project with a proposed implementation, or ask in the `#lm-thunderdome` channel on the EleutherAI discord.
+## Configurations
+Tasks are configured via the `TaskConfig` object. Below, we describe all fields usable within the object, and their role in defining a task.
+### Parameters
+Task naming + registration:
+- **task** (`str`, defaults to None) — name of the task.
+- **task_alias** (`str`, defaults to None) - Alias of the task name that will be printed in the final table results.
+- **tag** (`str`, *optional*) — name of the task tags(s) a task belongs to. Enables one to run all tasks with a specified tag name at once.
+Dataset configuration options:
+- **dataset_path** (`str`) — The name of the dataset as listed by HF in the datasets Hub.
+- **dataset_name**  (`str`, *optional*, defaults to None) — The name of what HF calls a “data instance” or sub-task of the benchmark. If your task does not contain any data instances, just leave this to default to None. (If you're familiar with the HF `datasets.load_dataset` function, these are just the first 2 arguments to it.)
+- **dataset_kwargs** (`dict`, *optional*) — Auxiliary arguments that `datasets.load_dataset` accepts. This can be used to specify arguments such as `data_files` or `data_dir` if you want to use local datafiles such as json or csv.
+- **training_split** (`str`, *optional*) — Split in the dataset to use as the training split.
+- **validation_split** (`str`, *optional*) — Split in the dataset to use as the validation split.
+- **test_split** (`str`, *optional*) — Split in the dataset to use as the test split.
+- **fewshot_split** (`str`, *optional*) — Split in the dataset to draw few-shot exemplars from. assert that this not None if num_fewshot > 0.
+- **process_docs** (`Callable`, *optional*) — Optionally define a function to apply to each HF dataset split, to preprocess all documents before being fed into prompt template rendering or other evaluation steps. Can be used to rename dataset columns, or to process documents into a format closer to the expected format expected by a prompt template.
+Prompting / in-context formatting options:
+- **use_prompt** (`str`, *optional*) — Name of prompt in promptsource to use. if defined, will overwrite doc_to_text, doc_to_target, and doc_to_choice.
+- **description** (`str`, *optional*) — An optional prepended Jinja2 template or string which will be prepended to the few-shot examples passed into the model, often describing the task or providing instructions to a model, such as `"The following are questions (with answers) about {{subject}}.\n\n"`. No delimiters or spacing are inserted between the description and the first few-shot example.
+- **doc_to_text** (`Union[Callable, str]`, *optional*) — Jinja2 template, string, or function to process a sample into the appropriate input for the model.
+- **doc_to_target** (`Union[Callable, str]`, *optional*) — Jinja2 template, string, or function to process a sample into the appropriate target output for the model. For multiple choice tasks, this should return an index into the answer choice list of the correct answer.
+- **doc_to_choice** (`Union[Callable, str]`, *optional*) — Jinja2 template, string, or function to process a sample into a list of possible string choices for `multiple_choice` tasks. Left undefined for `generate_until` tasks.
+- **fewshot_delimiter** (`str`, *optional*, defaults to "\n\n") — String to insert between few-shot examples.
+- **target_delimiter** (`str`, *optional*, defaults to `" "`) — String to insert between input and target output for the datapoint being tested.
+Runtime configuration options:
+- **num_fewshot** (`int`, *optional*, defaults to 0) — Number of few-shot examples before the input.
+- **batch_size** (`int`, *optional*, defaults to 1) — Batch size.
+Scoring details:
+- **metric_list** (`str`, *optional*, defaults to None) — A list of metrics to use for evaluation. See docs for expected format.
+- **output_type** (`str`, *optional*, defaults to "generate_until") — Selects the type of model output for the given task. Options are `generate_until`, `loglikelihood`, `loglikelihood_rolling`, and `multiple_choice`.
+- **generation_kwargs** (`dict`, *optional*) — Auxiliary arguments for the `generate` function from HF transformers library. Advanced keyword arguments may not be supported for non-HF LM classes.
+- **repeats** (`int`, *optional*, defaults to 1) — Number of repeated runs through model for each sample. can be used for cases such as self-consistency.
+- **filter_list** (`Union[str, list]`, *optional*) — List of filters to postprocess model outputs. See below for further detail on the filter API.
+- **should_decontaminate** (`bool`, *optional*, defaults to False) - Whether to decontaminate or not.
+- **doc_to_decontamination_query** (`str`, *optional*) — Query for decontamination if `should_decontaminate` is True. If `should_decontaminate` is True but `doc_to_decontamination_query` is `None`, `doc_to_decontamination_query` will follow `doc_to_text`.
+Other:
+- **metadata** (`dict`, *optional*) — An optional field where arbitrary metadata can be passed. Most tasks should include a `version` key in this field that is used to denote the version of the yaml config. Other special metadata keys are: `num_fewshot`, to override the printed `n-shot` table column for a task.
+## Filters
+A key component of the `lm-evaluation-harness` library is the `Filter` object. In a typical evaluation run of the harness, we take the formatted inputs and run them through our LM, with the appropriate output type (greedy or free-form generation, or loglikelihood-based comparative scoring).
+After getting scores or output text from our LM on each `Instance` or document in the dataset, we then need to feed these responses into a metric or scoring function to return scores to a user.
+However, certain tasks may require more complex behavior than directly turning over model outputs to a metric function. For example, we may want to post-process our output text by truncating it or extracting a model's answer, we may want to ensemble over multiple "takes" on a different document, et cetera.
+**Detailed Aside**:
+We do such post-processing by operating on *responses*, which are stored after running an LM on an `Instance` from the task in `Instance.resps`.
+`resps` is a `List[str]` for each instance, and we pass a `List[List[<expected return type from model>]]` to our filters that is a list of `[instance.resps for instance in instances]`.
+Our filters, after completing a pipeline, must return a `List[<expected return type from model>]` which we then unpack and store each element of in `Instance.filtered_resps` for the corresponding instance. Thus, we take as input a list of returns from our model for each doc, and must return a return from our model *without it being wrapped in a list* for each doc.
+**End Aside**
+A full list of supported filter operations can be found in `lm_eval/filters/__init__.py`. Contributions of new filter types are welcome!
+### Multiple Filter Pipelines
+Tasks need not be limited to a single filter pipeline. We enable users to run multiple, distinct, filter pipelines on *the same model outputs* generated in one run on a task.
+As a case study, let's look at an implementation of solving the Gsm8k math word problem benchmark in `lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml`. Here, we are emulating the setup used by [Self-Consistency Improves Chain of Thought Prompting](https://arxiv.org/abs/2203.11171), in which evaluation is performed by generating N chain-of-thought outputs from a model via temperature-based sampling, then selecting the answers output by the model at the end of the chains of thought, then majority voting across all those numeric answers.
+Within our YAML file:
+```yaml
+...
+repeats: 64
+filter_list:
+  - name: "score-first"
+    filter:
+      - function: "regex"
+        regex_pattern: "The answer is (\\-?[0-9\\.\\,]*[0-9]+)"
+      - function: "take_first"
+  - name: "maj@64"
+    filter:
+      - function: "regex"
+        regex_pattern: "The answer is (\\-?[0-9\\.\\,]*[0-9]+)"
+      - function: "majority_vote"
+      - function: "take_first"
+  - name: "maj@8"
+    filter:
+      - function: "take_first_k"
+        k: 8
+      - function: "regex"
+        regex_pattern: "The answer is (\\-?[0-9\\.\\,]*[0-9]+)"
+      - function: "majority_vote"
+      - function: "take_first"
+```
+We are able to provide multiple different filter pipelines, each with their own name and list of filters to apply in sequence.
+Our first filter pipeline implements
+- applying a regex to the model generations (extracting the number within the phrase "The answer is (number)")
+- selecting only the first out of the 64 model answers
+Then scoring this single answer.
+```yaml
+- name: "score-first"
+  filter:
+    - function: "regex"
+      regex_pattern: "The answer is (\\-?[0-9\\.\\,]*[0-9]+)"
+    - function: "take_first"
+```
+Our second filter pipeline, "maj@64", does majority voting across all 64 answers via:
+- applying the same regex to all responses, to get the numerical answer from the model for each of the 64 responses per problem
+- applying majority voting to all responses, which then returns a length-1 `[<majority answer>]` list for each
+- taking the first element of this length-1 list, to then score the sole response `<majority answer>` for each document.
+```yaml
+- name: "maj@64"
+  filter:
+    - function: "regex"
+      regex_pattern: "The answer is (\\-?[0-9\\.\\,]*[0-9]+)"
+    - function: "majority_vote"
+    - function: "take_first"
+```
+Our final filter pipeline, "maj@8", does majority voting across the first 8 of the model's responses per document via:
+- subsetting the len-64 list of responses `[answer1, answer2, ..., answer64]` to `[answer1, answer2, ..., answer8]` for each document
+- performing the same sequence of filters on these new sets of 8 responses, for each document.
+```yaml
+- name: "maj@8"
+  filter:
+    - function: "take_first_k"
+      k: 8
+    - function: "regex"
+      regex_pattern: "The answer is (\\-?[0-9\\.\\,]*[0-9]+)"
+    - function: "majority_vote"
+    - function: "take_first"
+```
+Thus, given the 64 responses from our LM on each document, we can report metrics on these responses in these 3 different ways, as defined by our filter pipelines.
+### Adding a custom filter
+Just like adding a custom model with `register_model` decorator one is able to do the same with filters, for example
+```python
+from lm_eval.api.filter import Filter
+from lm_eval.api.registry import register_filter
+@register_filter("new_filter")
+class NewFilter(Filter)
+    ...
+```
+## Embedded Python Code
+Use can use python functions for certain arguments by using the `!function` operator after the argument name followed by `<filename>.<pythonfunctionname>`. This feature can be used for the following arguments:
+1. `doc_to_text`
+2. `doc_to_target`
+3. `doc_to_choice`
+4. `aggregation` for a `metric` in `metric_list`
+## (No Longer Recommended) Direct `Task` Subclassing
+The prior implementation method of new tasks was to subclass `Task`. While we intend to migrate all tasks to the new YAML implementation option going forward, it remains possible to subclass the Task class and implement custom logic. For more information, see `docs/task_guide.md` in v0.3.0 of the `lm-evaluation-harness`.
+## Including a Base YAML
+You can base a YAML on another YAML file as a template. This can be handy when you need to just change the prompt for `doc_to_text` but keep the rest the same or change `filters` to compare which is better. Simply use `include` in the YAML file and write the name of the template you want to base from. This assumes that the base temeplate is in the same directory. Otherwise, You will need to define the full path.
+```
+include: <YAML filename or with full path>
+...
+```
+You can find an example of how to use this feature at [gsm8k-cot-self-consistency.yaml](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml) where it is based off [gsm8k-cot.yaml](https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/gsm8k/gsm8k-cot.yaml)
+## Passing Arguments to Metrics
+Metrics can be defined in the `metric_list` argument when building the YAML config. Multiple metrics can be listed along with any auxiliary arguments. For example, setting the [`exact_match` metric](https://github.com/huggingface/evaluate/tree/main/metrics/exact_match), auxiliary arguments such as `ignore_case`, `ignore_punctuation`, `regexes_to_ignore` can be listed as well. They will be added to the metric function as `kwargs`. Some metrics have predefined values for `aggregation` and `higher_is_better` so listing the metric name only can be sufficient.
+```
+metric_list:
+  - metric: acc
+  - metric: exact_match
+    aggregation: mean
+    higher_is_better: true
+    ignore_case: true
+    ignore_punctuation: false
+    regexes_to_ignore:
+      - ","
+      - "\\$"
+```
+### Natively Supported Metrics
+Here we list all metrics currently supported natively in `lm-eval`:
+Metrics:
+* `acc` (accuracy)
+* `acc_norm` (length-normalized accuracy)
+* `acc_mutual_info` (baseline loglikelihood - normalized accuracy)
+* `perplexity`
+* `word_perplexity` (perplexity per word)
+* `byte_perplexity` (perplexity per byte)
+* `bits_per_byte`
+* `matthews_corrcoef` (Matthews correlation coefficient)
+* `f1` (F1 score)
+* `bleu`
+* `chrf`
+* `ter`
+Aggregation functions:
+* `mean`
+* `median`
+* `perplexity`
+* `weighted_perplexity`
+* `bits_per_byte`
+### Adding a Multiple Choice Metric
+Adding a multiple choice metric has a few steps. To get it working you need to:
+1. register a metric function
+2. register an aggregation function
+3. update the `Task` definition to make sure the correct arguments are passed
+The default metric and aggregation functions are in `lm_eval/api/metrics.py`, and you can add a function there if it's for general use. The metrics are towards the bottom of the file and look like this:
+    @register_metric(
+        metric="mcc",
+        higher_is_better=True,
+        output_type="multiple_choice",
+        aggregation="matthews_corrcoef",
+    )
+    def mcc_fn(items):  # This is a passthrough function
+        return items
+Note that many of these are passthrough functions, and for multiple choice (at least) this function is never actually called.
+Aggregation functions are defined towards the top of the file, here's an example:
+    @register_aggregation("matthews_corrcoef")
+    def matthews_corrcoef(items):
+        unzipped_list = list(zip(*items))
+        golds = unzipped_list[0]
+        preds = unzipped_list[1]
+        return sklearn.metrics.matthews_corrcoef(golds, preds)
+This function returns a single numeric value. The input is defined in `Task.process_results` in `lm_eval/api/task.py`. There's a section that looks like this:
+    result_dict = {
+        **({"acc": acc} if "acc" in use_metric else {}),
+        **({"f1": (gold, pred)} if "f1" in use_metric else {}),
+        **({"mcc": (gold, pred)} if "mcc" in use_metric else {}),
+        **({"acc_norm": acc_norm} if "acc_norm" in use_metric else {}),
+        **({"exact_match": exact_match} if "exact_match" in use_metric else {}),
+    }
+The value here determines the input to the aggregation function, though the name used matches the metric function. These metrics all have simple needs and just need the accuracy or gold and predicted values, but immediately below this there are examples of metrics with more complicated needs you can use as reference.
+## Good Reference Tasks
+Contributing a new task can be daunting! Luckily, much of the work has often been done for you in a different, similarly evaluated task. Good examples of task implementations to study include:
+Multiple choice tasks:
+- SciQ (`lm_eval/tasks/sciq/sciq.yaml`)
+Corpus perplexity evaluations:
+- Wikitext (`lm_eval/tasks/wikitext/wikitext.yaml`)
+Generative tasks:
+- GSM8k (`lm_eval/tasks/gsm8k/gsm8k.yaml`)
+Tasks using complex filtering:
+- GSM8k with CoT (+ with Self-Consistency): (`lm_eval/tasks/gsm8k/gsm8k-cot.yaml` ; `lm_eval/tasks/gsm8k/gsm8k-cot-self-consistency.yaml`)
+# Group Configuration
+When evaluating a language model, it's is not unusual to test across a number of tasks that may not be related to one another in order to assess a variety of capabilities. To this end, it may be combursome to have to list the set of tasks or add a new group name to each yaml of each individual task.
+To solve this, we can create a **group** yaml config. This is a config that contains the names of the tasks that should be included in a particular group. The config consists of two main keys: a `group` key which denotes the name of the group (as it would be called from the command line, e.g. `mmlu`) and a `task` key which is where we can list the tasks. The tasks listed in `task` are the task names that have been registered. A good example of a group yaml config can be found at [../lm_eval/tasks/mmlu/default/_mmlu.yaml]. See also the [New Task Guide](./new_task_guide.md) for a more in-depth and tutorial-esque explanation of how to write complex GroupConfigs.
+## Configurations
+Groups are configured via the `GroupConfig` object. Below, we describe all fields usable within the object, and their role in defining a task.
+### Parameters
+- **group** (`str`, defaults to `None`) — name of the group. Used to invoke it from the command line.
+- **group_alias** (`str`, defaults to `None`) - Alternative name for the group that will be printed in the table output.
+- **task** (`Union[str, list]`, defaults to `None`) - List of tasks that constitute the group.
+- **aggregate_metric_list** (`list`, defaults to `None`) - similar to `metric_list` in TaskConfigs, provide a list of configurations for metrics that should be aggregated across subtasks. Leaving empty will result in no aggregation being performed for this group. Keys for each list entry are:
+  - `metric: str` - the name of the metric to aggregate over (all subtasks must report a metric holding this name.)
+  - `aggregation: str` - what aggregation function to apply to aggregate these per-subtask metrics.  **currently, only `mean` is supported.**
+  - `weight_by_size: bool = True` whether to perform micro- averaging (`True`) or macro- (`False`) averaging of subtasks' accuracy scores when reporting the group's metric. MMLU, for example, averages over per-document accuracies (the *micro average*), resulting in the same accuracy as if one simply concatenated all 57 subjects into a single dataset and evaluated accuracy on that dataset.
+  - `filter_list: Union[str, List[str]] = "none"` - what filter keys one should match on to aggregate results. For example, if trying to aggregate over the `exact_match` metric using `strict-match` filter for `bbh_cot_zeroshot`, then set this to be `filter_list: "strict-match"`.
+- **metadata** (`dict`, *optional*) - As with TaskConfigs, a field where extra config metadata can be passed. set the `num_fewshot` key within this to override the printed n_shot value in a results table for your group, for example.

scripts/yans/lm-evaluation-harness/eval.sh ADDED Viewed

	@@ -0,0 +1,5 @@

+lm_eval --model hf \
+    --model_args pretrained=/project/models/yans-quen2-0.5B/iter_0000990 \
+    --tasks hellaswag \
+    --device cuda:0 \
+    --batch_size 8

scripts/yans/lm-evaluation-harness/eval2.sh ADDED Viewed

	@@ -0,0 +1,5 @@

+lm_eval --model hf \
+    --model_args pretrained=/share/pretrained_lm/Qwen/Qwen2-0.5B\
+    --tasks hellaswag,openbookqa,arc_easy,winogrande,hellaswag,arc_challenge,piqa,boolq \
+    --device cuda:0 \
+    --batch_size 8

scripts/yans/lm-evaluation-harness/eval3.sh ADDED Viewed

	@@ -0,0 +1,5 @@

+lm_eval --model hf \
+    --model_args pretrained=/project/models/1_qwen2-0.5B \
+    --tasks hellaswag \
+    --device cuda:0 \
+    --batch_size 8

scripts/yans/lm-evaluation-harness/eval4.sh ADDED Viewed

	@@ -0,0 +1,5 @@

+lm_eval --model hf \
+    --model_args pretrained=/project/models/1_qwen2-0.5B \
+    --tasks openbookqa,arc_easy,winogrande,hellaswag,arc_challenge,piqa,boolq,hellaswag \
+    --device cuda:0 \
+    --batch_size 8

scripts/yans/lm-evaluation-harness/examples/lm-eval-overview.ipynb ADDED Viewed

	@@ -0,0 +1,1230 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Qw83KAePAhaS"
+      },
+      "source": [
+        "# Releasing LM-Evaluation-Harness v0.4.0"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Z7k2vq1iAdqr"
+      },
+      "source": [
+        "With the vast amount of work done in the field today, it helps to have a tool that people can use easily to share their results and use to check others to ensure reported numbers are valid. The LM Evaluation Harness is one such tool the community has used extensively. We want to continue to support the community and with that in mind, we’re excited to announce a major update on the LM Evaluation Harness to further our goal for open and accessible AI research."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "0gDoM0AJAvEc"
+      },
+      "source": [
+        "Our refactor stems from our desires to make the following believed best practices easier to carry out.  \n",
+        "\n",
+        "1.   Never copy results from other papers\n",
+        "2.   Always share your exact prompts\n",
+        "3.   Always provide model outputs\n",
+        "4.   Qualitatively review a small batch of outputs before running evaluation jobs at scale\n",
+        "\n",
+        "We also wanted to make the library a better experience to use and to contribute or design evaluations within. New features in the new release that serve this purpose include:\n",
+        "\n",
+        "1. Faster Evaluation Runtimes (accelerated data-parallel inference with HF Transformers + Accelerate, and commonly used or faster inference libraries such as vLLM and Llama-CPP)\n",
+        "2. Easier addition and sharing of new tasks (YAML-based task config formats, allowing single-file sharing of custom tasks)\n",
+        "3. More configurability, for more advanced workflows and easier operation with modifying prompts\n",
+        "4. Better logging of data at runtime and post-hoc"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "nnwsOpjda_YW"
+      },
+      "source": [
+        "In this notebook we will be going through a short tutorial on how things work."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "zAov81vTbL2K"
+      },
+      "source": [
+        "## Install LM-Eval"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "8hiosGzq_qZg",
+        "outputId": "6ab73e5e-1f54-417e-a388-07e0d870b132"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Collecting git+https://github.com/EleutherAI/lm-evaluation-harness.git@big-refactor\n",
+            "  Cloning https://github.com/EleutherAI/lm-evaluation-harness.git (to revision big-refactor) to /tmp/pip-req-build-tnssql5s\n",
+            "  Running command git clone --filter=blob:none --quiet https://github.com/EleutherAI/lm-evaluation-harness.git /tmp/pip-req-build-tnssql5s\n",
+            "  Running command git checkout -b big-refactor --track origin/big-refactor\n",
+            "  Switched to a new branch 'big-refactor'\n",
+            "  Branch 'big-refactor' set up to track remote branch 'big-refactor' from 'origin'.\n",
+            "  Resolved https://github.com/EleutherAI/lm-evaluation-harness.git to commit 42f486ee49b65926a444cb0620870a39a5b4b0a8\n",
+            "  Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
+            "  Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
+            "  Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
+            "Collecting accelerate>=0.21.0 (from lm-eval==1.0.0)\n",
+            "  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m261.4/261.4 kB\u001b[0m \u001b[31m4.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting evaluate (from lm-eval==1.0.0)\n",
+            "  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.1/84.1 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting datasets>=2.0.0 (from lm-eval==1.0.0)\n",
+            "  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m521.2/521.2 kB\u001b[0m \u001b[31m9.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting jsonlines (from lm-eval==1.0.0)\n",
+            "  Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)\n",
+            "Requirement already satisfied: numexpr in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (2.8.7)\n",
+            "Collecting peft>=0.2.0 (from lm-eval==1.0.0)\n",
+            "  Downloading peft-0.6.2-py3-none-any.whl (174 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m174.7/174.7 kB\u001b[0m \u001b[31m7.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting pybind11>=2.6.2 (from lm-eval==1.0.0)\n",
+            "  Downloading pybind11-2.11.1-py3-none-any.whl (227 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m227.7/227.7 kB\u001b[0m \u001b[31m12.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting pytablewriter (from lm-eval==1.0.0)\n",
+            "  Downloading pytablewriter-1.2.0-py3-none-any.whl (111 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m111.1/111.1 kB\u001b[0m \u001b[31m8.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hCollecting rouge-score>=0.0.4 (from lm-eval==1.0.0)\n",
+            "  Downloading rouge_score-0.1.2.tar.gz (17 kB)\n",
+            "  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "Collecting sacrebleu>=1.5.0 (from lm-eval==1.0.0)\n",
+            "  Downloading sacrebleu-2.3.2-py3-none-any.whl (119 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m119.7/119.7 kB\u001b[0m \u001b[31m8.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: scikit-learn>=0.24.1 in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (1.2.2)\n",
+            "Collecting sqlitedict (from lm-eval==1.0.0)\n",
+            "  Downloading sqlitedict-2.1.0.tar.gz (21 kB)\n",
+            "  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "Requirement already satisfied: torch>=1.8 in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (2.1.0+cu118)\n",
+            "Collecting tqdm-multiprocess (from lm-eval==1.0.0)\n",
+            "  Downloading tqdm_multiprocess-0.0.11-py3-none-any.whl (9.8 kB)\n",
+            "Requirement already satisfied: transformers>=4.1 in /usr/local/lib/python3.10/dist-packages (from lm-eval==1.0.0) (4.35.2)\n",
+            "Collecting zstandard (from lm-eval==1.0.0)\n",
+            "  Downloading zstandard-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.4 MB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.4/5.4 MB\u001b[0m \u001b[31m29.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (1.23.5)\n",
+            "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (23.2)\n",
+            "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (5.9.5)\n",
+            "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (6.0.1)\n",
+            "Requirement already satisfied: huggingface-hub in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm-eval==1.0.0) (0.19.4)\n",
+            "Requirement already satisfied: pyarrow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (9.0.0)\n",
+            "Collecting pyarrow-hotfix (from datasets>=2.0.0->lm-eval==1.0.0)\n",
+            "  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)\n",
+            "Collecting dill<0.3.8,>=0.3.0 (from datasets>=2.0.0->lm-eval==1.0.0)\n",
+            "  Downloading dill-0.3.7-py3-none-any.whl (115 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m14.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (1.5.3)\n",
+            "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (2.31.0)\n",
+            "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (4.66.1)\n",
+            "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (3.4.1)\n",
+            "Collecting multiprocess (from datasets>=2.0.0->lm-eval==1.0.0)\n",
+            "  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m19.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hRequirement already satisfied: fsspec[http]<=2023.10.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (2023.6.0)\n",
+            "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==1.0.0) (3.8.6)\n",
+            "Collecting responses<0.19 (from evaluate->lm-eval==1.0.0)\n",
+            "  Downloading responses-0.18.0-py3-none-any.whl (38 kB)\n",
+            "Requirement already satisfied: safetensors in /usr/local/lib/python3.10/dist-packages (from peft>=0.2.0->lm-eval==1.0.0) (0.4.0)\n",
+            "Requirement already satisfied: absl-py in /usr/local/lib/python3.10/dist-packages (from rouge-score>=0.0.4->lm-eval==1.0.0) (1.4.0)\n",
+            "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from rouge-score>=0.0.4->lm-eval==1.0.0) (3.8.1)\n",
+            "Requirement already satisfied: six>=1.14.0 in /usr/local/lib/python3.10/dist-packages (from rouge-score>=0.0.4->lm-eval==1.0.0) (1.16.0)\n",
+            "Collecting portalocker (from sacrebleu>=1.5.0->lm-eval==1.0.0)\n",
+            "  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)\n",
+            "Requirement already satisfied: regex in /usr/local/lib/python3.10/dist-packages (from sacrebleu>=1.5.0->lm-eval==1.0.0) (2023.6.3)\n",
+            "Requirement already satisfied: tabulate>=0.8.9 in /usr/local/lib/python3.10/dist-packages (from sacrebleu>=1.5.0->lm-eval==1.0.0) (0.9.0)\n",
+            "Collecting colorama (from sacrebleu>=1.5.0->lm-eval==1.0.0)\n",
+            "  Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)\n",
+            "Requirement already satisfied: lxml in /usr/local/lib/python3.10/dist-packages (from sacrebleu>=1.5.0->lm-eval==1.0.0) (4.9.3)\n",
+            "Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.24.1->lm-eval==1.0.0) (1.11.3)\n",
+            "Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.24.1->lm-eval==1.0.0) (1.3.2)\n",
+            "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.24.1->lm-eval==1.0.0) (3.2.0)\n",
+            "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm-eval==1.0.0) (3.13.1)\n",
+            "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm-eval==1.0.0) (4.5.0)\n",
+            "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm-eval==1.0.0) (1.12)\n",
+            "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm-eval==1.0.0) (3.2.1)\n",
+            "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm-eval==1.0.0) (3.1.2)\n",
+            "Requirement already satisfied: triton==2.1.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm-eval==1.0.0) (2.1.0)\n",
+            "Requirement already satisfied: tokenizers<0.19,>=0.14 in /usr/local/lib/python3.10/dist-packages (from transformers>=4.1->lm-eval==1.0.0) (0.15.0)\n",
+            "Requirement already satisfied: attrs>=19.2.0 in /usr/local/lib/python3.10/dist-packages (from jsonlines->lm-eval==1.0.0) (23.1.0)\n",
+            "Requirement already satisfied: setuptools>=38.3.0 in /usr/local/lib/python3.10/dist-packages (from pytablewriter->lm-eval==1.0.0) (67.7.2)\n",
+            "Collecting DataProperty<2,>=1.0.1 (from pytablewriter->lm-eval==1.0.0)\n",
+            "  Downloading DataProperty-1.0.1-py3-none-any.whl (27 kB)\n",
+            "Collecting mbstrdecoder<2,>=1.0.0 (from pytablewriter->lm-eval==1.0.0)\n",
+            "  Downloading mbstrdecoder-1.1.3-py3-none-any.whl (7.8 kB)\n",
+            "Collecting pathvalidate<4,>=2.3.0 (from pytablewriter->lm-eval==1.0.0)\n",
+            "  Downloading pathvalidate-3.2.0-py3-none-any.whl (23 kB)\n",
+            "Collecting tabledata<2,>=1.3.1 (from pytablewriter->lm-eval==1.0.0)\n",
+            "  Downloading tabledata-1.3.3-py3-none-any.whl (11 kB)\n",
+            "Collecting tcolorpy<1,>=0.0.5 (from pytablewriter->lm-eval==1.0.0)\n",
+            "  Downloading tcolorpy-0.1.4-py3-none-any.whl (7.9 kB)\n",
+            "Collecting typepy[datetime]<2,>=1.3.2 (from pytablewriter->lm-eval==1.0.0)\n",
+            "  Downloading typepy-1.3.2-py3-none-any.whl (31 kB)\n",
+            "Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==1.0.0) (3.3.2)\n",
+            "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==1.0.0) (6.0.4)\n",
+            "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==1.0.0) (4.0.3)\n",
+            "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==1.0.0) (1.9.2)\n",
+            "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==1.0.0) (1.4.0)\n",
+            "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==1.0.0) (1.3.1)\n",
+            "Requirement already satisfied: chardet<6,>=3.0.4 in /usr/local/lib/python3.10/dist-packages (from mbstrdecoder<2,>=1.0.0->pytablewriter->lm-eval==1.0.0) (5.2.0)\n",
+            "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets>=2.0.0->lm-eval==1.0.0) (3.4)\n",
+            "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets>=2.0.0->lm-eval==1.0.0) (2.0.7)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets>=2.0.0->lm-eval==1.0.0) (2023.7.22)\n",
+            "Requirement already satisfied: python-dateutil<3.0.0,>=2.8.0 in /usr/local/lib/python3.10/dist-packages (from typepy[datetime]<2,>=1.3.2->pytablewriter->lm-eval==1.0.0) (2.8.2)\n",
+            "Requirement already satisfied: pytz>=2018.9 in /usr/local/lib/python3.10/dist-packages (from typepy[datetime]<2,>=1.3.2->pytablewriter->lm-eval==1.0.0) (2023.3.post1)\n",
+            "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.8->lm-eval==1.0.0) (2.1.3)\n",
+            "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk->rouge-score>=0.0.4->lm-eval==1.0.0) (8.1.7)\n",
+            "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.8->lm-eval==1.0.0) (1.3.0)\n",
+            "Building wheels for collected packages: lm-eval, rouge-score, sqlitedict\n",
+            "  Building wheel for lm-eval (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for lm-eval: filename=lm_eval-1.0.0-py3-none-any.whl size=994254 sha256=88356155b19f2891981ecef948326ad6ce8ca40a6009378410ec20d0e225995a\n",
+            "  Stored in directory: /tmp/pip-ephem-wheel-cache-9v6ye7h3/wheels/17/01/26/599c0779e9858a70a73fa8a306699b5b9a868f820c225457b0\n",
+            "  Building wheel for rouge-score (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=6bb0d44e4881972c43ce194e7cb65233d309758cb15f0dec54590d3d2efcfc36\n",
+            "  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4\n",
+            "  Building wheel for sqlitedict (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "  Created wheel for sqlitedict: filename=sqlitedict-2.1.0-py3-none-any.whl size=16863 sha256=5747f7dd73ddf3d8fbcebf51b5e4f718fabe1e94bccdf16d2f22a2e65ee7fdf4\n",
+            "  Stored in directory: /root/.cache/pip/wheels/79/d6/e7/304e0e6cb2221022c26d8161f7c23cd4f259a9e41e8bbcfabd\n",
+            "Successfully built lm-eval rouge-score sqlitedict\n",
+            "Installing collected packages: sqlitedict, zstandard, tcolorpy, pybind11, pyarrow-hotfix, portalocker, pathvalidate, mbstrdecoder, jsonlines, dill, colorama, typepy, tqdm-multiprocess, sacrebleu, rouge-score, responses, multiprocess, accelerate, datasets, DataProperty, tabledata, peft, evaluate, pytablewriter, lm-eval\n",
+            "Successfully installed DataProperty-1.0.1 accelerate-0.24.1 colorama-0.4.6 datasets-2.15.0 dill-0.3.7 evaluate-0.4.1 jsonlines-4.0.0 lm-eval-1.0.0 mbstrdecoder-1.1.3 multiprocess-0.70.15 pathvalidate-3.2.0 peft-0.6.2 portalocker-2.8.2 pyarrow-hotfix-0.6 pybind11-2.11.1 pytablewriter-1.2.0 responses-0.18.0 rouge-score-0.1.2 sacrebleu-2.3.2 sqlitedict-2.1.0 tabledata-1.3.3 tcolorpy-0.1.4 tqdm-multiprocess-0.0.11 typepy-1.3.2 zstandard-0.22.0\n"
+          ]
+        }
+      ],
+      "source": [
+        "# Install LM-Eval\n",
+        "!pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 0,
+          "referenced_widgets": [
+            "a1d3a8aa016544a78e8821c8f6199e06",
+            "f61ed33fad754146bdd2ac9db1ba1c48",
+            "bfa0af6aeff344c6845e1080a878e92e",
+            "fd1ad9e0367d4004aae853b91c3a7617",
+            "6b2d90209ec14230b3d58a74ac9b83bf",
+            "a73f357065d34d7baf0453ae4a8d75e2",
+            "46f521b73fd943c081c648fd873ebc0a",
+            "7c5689bc13684db8a22681f41863dddd",
+            "48763b6233374554ae76035c0483066f",
+            "4986a21eb560448fa79f4b25cde48951",
+            "aed3acd2f2d74003b44079c333a0698e"
+          ]
+        },
+        "id": "uyO5MaKkZyah",
+        "outputId": "d46e8096-5086-4e49-967e-ea33d4a2a335"
+      },
+      "outputs": [
+        {
+          "data": {
+            "application/vnd.jupyter.widget-view+json": {
+              "model_id": "a1d3a8aa016544a78e8821c8f6199e06",
+              "version_major": 2,
+              "version_minor": 0
+            },
+            "text/plain": [
+              "Downloading builder script:   0%|          | 0.00/5.67k [00:00<?, ?B/s]"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        }
+      ],
+      "source": [
+        "from lm_eval import api"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "8rfUeX6n_wkK"
+      },
+      "source": [
+        "## Create new evaluation tasks with config-based tasks\n",
+        "\n",
+        "Even within the same task, many works have reported numbers based on different choices of evaluation. Some report on the test sets, validation sets, or even subset of the training sets. Others have specialized prompts and verbalizers. We introduce YAMLs to allow users to easily make different variations. By leveraging the YAML configs to configure evaluations, the refactored LM-Eval takes the methods of the `Task` object and makes them configurable by setting the appropriate attributes in the config file. There, users can set the tasks they want by setting the name of the HF dataset (local tasks are also possible), the dataset splits used, and much more. Key configurations relating to prompting, such as `doc_to_text`, previously implemented as a method of the same name, are now configurable with jinja2 to allow high-level scripting to transform a HF dataset to text string as input to the model.\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "HYFUhhfOSJKe"
+      },
+      "source": [
+        "A core-feature to LM-Eval is to configure tasks with YAML configs. With configs, you can fill preset fields to easily set up a task.\n",
+        "\n",
+        "Here, we write a demo YAML config for a multiple-choice evaluation of BoolQ:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "id": "bg3dGROW-V39"
+      },
+      "outputs": [],
+      "source": [
+        "YAML_boolq_string = '''\n",
+        "task: demo_boolq\n",
+        "dataset_path: super_glue\n",
+        "dataset_name: boolq\n",
+        "output_type: multiple_choice\n",
+        "training_split: train\n",
+        "validation_split: validation\n",
+        "doc_to_text: \"{{passage}}\\nQuestion: {{question}}?\\nAnswer:\"\n",
+        "doc_to_target: label\n",
+        "doc_to_choice: [\"no\", \"yes\"]\n",
+        "should_decontaminate: true\n",
+        "doc_to_decontamination_query: passage\n",
+        "metric_list:\n",
+        "  - metric: acc\n",
+        "'''\n",
+        "with open('boolq.yaml', 'w') as f:\n",
+        "    f.write(YAML_boolq_string)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "And we can now run evaluation on this task, by pointing to the config file we've just created:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {
+        "id": "LOUHK7PtQfq4"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "2023-11-29:11:54:55,156 INFO     [utils.py:160] NumExpr defaulting to 2 threads.\n",
+            "2023-11-29 11:54:55.942051: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
+            "2023-11-29 11:54:55.942108: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
+            "2023-11-29 11:54:55.942142: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
+            "2023-11-29 11:54:57.066802: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
+            "2023-11-29:11:55:00,954 INFO     [__main__.py:132] Verbosity set to INFO\n",
+            "2023-11-29:11:55:11,038 WARNING  [__main__.py:138]  --limit SHOULD ONLY BE USED FOR TESTING.REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.\n",
+            "2023-11-29:11:55:11,038 INFO     [__main__.py:143] Including path: ./\n",
+            "2023-11-29:11:55:11,046 INFO     [__main__.py:205] Selected Tasks: ['demo_boolq']\n",
+            "2023-11-29:11:55:11,047 WARNING  [evaluator.py:93] generation_kwargs specified through cli, these settings will be used over set parameters in yaml tasks.\n",
+            "2023-11-29:11:55:11,110 INFO     [huggingface.py:120] Using device 'cuda'\n",
+            "config.json: 100% 571/571 [00:00<00:00, 2.87MB/s]\n",
+            "model.safetensors: 100% 5.68G/5.68G [00:32<00:00, 173MB/s]\n",
+            "tokenizer_config.json: 100% 396/396 [00:00<00:00, 2.06MB/s]\n",
+            "tokenizer.json: 100% 2.11M/2.11M [00:00<00:00, 11.6MB/s]\n",
+            "special_tokens_map.json: 100% 99.0/99.0 [00:00<00:00, 555kB/s]\n",
+            "2023-11-29:11:56:18,658 WARNING  [task.py:614] [Task: demo_boolq] metric acc is defined, but aggregation is not. using default aggregation=mean\n",
+            "2023-11-29:11:56:18,658 WARNING  [task.py:626] [Task: demo_boolq] metric acc is defined, but higher_is_better is not. using default higher_is_better=True\n",
+            "Downloading builder script: 100% 30.7k/30.7k [00:00<00:00, 59.0MB/s]\n",
+            "Downloading metadata: 100% 38.7k/38.7k [00:00<00:00, 651kB/s]\n",
+            "Downloading readme: 100% 14.8k/14.8k [00:00<00:00, 37.3MB/s]\n",
+            "Downloading data: 100% 4.12M/4.12M [00:00<00:00, 55.1MB/s]\n",
+            "Generating train split: 100% 9427/9427 [00:00<00:00, 15630.89 examples/s]\n",
+            "Generating validation split: 100% 3270/3270 [00:00<00:00, 20002.56 examples/s]\n",
+            "Generating test split: 100% 3245/3245 [00:00<00:00, 20866.19 examples/s]\n",
+            "2023-11-29:11:56:22,315 INFO     [task.py:355] Building contexts for task on rank 0...\n",
+            "2023-11-29:11:56:22,322 INFO     [evaluator.py:319] Running loglikelihood requests\n",
+            "100% 20/20 [00:04<00:00,  4.37it/s]\n",
+            "fatal: not a git repository (or any of the parent directories): .git\n",
+            "hf (pretrained=EleutherAI/pythia-2.8b), gen_kwargs: (), limit: 10.0, num_fewshot: None, batch_size: 1\n",
+            "|  Tasks   |Version|Filter|n-shot|Metric|Value|   |Stderr|\n",
+            "|----------|-------|------|-----:|------|----:|---|-----:|\n",
+            "|demo_boolq|Yaml   |none  |     0|acc   |    1|±  |     0|\n",
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "!lm_eval \\\n",
+        "    --model hf \\\n",
+        "    --model_args pretrained=EleutherAI/pythia-2.8b \\\n",
+        "    --include_path ./ \\\n",
+        "    --tasks demo_boolq \\\n",
+        "    --limit 10\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "LOUHK7PtQfq4"
+      },
+      "source": [
+        "Often, tasks are part of a larger group used to measure different capabilities. The dynamism of the field today means new dimensions of evaluation can come about which would mix and match new and older tasks alike. In LM-Eval, We can also group tasks and call that the group name to evaluate on a set of tasks easily. In this instance, let's evaluate the tag `yes_or_no_tasks` which comprise of the tasks `demo_boolq` and `demo_cola`; tasks which are multiple choice tasks with options `yes` and `no` as the name suggests.\n",
+        "\n",
+        "<!-- making new groups is easier than ever, allowing user to work bottom-up by makiing individual tasks and linking them to a group or Top-Down, making a new group by listing existing tasks.\n",
+        "\n",
+        "We also show the aggregate across samples besides only showing the aggregation between subtasks. This may come in handy when certain groups want to be aggregated as a single task. -->\n",
+        "\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "id": "fthNg3ywO-kA"
+      },
+      "outputs": [],
+      "source": [
+        "YAML_cola_string = '''\n",
+        "tag: yes_or_no_tasks\n",
+        "task: demo_cola\n",
+        "dataset_path: glue\n",
+        "dataset_name: cola\n",
+        "output_type: multiple_choice\n",
+        "training_split: train\n",
+        "validation_split: validation\n",
+        "doc_to_text: \"{{sentence}}\\nQuestion: Does this sentence make sense?\\nAnswer:\"\n",
+        "doc_to_target: label\n",
+        "doc_to_choice: [\"no\", \"yes\"]\n",
+        "should_decontaminate: true\n",
+        "doc_to_decontamination_query: sentence\n",
+        "metric_list:\n",
+        "  - metric: acc\n",
+        "'''\n",
+        "with open('cola.yaml', 'w') as f:\n",
+        "    f.write(YAML_cola_string)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 6,
+      "metadata": {
+        "id": "XceRKCuuDtbn"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "2023-11-29:11:56:33,016 INFO     [utils.py:160] NumExpr defaulting to 2 threads.\n",
+            "2023-11-29 11:56:33.852995: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
+            "2023-11-29 11:56:33.853050: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
+            "2023-11-29 11:56:33.853087: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
+            "2023-11-29 11:56:35.129047: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
+            "2023-11-29:11:56:38,546 INFO     [__main__.py:132] Verbosity set to INFO\n",
+            "2023-11-29:11:56:47,509 WARNING  [__main__.py:138]  --limit SHOULD ONLY BE USED FOR TESTING.REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.\n",
+            "2023-11-29:11:56:47,509 INFO     [__main__.py:143] Including path: ./\n",
+            "2023-11-29:11:56:47,517 INFO     [__main__.py:205] Selected Tasks: ['yes_or_no_tasks']\n",
+            "2023-11-29:11:56:47,520 WARNING  [evaluator.py:93] generation_kwargs specified through cli, these settings will be used over set parameters in yaml tasks.\n",
+            "2023-11-29:11:56:47,550 INFO     [huggingface.py:120] Using device 'cuda'\n",
+            "2023-11-29:11:57:08,743 WARNING  [task.py:614] [Task: demo_cola] metric acc is defined, but aggregation is not. using default aggregation=mean\n",
+            "2023-11-29:11:57:08,743 WARNING  [task.py:626] [Task: demo_cola] metric acc is defined, but higher_is_better is not. using default higher_is_better=True\n",
+            "Downloading builder script: 100% 28.8k/28.8k [00:00<00:00, 52.7MB/s]\n",
+            "Downloading metadata: 100% 28.7k/28.7k [00:00<00:00, 51.9MB/s]\n",
+            "Downloading readme: 100% 27.9k/27.9k [00:00<00:00, 48.0MB/s]\n",
+            "Downloading data: 100% 377k/377k [00:00<00:00, 12.0MB/s]\n",
+            "Generating train split: 100% 8551/8551 [00:00<00:00, 19744.58 examples/s]\n",
+            "Generating validation split: 100% 1043/1043 [00:00<00:00, 27057.01 examples/s]\n",
+            "Generating test split: 100% 1063/1063 [00:00<00:00, 22705.17 examples/s]\n",
+            "2023-11-29:11:57:11,698 INFO     [task.py:355] Building contexts for task on rank 0...\n",
+            "2023-11-29:11:57:11,704 INFO     [evaluator.py:319] Running loglikelihood requests\n",
+            "100% 20/20 [00:03<00:00,  5.15it/s]\n",
+            "fatal: not a git repository (or any of the parent directories): .git\n",
+            "hf (pretrained=EleutherAI/pythia-2.8b), gen_kwargs: (), limit: 10.0, num_fewshot: None, batch_size: 1\n",
+            "|     Tasks     |Version|Filter|n-shot|Metric|Value|   |Stderr|\n",
+            "|---------------|-------|------|-----:|------|----:|---|-----:|\n",
+            "|yes_or_no_tasks|N/A    |none  |     0|acc   |  0.7|±  |0.1528|\n",
+            "| - demo_cola   |Yaml   |none  |     0|acc   |  0.7|±  |0.1528|\n",
+            "\n",
+            "|    Groups     |Version|Filter|n-shot|Metric|Value|   |Stderr|\n",
+            "|---------------|-------|------|-----:|------|----:|---|-----:|\n",
+            "|yes_or_no_tasks|N/A    |none  |     0|acc   |  0.7|±  |0.1528|\n",
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "# !accelerate launch --no_python\n",
+        "!lm_eval \\\n",
+        "    --model hf \\\n",
+        "    --model_args pretrained=EleutherAI/pythia-2.8b \\\n",
+        "    --include_path ./ \\\n",
+        "    --tasks yes_or_no_tasks \\\n",
+        "    --limit 10 \\\n",
+        "    --output output/yes_or_no_tasks/ \\\n",
+        "    --log_samples\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "XceRKCuuDtbn"
+      },
+      "source": [
+        "## Edit Prompt Templates Quickly\n",
+        "\n",
+        "The following is a yaml made to evaluate the specific subtask of `high_school_geography` from MMLU. It uses the standard prompt where the we choose the letters from the options with most likelihood as the model's prediction."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "metadata": {
+        "id": "GTFvdt9kSlBG"
+      },
+      "outputs": [],
+      "source": [
+        "YAML_mmlu_geo_string = '''\n",
+        "task: demo_mmlu_high_school_geography\n",
+        "dataset_path: cais/mmlu\n",
+        "dataset_name: high_school_geography\n",
+        "description: \"The following are multiple choice questions (with answers) about high school geography.\\n\\n\"\n",
+        "test_split: test\n",
+        "fewshot_split: dev\n",
+        "fewshot_config:\n",
+        "  sampler: first_n\n",
+        "output_type: multiple_choice\n",
+        "doc_to_text: \"{{question.strip()}}\\nA. {{choices[0]}}\\nB. {{choices[1]}}\\nC. {{choices[2]}}\\nD. {{choices[3]}}\\nAnswer:\"\n",
+        "doc_to_choice: [\"A\", \"B\", \"C\", \"D\"]\n",
+        "doc_to_target: answer\n",
+        "metric_list:\n",
+        "  - metric: acc\n",
+        "    aggregation: mean\n",
+        "    higher_is_better: true\n",
+        "  - metric: acc_norm\n",
+        "    aggregation: mean\n",
+        "    higher_is_better: true\n",
+        "'''\n",
+        "with open('mmlu_high_school_geography.yaml', 'w') as f:\n",
+        "    f.write(YAML_mmlu_geo_string)\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "metadata": {
+        "id": "jyKOfCsKb-xy"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "2023-11-29:11:57:23,598 INFO     [utils.py:160] NumExpr defaulting to 2 threads.\n",
+            "2023-11-29 11:57:24.719750: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
+            "2023-11-29 11:57:24.719806: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
+            "2023-11-29 11:57:24.719847: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
+            "2023-11-29 11:57:26.656125: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
+            "2023-11-29:11:57:31,563 INFO     [__main__.py:132] Verbosity set to INFO\n",
+            "2023-11-29:11:57:40,541 WARNING  [__main__.py:138]  --limit SHOULD ONLY BE USED FOR TESTING.REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.\n",
+            "2023-11-29:11:57:40,541 INFO     [__main__.py:143] Including path: ./\n",
+            "2023-11-29:11:57:40,558 INFO     [__main__.py:205] Selected Tasks: ['demo_mmlu_high_school_geography']\n",
+            "2023-11-29:11:57:40,559 WARNING  [evaluator.py:93] generation_kwargs specified through cli, these settings will be used over set parameters in yaml tasks.\n",
+            "2023-11-29:11:57:40,589 INFO     [huggingface.py:120] Using device 'cuda'\n",
+            "Downloading builder script: 100% 5.84k/5.84k [00:00<00:00, 17.7MB/s]\n",
+            "Downloading metadata: 100% 106k/106k [00:00<00:00, 892kB/s] \n",
+            "Downloading readme: 100% 39.7k/39.7k [00:00<00:00, 631kB/s]\n",
+            "Downloading data: 100% 166M/166M [00:01<00:00, 89.0MB/s]\n",
+            "Generating auxiliary_train split: 100% 99842/99842 [00:07<00:00, 12536.83 examples/s]\n",
+            "Generating test split: 100% 198/198 [00:00<00:00, 1439.20 examples/s]\n",
+            "Generating validation split: 100% 22/22 [00:00<00:00, 4181.76 examples/s]\n",
+            "Generating dev split: 100% 5/5 [00:00<00:00, 36.25 examples/s]\n",
+            "2023-11-29:11:58:09,798 INFO     [task.py:355] Building contexts for task on rank 0...\n",
+            "2023-11-29:11:58:09,822 INFO     [evaluator.py:319] Running loglikelihood requests\n",
+            "100% 40/40 [00:05<00:00,  7.86it/s]\n",
+            "fatal: not a git repository (or any of the parent directories): .git\n",
+            "hf (pretrained=EleutherAI/pythia-2.8b), gen_kwargs: (), limit: 10.0, num_fewshot: None, batch_size: 1\n",
+            "|             Tasks             |Version|Filter|n-shot| Metric |Value|   |Stderr|\n",
+            "|-------------------------------|-------|------|-----:|--------|----:|---|-----:|\n",
+            "|demo_mmlu_high_school_geography|Yaml   |none  |     0|acc     |  0.3|±  |0.1528|\n",
+            "|                               |       |none  |     0|acc_norm|  0.3|±  |0.1528|\n",
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "# !accelerate launch --no_python\n",
+        "!lm_eval \\\n",
+        "    --model hf \\\n",
+        "    --model_args pretrained=EleutherAI/pythia-2.8b \\\n",
+        "    --include_path ./ \\\n",
+        "    --tasks demo_mmlu_high_school_geography \\\n",
+        "    --limit 10 \\\n",
+        "    --output output/mmlu_high_school_geography/ \\\n",
+        "    --log_samples"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "jyKOfCsKb-xy"
+      },
+      "source": [
+        "We could also evaluate this task in a different way. For example, instead of observing the loglikelihood of the letters, we can instead evaluate on the choices themselves as the continuation. This is done by simply changing `doc_to_choice` from a list of letters to the corresponding `choices` field from the HF dataset. We write `\"{{choices}}\"` so that the string field is interpreted as jinja string that acquires the list from the HF dataset directly.\n",
+        "\n",
+        "Another convenient feature here is since we're only modifying the `doc_to_choice` and the rest of config is the same as the task above, we can use the above configuration as a template by using `include: mmlu_high_school_geography.yaml` to load the config from that file. We'll need to add a unique task name as to not colide with the existing yaml config we're including. For this case we'll simply name this one `mmlu_high_school_geography_continuation`. `doc_to_text` is added here just for sake of clarity."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 9,
+      "metadata": {
+        "id": "lqElwU54TaK-"
+      },
+      "outputs": [],
+      "source": [
+        "YAML_mmlu_geo_string = '''\n",
+        "include: mmlu_high_school_geography.yaml\n",
+        "task: demo_mmlu_high_school_geography_continuation\n",
+        "doc_to_text: \"{{question.strip()}}\\nA. {{choices[0]}}\\nB. {{choices[1]}}\\nC. {{choices[2]}}\\nD. {{choices[3]}}\\nAnswer:\"\n",
+        "doc_to_choice: \"{{choices}}\"\n",
+        "'''\n",
+        "with open('mmlu_high_school_geography_continuation.yaml', 'w') as f:\n",
+        "    f.write(YAML_mmlu_geo_string)\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 10,
+      "metadata": {
+        "id": "-_CVnDirdy7j"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "2023-11-29:11:58:21,284 INFO     [utils.py:160] NumExpr defaulting to 2 threads.\n",
+            "2023-11-29 11:58:22.850159: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
+            "2023-11-29 11:58:22.850219: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
+            "2023-11-29 11:58:22.850254: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
+            "2023-11-29 11:58:24.948103: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
+            "2023-11-29:11:58:28,460 INFO     [__main__.py:132] Verbosity set to INFO\n",
+            "2023-11-29:11:58:37,935 WARNING  [__main__.py:138]  --limit SHOULD ONLY BE USED FOR TESTING.REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.\n",
+            "2023-11-29:11:58:37,935 INFO     [__main__.py:143] Including path: ./\n",
+            "2023-11-29:11:58:37,969 INFO     [__main__.py:205] Selected Tasks: ['demo_mmlu_high_school_geography_continuation']\n",
+            "2023-11-29:11:58:37,972 WARNING  [evaluator.py:93] generation_kwargs specified through cli, these settings will be used over set parameters in yaml tasks.\n",
+            "2023-11-29:11:58:38,008 INFO     [huggingface.py:120] Using device 'cuda'\n",
+            "2023-11-29:11:58:59,758 INFO     [task.py:355] Building contexts for task on rank 0...\n",
+            "2023-11-29:11:58:59,777 INFO     [evaluator.py:319] Running loglikelihood requests\n",
+            "100% 40/40 [00:02<00:00, 16.23it/s]\n",
+            "fatal: not a git repository (or any of the parent directories): .git\n",
+            "hf (pretrained=EleutherAI/pythia-2.8b), gen_kwargs: (), limit: 10.0, num_fewshot: None, batch_size: 1\n",
+            "|                   Tasks                    |Version|Filter|n-shot| Metric |Value|   |Stderr|\n",
+            "|--------------------------------------------|-------|------|-----:|--------|----:|---|-----:|\n",
+            "|demo_mmlu_high_school_geography_continuation|Yaml   |none  |     0|acc     |  0.1|±  |0.1000|\n",
+            "|                                            |       |none  |     0|acc_norm|  0.2|±  |0.1333|\n",
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "# !accelerate launch --no_python\n",
+        "!lm_eval \\\n",
+        "    --model hf \\\n",
+        "    --model_args pretrained=EleutherAI/pythia-2.8b \\\n",
+        "    --include_path ./ \\\n",
+        "    --tasks demo_mmlu_high_school_geography_continuation \\\n",
+        "    --limit 10 \\\n",
+        "    --output output/mmlu_high_school_geography_continuation/ \\\n",
+        "    --log_samples\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "-_CVnDirdy7j"
+      },
+      "source": [
+        "If we take a look at the samples, we can see that it is in fact evaluating the continuation based on the choices rather than the letters."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 11,
+      "metadata": {
+        "id": "duBDqC6PAdjL"
+      },
+      "outputs": [
+        {
+          "data": {
+            "application/javascript": "\n      ((filepath) => {{\n        if (!google.colab.kernel.accessAllowed) {{\n          return;\n        }}\n        google.colab.files.view(filepath);\n      }})(\"/content/output/mmlu_high_school_geography_continuation/pretrained__EleutherAI__pythia-2.8b_demo_mmlu_high_school_geography_continuation.jsonl\")",
+            "text/plain": [
+              "<IPython.core.display.Javascript object>"
+            ]
+          },
+          "metadata": {},
+          "output_type": "display_data"
+        }
+      ],
+      "source": [
+        "from google.colab import files\n",
+        "files.view(\"output/mmlu_high_school_geography_continuation/pretrained__EleutherAI__pythia-2.8b_demo_mmlu_high_school_geography_continuation.jsonl\")\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "6p0-KPwAgK5j"
+      },
+      "source": [
+        "## Closer Look at YAML Fields\n",
+        "\n",
+        "To prepare a task we can simply fill in a YAML config with the relevant information.\n",
+        "\n",
+        "`output_type`\n",
+        "The current provided evaluation types comprise of the following:\n",
+        "1.   `loglikelihood`: Evaluates the loglikelihood of a continuation, conditioned on some input string.\n",
+        "2.   `loglikelihood_rolling`: evaluate the loglikelihood of producing a string, conditioned on the empty string. (Used for perplexity evaluations)\n",
+        "3.   `multiple_choice`: Evaluates loglikelihood among the a number of choices predicted by the model.\n",
+        "4.   `greedy_until`: Model outputs greedy generation (can be configured to to use beam search and other generation-related parameters)\n",
+        "\n",
+        "The core prompt revolves around 3 fields.\n",
+        "1. `doc_to_text`: Denotes the prompt template that will be used as input to the model.\n",
+        "2. `doc_to_choice`: Available choices that will be used as continuation for the model. This is used when the `output_type` is `multiple_choice`, and otherwise can be left as `None`.\n",
+        "3. `doc_to_target`: When `output_type` is `multiple_choice`, this can be an index that corresponds to the correct answer, or the answer string itself (must be a subset of `doc_to_choice`). For other tasks, this is expected to be a string. You can fill this field with a feature name from the HF dataset so long as the resulting feature follows the conditioned described.\n",
+        "\n",
+        "These three fields can be expressed as strings, column names from the source dataset, or as Jinja2 templates that can use fields from the source dataset as variables.\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "6p0-KPwAgK5j"
+      },
+      "source": [
+        "## What if Jinja is not Sufficient?\n",
+        "\n",
+        "There can be times where the Jinja2 templating language is not enough to make the prompt we had in mind. There are a few ways to circumvent this limitation:\n",
+        "\n",
+        "1. Use `!function` operator for the prompt-related fields to pass a python function that takes as input the dataset row, and will output the prompt template component.\n",
+        "2. Perform a transformation on the dataset beforehand."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Below, we show an example of using `!function` to create `doc_to_text` from a python function:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 12,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "DYZ5c0JhR1lJ",
+        "outputId": "ca945235-fb9e-4f17-8bfa-78e7d6ec1490"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "2023-11-29:11:59:08,312 INFO     [utils.py:160] NumExpr defaulting to 2 threads.\n",
+            "2023-11-29 11:59:09.348327: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
+            "2023-11-29 11:59:09.348387: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
+            "2023-11-29 11:59:09.348421: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
+            "2023-11-29 11:59:10.573752: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
+            "2023-11-29:11:59:14,044 INFO     [__main__.py:132] Verbosity set to INFO\n",
+            "2023-11-29:11:59:23,654 WARNING  [__main__.py:138]  --limit SHOULD ONLY BE USED FOR TESTING.REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.\n",
+            "2023-11-29:11:59:23,654 INFO     [__main__.py:143] Including path: ./\n",
+            "2023-11-29:11:59:23,678 INFO     [__main__.py:205] Selected Tasks: ['demo_mmlu_high_school_geography_function_prompt']\n",
+            "2023-11-29:11:59:23,679 WARNING  [evaluator.py:93] generation_kwargs specified through cli, these settings will be used over set parameters in yaml tasks.\n",
+            "2023-11-29:11:59:23,708 INFO     [huggingface.py:120] Using device 'cuda'\n",
+            "2023-11-29:11:59:44,516 INFO     [task.py:355] Building contexts for task on rank 0...\n",
+            "2023-11-29:11:59:44,524 INFO     [evaluator.py:319] Running loglikelihood requests\n",
+            "100% 40/40 [00:02<00:00, 15.41it/s]\n",
+            "fatal: not a git repository (or any of the parent directories): .git\n",
+            "hf (pretrained=EleutherAI/pythia-2.8b), gen_kwargs: (), limit: 10.0, num_fewshot: None, batch_size: 1\n",
+            "|                     Tasks                     |Version|Filter|n-shot| Metric |Value|   |Stderr|\n",
+            "|-----------------------------------------------|-------|------|-----:|--------|----:|---|-----:|\n",
+            "|demo_mmlu_high_school_geography_function_prompt|Yaml   |none  |     0|acc     |  0.1|±  |0.1000|\n",
+            "|                                               |       |none  |     0|acc_norm|  0.2|±  |0.1333|\n",
+            "\n"
+          ]
+        }
+      ],
+      "source": [
+        "YAML_mmlu_geo_string = '''\n",
+        "include: mmlu_high_school_geography.yaml\n",
+        "task: demo_mmlu_high_school_geography_function_prompt\n",
+        "doc_to_text: !function utils.doc_to_text\n",
+        "doc_to_choice: \"{{choices}}\"\n",
+        "'''\n",
+        "with open('demo_mmlu_high_school_geography_function_prompt.yaml', 'w') as f:\n",
+        "    f.write(YAML_mmlu_geo_string)\n",
+        "\n",
+        "DOC_TO_TEXT = '''\n",
+        "def doc_to_text(x):\n",
+        "    question = x[\"question\"].strip()\n",
+        "    choices = x[\"choices\"]\n",
+        "    option_a = choices[0]\n",
+        "    option_b = choices[1]\n",
+        "    option_c = choices[2]\n",
+        "    option_d = choices[3]\n",
+        "    return f\"{question}\\\\nA. {option_a}\\\\nB. {option_b}\\\\nC. {option_c}\\\\nD. {option_d}\\\\nAnswer:\"\n",
+        "'''\n",
+        "with open('utils.py', 'w') as f:\n",
+        "    f.write(DOC_TO_TEXT)\n",
+        "\n",
+        "!lm_eval \\\n",
+        "    --model hf \\\n",
+        "    --model_args pretrained=EleutherAI/pythia-2.8b \\\n",
+        "    --include_path ./ \\\n",
+        "    --tasks demo_mmlu_high_school_geography_function_prompt \\\n",
+        "    --limit 10 \\\n",
+        "    --output output/demo_mmlu_high_school_geography_function_prompt/ \\\n",
+        "    --log_samples\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "Next, we'll also show how to do this via preprocessing the dataset as necessary using the `process_docs` config field:\n",
+        "\n",
+        "We will write a function that will modify each document in our evaluation dataset's split to add a field that is suitable for us to use in `doc_to_text`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "YAML_mmlu_geo_string = '''\n",
+        "include: mmlu_high_school_geography.yaml\n",
+        "task: demo_mmlu_high_school_geography_function_prompt_2\n",
+        "process_docs: !function utils_process_docs.process_docs\n",
+        "doc_to_text: \"{{input}}\"\n",
+        "doc_to_choice: \"{{choices}}\"\n",
+        "'''\n",
+        "with open('demo_mmlu_high_school_geography_process_docs.yaml', 'w') as f:\n",
+        "    f.write(YAML_mmlu_geo_string)\n",
+        "\n",
+        "DOC_TO_TEXT = '''\n",
+        "def process_docs(dataset):\n",
+        "    def _process_doc(x):\n",
+        "        question = x[\"question\"].strip()\n",
+        "        choices = x[\"choices\"]\n",
+        "        option_a = choices[0]\n",
+        "        option_b = choices[1]\n",
+        "        option_c = choices[2]\n",
+        "        option_d = choices[3]\n",
+        "        doc[\"input\"] = f\"{question}\\\\nA. {option_a}\\\\nB. {option_b}\\\\nC. {option_c}\\\\nD. {option_d}\\\\nAnswer:\"\n",
+        "        return out_doc\n",
+        "\n",
+        "    return dataset.map(_process_doc)\n",
+        "'''\n",
+        "\n",
+        "with open('utils_process_docs.py', 'w') as f:\n",
+        "    f.write(DOC_TO_TEXT)\n",
+        "\n",
+        "!lm_eval \\\n",
+        "    --model hf \\\n",
+        "    --model_args pretrained=EleutherAI/pythia-2.8b \\\n",
+        "    --include_path ./ \\\n",
+        "    --tasks demo_mmlu_high_school_geography_function_prompt_2 \\\n",
+        "    --limit 10 \\\n",
+        "    --output output/demo_mmlu_high_school_geography_function_prompt_2/ \\\n",
+        "    --log_samples\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "We hope that this explainer gives you a sense of what can be done with and how to work with LM-Evaluation-Harnes v0.4.0 ! \n",
+        "\n",
+        "For more information, check out our documentation pages in the `docs/` folder, and if you have questions, please raise them in GitHub issues, or in #lm-thunderdome or #release-discussion on the EleutherAI discord server."
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "collapsed_sections": [
+        "zAov81vTbL2K"
+      ],
+      "gpuType": "T4",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    },
+    "widgets": {
+      "application/vnd.jupyter.widget-state+json": {
+        "46f521b73fd943c081c648fd873ebc0a": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "48763b6233374554ae76035c0483066f": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "ProgressStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "ProgressStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "bar_color": null,
+            "description_width": ""
+          }
+        },
+        "4986a21eb560448fa79f4b25cde48951": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "6b2d90209ec14230b3d58a74ac9b83bf": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "7c5689bc13684db8a22681f41863dddd": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "a1d3a8aa016544a78e8821c8f6199e06": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HBoxModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HBoxModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HBoxView",
+            "box_style": "",
+            "children": [
+              "IPY_MODEL_f61ed33fad754146bdd2ac9db1ba1c48",
+              "IPY_MODEL_bfa0af6aeff344c6845e1080a878e92e",
+              "IPY_MODEL_fd1ad9e0367d4004aae853b91c3a7617"
+            ],
+            "layout": "IPY_MODEL_6b2d90209ec14230b3d58a74ac9b83bf"
+          }
+        },
+        "a73f357065d34d7baf0453ae4a8d75e2": {
+          "model_module": "@jupyter-widgets/base",
+          "model_module_version": "1.2.0",
+          "model_name": "LayoutModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/base",
+            "_model_module_version": "1.2.0",
+            "_model_name": "LayoutModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "LayoutView",
+            "align_content": null,
+            "align_items": null,
+            "align_self": null,
+            "border": null,
+            "bottom": null,
+            "display": null,
+            "flex": null,
+            "flex_flow": null,
+            "grid_area": null,
+            "grid_auto_columns": null,
+            "grid_auto_flow": null,
+            "grid_auto_rows": null,
+            "grid_column": null,
+            "grid_gap": null,
+            "grid_row": null,
+            "grid_template_areas": null,
+            "grid_template_columns": null,
+            "grid_template_rows": null,
+            "height": null,
+            "justify_content": null,
+            "justify_items": null,
+            "left": null,
+            "margin": null,
+            "max_height": null,
+            "max_width": null,
+            "min_height": null,
+            "min_width": null,
+            "object_fit": null,
+            "object_position": null,
+            "order": null,
+            "overflow": null,
+            "overflow_x": null,
+            "overflow_y": null,
+            "padding": null,
+            "right": null,
+            "top": null,
+            "visibility": null,
+            "width": null
+          }
+        },
+        "aed3acd2f2d74003b44079c333a0698e": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "DescriptionStyleModel",
+          "state": {
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "DescriptionStyleModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/base",
+            "_view_module_version": "1.2.0",
+            "_view_name": "StyleView",
+            "description_width": ""
+          }
+        },
+        "bfa0af6aeff344c6845e1080a878e92e": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "FloatProgressModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "FloatProgressModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "ProgressView",
+            "bar_style": "success",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_7c5689bc13684db8a22681f41863dddd",
+            "max": 5669,
+            "min": 0,
+            "orientation": "horizontal",
+            "style": "IPY_MODEL_48763b6233374554ae76035c0483066f",
+            "value": 5669
+          }
+        },
+        "f61ed33fad754146bdd2ac9db1ba1c48": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_a73f357065d34d7baf0453ae4a8d75e2",
+            "placeholder": "",
+            "style": "IPY_MODEL_46f521b73fd943c081c648fd873ebc0a",
+            "value": "Downloading builder script: 100%"
+          }
+        },
+        "fd1ad9e0367d4004aae853b91c3a7617": {
+          "model_module": "@jupyter-widgets/controls",
+          "model_module_version": "1.5.0",
+          "model_name": "HTMLModel",
+          "state": {
+            "_dom_classes": [],
+            "_model_module": "@jupyter-widgets/controls",
+            "_model_module_version": "1.5.0",
+            "_model_name": "HTMLModel",
+            "_view_count": null,
+            "_view_module": "@jupyter-widgets/controls",
+            "_view_module_version": "1.5.0",
+            "_view_name": "HTMLView",
+            "description": "",
+            "description_tooltip": null,
+            "layout": "IPY_MODEL_4986a21eb560448fa79f4b25cde48951",
+            "placeholder": "",
+            "style": "IPY_MODEL_aed3acd2f2d74003b44079c333a0698e",
+            "value": " 5.67k/5.67k [00:00&lt;00:00, 205kB/s]"
+          }
+        }
+      }
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}

scripts/yans/lm-evaluation-harness/examples/visualize-wandb.ipynb ADDED Viewed

	@@ -0,0 +1,170 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "fc477b96-adee-4829-a9d7-a5eb990df358",
+   "metadata": {},
+   "source": [
+    "# Visualizing Results in Weights and Biases\n",
+    "\n",
+    "With the Weights and Biases integration, you can now spend more time extracting deeper insights into your evaluation results. The integration is designed to streamline the process of logging and visualizing experiment results using the Weights & Biases (W&B) platform.\n",
+    "\n",
+    "The integration provide functionalities\n",
+    "\n",
+    "- to automatically log the evaluation results,\n",
+    "- log the samples as W&B Tables for easy visualization,\n",
+    "- log the `results.json` file as an artifact for version control,\n",
+    "- log the `<task_name>_eval_samples.json` file if the samples are logged,\n",
+    "- generate a comprehensive report for analysis and visualization with all the important metric,\n",
+    "- log task and cli configs,\n",
+    "- and more out of the box like the command used to run the evaluation, GPU/CPU counts, timestamp, etc.\n",
+    "\n",
+    "The integration is super easy to use with the eval harness. Let's see how!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3851439a-bff4-41f2-bf21-1b3d8704913b",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Install this project if you did not already have it.\n",
+    "# This is all that is needed to be installed to start using Weights and Biases\n",
+    "\n",
+    "!pip -qq install -e ..[wandb]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8507fd7e-3b99-4a92-89fa-9eaada74ba91",
+   "metadata": {},
+   "source": [
+    "# Run the Eval Harness\n",
+    "\n",
+    "Run the eval harness as usual with a `wandb_args` flag. This flag is used to provide arguments for initializing a wandb run ([wandb.init](https://docs.wandb.ai/ref/python/init)) as comma separated string arguments.\n",
+    "\n",
+    "If `wandb_args` flag is used, the metrics and all other goodness will be automatically logged to Weights and Biases. In the stdout, you will find the link to the W&B run page as well as link to the generated report."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "eec5866e-f01e-42f8-8803-9d77472ef991",
+   "metadata": {},
+   "source": [
+    "## Set your API Key\n",
+    "\n",
+    "Before you can use W&B, you need to authenticate your machine with an authentication key. Visit https://wandb.ai/authorize to get one."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d824d163-71a9-4313-935d-f1d56397841c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import wandb\n",
+    "\n",
+    "wandb.login()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "124e4a34-1547-4bed-bc09-db012bacbda6",
+   "metadata": {},
+   "source": [
+    "> Note that if you are using command line you can simply authenticate your machine by doing `wandb login` in your terminal. For more info check out the [documentation](https://docs.wandb.ai/quickstart#2-log-in-to-wb)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "abc6f6b6-179a-4aff-ada9-f380fb74df6e",
+   "metadata": {},
+   "source": [
+    "## Run and log to W&B"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bd0a8130-a97b-451a-acd2-3f9885b88643",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!lm_eval \\\n",
+    "    --model hf \\\n",
+    "    --model_args pretrained=microsoft/phi-2,trust_remote_code=True \\\n",
+    "    --tasks hellaswag,mmlu_abstract_algebra \\\n",
+    "    --device cuda:0 \\\n",
+    "    --batch_size 8 \\\n",
+    "    --output_path output/phi-2 \\\n",
+    "    --limit 10 \\\n",
+    "    --wandb_args project=lm-eval-harness-integration \\\n",
+    "    --log_samples"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e974cabdbe70b667",
+   "metadata": {},
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5178ca9445b844e4",
+   "metadata": {},
+   "source": [
+    "W&B can also be initialized programmatically for use outside the CLI to parse and log the results."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c6a421b2cf3ddac5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import lm_eval\n",
+    "from lm_eval.loggers import WandbLogger\n",
+    "\n",
+    "results = lm_eval.simple_evaluate(\n",
+    "    model=\"hf\",\n",
+    "    model_args=\"pretrained=microsoft/phi-2,trust_remote_code=True\",\n",
+    "    tasks=\"hellaswag,mmlu_abstract_algebra\",\n",
+    "    log_samples=True,\n",
+    ")\n",
+    "\n",
+    "wandb_logger = WandbLogger(\n",
+    "    project=\"lm-eval-harness-integration\", job_type=\"eval\"\n",
+    ")  # or empty if wandb.init(...) already called before\n",
+    "wandb_logger.post_init(results)\n",
+    "wandb_logger.log_eval_result()\n",
+    "wandb_logger.log_eval_samples(results[\"samples\"])  # if log_samples"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

scripts/yans/lm-evaluation-harness/examples/visualize-zeno.ipynb ADDED Viewed

	@@ -0,0 +1,115 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Visualizing Results in Zeno\n",
+    "\n",
+    "Benchmarking your models is the first step towards making sure your model performs well.\n",
+    "However, looking at the data behind the benchmark, slicing the data into subsets, and comparing models on individual instances can help you even more in evaluating and quantifying the behavior of your AI system.\n",
+    "\n",
+    "All of this can be done in [Zeno](https://zenoml.com)!\n",
+    "Zeno is super easy to use with the eval harness, let's explore how you can easily upload and visualize your eval results.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Install this project if you did not already do that. This is all that needs to be installed for you to be able to visualize your data in Zeno!\n",
+    "!pip install -e ..\n",
+    "!pip install -e ..[zeno]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Run the Eval Harness\n",
+    "\n",
+    "To visualize the results, run the eval harness with the `log_samples` and `output_path` flags. We expect `output_path` to contain multiple folders that represent individual model names. You can thus run your evaluation on any number of tasks and models and upload all of the results as projects on Zeno.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!lm_eval \\\n",
+    "    --model hf \\\n",
+    "    --model_args pretrained=EleutherAI/gpt-neo-2.7B \\\n",
+    "    --tasks hellaswag,wikitext \\\n",
+    "    --batch_size 8 \\\n",
+    "    --device mps \\\n",
+    "    --log_samples \\\n",
+    "    --output_path output/gpt-neo-2.7B \\\n",
+    "    --limit 10"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Set your API Key\n",
+    "\n",
+    "This is so you can be authenticated with Zeno.\n",
+    "If you don't already have a Zeno account, first create an account on [Zeno Hub](https://hub.zenoml.com).\n",
+    "After logging in to Zeno Hub, generate your API key by clicking on your profile at the bottom left to navigate to your account page.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%env ZENO_API_KEY=YOUR_API_KEY"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Visualize Eval Results\n",
+    "\n",
+    "You can now use the `zeno_visualize` script to upload the results to Zeno.\n",
+    "\n",
+    "This will use all subfolders in `data_path` as different models and upload all tasks within these model folders to Zeno. If you run the eval harness on multiple tasks, the `project_name` will be used as a prefix and one project will be created per task.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!python ../scripts/zeno_visualize.py --data_path output --project_name \"Zeno Upload Test\""
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "zeno_projects",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

scripts/yans/lm-evaluation-harness/ignore.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+ROUGE
+rouge
+nin
+maka
+mor
+te
+ond
+extraversion

scripts/yans/lm-evaluation-harness/ja_eval.sh ADDED Viewed

	@@ -0,0 +1,5 @@

+lm_eval --model hf \
+    --model_args pretrained=/share/pretrained_lm/Qwen/Qwen2-0.5B\
+    --tasks jaqket_v1,jaqket_v2,jaquad,jblimp,jcola,jcommonsenseqa,jnli,jsquad,marc_ja,mgsm,wikilingua_ja,xlsum_ja,xwinograd_ja\
+    --device cuda:0 \
+    --batch_size 8

scripts/yans/lm-evaluation-harness/ja_eval2.sh ADDED Viewed

	@@ -0,0 +1,5 @@

+lm_eval --model hf \
+    --model_args pretrained=/share/pretrained_lm/Qwen/Qwen2-0.5B\
+    --tasks "ja/jaqket_v1" \
+    --device cuda:0 \
+    --batch_size 8