{ "cells": [ { "cell_type": "code", "execution_count": 17, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "collapsed": true, "id": "MCiLSwoWQK7z", "outputId": "5efbb6bc-0e2d-4df7-a5b2-36f4448960a8" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting git+https://github.com/EleutherAI/lm-evaluation-harness.git\n", " Cloning https://github.com/EleutherAI/lm-evaluation-harness.git to /tmp/pip-req-build-j2xmmhxh\n", " Running command git clone --filter=blob:none --quiet https://github.com/EleutherAI/lm-evaluation-harness.git /tmp/pip-req-build-j2xmmhxh\n", " Resolved https://github.com/EleutherAI/lm-evaluation-harness.git to commit b4cd85d406938f94ee5d451840a0d69bbda27006\n", " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", "Requirement already satisfied: accelerate>=0.21.0 in /usr/local/lib/python3.10/dist-packages (from lm_eval==0.4.2) (0.30.1)\n", "Requirement already satisfied: evaluate in /usr/local/lib/python3.10/dist-packages (from lm_eval==0.4.2) (0.4.2)\n", "Requirement already satisfied: datasets>=2.16.0 in /usr/local/lib/python3.10/dist-packages (from lm_eval==0.4.2) (2.19.1)\n", "Requirement already satisfied: jsonlines in /usr/local/lib/python3.10/dist-packages (from lm_eval==0.4.2) (4.0.0)\n", "Requirement already satisfied: numexpr in /usr/local/lib/python3.10/dist-packages (from lm_eval==0.4.2) (2.10.0)\n", "Requirement already satisfied: peft>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from lm_eval==0.4.2) (0.11.1)\n", "Requirement already satisfied: pybind11>=2.6.2 in /usr/local/lib/python3.10/dist-packages (from lm_eval==0.4.2) (2.12.0)\n", "Requirement already satisfied: pytablewriter in /usr/local/lib/python3.10/dist-packages (from lm_eval==0.4.2) (1.2.0)\n", "Requirement already satisfied: rouge-score>=0.0.4 in /usr/local/lib/python3.10/dist-packages (from lm_eval==0.4.2) (0.1.2)\n", "Requirement already satisfied: sacrebleu>=1.5.0 in /usr/local/lib/python3.10/dist-packages (from lm_eval==0.4.2) (2.4.2)\n", "Requirement already satisfied: scikit-learn>=0.24.1 in /usr/local/lib/python3.10/dist-packages (from lm_eval==0.4.2) (1.2.2)\n", "Requirement already satisfied: sqlitedict in /usr/local/lib/python3.10/dist-packages (from lm_eval==0.4.2) (2.1.0)\n", "Requirement already satisfied: torch>=1.8 in /usr/local/lib/python3.10/dist-packages (from lm_eval==0.4.2) (2.3.0+cu121)\n", "Requirement already satisfied: tqdm-multiprocess in /usr/local/lib/python3.10/dist-packages (from lm_eval==0.4.2) (0.0.11)\n", "Requirement already satisfied: transformers>=4.1 in /usr/local/lib/python3.10/dist-packages (from lm_eval==0.4.2) (4.41.1)\n", "Requirement already satisfied: zstandard in /usr/local/lib/python3.10/dist-packages (from lm_eval==0.4.2) (0.22.0)\n", "Requirement already satisfied: dill in /usr/local/lib/python3.10/dist-packages (from lm_eval==0.4.2) (0.3.8)\n", "Requirement already satisfied: word2number in /usr/local/lib/python3.10/dist-packages (from lm_eval==0.4.2) (1.1)\n", "Requirement already satisfied: more-itertools in /usr/local/lib/python3.10/dist-packages (from lm_eval==0.4.2) (10.1.0)\n", "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm_eval==0.4.2) (1.25.2)\n", "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm_eval==0.4.2) (24.0)\n", "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm_eval==0.4.2) (5.9.5)\n", "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm_eval==0.4.2) (6.0.1)\n", "Requirement already satisfied: huggingface-hub in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm_eval==0.4.2) (0.23.1)\n", "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm_eval==0.4.2) (0.4.3)\n", "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from datasets>=2.16.0->lm_eval==0.4.2) (3.14.0)\n", "Requirement already satisfied: pyarrow>=12.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.16.0->lm_eval==0.4.2) (14.0.2)\n", "Requirement already satisfied: pyarrow-hotfix in /usr/local/lib/python3.10/dist-packages (from datasets>=2.16.0->lm_eval==0.4.2) (0.6)\n", "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets>=2.16.0->lm_eval==0.4.2) (2.0.3)\n", "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.16.0->lm_eval==0.4.2) (2.31.0)\n", "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.16.0->lm_eval==0.4.2) (4.66.4)\n", "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets>=2.16.0->lm_eval==0.4.2) (3.4.1)\n", "Requirement already satisfied: multiprocess in /usr/local/lib/python3.10/dist-packages (from datasets>=2.16.0->lm_eval==0.4.2) (0.70.16)\n", "Requirement already satisfied: fsspec[http]<=2024.3.1,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.16.0->lm_eval==0.4.2) (2023.6.0)\n", "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets>=2.16.0->lm_eval==0.4.2) (3.9.5)\n", "Requirement already satisfied: absl-py in /usr/local/lib/python3.10/dist-packages (from rouge-score>=0.0.4->lm_eval==0.4.2) (1.4.0)\n", "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from rouge-score>=0.0.4->lm_eval==0.4.2) (3.8.1)\n", "Requirement already satisfied: six>=1.14.0 in /usr/local/lib/python3.10/dist-packages (from rouge-score>=0.0.4->lm_eval==0.4.2) (1.16.0)\n", "Requirement already satisfied: portalocker in /usr/local/lib/python3.10/dist-packages (from sacrebleu>=1.5.0->lm_eval==0.4.2) (2.8.2)\n", "Requirement already satisfied: regex in /usr/local/lib/python3.10/dist-packages (from sacrebleu>=1.5.0->lm_eval==0.4.2) (2024.5.15)\n", "Requirement already satisfied: tabulate>=0.8.9 in /usr/local/lib/python3.10/dist-packages (from sacrebleu>=1.5.0->lm_eval==0.4.2) (0.9.0)\n", "Requirement already satisfied: colorama in /usr/local/lib/python3.10/dist-packages (from sacrebleu>=1.5.0->lm_eval==0.4.2) (0.4.6)\n", "Requirement already satisfied: lxml in /usr/local/lib/python3.10/dist-packages (from sacrebleu>=1.5.0->lm_eval==0.4.2) (4.9.4)\n", "Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.24.1->lm_eval==0.4.2) (1.11.4)\n", "Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.24.1->lm_eval==0.4.2) (1.4.2)\n", "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.24.1->lm_eval==0.4.2) (3.5.0)\n", "Requirement already satisfied: typing-extensions>=4.8.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm_eval==0.4.2) (4.11.0)\n", "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm_eval==0.4.2) (1.12)\n", "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm_eval==0.4.2) (3.3)\n", "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm_eval==0.4.2) (3.1.4)\n", "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm_eval==0.4.2) (12.1.105)\n", "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm_eval==0.4.2) (12.1.105)\n", "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm_eval==0.4.2) (12.1.105)\n", "Requirement already satisfied: nvidia-cudnn-cu12==8.9.2.26 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm_eval==0.4.2) (8.9.2.26)\n", "Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm_eval==0.4.2) (12.1.3.1)\n", "Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm_eval==0.4.2) (11.0.2.54)\n", "Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm_eval==0.4.2) (10.3.2.106)\n", "Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm_eval==0.4.2) (11.4.5.107)\n", "Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm_eval==0.4.2) (12.1.0.106)\n", "Requirement already satisfied: nvidia-nccl-cu12==2.20.5 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm_eval==0.4.2) (2.20.5)\n", "Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm_eval==0.4.2) (12.1.105)\n", "Requirement already satisfied: triton==2.3.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm_eval==0.4.2) (2.3.0)\n", "Requirement already satisfied: nvidia-nvjitlink-cu12 in /usr/local/lib/python3.10/dist-packages (from nvidia-cusolver-cu12==11.4.5.107->torch>=1.8->lm_eval==0.4.2) (12.5.40)\n", "Requirement already satisfied: tokenizers<0.20,>=0.19 in /usr/local/lib/python3.10/dist-packages (from transformers>=4.1->lm_eval==0.4.2) (0.19.1)\n", "Requirement already satisfied: attrs>=19.2.0 in /usr/local/lib/python3.10/dist-packages (from jsonlines->lm_eval==0.4.2) (23.2.0)\n", "Requirement already satisfied: setuptools>=38.3.0 in /usr/local/lib/python3.10/dist-packages (from pytablewriter->lm_eval==0.4.2) (67.7.2)\n", "Requirement already satisfied: DataProperty<2,>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from pytablewriter->lm_eval==0.4.2) (1.0.1)\n", "Requirement already satisfied: mbstrdecoder<2,>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from pytablewriter->lm_eval==0.4.2) (1.1.3)\n", "Requirement already satisfied: pathvalidate<4,>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from pytablewriter->lm_eval==0.4.2) (3.2.0)\n", "Requirement already satisfied: tabledata<2,>=1.3.1 in /usr/local/lib/python3.10/dist-packages (from pytablewriter->lm_eval==0.4.2) (1.3.3)\n", "Requirement already satisfied: tcolorpy<1,>=0.0.5 in /usr/local/lib/python3.10/dist-packages (from pytablewriter->lm_eval==0.4.2) (0.1.6)\n", "Requirement already satisfied: typepy[datetime]<2,>=1.3.2 in /usr/local/lib/python3.10/dist-packages (from pytablewriter->lm_eval==0.4.2) (1.3.2)\n", "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.16.0->lm_eval==0.4.2) (1.3.1)\n", "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.16.0->lm_eval==0.4.2) (1.4.1)\n", "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.16.0->lm_eval==0.4.2) (6.0.5)\n", "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.16.0->lm_eval==0.4.2) (1.9.4)\n", "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.16.0->lm_eval==0.4.2) (4.0.3)\n", "Requirement already satisfied: chardet<6,>=3.0.4 in /usr/local/lib/python3.10/dist-packages (from mbstrdecoder<2,>=1.0.0->pytablewriter->lm_eval==0.4.2) (5.2.0)\n", "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets>=2.16.0->lm_eval==0.4.2) (3.3.2)\n", "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets>=2.16.0->lm_eval==0.4.2) (3.7)\n", "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets>=2.16.0->lm_eval==0.4.2) (2.0.7)\n", "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets>=2.16.0->lm_eval==0.4.2) (2024.2.2)\n", "Requirement already satisfied: python-dateutil<3.0.0,>=2.8.0 in /usr/local/lib/python3.10/dist-packages (from typepy[datetime]<2,>=1.3.2->pytablewriter->lm_eval==0.4.2) (2.8.2)\n", "Requirement already satisfied: pytz>=2018.9 in /usr/local/lib/python3.10/dist-packages (from typepy[datetime]<2,>=1.3.2->pytablewriter->lm_eval==0.4.2) (2023.4)\n", "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.8->lm_eval==0.4.2) (2.1.5)\n", "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk->rouge-score>=0.0.4->lm_eval==0.4.2) (8.1.7)\n", "Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets>=2.16.0->lm_eval==0.4.2) (2024.1)\n", "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.8->lm_eval==0.4.2) (1.3.0)\n" ] } ], "source": [ "# Install LM-Eval\n", "!pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "id": "JbpEeufJQnTr" }, "outputs": [], "source": [ "from lm_eval import api" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "id": "hgzFSI8hH59H" }, "outputs": [], "source": [ "import os\n", "\n", "HF_TOKEN = \"\" # generate a user access token from https://huggingface.co/settings/tokens and copy it here\n", "os.environ[\"HF_TOKEN\"] = HF_TOKEN" ] }, { "cell_type": "markdown", "metadata": { "id": "Knxt2sGYyBrY" }, "source": [ "# Configure Evaluation\n" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "id": "9WS47SmXyQyC" }, "outputs": [], "source": [ "YAML_boolq_string = \"\"\"\n", "task: demo_boolq\n", "dataset_path: super_glue\n", "dataset_name: boolq\n", "output_type: multiple_choice\n", "training_split: train\n", "validation_split: validation\n", "doc_to_text: \"{{passage}}\\nQuestion: {{question}}?\\nAnswer:\"\n", "doc_to_target: label\n", "doc_to_choice: [\"no\", \"yes\"]\n", "should_decontaminate: true\n", "doc_to_decontamination_query: passage\n", "metric_list:\n", " - metric: acc\n", " - metric: bleu\n", " - metric: f1\n", "\"\"\"\n", "with open(\"boolq.yaml\", \"w\") as f:\n", " f.write(YAML_boolq_string)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "HEqYUlYvGuhd", "outputId": "fd36c9ca-fdc3-4567-cce7-6818f9ff69a8" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2024-05-30 06:24:29.336227: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", "2024-05-30 06:24:29.336292: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", "2024-05-30 06:24:29.338088: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", "2024-05-30 06:24:30.997165: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n", "2024-05-30:06:24:35,343 INFO [__main__.py:254] Verbosity set to INFO\n", "2024-05-30:06:24:35,343 INFO [__main__.py:277] Including path: ./\n", "2024-05-30:06:24:43,787 WARNING [__main__.py:293] --limit SHOULD ONLY BE USED FOR TESTING.REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.\n", "2024-05-30:06:24:43,788 INFO [__main__.py:344] Selected Tasks: ['demo_boolq']\n", "2024-05-30:06:24:43,790 INFO [evaluator.py:141] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234\n", "2024-05-30:06:24:43,790 INFO [evaluator.py:178] Initializing hf model, with arguments: {'pretrained': 'EleutherAI/pythia-2.8b'}\n", "2024-05-30:06:24:43,812 INFO [huggingface.py:165] Using device 'cuda'\n", "/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", " warnings.warn(\n", "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n", "2024-05-30:06:25:03,269 WARNING [task.py:774] [Task: demo_boolq] metric acc is defined, but aggregation is not. using default aggregation=mean\n", "2024-05-30:06:25:03,269 WARNING [task.py:786] [Task: demo_boolq] metric acc is defined, but higher_is_better is not. using default higher_is_better=True\n", "2024-05-30:06:25:03,269 WARNING [task.py:774] [Task: demo_boolq] metric bleu is defined, but aggregation is not. using default aggregation=bleu\n", "2024-05-30:06:25:03,269 WARNING [task.py:786] [Task: demo_boolq] metric bleu is defined, but higher_is_better is not. using default higher_is_better=True\n", "2024-05-30:06:25:03,269 WARNING [task.py:774] [Task: demo_boolq] metric f1 is defined, but aggregation is not. using default aggregation=f1\n", "2024-05-30:06:25:03,269 WARNING [task.py:786] [Task: demo_boolq] metric f1 is defined, but higher_is_better is not. using default higher_is_better=True\n", "/usr/local/lib/python3.10/dist-packages/datasets/load.py:1486: FutureWarning: The repository for super_glue contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/super_glue\n", "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n", "Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n", " warnings.warn(\n", "2024-05-30:06:25:06,006 INFO [task.py:398] Building contexts for demo_boolq on rank 0...\n", "100% 20/20 [00:00<00:00, 1266.87it/s]\n", "2024-05-30:06:25:06,024 INFO [evaluator.py:395] Running loglikelihood requests\n", "Running loglikelihood requests: 100% 40/40 [00:02<00:00, 14.95it/s]\n", "/usr/lib/python3.10/multiprocessing/popen_fork.py:66: RuntimeWarning: os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.\n", " self.pid = os.fork()\n", "bootstrapping for stddev: f1_score\n", "100% 100/100 [01:59<00:00, 1.20s/it]\n", "fatal: not a git repository (or any of the parent directories): .git\n", "2024-05-30:06:27:09,982 INFO [evaluation_tracker.py:132] Saving results aggregated\n", "2024-05-30:06:27:09,983 INFO [evaluation_tracker.py:203] Saving samples results\n", "hf (pretrained=EleutherAI/pythia-2.8b), gen_kwargs: (None), limit: 20.0, num_fewshot: None, batch_size: 1\n", "| Tasks |Version|Filter|n-shot|Metric|Value | |Stderr|\n", "|----------|-------|------|-----:|------|-----:|---|-----:|\n", "|demo_boolq|Yaml |none | 0|acc |0.7500|± |0.0993|\n", "| | |none | 0|f1 |0.8485|± |0.0690|\n", "\n" ] } ], "source": [ "!lm_eval \\\n", " --model hf \\\n", " --model_args pretrained=EleutherAI/pythia-2.8b \\\n", " --include_path ./ \\\n", " --tasks demo_boolq \\\n", " --output output/ \\\n", " --limit 20 \\\n", " --log_samples" ] }, { "cell_type": "code", "execution_count": 22, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "HDyMUJieyX-S", "outputId": "2307e7c9-fbcc-467e-8780-107924666e54" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "2024-05-30 06:27:14.929536: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", "2024-05-30 06:27:14.929584: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", "2024-05-30 06:27:14.930843: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", "2024-05-30 06:27:16.588649: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n", "2024-05-30:06:27:23,447 INFO [__main__.py:254] Verbosity set to INFO\n", "2024-05-30:06:27:23,447 INFO [__main__.py:277] Including path: ./\n", "2024-05-30:06:27:29,860 WARNING [__main__.py:293] --limit SHOULD ONLY BE USED FOR TESTING.REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.\n", "2024-05-30:06:27:29,861 INFO [__main__.py:344] Selected Tasks: ['demo_boolq']\n", "2024-05-30:06:27:29,863 INFO [evaluator.py:141] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234\n", "2024-05-30:06:27:29,863 INFO [evaluator.py:178] Initializing hf model, with arguments: {'pretrained': 'mistralai/Mistral-7B-v0.1'}\n", "2024-05-30:06:27:29,885 INFO [huggingface.py:165] Using device 'cuda'\n", "/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", " warnings.warn(\n", "Loading checkpoint shards: 100% 2/2 [01:01<00:00, 30.54s/it]\n", "2024-05-30:06:28:33,160 WARNING [task.py:774] [Task: demo_boolq] metric acc is defined, but aggregation is not. using default aggregation=mean\n", "2024-05-30:06:28:33,160 WARNING [task.py:786] [Task: demo_boolq] metric acc is defined, but higher_is_better is not. using default higher_is_better=True\n", "2024-05-30:06:28:33,160 WARNING [task.py:774] [Task: demo_boolq] metric bleu is defined, but aggregation is not. using default aggregation=bleu\n", "2024-05-30:06:28:33,160 WARNING [task.py:786] [Task: demo_boolq] metric bleu is defined, but higher_is_better is not. using default higher_is_better=True\n", "2024-05-30:06:28:33,160 WARNING [task.py:774] [Task: demo_boolq] metric f1 is defined, but aggregation is not. using default aggregation=f1\n", "2024-05-30:06:28:33,160 WARNING [task.py:786] [Task: demo_boolq] metric f1 is defined, but higher_is_better is not. using default higher_is_better=True\n", "/usr/local/lib/python3.10/dist-packages/datasets/load.py:1486: FutureWarning: The repository for super_glue contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/super_glue\n", "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n", "Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n", " warnings.warn(\n", "2024-05-30:06:28:35,330 INFO [task.py:398] Building contexts for demo_boolq on rank 0...\n", "100% 20/20 [00:00<00:00, 1841.06it/s]\n", "2024-05-30:06:28:35,342 INFO [evaluator.py:395] Running loglikelihood requests\n", "Running loglikelihood requests: 100% 40/40 [00:22<00:00, 1.80it/s]\n", "/usr/lib/python3.10/multiprocessing/popen_fork.py:66: RuntimeWarning: os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.\n", " self.pid = os.fork()\n", "bootstrapping for stddev: f1_score\n", "100% 100/100 [02:00<00:00, 1.20s/it]\n", "fatal: not a git repository (or any of the parent directories): .git\n", "2024-05-30:06:30:59,045 INFO [evaluation_tracker.py:132] Saving results aggregated\n", "2024-05-30:06:30:59,046 INFO [evaluation_tracker.py:203] Saving samples results\n", "hf (pretrained=mistralai/Mistral-7B-v0.1), gen_kwargs: (None), limit: 20.0, num_fewshot: None, batch_size: 1\n", "| Tasks |Version|Filter|n-shot|Metric|Value| |Stderr|\n", "|----------|-------|------|-----:|------|----:|---|-----:|\n", "|demo_boolq|Yaml |none | 0|acc |0.800|± |0.0918|\n", "| | |none | 0|f1 |0.875|± |0.0642|\n", "\n" ] } ], "source": [ "!lm_eval \\\n", " --model hf \\\n", " --model_args pretrained=mistralai/Mistral-7B-v0.1 \\\n", " --include_path ./ \\\n", " --tasks demo_boolq \\\n", " --output output/ \\\n", " --limit 20 \\\n", " --log_samples" ] }, { "cell_type": "markdown", "metadata": { "id": "ivXfua4qLggD" }, "source": [ "# Convert to Analytics Platform JSON\n" ] }, { "cell_type": "markdown", "metadata": { "id": "qjyOXzRvMBQs" }, "source": [ "### Let's start with defining the `name`, `models`, and `metrics` we used in this demo\n" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "id": "LNGj4ncsLqVq" }, "outputs": [], "source": [ "name = \"LM Evaluation Harness Demo\"\n", "\n", "# models -> List[dict]\n", "models = [\n", " {\n", " \"model_id\": \"EleutherAI/pythia-2.8b\",\n", " \"name\": \"Pythia-2.9b\",\n", " \"owner\": \"EleutherAI\",\n", " },\n", " {\n", " \"model_id\": \"mistralai/Mistral-7B-v0.1\",\n", " \"name\": \"Mistral-7B-v0.1\",\n", " \"owner\": \"Mistral AI\",\n", " },\n", "]\n", "\n", "# metrics -> List[dict]\n", "all_metrics = [\n", " {\n", " \"name\": \"F1\",\n", " \"display_name\": \"F1\",\n", " \"description\": \"F1 score \",\n", " \"author\": \"algorithm\",\n", " \"type\": \"numerical\",\n", " \"aggregator\": \"average\",\n", " \"range\": [0, 1.0, 0.1],\n", " },\n", " {\n", " \"name\": \"Accuracy\",\n", " \"display_name\": \"Accuracy\",\n", " \"description\": \"Prediction accuracy\",\n", " \"author\": \"algorithm\",\n", " \"type\": \"numerical\",\n", " \"aggregator\": \"average\",\n", " \"range\": [0, 1.0, 0.1],\n", " },\n", "]" ] }, { "cell_type": "markdown", "metadata": { "id": "HntEhvugQt2Y" }, "source": [ "## Now let's define `tasks`, `documents`, and `evaluations`\n" ] }, { "cell_type": "code", "execution_count": 27, "metadata": { "id": "9yRse3PQQsxb" }, "outputs": [], "source": [ "import json\n", "\n", "outputs = []\n", "\n", "# modify output filepath for pythia-2.8b here\n", "with open(\n", " \"output/EleutherAI__pythia-2.8b/samples_demo_boolq_2024-05-30T02-24-44.249027.json\",\n", " \"r\",\n", ") as f:\n", " model_1_samples = json.load(f)\n", "\n", "# modify output filepath for Mistral-7B-v0.1 here\n", "with open(\n", " \"output/mistralai__Mistral-7B-v0.1/samples_demo_boolq_2024-05-30T02-28-34.024454.json\",\n", " \"r\",\n", ") as f:\n", " model_2_samples = json.load(f)\n", "\n", "all_tasks = []\n", "all_documents = []\n", "all_evaluations = []\n", "for model_1_sample, model_2_sample in zip(model_1_samples, model_2_samples):\n", " assert model_1_sample[\"doc_id\"] == model_2_sample[\"doc_id\"]\n", " doc_id = model_1_sample[\"doc_id\"]\n", " content_1 = model_1_sample.get(\"doc\")\n", " content_2 = model_2_sample.get(\"doc\")\n", " passage_text = content_1.get(\"passage\")\n", " document = {\"document_id\": f\"doc_{doc_id}\", \"text\": passage_text}\n", "\n", " all_documents.extend([document])\n", " instance = {\n", " \"task_id\": f\"{doc_id}\",\n", " \"task_type\": \"conversation\",\n", " \"contexts\": [{\"document_id\": document[\"document_id\"]}],\n", " \"input\": [{\"speaker\": \"user\", \"text\": f\"{model_1_sample['doc']['question']}\"}],\n", " \"targets\": [{\"text\": \"yes\" if model_1_sample[\"target\"] else \"no\"}],\n", " }\n", " all_tasks.append(instance)\n", "\n", " for i, pred in enumerate([model_1_sample, model_2_sample]):\n", " model_id = models[i][\"model_id\"]\n", " target = \"yes\" if pred[\"target\"] else \"no\"\n", " prediction = (\n", " \"no\"\n", " if pred[\"filtered_resps\"][0][0] > pred[\"filtered_resps\"][1][0]\n", " else \"yes\"\n", " )\n", " all_evaluations.append(\n", " {\n", " \"task_id\": f\"{doc_id}\",\n", " \"model_id\": model_id,\n", " \"model_response\": prediction,\n", " \"annotations\": {\n", " \"Accuracy\": {\n", " \"system\": {\n", " \"value\": 1 if prediction == target else 0,\n", " \"duration\": 0,\n", " }\n", " },\n", " \"F1\": {\n", " \"system\": {\n", " \"value\": 1 if prediction == target else 0,\n", " \"duration\": 0,\n", " }\n", " },\n", " },\n", " }\n", " )" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "NM_VZxEU5UiX", "outputId": "7cb16261-b0ed-49dd-e2e2-0a1974c17f9f" }, "outputs": [ { "data": { "text/plain": [ "(20, 20, 40)" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(all_tasks), len(all_documents), len(all_evaluations)" ] }, { "cell_type": "markdown", "metadata": { "id": "bekGOYtEcABN" }, "source": [ "## Now we can write the output to file and import it into our dashboard for analysis :D\n" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "id": "3tjuCibsYzG7" }, "outputs": [], "source": [ "import json\n", "\n", "output = {\n", " \"name\": name,\n", " \"models\": models,\n", " \"metrics\": all_metrics,\n", " \"documents\": all_documents,\n", " \"tasks\": all_tasks,\n", " \"evaluations\": all_evaluations,\n", "}\n", "\n", "with open(\n", " file=\"lm-eval-harness-inspectorraget-demo.json\", mode=\"w\", encoding=\"utf-8\"\n", ") as fp:\n", " json.dump(output, fp, indent=4)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "iIcWaE51cuAh" }, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": { "id": "8BEkotPhx-_w" }, "source": [] } ], "metadata": { "accelerator": "GPU", "colab": { "gpuType": "T4", "machine_shape": "hm", "provenance": [] }, "kernelspec": { "display_name": "Python 3", "name": "python3" }, "language_info": { "name": "python" } }, "nbformat": 4, "nbformat_minor": 0 }