Siva Sankalp commited on
Commit
2d6d7db
·
1 Parent(s): d23bbc5

feat: LM Eval Harness Demonstration (#3)

Browse files
Files changed (1) hide show
  1. notebooks/LM_Eval_Demonstration.ipynb +571 -0
notebooks/LM_Eval_Demonstration.ipynb ADDED
@@ -0,0 +1,571 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 17,
6
+ "metadata": {
7
+ "colab": {
8
+ "base_uri": "https://localhost:8080/"
9
+ },
10
+ "collapsed": true,
11
+ "id": "MCiLSwoWQK7z",
12
+ "outputId": "5efbb6bc-0e2d-4df7-a5b2-36f4448960a8"
13
+ },
14
+ "outputs": [
15
+ {
16
+ "name": "stdout",
17
+ "output_type": "stream",
18
+ "text": [
19
+ "Collecting git+https://github.com/EleutherAI/lm-evaluation-harness.git\n",
20
+ " Cloning https://github.com/EleutherAI/lm-evaluation-harness.git to /tmp/pip-req-build-j2xmmhxh\n",
21
+ " Running command git clone --filter=blob:none --quiet https://github.com/EleutherAI/lm-evaluation-harness.git /tmp/pip-req-build-j2xmmhxh\n",
22
+ " Resolved https://github.com/EleutherAI/lm-evaluation-harness.git to commit b4cd85d406938f94ee5d451840a0d69bbda27006\n",
23
+ " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
24
+ " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
25
+ " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
26
+ "Requirement already satisfied: accelerate>=0.21.0 in /usr/local/lib/python3.10/dist-packages (from lm_eval==0.4.2) (0.30.1)\n",
27
+ "Requirement already satisfied: evaluate in /usr/local/lib/python3.10/dist-packages (from lm_eval==0.4.2) (0.4.2)\n",
28
+ "Requirement already satisfied: datasets>=2.16.0 in /usr/local/lib/python3.10/dist-packages (from lm_eval==0.4.2) (2.19.1)\n",
29
+ "Requirement already satisfied: jsonlines in /usr/local/lib/python3.10/dist-packages (from lm_eval==0.4.2) (4.0.0)\n",
30
+ "Requirement already satisfied: numexpr in /usr/local/lib/python3.10/dist-packages (from lm_eval==0.4.2) (2.10.0)\n",
31
+ "Requirement already satisfied: peft>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from lm_eval==0.4.2) (0.11.1)\n",
32
+ "Requirement already satisfied: pybind11>=2.6.2 in /usr/local/lib/python3.10/dist-packages (from lm_eval==0.4.2) (2.12.0)\n",
33
+ "Requirement already satisfied: pytablewriter in /usr/local/lib/python3.10/dist-packages (from lm_eval==0.4.2) (1.2.0)\n",
34
+ "Requirement already satisfied: rouge-score>=0.0.4 in /usr/local/lib/python3.10/dist-packages (from lm_eval==0.4.2) (0.1.2)\n",
35
+ "Requirement already satisfied: sacrebleu>=1.5.0 in /usr/local/lib/python3.10/dist-packages (from lm_eval==0.4.2) (2.4.2)\n",
36
+ "Requirement already satisfied: scikit-learn>=0.24.1 in /usr/local/lib/python3.10/dist-packages (from lm_eval==0.4.2) (1.2.2)\n",
37
+ "Requirement already satisfied: sqlitedict in /usr/local/lib/python3.10/dist-packages (from lm_eval==0.4.2) (2.1.0)\n",
38
+ "Requirement already satisfied: torch>=1.8 in /usr/local/lib/python3.10/dist-packages (from lm_eval==0.4.2) (2.3.0+cu121)\n",
39
+ "Requirement already satisfied: tqdm-multiprocess in /usr/local/lib/python3.10/dist-packages (from lm_eval==0.4.2) (0.0.11)\n",
40
+ "Requirement already satisfied: transformers>=4.1 in /usr/local/lib/python3.10/dist-packages (from lm_eval==0.4.2) (4.41.1)\n",
41
+ "Requirement already satisfied: zstandard in /usr/local/lib/python3.10/dist-packages (from lm_eval==0.4.2) (0.22.0)\n",
42
+ "Requirement already satisfied: dill in /usr/local/lib/python3.10/dist-packages (from lm_eval==0.4.2) (0.3.8)\n",
43
+ "Requirement already satisfied: word2number in /usr/local/lib/python3.10/dist-packages (from lm_eval==0.4.2) (1.1)\n",
44
+ "Requirement already satisfied: more-itertools in /usr/local/lib/python3.10/dist-packages (from lm_eval==0.4.2) (10.1.0)\n",
45
+ "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm_eval==0.4.2) (1.25.2)\n",
46
+ "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm_eval==0.4.2) (24.0)\n",
47
+ "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm_eval==0.4.2) (5.9.5)\n",
48
+ "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm_eval==0.4.2) (6.0.1)\n",
49
+ "Requirement already satisfied: huggingface-hub in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm_eval==0.4.2) (0.23.1)\n",
50
+ "Requirement already satisfied: safetensors>=0.3.1 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.21.0->lm_eval==0.4.2) (0.4.3)\n",
51
+ "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from datasets>=2.16.0->lm_eval==0.4.2) (3.14.0)\n",
52
+ "Requirement already satisfied: pyarrow>=12.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.16.0->lm_eval==0.4.2) (14.0.2)\n",
53
+ "Requirement already satisfied: pyarrow-hotfix in /usr/local/lib/python3.10/dist-packages (from datasets>=2.16.0->lm_eval==0.4.2) (0.6)\n",
54
+ "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets>=2.16.0->lm_eval==0.4.2) (2.0.3)\n",
55
+ "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.16.0->lm_eval==0.4.2) (2.31.0)\n",
56
+ "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.16.0->lm_eval==0.4.2) (4.66.4)\n",
57
+ "Requirement already satisfied: xxhash in /usr/local/lib/python3.10/dist-packages (from datasets>=2.16.0->lm_eval==0.4.2) (3.4.1)\n",
58
+ "Requirement already satisfied: multiprocess in /usr/local/lib/python3.10/dist-packages (from datasets>=2.16.0->lm_eval==0.4.2) (0.70.16)\n",
59
+ "Requirement already satisfied: fsspec[http]<=2024.3.1,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.16.0->lm_eval==0.4.2) (2023.6.0)\n",
60
+ "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets>=2.16.0->lm_eval==0.4.2) (3.9.5)\n",
61
+ "Requirement already satisfied: absl-py in /usr/local/lib/python3.10/dist-packages (from rouge-score>=0.0.4->lm_eval==0.4.2) (1.4.0)\n",
62
+ "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from rouge-score>=0.0.4->lm_eval==0.4.2) (3.8.1)\n",
63
+ "Requirement already satisfied: six>=1.14.0 in /usr/local/lib/python3.10/dist-packages (from rouge-score>=0.0.4->lm_eval==0.4.2) (1.16.0)\n",
64
+ "Requirement already satisfied: portalocker in /usr/local/lib/python3.10/dist-packages (from sacrebleu>=1.5.0->lm_eval==0.4.2) (2.8.2)\n",
65
+ "Requirement already satisfied: regex in /usr/local/lib/python3.10/dist-packages (from sacrebleu>=1.5.0->lm_eval==0.4.2) (2024.5.15)\n",
66
+ "Requirement already satisfied: tabulate>=0.8.9 in /usr/local/lib/python3.10/dist-packages (from sacrebleu>=1.5.0->lm_eval==0.4.2) (0.9.0)\n",
67
+ "Requirement already satisfied: colorama in /usr/local/lib/python3.10/dist-packages (from sacrebleu>=1.5.0->lm_eval==0.4.2) (0.4.6)\n",
68
+ "Requirement already satisfied: lxml in /usr/local/lib/python3.10/dist-packages (from sacrebleu>=1.5.0->lm_eval==0.4.2) (4.9.4)\n",
69
+ "Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.24.1->lm_eval==0.4.2) (1.11.4)\n",
70
+ "Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.24.1->lm_eval==0.4.2) (1.4.2)\n",
71
+ "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.24.1->lm_eval==0.4.2) (3.5.0)\n",
72
+ "Requirement already satisfied: typing-extensions>=4.8.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm_eval==0.4.2) (4.11.0)\n",
73
+ "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm_eval==0.4.2) (1.12)\n",
74
+ "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm_eval==0.4.2) (3.3)\n",
75
+ "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm_eval==0.4.2) (3.1.4)\n",
76
+ "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm_eval==0.4.2) (12.1.105)\n",
77
+ "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm_eval==0.4.2) (12.1.105)\n",
78
+ "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm_eval==0.4.2) (12.1.105)\n",
79
+ "Requirement already satisfied: nvidia-cudnn-cu12==8.9.2.26 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm_eval==0.4.2) (8.9.2.26)\n",
80
+ "Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm_eval==0.4.2) (12.1.3.1)\n",
81
+ "Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm_eval==0.4.2) (11.0.2.54)\n",
82
+ "Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm_eval==0.4.2) (10.3.2.106)\n",
83
+ "Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm_eval==0.4.2) (11.4.5.107)\n",
84
+ "Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm_eval==0.4.2) (12.1.0.106)\n",
85
+ "Requirement already satisfied: nvidia-nccl-cu12==2.20.5 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm_eval==0.4.2) (2.20.5)\n",
86
+ "Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm_eval==0.4.2) (12.1.105)\n",
87
+ "Requirement already satisfied: triton==2.3.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.8->lm_eval==0.4.2) (2.3.0)\n",
88
+ "Requirement already satisfied: nvidia-nvjitlink-cu12 in /usr/local/lib/python3.10/dist-packages (from nvidia-cusolver-cu12==11.4.5.107->torch>=1.8->lm_eval==0.4.2) (12.5.40)\n",
89
+ "Requirement already satisfied: tokenizers<0.20,>=0.19 in /usr/local/lib/python3.10/dist-packages (from transformers>=4.1->lm_eval==0.4.2) (0.19.1)\n",
90
+ "Requirement already satisfied: attrs>=19.2.0 in /usr/local/lib/python3.10/dist-packages (from jsonlines->lm_eval==0.4.2) (23.2.0)\n",
91
+ "Requirement already satisfied: setuptools>=38.3.0 in /usr/local/lib/python3.10/dist-packages (from pytablewriter->lm_eval==0.4.2) (67.7.2)\n",
92
+ "Requirement already satisfied: DataProperty<2,>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from pytablewriter->lm_eval==0.4.2) (1.0.1)\n",
93
+ "Requirement already satisfied: mbstrdecoder<2,>=1.0.0 in /usr/local/lib/python3.10/dist-packages (from pytablewriter->lm_eval==0.4.2) (1.1.3)\n",
94
+ "Requirement already satisfied: pathvalidate<4,>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from pytablewriter->lm_eval==0.4.2) (3.2.0)\n",
95
+ "Requirement already satisfied: tabledata<2,>=1.3.1 in /usr/local/lib/python3.10/dist-packages (from pytablewriter->lm_eval==0.4.2) (1.3.3)\n",
96
+ "Requirement already satisfied: tcolorpy<1,>=0.0.5 in /usr/local/lib/python3.10/dist-packages (from pytablewriter->lm_eval==0.4.2) (0.1.6)\n",
97
+ "Requirement already satisfied: typepy[datetime]<2,>=1.3.2 in /usr/local/lib/python3.10/dist-packages (from pytablewriter->lm_eval==0.4.2) (1.3.2)\n",
98
+ "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.16.0->lm_eval==0.4.2) (1.3.1)\n",
99
+ "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.16.0->lm_eval==0.4.2) (1.4.1)\n",
100
+ "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.16.0->lm_eval==0.4.2) (6.0.5)\n",
101
+ "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.16.0->lm_eval==0.4.2) (1.9.4)\n",
102
+ "Requirement already satisfied: async-timeout<5.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.16.0->lm_eval==0.4.2) (4.0.3)\n",
103
+ "Requirement already satisfied: chardet<6,>=3.0.4 in /usr/local/lib/python3.10/dist-packages (from mbstrdecoder<2,>=1.0.0->pytablewriter->lm_eval==0.4.2) (5.2.0)\n",
104
+ "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets>=2.16.0->lm_eval==0.4.2) (3.3.2)\n",
105
+ "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets>=2.16.0->lm_eval==0.4.2) (3.7)\n",
106
+ "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets>=2.16.0->lm_eval==0.4.2) (2.0.7)\n",
107
+ "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets>=2.16.0->lm_eval==0.4.2) (2024.2.2)\n",
108
+ "Requirement already satisfied: python-dateutil<3.0.0,>=2.8.0 in /usr/local/lib/python3.10/dist-packages (from typepy[datetime]<2,>=1.3.2->pytablewriter->lm_eval==0.4.2) (2.8.2)\n",
109
+ "Requirement already satisfied: pytz>=2018.9 in /usr/local/lib/python3.10/dist-packages (from typepy[datetime]<2,>=1.3.2->pytablewriter->lm_eval==0.4.2) (2023.4)\n",
110
+ "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.8->lm_eval==0.4.2) (2.1.5)\n",
111
+ "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk->rouge-score>=0.0.4->lm_eval==0.4.2) (8.1.7)\n",
112
+ "Requirement already satisfied: tzdata>=2022.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets>=2.16.0->lm_eval==0.4.2) (2024.1)\n",
113
+ "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.8->lm_eval==0.4.2) (1.3.0)\n"
114
+ ]
115
+ }
116
+ ],
117
+ "source": [
118
+ "# Install LM-Eval\n",
119
+ "!pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git"
120
+ ]
121
+ },
122
+ {
123
+ "cell_type": "code",
124
+ "execution_count": 18,
125
+ "metadata": {
126
+ "id": "JbpEeufJQnTr"
127
+ },
128
+ "outputs": [],
129
+ "source": [
130
+ "from lm_eval import api"
131
+ ]
132
+ },
133
+ {
134
+ "cell_type": "code",
135
+ "execution_count": 19,
136
+ "metadata": {
137
+ "id": "hgzFSI8hH59H"
138
+ },
139
+ "outputs": [],
140
+ "source": [
141
+ "import os\n",
142
+ "\n",
143
+ "HF_TOKEN = \"\" # generate a user access token from https://huggingface.co/settings/tokens and copy it here\n",
144
+ "os.environ[\"HF_TOKEN\"] = HF_TOKEN"
145
+ ]
146
+ },
147
+ {
148
+ "cell_type": "markdown",
149
+ "metadata": {
150
+ "id": "Knxt2sGYyBrY"
151
+ },
152
+ "source": [
153
+ "# Configure Evaluation\n"
154
+ ]
155
+ },
156
+ {
157
+ "cell_type": "code",
158
+ "execution_count": 20,
159
+ "metadata": {
160
+ "id": "9WS47SmXyQyC"
161
+ },
162
+ "outputs": [],
163
+ "source": [
164
+ "YAML_boolq_string = \"\"\"\n",
165
+ "task: demo_boolq\n",
166
+ "dataset_path: super_glue\n",
167
+ "dataset_name: boolq\n",
168
+ "output_type: multiple_choice\n",
169
+ "training_split: train\n",
170
+ "validation_split: validation\n",
171
+ "doc_to_text: \"{{passage}}\\nQuestion: {{question}}?\\nAnswer:\"\n",
172
+ "doc_to_target: label\n",
173
+ "doc_to_choice: [\"no\", \"yes\"]\n",
174
+ "should_decontaminate: true\n",
175
+ "doc_to_decontamination_query: passage\n",
176
+ "metric_list:\n",
177
+ " - metric: acc\n",
178
+ " - metric: bleu\n",
179
+ " - metric: f1\n",
180
+ "\"\"\"\n",
181
+ "with open(\"boolq.yaml\", \"w\") as f:\n",
182
+ " f.write(YAML_boolq_string)"
183
+ ]
184
+ },
185
+ {
186
+ "cell_type": "code",
187
+ "execution_count": 21,
188
+ "metadata": {
189
+ "colab": {
190
+ "base_uri": "https://localhost:8080/"
191
+ },
192
+ "id": "HEqYUlYvGuhd",
193
+ "outputId": "fd36c9ca-fdc3-4567-cce7-6818f9ff69a8"
194
+ },
195
+ "outputs": [
196
+ {
197
+ "name": "stdout",
198
+ "output_type": "stream",
199
+ "text": [
200
+ "2024-05-30 06:24:29.336227: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
201
+ "2024-05-30 06:24:29.336292: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
202
+ "2024-05-30 06:24:29.338088: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
203
+ "2024-05-30 06:24:30.997165: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
204
+ "2024-05-30:06:24:35,343 INFO [__main__.py:254] Verbosity set to INFO\n",
205
+ "2024-05-30:06:24:35,343 INFO [__main__.py:277] Including path: ./\n",
206
+ "2024-05-30:06:24:43,787 WARNING [__main__.py:293] --limit SHOULD ONLY BE USED FOR TESTING.REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.\n",
207
+ "2024-05-30:06:24:43,788 INFO [__main__.py:344] Selected Tasks: ['demo_boolq']\n",
208
+ "2024-05-30:06:24:43,790 INFO [evaluator.py:141] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234\n",
209
+ "2024-05-30:06:24:43,790 INFO [evaluator.py:178] Initializing hf model, with arguments: {'pretrained': 'EleutherAI/pythia-2.8b'}\n",
210
+ "2024-05-30:06:24:43,812 INFO [huggingface.py:165] Using device 'cuda'\n",
211
+ "/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
212
+ " warnings.warn(\n",
213
+ "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
214
+ "2024-05-30:06:25:03,269 WARNING [task.py:774] [Task: demo_boolq] metric acc is defined, but aggregation is not. using default aggregation=mean\n",
215
+ "2024-05-30:06:25:03,269 WARNING [task.py:786] [Task: demo_boolq] metric acc is defined, but higher_is_better is not. using default higher_is_better=True\n",
216
+ "2024-05-30:06:25:03,269 WARNING [task.py:774] [Task: demo_boolq] metric bleu is defined, but aggregation is not. using default aggregation=bleu\n",
217
+ "2024-05-30:06:25:03,269 WARNING [task.py:786] [Task: demo_boolq] metric bleu is defined, but higher_is_better is not. using default higher_is_better=True\n",
218
+ "2024-05-30:06:25:03,269 WARNING [task.py:774] [Task: demo_boolq] metric f1 is defined, but aggregation is not. using default aggregation=f1\n",
219
+ "2024-05-30:06:25:03,269 WARNING [task.py:786] [Task: demo_boolq] metric f1 is defined, but higher_is_better is not. using default higher_is_better=True\n",
220
+ "/usr/local/lib/python3.10/dist-packages/datasets/load.py:1486: FutureWarning: The repository for super_glue contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/super_glue\n",
221
+ "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
222
+ "Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n",
223
+ " warnings.warn(\n",
224
+ "2024-05-30:06:25:06,006 INFO [task.py:398] Building contexts for demo_boolq on rank 0...\n",
225
+ "100% 20/20 [00:00<00:00, 1266.87it/s]\n",
226
+ "2024-05-30:06:25:06,024 INFO [evaluator.py:395] Running loglikelihood requests\n",
227
+ "Running loglikelihood requests: 100% 40/40 [00:02<00:00, 14.95it/s]\n",
228
+ "/usr/lib/python3.10/multiprocessing/popen_fork.py:66: RuntimeWarning: os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.\n",
229
+ " self.pid = os.fork()\n",
230
+ "bootstrapping for stddev: f1_score\n",
231
+ "100% 100/100 [01:59<00:00, 1.20s/it]\n",
232
+ "fatal: not a git repository (or any of the parent directories): .git\n",
233
+ "2024-05-30:06:27:09,982 INFO [evaluation_tracker.py:132] Saving results aggregated\n",
234
+ "2024-05-30:06:27:09,983 INFO [evaluation_tracker.py:203] Saving samples results\n",
235
+ "hf (pretrained=EleutherAI/pythia-2.8b), gen_kwargs: (None), limit: 20.0, num_fewshot: None, batch_size: 1\n",
236
+ "| Tasks |Version|Filter|n-shot|Metric|Value | |Stderr|\n",
237
+ "|----------|-------|------|-----:|------|-----:|---|-----:|\n",
238
+ "|demo_boolq|Yaml |none | 0|acc |0.7500|± |0.0993|\n",
239
+ "| | |none | 0|f1 |0.8485|± |0.0690|\n",
240
+ "\n"
241
+ ]
242
+ }
243
+ ],
244
+ "source": [
245
+ "!lm_eval \\\n",
246
+ " --model hf \\\n",
247
+ " --model_args pretrained=EleutherAI/pythia-2.8b \\\n",
248
+ " --include_path ./ \\\n",
249
+ " --tasks demo_boolq \\\n",
250
+ " --output output/ \\\n",
251
+ " --limit 20 \\\n",
252
+ " --log_samples"
253
+ ]
254
+ },
255
+ {
256
+ "cell_type": "code",
257
+ "execution_count": 22,
258
+ "metadata": {
259
+ "colab": {
260
+ "base_uri": "https://localhost:8080/"
261
+ },
262
+ "id": "HDyMUJieyX-S",
263
+ "outputId": "2307e7c9-fbcc-467e-8780-107924666e54"
264
+ },
265
+ "outputs": [
266
+ {
267
+ "name": "stdout",
268
+ "output_type": "stream",
269
+ "text": [
270
+ "2024-05-30 06:27:14.929536: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
271
+ "2024-05-30 06:27:14.929584: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
272
+ "2024-05-30 06:27:14.930843: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
273
+ "2024-05-30 06:27:16.588649: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
274
+ "2024-05-30:06:27:23,447 INFO [__main__.py:254] Verbosity set to INFO\n",
275
+ "2024-05-30:06:27:23,447 INFO [__main__.py:277] Including path: ./\n",
276
+ "2024-05-30:06:27:29,860 WARNING [__main__.py:293] --limit SHOULD ONLY BE USED FOR TESTING.REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT.\n",
277
+ "2024-05-30:06:27:29,861 INFO [__main__.py:344] Selected Tasks: ['demo_boolq']\n",
278
+ "2024-05-30:06:27:29,863 INFO [evaluator.py:141] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234\n",
279
+ "2024-05-30:06:27:29,863 INFO [evaluator.py:178] Initializing hf model, with arguments: {'pretrained': 'mistralai/Mistral-7B-v0.1'}\n",
280
+ "2024-05-30:06:27:29,885 INFO [huggingface.py:165] Using device 'cuda'\n",
281
+ "/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
282
+ " warnings.warn(\n",
283
+ "Loading checkpoint shards: 100% 2/2 [01:01<00:00, 30.54s/it]\n",
284
+ "2024-05-30:06:28:33,160 WARNING [task.py:774] [Task: demo_boolq] metric acc is defined, but aggregation is not. using default aggregation=mean\n",
285
+ "2024-05-30:06:28:33,160 WARNING [task.py:786] [Task: demo_boolq] metric acc is defined, but higher_is_better is not. using default higher_is_better=True\n",
286
+ "2024-05-30:06:28:33,160 WARNING [task.py:774] [Task: demo_boolq] metric bleu is defined, but aggregation is not. using default aggregation=bleu\n",
287
+ "2024-05-30:06:28:33,160 WARNING [task.py:786] [Task: demo_boolq] metric bleu is defined, but higher_is_better is not. using default higher_is_better=True\n",
288
+ "2024-05-30:06:28:33,160 WARNING [task.py:774] [Task: demo_boolq] metric f1 is defined, but aggregation is not. using default aggregation=f1\n",
289
+ "2024-05-30:06:28:33,160 WARNING [task.py:786] [Task: demo_boolq] metric f1 is defined, but higher_is_better is not. using default higher_is_better=True\n",
290
+ "/usr/local/lib/python3.10/dist-packages/datasets/load.py:1486: FutureWarning: The repository for super_glue contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/super_glue\n",
291
+ "You can avoid this message in future by passing the argument `trust_remote_code=True`.\n",
292
+ "Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.\n",
293
+ " warnings.warn(\n",
294
+ "2024-05-30:06:28:35,330 INFO [task.py:398] Building contexts for demo_boolq on rank 0...\n",
295
+ "100% 20/20 [00:00<00:00, 1841.06it/s]\n",
296
+ "2024-05-30:06:28:35,342 INFO [evaluator.py:395] Running loglikelihood requests\n",
297
+ "Running loglikelihood requests: 100% 40/40 [00:22<00:00, 1.80it/s]\n",
298
+ "/usr/lib/python3.10/multiprocessing/popen_fork.py:66: RuntimeWarning: os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.\n",
299
+ " self.pid = os.fork()\n",
300
+ "bootstrapping for stddev: f1_score\n",
301
+ "100% 100/100 [02:00<00:00, 1.20s/it]\n",
302
+ "fatal: not a git repository (or any of the parent directories): .git\n",
303
+ "2024-05-30:06:30:59,045 INFO [evaluation_tracker.py:132] Saving results aggregated\n",
304
+ "2024-05-30:06:30:59,046 INFO [evaluation_tracker.py:203] Saving samples results\n",
305
+ "hf (pretrained=mistralai/Mistral-7B-v0.1), gen_kwargs: (None), limit: 20.0, num_fewshot: None, batch_size: 1\n",
306
+ "| Tasks |Version|Filter|n-shot|Metric|Value| |Stderr|\n",
307
+ "|----------|-------|------|-----:|------|----:|---|-----:|\n",
308
+ "|demo_boolq|Yaml |none | 0|acc |0.800|± |0.0918|\n",
309
+ "| | |none | 0|f1 |0.875|± |0.0642|\n",
310
+ "\n"
311
+ ]
312
+ }
313
+ ],
314
+ "source": [
315
+ "!lm_eval \\\n",
316
+ " --model hf \\\n",
317
+ " --model_args pretrained=mistralai/Mistral-7B-v0.1 \\\n",
318
+ " --include_path ./ \\\n",
319
+ " --tasks demo_boolq \\\n",
320
+ " --output output/ \\\n",
321
+ " --limit 20 \\\n",
322
+ " --log_samples"
323
+ ]
324
+ },
325
+ {
326
+ "cell_type": "markdown",
327
+ "metadata": {
328
+ "id": "ivXfua4qLggD"
329
+ },
330
+ "source": [
331
+ "# Convert to Analytics Platform JSON\n"
332
+ ]
333
+ },
334
+ {
335
+ "cell_type": "markdown",
336
+ "metadata": {
337
+ "id": "qjyOXzRvMBQs"
338
+ },
339
+ "source": [
340
+ "### Let's start with defining the `name`, `models`, and `metrics` we used in this demo\n"
341
+ ]
342
+ },
343
+ {
344
+ "cell_type": "code",
345
+ "execution_count": 24,
346
+ "metadata": {
347
+ "id": "LNGj4ncsLqVq"
348
+ },
349
+ "outputs": [],
350
+ "source": [
351
+ "name = \"LM Evaluation Harness Demo\"\n",
352
+ "\n",
353
+ "# models -> List[dict]\n",
354
+ "models = [\n",
355
+ " {\n",
356
+ " \"model_id\": \"EleutherAI/pythia-2.8b\",\n",
357
+ " \"name\": \"Pythia-2.9b\",\n",
358
+ " \"owner\": \"EleutherAI\",\n",
359
+ " },\n",
360
+ " {\n",
361
+ " \"model_id\": \"mistralai/Mistral-7B-v0.1\",\n",
362
+ " \"name\": \"Mistral-7B-v0.1\",\n",
363
+ " \"owner\": \"Mistral AI\",\n",
364
+ " },\n",
365
+ "]\n",
366
+ "\n",
367
+ "# metrics -> List[dict]\n",
368
+ "all_metrics = [\n",
369
+ " {\n",
370
+ " \"name\": \"F1\",\n",
371
+ " \"display_name\": \"F1\",\n",
372
+ " \"description\": \"F1 score \",\n",
373
+ " \"author\": \"algorithm\",\n",
374
+ " \"type\": \"numerical\",\n",
375
+ " \"aggregator\": \"average\",\n",
376
+ " \"range\": [0, 1.0, 0.1],\n",
377
+ " },\n",
378
+ " {\n",
379
+ " \"name\": \"Accuracy\",\n",
380
+ " \"display_name\": \"Accuracy\",\n",
381
+ " \"description\": \"Prediction accuracy\",\n",
382
+ " \"author\": \"algorithm\",\n",
383
+ " \"type\": \"numerical\",\n",
384
+ " \"aggregator\": \"average\",\n",
385
+ " \"range\": [0, 1.0, 0.1],\n",
386
+ " },\n",
387
+ "]"
388
+ ]
389
+ },
390
+ {
391
+ "cell_type": "markdown",
392
+ "metadata": {
393
+ "id": "HntEhvugQt2Y"
394
+ },
395
+ "source": [
396
+ "## Now let's define `tasks`, `documents`, and `evaluations`\n"
397
+ ]
398
+ },
399
+ {
400
+ "cell_type": "code",
401
+ "execution_count": 27,
402
+ "metadata": {
403
+ "id": "9yRse3PQQsxb"
404
+ },
405
+ "outputs": [],
406
+ "source": [
407
+ "import json\n",
408
+ "\n",
409
+ "outputs = []\n",
410
+ "\n",
411
+ "# modify output filepath for pythia-2.8b here\n",
412
+ "with open(\n",
413
+ " \"output/EleutherAI__pythia-2.8b/samples_demo_boolq_2024-05-30T02-24-44.249027.json\",\n",
414
+ " \"r\",\n",
415
+ ") as f:\n",
416
+ " model_1_samples = json.load(f)\n",
417
+ "\n",
418
+ "# modify output filepath for Mistral-7B-v0.1 here\n",
419
+ "with open(\n",
420
+ " \"output/mistralai__Mistral-7B-v0.1/samples_demo_boolq_2024-05-30T02-28-34.024454.json\",\n",
421
+ " \"r\",\n",
422
+ ") as f:\n",
423
+ " model_2_samples = json.load(f)\n",
424
+ "\n",
425
+ "all_tasks = []\n",
426
+ "all_documents = []\n",
427
+ "all_evaluations = []\n",
428
+ "for model_1_sample, model_2_sample in zip(model_1_samples, model_2_samples):\n",
429
+ " assert model_1_sample[\"doc_id\"] == model_2_sample[\"doc_id\"]\n",
430
+ " doc_id = model_1_sample[\"doc_id\"]\n",
431
+ " content_1 = model_1_sample.get(\"doc\")\n",
432
+ " content_2 = model_2_sample.get(\"doc\")\n",
433
+ " passage_text = content_1.get(\"passage\")\n",
434
+ " document = {\"document_id\": f\"doc_{doc_id}\", \"text\": passage_text}\n",
435
+ "\n",
436
+ " all_documents.extend([document])\n",
437
+ " instance = {\n",
438
+ " \"task_id\": f\"{doc_id}\",\n",
439
+ " \"task_type\": \"conversation\",\n",
440
+ " \"contexts\": [{\"document_id\": document[\"document_id\"]}],\n",
441
+ " \"input\": [{\"speaker\": \"user\", \"text\": f\"{model_1_sample['doc']['question']}\"}],\n",
442
+ " \"targets\": [{\"text\": \"yes\" if model_1_sample[\"target\"] else \"no\"}],\n",
443
+ " }\n",
444
+ " all_tasks.append(instance)\n",
445
+ "\n",
446
+ " for i, pred in enumerate([model_1_sample, model_2_sample]):\n",
447
+ " model_id = models[i][\"model_id\"]\n",
448
+ " target = \"yes\" if pred[\"target\"] else \"no\"\n",
449
+ " prediction = (\n",
450
+ " \"no\"\n",
451
+ " if pred[\"filtered_resps\"][0][0] > pred[\"filtered_resps\"][1][0]\n",
452
+ " else \"yes\"\n",
453
+ " )\n",
454
+ " all_evaluations.append(\n",
455
+ " {\n",
456
+ " \"task_id\": f\"{doc_id}\",\n",
457
+ " \"model_id\": model_id,\n",
458
+ " \"model_response\": prediction,\n",
459
+ " \"annotations\": {\n",
460
+ " \"Accuracy\": {\n",
461
+ " \"system\": {\n",
462
+ " \"value\": 1 if prediction == target else 0,\n",
463
+ " \"duration\": 0,\n",
464
+ " }\n",
465
+ " },\n",
466
+ " \"F1\": {\n",
467
+ " \"system\": {\n",
468
+ " \"value\": 1 if prediction == target else 0,\n",
469
+ " \"duration\": 0,\n",
470
+ " }\n",
471
+ " },\n",
472
+ " },\n",
473
+ " }\n",
474
+ " )"
475
+ ]
476
+ },
477
+ {
478
+ "cell_type": "code",
479
+ "execution_count": 29,
480
+ "metadata": {
481
+ "colab": {
482
+ "base_uri": "https://localhost:8080/"
483
+ },
484
+ "id": "NM_VZxEU5UiX",
485
+ "outputId": "7cb16261-b0ed-49dd-e2e2-0a1974c17f9f"
486
+ },
487
+ "outputs": [
488
+ {
489
+ "data": {
490
+ "text/plain": [
491
+ "(20, 20, 40)"
492
+ ]
493
+ },
494
+ "execution_count": 29,
495
+ "metadata": {},
496
+ "output_type": "execute_result"
497
+ }
498
+ ],
499
+ "source": [
500
+ "len(all_tasks), len(all_documents), len(all_evaluations)"
501
+ ]
502
+ },
503
+ {
504
+ "cell_type": "markdown",
505
+ "metadata": {
506
+ "id": "bekGOYtEcABN"
507
+ },
508
+ "source": [
509
+ "## Now we can write the output to file and import it into our dashboard for analysis :D\n"
510
+ ]
511
+ },
512
+ {
513
+ "cell_type": "code",
514
+ "execution_count": 30,
515
+ "metadata": {
516
+ "id": "3tjuCibsYzG7"
517
+ },
518
+ "outputs": [],
519
+ "source": [
520
+ "import json\n",
521
+ "\n",
522
+ "output = {\n",
523
+ " \"name\": name,\n",
524
+ " \"models\": models,\n",
525
+ " \"metrics\": all_metrics,\n",
526
+ " \"documents\": all_documents,\n",
527
+ " \"tasks\": all_tasks,\n",
528
+ " \"evaluations\": all_evaluations,\n",
529
+ "}\n",
530
+ "\n",
531
+ "with open(\n",
532
+ " file=\"lm-eval-harness-inspectorraget-demo.json\", mode=\"w\", encoding=\"utf-8\"\n",
533
+ ") as fp:\n",
534
+ " json.dump(output, fp, indent=4)"
535
+ ]
536
+ },
537
+ {
538
+ "cell_type": "code",
539
+ "execution_count": null,
540
+ "metadata": {
541
+ "id": "iIcWaE51cuAh"
542
+ },
543
+ "outputs": [],
544
+ "source": []
545
+ },
546
+ {
547
+ "cell_type": "markdown",
548
+ "metadata": {
549
+ "id": "8BEkotPhx-_w"
550
+ },
551
+ "source": []
552
+ }
553
+ ],
554
+ "metadata": {
555
+ "accelerator": "GPU",
556
+ "colab": {
557
+ "gpuType": "T4",
558
+ "machine_shape": "hm",
559
+ "provenance": []
560
+ },
561
+ "kernelspec": {
562
+ "display_name": "Python 3",
563
+ "name": "python3"
564
+ },
565
+ "language_info": {
566
+ "name": "python"
567
+ }
568
+ },
569
+ "nbformat": 4,
570
+ "nbformat_minor": 0
571
+ }