In [17]:
# Install LM-Eval
!pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git

Collecting git+https://github.com/EleutherAI/lm-evaluation-harness.git
 Cloning https://github.com/EleutherAI/lm-evaluation-harness.git to /tmp/pip-req-build-j2xmmhxh
 Running command git clone --filter=blob:none --quiet https://github.com/EleutherAI/lm-evaluation-harness.git /tmp/pip-req-build-j2xmmhxh
 Resolved https://github.com/EleutherAI/lm-evaluation-harness.git to commit b4cd85d406938f94ee5d451840a0d69bbda27006
 Installing build dependencies ... [?25l[?25hdone
 Getting requirements to build wheel ... [?25l[?25hdone
 Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [18]:
from lm_eval import api

In [19]:
import os

HF_TOKEN = "" # generate a user access token from https://huggingface.co/settings/tokens and copy it here
os.environ["HF_TOKEN"] = HF_TOKEN

# Configure Evaluation


In [20]:
YAML_boolq_string = """
task: demo_boolq
dataset_path: super_glue
dataset_name: boolq
output_type: multiple_choice
training_split: train
validation_split: validation
doc_to_text: "{{passage}}\nQuestion: {{question}}?\nAnswer:"
doc_to_target: label
doc_to_choice: ["no", "yes"]
should_decontaminate: true
doc_to_decontamination_query: passage
metric_list:
 - metric: acc
 - metric: bleu
 - metric: f1
"""
with open("boolq.yaml", "w") as f:
 f.write(YAML_boolq_string)

In [21]:
!lm_eval \
 --model hf \
 --model_args pretrained=EleutherAI/pythia-2.8b \
 --include_path ./ \
 --tasks demo_boolq \
 --output output/ \
 --limit 20 \
 --log_samples

2024-05-30 06:24:29.336227: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-30 06:24:29.336292: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-30 06:24:29.338088: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-30:06:24:35,343 INFO [__main__.py:254] Verbosity set to INFO
2024-05-30:06:24:35,343 INFO [__main__.py:277] Including path: ./
2024-05-30:06:24:43,788 INFO [__main__.py:344] Selected Tasks: ['demo_boolq']
2024-05-30:06:24:43,790 INFO [evaluator.py:141] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234
2024-05-30:06:

In [22]:
!lm_eval \
 --model hf \
 --model_args pretrained=mistralai/Mistral-7B-v0.1 \
 --include_path ./ \
 --tasks demo_boolq \
 --output output/ \
 --limit 20 \
 --log_samples

2024-05-30 06:27:14.929536: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-30 06:27:14.929584: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-30 06:27:14.930843: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-30:06:27:23,447 INFO [__main__.py:254] Verbosity set to INFO
2024-05-30:06:27:23,447 INFO [__main__.py:277] Including path: ./
2024-05-30:06:27:29,861 INFO [__main__.py:344] Selected Tasks: ['demo_boolq']
2024-05-30:06:27:29,863 INFO [evaluator.py:141] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234
2024-05-30:06:

# Convert to Analytics Platform JSON


### Let's start with defining the `name`, `models`, and `metrics` we used in this demo


In [24]:
name = "LM Evaluation Harness Demo"

# models -> List[dict]
models = [
 {
 "model_id": "EleutherAI/pythia-2.8b",
 "name": "Pythia-2.9b",
 "owner": "EleutherAI",
 },
 {
 "model_id": "mistralai/Mistral-7B-v0.1",
 "name": "Mistral-7B-v0.1",
 "owner": "Mistral AI",
 },
]

# metrics -> List[dict]
all_metrics = [
 {
 "name": "F1",
 "display_name": "F1",
 "description": "F1 score ",
 "author": "algorithm",
 "type": "numerical",
 "aggregator": "average",
 "range": [0, 1.0, 0.1],
 },
 {
 "name": "Accuracy",
 "display_name": "Accuracy",
 "description": "Prediction accuracy",
 "author": "algorithm",
 "type": "numerical",
 "aggregator": "average",
 "range": [0, 1.0, 0.1],
 },
]

## Now let's define `tasks`, `documents`, and `evaluations`


In [27]:
import json

outputs = []

# modify output filepath for pythia-2.8b here
with open(
 "output/EleutherAI__pythia-2.8b/samples_demo_boolq_2024-05-30T02-24-44.249027.json",
 "r",
) as f:
 model_1_samples = json.load(f)

# modify output filepath for Mistral-7B-v0.1 here
with open(
 "output/mistralai__Mistral-7B-v0.1/samples_demo_boolq_2024-05-30T02-28-34.024454.json",
 "r",
) as f:
 model_2_samples = json.load(f)

all_tasks = []
all_documents = []
all_evaluations = []
for model_1_sample, model_2_sample in zip(model_1_samples, model_2_samples):
 assert model_1_sample["doc_id"] == model_2_sample["doc_id"]
 doc_id = model_1_sample["doc_id"]
 content_1 = model_1_sample.get("doc")
 content_2 = model_2_sample.get("doc")
 passage_text = content_1.get("passage")
 document = {"document_id": f"doc_{doc_id}", "text": passage_text}

 all_documents.extend([document])
 instance = {
 "task_id": f"{doc_id}",
 "task_type": "conversation",
 "contexts": [{"document_id": document["document_id"]}],
 "input": [{"speaker": "user", "text": f"{model_1_sample['doc']['question']}"}],
 "targets": [{"text": "yes" if model_1_sample["target"] else "no"}],
 }
 all_tasks.append(instance)

 for i, pred in enumerate([model_1_sample, model_2_sample]):
 model_id = models[i]["model_id"]
 target = "yes" if pred["target"] else "no"
 prediction = (
 "no"
 if pred["filtered_resps"][0][0] > pred["filtered_resps"][1][0]
 else "yes"
 )
 all_evaluations.append(
 {
 "task_id": f"{doc_id}",
 "model_id": model_id,
 "model_response": prediction,
 "annotations": {
 "Accuracy": {
 "system": {
 "value": 1 if prediction == target else 0,
 "duration": 0,
 }
 },
 "F1": {
 "system": {
 "value": 1 if prediction == target else 0,
 "duration": 0,
 }
 },
 },
 }
 )

In [29]:
len(all_tasks), len(all_documents), len(all_evaluations)

(20, 20, 40)

## Now we can write the output to file and import it into our dashboard for analysis :D


In [30]:
import json

output = {
 "name": name,
 "models": models,
 "metrics": all_metrics,
 "documents": all_documents,
 "tasks": all_tasks,
 "evaluations": all_evaluations,
}

with open(
 file="lm-eval-harness-inspectorraget-demo.json", mode="w", encoding="utf-8"
) as fp:
 json.dump(output, fp, indent=4)