File size: 2,023 Bytes
894c4b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c79b12
894c4b4
 
 
 
 
 
90dff75
 
bcdca08
73d1e6e
bcdca08
 
 
73d1e6e
bcdca08
23a137b
 
73d1e6e
e598f52
 
73d1e6e
e598f52
894c4b4
7e267bf
 
 
5999035
f21645c
9aa52c9
 
53c755d
39b4e9f
62679c8
 
21eac98
 
62ea587
a117804
6c79b12
894c4b4
b1a5839
894c4b4
 
 
7e68bad
894c4b4
f9d415e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import os

import torch

from dataclasses import dataclass
from enum import Enum

from src.envs import CACHE_PATH


@dataclass
class Task:
    benchmark: str
    metric: str
    col_name: str
    num_fewshot: int


class Tasks(Enum):
    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
    # task0 = Task("anli_r1", "acc", "ANLI")
    # task1 = Task("logiqa", "acc_norm", "LogiQA")
    task0 = Task("nq_open", "em", "NQ Open", 64)  # 64, as in the ATLAS paper
    task1 = Task("triviaqa", "em", "TriviaQA", 64)  # 64, as in the ATLAS paper
    # TruthfulQA is intended as a zero-shot benchmark [5, 47]. https://owainevans.github.io/pdfs/truthfulQA_lin_evans.pdf

    # task2 = Task("truthfulqa_gen", "rougeL_acc", "TruthfulQA Gen", 0)
    task3 = Task("truthfulqa_mc1", "acc", "TruthfulQA MC1", 0)
    task4 = Task("truthfulqa_mc2", "acc", "TruthfulQA MC2", 0)

    task5 = Task("halueval_qa", "acc", "HaluEval QA", 0)
    task6 = Task("halueval_dialogue", "acc", "HaluEval Dialogue", 0)
    task7 = Task("halueval_summarization", "acc", "HaluEval Summarization", 0)

    task8 = Task("xsum", "rougeL", "XSum", 2)
    task9 = Task("cnndm", "rougeL", "CNN/DM", 2)

    task10 = Task("memo-trap", "acc", "memo-trap", 0)

    task11 = Task("nq8", "em", "NQ Open 8", 8)
    task12 = Task("tqa8", "em", "TriviaQA 8", 8)

    task13 = Task("ifeval", "prompt_level_strict_acc", "IFEval", 0)

    task14 = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT", 0)

    task15 = Task("fever10", "acc", "FEVER", 16)

    task16 = Task("squadv2", "exact", "SQuADv2", 4)

    task17 = Task("truefalse_cieacf", "acc", "TrueFalse", 8)

    task18 = Task("faithdial_hallu", "acc", "FaithDial", 8)

# NUM_FEWSHOT = 64  # Change with your few shot


EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")

DEVICE = "cuda" if torch.cuda.is_available() else 'cpu'

LIMIT = None  # Testing; needs to be None