Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
File size: 3,625 Bytes
f766ce9 9134169 f766ce9 8b7a945 f766ce9 8b7a945 3b83af7 f000c74 3b83af7 1a2dba5 2edd122 df659d0 9400714 3b83af7 df659d0 9134169 8b7a945 df659d0 8b7a945 9134169 df659d0 9134169 1a2dba5 df659d0 1a2dba5 df659d0 1a2dba5 9134169 3b83af7 9134169 2edd122 6d7eea4 df659d0 6d7eea4 2edd122 9400714 df659d0 9400714 df659d0 9134169 f766ce9 9134169 f766ce9 8b7a945 f766ce9 f30cbcc 7ff98ba 9134169 f766ce9 9134169 a0387d8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
from dataclasses import dataclass, make_dataclass
from src.benchmarks import BenchmarksQA, BenchmarksLongDoc
def fields(raw_class):
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
# These classes are for user facing column names,
# to avoid having to change them all around the code
# when a modification is needed
@dataclass
class ColumnContent:
name: str
type: str
displayed_by_default: bool
hidden: bool = False
never_hidden: bool = False
COL_NAME_AVG = "Average ⬆️"
COL_NAME_RETRIEVAL_MODEL = "Retrieval Method"
COL_NAME_RERANKING_MODEL = "Reranking Model"
COL_NAME_RETRIEVAL_MODEL_LINK = "Retrieval Model LINK"
COL_NAME_RERANKING_MODEL_LINK = "Reranking Model LINK"
COL_NAME_RANK = "Rank 🏆"
COL_NAME_REVISION = "Revision"
COL_NAME_TIMESTAMP = "Submission Date"
COL_NAME_IS_ANONYMOUS = "Anonymous Submission"
def get_default_auto_eval_column_dict():
auto_eval_column_dict = []
# Init
auto_eval_column_dict.append(
["rank", ColumnContent, ColumnContent(COL_NAME_RANK, "number", True)]
)
auto_eval_column_dict.append(
["retrieval_model", ColumnContent, ColumnContent(COL_NAME_RETRIEVAL_MODEL, "markdown", True, hidden=False, never_hidden=True)]
)
auto_eval_column_dict.append(
["reranking_model", ColumnContent, ColumnContent(COL_NAME_RERANKING_MODEL, "markdown", True, hidden=False, never_hidden=True)]
)
auto_eval_column_dict.append(
["revision", ColumnContent, ColumnContent(COL_NAME_REVISION, "markdown", True, never_hidden=True)]
)
auto_eval_column_dict.append(
["timestamp", ColumnContent, ColumnContent(COL_NAME_TIMESTAMP, "date", True, never_hidden=True)]
)
auto_eval_column_dict.append(
["average", ColumnContent, ColumnContent(COL_NAME_AVG, "number", True)]
)
auto_eval_column_dict.append(
["retrieval_model_link", ColumnContent, ColumnContent(COL_NAME_RETRIEVAL_MODEL_LINK, "markdown", False, hidden=True, never_hidden=False)]
)
auto_eval_column_dict.append(
["reranking_model_link", ColumnContent, ColumnContent(COL_NAME_RERANKING_MODEL_LINK, "markdown", False, hidden=True, never_hidden=False)]
)
auto_eval_column_dict.append(
["is_anonymous", ColumnContent, ColumnContent(COL_NAME_IS_ANONYMOUS, "bool", False, hidden=True)]
)
return auto_eval_column_dict
def make_autoevalcolumn(cls_name="BenchmarksQA", benchmarks=BenchmarksQA):
auto_eval_column_dict = get_default_auto_eval_column_dict()
## Leaderboard columns
for benchmark in benchmarks:
auto_eval_column_dict.append(
[benchmark.name, ColumnContent, ColumnContent(benchmark.value.col_name, "number", True)]
)
# We use make dataclass to dynamically fill the scores from Tasks
return make_dataclass(cls_name, auto_eval_column_dict, frozen=True)
AutoEvalColumnQA = make_autoevalcolumn(
"AutoEvalColumnQA", BenchmarksQA)
AutoEvalColumnLongDoc = make_autoevalcolumn(
"AutoEvalColumnLongDoc", BenchmarksLongDoc)
# Column selection
COLS_QA = [c.name for c in fields(AutoEvalColumnQA) if not c.hidden]
COLS_LONG_DOC = [c.name for c in fields(AutoEvalColumnLongDoc) if not c.hidden]
TYPES_QA = [c.type for c in fields(AutoEvalColumnQA) if not c.hidden]
TYPES_LONG_DOC = [c.type for c in fields(AutoEvalColumnLongDoc) if not c.hidden]
COLS_LITE = [c.name for c in fields(AutoEvalColumnQA) if c.displayed_by_default and not c.hidden]
QA_BENCHMARK_COLS = [t.value.col_name for t in BenchmarksQA]
LONG_DOC_BENCHMARK_COLS = [t.value.col_name for t in BenchmarksLongDoc]
|