Spaces:

babylm
/

leaderboard-2024

Running

leaderboard-2024 / src /display /utils.py

Aaron Mueller

enumerate blimp tasks

1bbb1d0 2 months ago

10.7 kB

	from dataclasses import dataclass, make_dataclass
	from enum import Enum

	import pandas as pd

	from src.about import Tasks, TasksMultimodal

	def fields(raw_class):
	return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]


	# These classes are for user facing column names,
	# to avoid having to change them all around the code
	# when a modif is needed
	@dataclass
	class ColumnContent:
	name: str
	type: str
	displayed_by_default: bool
	hidden: bool = False
	never_hidden: bool = False

	## Leaderboard columns
	auto_eval_column_dict = []
	auto_eval_column_dict_multimodal = []
	# Init
	auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
	auto_eval_column_dict.append(["hf_repo", ColumnContent, ColumnContent("HF Repo", "str", False)])
	auto_eval_column_dict.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
	#Scores
	for task in Tasks:
	auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
	# Model information
	auto_eval_column_dict.append(["text_average", ColumnContent, ColumnContent("Text Average", "number", True)])
	auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
	auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])

	auto_eval_column_dict_multimodal.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
	auto_eval_column_dict_multimodal.append(["hf_repo", ColumnContent, ColumnContent("HF Repo", "str", False)])
	auto_eval_column_dict_multimodal.append(["track", ColumnContent, ColumnContent("Track", "markdown", False)])
	for task in TasksMultimodal:
	auto_eval_column_dict_multimodal.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
	if task.value.col_name in ("ewok", "EWoK"): # make sure this appears in the right order
	auto_eval_column_dict_multimodal.append(["text_average", ColumnContent, ColumnContent("Text Average", "number", True)])
	auto_eval_column_dict_multimodal.append(["vision_average", ColumnContent, ColumnContent("Vision Average", "number", True)])
	auto_eval_column_dict_multimodal.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
	auto_eval_column_dict_multimodal.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])

	# We use make dataclass to dynamically fill the scores from Tasks
	AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
	AutoEvalColumnMultimodal = make_dataclass("AutoEvalColumnMultimodal", auto_eval_column_dict_multimodal, frozen=True)

	## For the queue columns in the submission tab
	@dataclass(frozen=True)
	class EvalQueueColumn: # Queue column
	model = ColumnContent("model", "markdown", True)
	track = ColumnContent("track", "str", True)
	revision = ColumnContent("revision", "str", True)
	private = ColumnContent("private", "bool", True)
	status = ColumnContent("status", "str", True)

	## All the model information that we might need
	@dataclass
	class ModelDetails:
	name: str
	display_name: str = ""
	symbol: str = "" # emoji

	# Column selection
	COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
	COLS_MULTIMODAL = [c.name for c in fields(AutoEvalColumnMultimodal) if not c.hidden]

	EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
	EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]

	BENCHMARK_COLS = [t.value.col_name for t in Tasks]
	BENCHMARK_COLS_MULTIMODAL = [t.value.col_name for t in TasksMultimodal]

	TEXT_TASKS = {
	"glue": ["cola", "sst2", "mrpc", "qqp", "mnli", "mnli-mm", "qnli", "rte",
	"boolq", "multirc", "wsc"],
	# Lots of BLiMP tasks – use verifier function below to see if you've included everything.
	"blimp": ["adjunct_island","anaphor_gender_agreement","anaphor_number_agreement","animate_subject_passive","animate_subject_trans",
	"causative","complex_NP_island","coordinate_structure_constraint_complex_left_branch","coordinate_structure_constraint_object_extraction","determiner_noun_agreement_1",
	"determiner_noun_agreement_2","determiner_noun_agreement_irregular_1","determiner_noun_agreement_irregular_2","determiner_noun_agreement_with_adjective_1",
	"determiner_noun_agreement_with_adj_2","determiner_noun_agreement_with_adj_irregular_1","determiner_noun_agreement_with_adj_irregular_2","distractor_agreement_relational_noun",
	"distractor_agreement_relative_clause","drop_argument","ellipsis_n_bar_1","ellipsis_n_bar_2",
	"existential_there_object_raising", "existential_there_quantifiers_1",
	"existential_there_quantifiers_2", "existential_there_subject_raising", "expletive_it_object_raising",
	"inchoative", "intransitive","irregular_past_participle_adjectives", "irregular_past_participle_verbs",
	"irregular_plural_subject_verb_agreement_1", "irregular_plural_subject_verb_agreement_2", "left_branch_island_echo_question", "left_branch_island_simple_question",
	"matrix_question_npi_licensor_present", "npi_present_1", "npi_present_2", "only_npi_licensor_present", "only_npi_scope", "passive_1", "passive_2",
	"principle_A_case_1", "principle_A_case_2", "principle_A_c_command", "principle_A_domain_1",
	"principle_A_domain_2", "principle_A_domain_3", "principle_A_reconstruction", "regular_plural_subject_verb_agreement_1",
	"regular_plural_subject_verb_agreement_2", "sentential_negation_npi_licensor_present", "sentential_negation_npi_scope", "sentential_subject_island",
	"superlative_quantifiers_1", "superlative_quantifiers_2", "tough_vs_raising_1", "tough_vs_raising_2",
	"transitive", "wh_island", "wh_questions_object_gap", "wh_questions_subject_gap",
	"wh_questions_subject_gap_long_distance", "wh_vs_that_no_gap", "wh_vs_that_no_gap_long_distance", "wh_vs_that_with_gap",
	"wh_vs_that_with_gap_long_distance"
	],
	"blimp_supplement": ["hypernym", "qa_congruence_easy", "qa_congruence_tricky",
	"subject_aux_inversion", "turn_taking"],
	"ewok": ["agent-properties", "material-dynamics", "material-properties", "physical-dynamics",
	"physical-interactions", "physical-relations", "quantitative-properties",
	"social-interactions", "social-properties", "social-relations", "spatial-relations"]
	}

	VISION_TASKS = {
	"vqa": ["vqa"],
	"winoground": ["winoground"],
	"devbench": ["lex-viz_vocab", "gram-trog", "sem-things"]
	}

	NUM_EXPECTED_EXAMPLES = {
	"glue": {
	"cola": 522,
	"sst2": 436,
	"mrpc": 204,
	"qqp": 20215,
	"mnli": 4908,
	"mnli-mm": 4916,
	"qnli": 2732,
	"rte": 139,
	"boolq": 1635,
	"multirc": 2424,
	"wsc": 52
	},
	"blimp": {
	"adjunct_island": 928,
	"anaphor_gender_agreement": 971,
	"anaphor_number_agreement": 931,
	"animate_subject_passive": 895,
	"animate_subject_trans": 923,
	"causative": 818,
	"complex_NP_island": 846,
	"coordinate_structure_constraint_complex_left_branch": 906,
	"coordinate_structure_constraint_object_extraction": 949,
	"determiner_noun_agreement_1": 929,
	"determiner_noun_agreement_2": 931,
	"determiner_noun_agreement_irregular_1": 681,
	"determiner_noun_agreement_irregular_2": 820,
	"determiner_noun_agreement_with_adjective_1": 933,
	"determiner_noun_agreement_with_adj_2": 941,
	"determiner_noun_agreement_with_adj_irregular_1": 718,
	"determiner_noun_agreement_with_adj_irregular_2": 840,
	"distractor_agreement_relational_noun": 788,
	"distractor_agreement_relative_clause": 871,
	"drop_argument": 920,
	"ellipsis_n_bar_1": 802,
	"ellipsis_n_bar_2": 828,
	"existential_there_object_raising": 812,
	"existential_there_quantifiers_1": 930,
	"existential_there_quantifiers_2": 911,
	"existential_there_subject_raising": 924,
	"expletive_it_object_raising": 759,
	"inchoative": 855,
	"intransitive": 868,
	"irregular_past_participle_adjectives": 961,
	"irregular_past_participle_verbs": 942,
	"irregular_plural_subject_verb_agreement_1": 804,
	"irregular_plural_subject_verb_agreement_2": 892,
	"left_branch_island_echo_question": 947,
	"left_branch_island_simple_question": 951,
	"matrix_question_npi_licensor_present": 929,
	"npi_present_1": 909,
	"npi_present_2": 914,
	"only_npi_licensor_present": 882,
	"only_npi_scope": 837,
	"passive_1": 840,
	"passive_2": 903,
	"principle_A_case_1": 912,
	"principle_A_case_2": 915,
	"principle_A_c_command": 946,
	"principle_A_domain_1": 914,
	"principle_A_domain_2": 915,
	"principle_A_domain_3": 941,
	"principle_A_reconstruction": 967,
	"regular_plural_subject_verb_agreement_1": 890,
	"regular_plural_subject_verb_agreement_2": 945,
	"sentential_negation_npi_licensor_present": 919,
	"sentential_negation_npi_scope": 871,
	"sentential_subject_island": 961,
	"superlative_quantifiers_1": 979,
	"superlative_quantifiers_2": 986,
	"tough_vs_raising_1": 948,
	"tough_vs_raising_2": 920,
	"transitive": 868,
	"wh_island": 960,
	"wh_questions_object_gap": 859,
	"wh_questions_subject_gap": 898,
	"wh_questions_subject_gap_long_distance": 857,
	"wh_vs_that_no_gap": 861,
	"wh_vs_that_no_gap_long_distance": 875,
	"wh_vs_that_with_gap": 919,
	"wh_vs_that_with_gap_long_distance": 910
	},
	"blimp_supplement": {
	"hypernym": 842,
	"qa_congruence_easy": 64,
	"qa_congruence_tricky": 165,
	"subject_aux_inversion": 3867,
	"turn_taking": 280
	},
	"ewok": {
	"agent-properties": 2210,
	"material-dynamics": 770,
	"material-properties": 170,
	"physical-dynamics": 120,
	"physical-interactions": 556,
	"physical-relations": 818,
	"quantitative-properties": 314,
	"social-interactions": 294,
	"social-properties": 328,
	"social-relations": 1548,
	"spatial-relations": 490
	},
	"vqa": {
	"vqa": 25230
	},
	"winoground": {
	"winoground": 746
	},
	"devbench": {
	"lex-viz_vocab": 119,
	"gram-trog": 76,
	"sem-things": 1854
	}
	}