from dataclasses import dataclass from enum import Enum @dataclass(frozen=True) class Task: benchmark: str metric: str col_name: str type: str baseline: float = 0.0 # Select your tasks here # --------------------------------------------------- class Tasks(Enum): # task_key in the json file, metric_key in the json file, name to display in the leaderboard # task2 = Task("belebele_pol_Latn", "acc,none", "belebele_pol_Latn", "multiple_choice", 0.279) # task3 = Task("polemo2_in", "exact_match,score-first", "polemo2-in_g", "generate_until", 0.416) # task4 = Task("polemo2_in_multiple_choice", "acc,none", "polemo2-in_mc", "multiple_choice", 0.416) # task5 = Task("polemo2_out", "exact_match,score-first", "polemo2-out_g", "generate_until", 0.368) # task6 = Task("polemo2_out_multiple_choice", "acc,none", "polemo2-out_mc", "multiple_choice", 0.368) # task7 = Task("polish_8tags_multiple_choice", "acc,none", "8tags_mc", "multiple_choice", 0.143) # task8 = Task("polish_8tags_regex", "exact_match,score-first", "8tags_g", "generate_until", 0.143) # task9a = Task("polish_belebele_mc", "acc,none", "belebele_mc", "multiple_choice", 0.279) # task9 = Task("polish_belebele_regex", "exact_match,score-first", "belebele_g", "generate_until", 0.279) # task10 = Task("polish_dyk_multiple_choice", "f1,none", "dyk_mc", "multiple_choice", 0.289) # task11 = Task("polish_dyk_regex", "f1,score-first", "dyk_g", "generate_until", 0.289) # task12 = Task("polish_ppc_multiple_choice", "acc,none", "ppc_mc", "multiple_choice", 0.419) # task13 = Task("polish_ppc_regex", "exact_match,score-first", "ppc_g", "generate_until", 0.419) # task14 = Task("polish_psc_multiple_choice", "f1,none", "psc_mc", "multiple_choice", 0.466) # task15 = Task("polish_psc_regex", "f1,score-first", "psc_g", "generate_until", 0.466) # task16 = Task("polish_cbd_multiple_choice", "f1,none", "cbd_mc", "multiple_choice", 0.149) # task17 = Task("polish_cbd_regex", "f1,score-first", "cbd_g", "generate_until", 0.149) # task18 = Task("polish_klej_ner_multiple_choice", "acc,none", "klej_ner_mc", "multiple_choice", 0.343) # task19 = Task("polish_klej_ner_regex", "exact_match,score-first", "klej_ner_g", "generate_until", 0.343) # task21 = Task("polish_polqa_reranking_multiple_choice", "acc,none", "polqa_reranking_mc", "multiple_choice", 0.5335588952710677) # multiple_choice # task22 = Task("polish_polqa_open_book", "levenshtein,none", "polqa_open_book_g", "generate_until", 0.0) # generate_until # task23 = Task("polish_polqa_closed_book", "levenshtein,none", "polqa_closed_book_g", "generate_until", 0.0) # generate_until # task24 = Task("polish_poquad_open_book", "levenshtein,none", "poquad_open_book", "generate_until", 0.0) # task25 = Task("polish_eq_bench_first_turn", "first_eqbench,none", "eq_bench_first_turn", "generate_until", 0.0) # task26 = Task("polish_eq_bench", "average_eqbench,none", "eq_bench", "generate_until", 0.0) # task20 = Task("polish_poleval2018_task3_test_10k", "word_perplexity,none", "poleval2018_task3_test_10k", "other") # task27 = Task("polish_poquad_reranking", "acc,none", "poquad_reranking", "other", 0.0) # task28 = Task("polish_abstractive_poquad_rag", "levenshtein,none", "abstractive_poquad_rag", "other", 0.0) # task29 = Task("polish_abstractive_poquad_open_book", "levenshtein,none", "abstractive_poquad_open_book", "other", 0.0) task30a = Task("polish_pes", "exact_match,score-first", "pes", "generate_until", 0.0) task60 = Task("polish_pes_medycyna_rodzinna", "exact_match,score-first", "medycyna_rodzinna", "generate_until", 0.0) task73 = Task("polish_pes_pediatria", "exact_match,score-first", "pediatria", "generate_until", 0.0) task30 = Task("polish_pes_alergologia", "exact_match,score-first", "alergologia", "generate_until", 0.0) task31 = Task("polish_pes_anestezjologia", "exact_match,score-first", "anestezjologia", "generate_until", 0.0) task32 = Task("polish_pes_angiologia", "exact_match,score-first", "angiologia", "generate_until", 0.0) task33 = Task("polish_pes_balneologia_i_medycyna_fizykalna", "exact_match,score-first", "balneologia_i_medycyna_fizykalna", "generate_until", 0.0) task34 = Task("polish_pes_chirurgia_dziecieca", "exact_match,score-first", "chirurgia_dziecieca", "generate_until", 0.0) task35 = Task("polish_pes_chirurgia_naczyniowa", "exact_match,score-first", "chirurgia_naczyniowa", "generate_until", 0.0) task36 = Task("polish_pes_chirurgia_ogolna", "exact_match,score-first", "chirurgia_ogolna", "generate_until", 0.0) task37 = Task("polish_pes_chirurgia_onkologiczna", "exact_match,score-first", "chirurgia_onkologiczna", "generate_until", 0.0) task38 = Task("polish_pes_chirurgia_stomatologiczna", "exact_match,score-first", "chirurgia_stomatologiczna", "generate_until", 0.0) task39 = Task("polish_pes_chirurgia_szczekowo-twarzowa", "exact_match,score-first", "chirurgia_szczekowo-twarzowa", "generate_until", 0.0) task40 = Task("polish_pes_choroby_pluc", "exact_match,score-first", "choroby_pluc", "generate_until", 0.0) task41 = Task("polish_pes_choroby_pluc_dzieci", "exact_match,score-first", "choroby_pluc_dzieci", "generate_until", 0.0) task42 = Task("polish_pes_choroby_wewnetrzne", "exact_match,score-first", "choroby_wewnetrzne", "generate_until", 0.0) task43 = Task("polish_pes_choroby_zakazne", "exact_match,score-first", "choroby_zakazne", "generate_until", 0.0) task44 = Task("polish_pes_dermatologia_i_wenerologia", "exact_match,score-first", "dermatologia_i_wenerologia", "generate_until", 0.0) task45 = Task("polish_pes_diabetologia", "exact_match,score-first", "diabetologia", "generate_until", 0.0) task46 = Task("polish_pes_endokrynologia", "exact_match,score-first", "endokrynologia", "generate_until", 0.0) task47 = Task("polish_pes_endokrynologia_ginekologiczna_i_rozrodczosc", "exact_match,score-first", "endokrynologia_ginekologiczna_i_rozrodczosc", "generate_until", 0.0) task48 = Task("polish_pes_endokrynologia_i_diabetologia_dziecieca", "exact_match,score-first", "endokrynologia_i_diabetologia_dziecieca", "generate_until", 0.0) task49 = Task("polish_pes_gastroenterologia", "exact_match,score-first", "gastroenterologia", "generate_until", 0.0) task50 = Task("polish_pes_gastroenterologia_dziecieca", "exact_match,score-first", "gastroenterologia_dziecieca", "generate_until", 0.0) task51 = Task("polish_pes_geriatria", "exact_match,score-first", "geriatria", "generate_until", 0.0) task52 = Task("polish_pes_ginekologia_onkologiczna", "exact_match,score-first", "ginekologia_onkologiczna", "generate_until", 0.0) task53 = Task("polish_pes_hematologia", "exact_match,score-first", "hematologia", "generate_until", 0.0) task54 = Task("polish_pes_hipertensjologia", "exact_match,score-first", "hipertensjologia", "generate_until", 0.0) task55 = Task("polish_pes_kardiochirurgia", "exact_match,score-first", "kardiochirurgia", "generate_until", 0.0) task56 = Task("polish_pes_kardiologia", "exact_match,score-first", "kardiologia", "generate_until", 0.0) task57 = Task("polish_pes_medycyna_pracy", "exact_match,score-first", "medycyna_pracy", "generate_until", 0.0) task58 = Task("polish_pes_medycyna_paliatywna", "exact_match,score-first", "medycyna_paliatywna", "generate_until", 0.0) task59 = Task("polish_pes_medycyna_ratunkowa", "exact_match,score-first", "medycyna_ratunkowa", "generate_until", 0.0) task61 = Task("polish_pes_medycyna_sportowa", "exact_match,score-first", "medycyna_sportowa", "generate_until", 0.0) task62 = Task("polish_pes_nefrologia", "exact_match,score-first", "nefrologia", "generate_until", 0.0) task63 = Task("polish_pes_neonatologia", "exact_match,score-first", "neonatologia", "generate_until", 0.0) task64 = Task("polish_pes_neurochirurgia", "exact_match,score-first", "neurochirurgia", "generate_until", 0.0) task65 = Task("polish_pes_neurologia", "exact_match,score-first", "neurologia", "generate_until", 0.0) task66 = Task("polish_pes_neurologia_dziecieca", "exact_match,score-first", "neurologia_dziecieca", "generate_until", 0.0) task67 = Task("polish_pes_okulistyka", "exact_match,score-first", "okulistyka", "generate_until", 0.0) task68 = Task("polish_pes_onkologia_kliniczna", "exact_match,score-first", "onkologia_kliniczna", "generate_until", 0.0) task69 = Task("polish_pes_ortodoncja", "exact_match,score-first", "ortodoncja", "generate_until", 0.0) task70 = Task("polish_pes_ortopedia", "exact_match,score-first", "ortopedia", "generate_until", 0.0) task71 = Task("polish_pes_otolaryngologia", "exact_match,score-first", "otolaryngologia", "generate_until", 0.0) task72 = Task("polish_pes_patomorfologia", "exact_match,score-first", "patomorfologia", "generate_until", 0.0) task74 = Task("polish_pes_perinatologia", "exact_match,score-first", "perinatologia", "generate_until", 0.0) task75 = Task("polish_pes_periodontologia", "exact_match,score-first", "periodontologia", "generate_until", 0.0) task76 = Task("polish_pes_poloznictwo_i_ginekologia", "exact_match,score-first", "poloznictwo_i_ginekologia", "generate_until", 0.0) task77 = Task("polish_pes_protetyka_stomatologiczna", "exact_match,score-first", "protetyka_stomatologiczna", "generate_until", 0.0) task78 = Task("polish_pes_psychiatria", "exact_match,score-first", "psychiatria", "generate_until", 0.0) task79 = Task("polish_pes_psychiatria_dzieci_i_mlodziezy", "exact_match,score-first", "psychiatria_dzieci_i_mlodziezy", "generate_until", 0.0) task80 = Task("polish_pes_radiologia_i_diagnostyka_obrazowa", "exact_match,score-first", "radiologia_i_diagnostyka_obrazowa", "generate_until", 0.0) task81 = Task("polish_pes_radioterapia_onkologiczna", "exact_match,score-first", "radioterapia_onkologiczna", "generate_until", 0.0) task82 = Task("polish_pes_rehabilitacja_medyczna", "exact_match,score-first", "rehabilitacja_medyczna", "generate_until", 0.0) task83 = Task("polish_pes_reumatologia", "exact_match,score-first", "reumatologia", "generate_until", 0.0) task84 = Task("polish_pes_stomatologia_dziecieca", "exact_match,score-first", "stomatologia_dziecieca", "generate_until", 0.0) task85 = Task("polish_pes_stomatologia_zachowawcza", "exact_match,score-first", "stomatologia_zachowawcza", "generate_until", 0.0) task86 = Task("polish_pes_transplantologia_kliniczna", "exact_match,score-first", "transplantologia_kliniczna", "generate_until", 0.0) g_tasks = [task.value.benchmark for task in Tasks if task.value.type == "generate_until"] mc_tasks = [task.value.benchmark for task in Tasks if task.value.type == "multiple_choice"] rag_tasks = ['polish_polqa_reranking_multiple_choice', 'polish_polqa_open_book', 'polish_poquad_open_book'] all_tasks = g_tasks + mc_tasks NUM_FEWSHOT = 0 # Change with your few shot # --------------------------------------------------- # Your leaderboard name TITLE = """