{ "results": { "assin2_rte": { "f1_macro,all": 0.3333333333333333, "f1_macro_stderr,all": 0.0031730251394380704, "acc,all": 0.5, "acc_stderr,all": 0.007138073526203421, "alias": "assin2_rte" }, "assin2_sts": { "pearson,all": 0.07790738376477561, "pearson_stderr,all": 0.014716684813439029, "mse,all": 2.066646241830065, "mse_stderr,all": "N/A", "alias": "assin2_sts" }, "bluex": { "acc,all": 0.20584144645340752, "acc_stderr,all": 0.008708200533939503, "acc,exam_id__USP_2024": 0.24390243902439024, "acc_stderr,exam_id__USP_2024": 0.038765983941525854, "acc,exam_id__UNICAMP_2018": 0.2222222222222222, "acc_stderr,exam_id__UNICAMP_2018": 0.032540841899570724, "acc,exam_id__USP_2018": 0.16666666666666666, "acc_stderr,exam_id__USP_2018": 0.029320906143906797, "acc,exam_id__USP_2022": 0.20408163265306123, "acc_stderr,exam_id__USP_2022": 0.03318725036103681, "acc,exam_id__USP_2021": 0.1346153846153846, "acc_stderr,exam_id__USP_2021": 0.027253785013691273, "acc,exam_id__UNICAMP_2021_2": 0.27450980392156865, "acc_stderr,exam_id__UNICAMP_2021_2": 0.0360141604917446, "acc,exam_id__UNICAMP_2021_1": 0.2826086956521739, "acc_stderr,exam_id__UNICAMP_2021_1": 0.038353605844743385, "acc,exam_id__USP_2020": 0.08928571428571429, "acc_stderr,exam_id__USP_2020": 0.021921367122397676, "acc,exam_id__UNICAMP_2022": 0.3076923076923077, "acc_stderr,exam_id__UNICAMP_2022": 0.042727899536711536, "acc,exam_id__UNICAMP_2023": 0.16279069767441862, "acc_stderr,exam_id__UNICAMP_2023": 0.03238294764474062, "acc,exam_id__UNICAMP_2024": 0.3111111111111111, "acc_stderr,exam_id__UNICAMP_2024": 0.0398333923513593, "acc,exam_id__UNICAMP_2020": 0.18181818181818182, "acc_stderr,exam_id__UNICAMP_2020": 0.02992924246353086, "acc,exam_id__USP_2023": 0.18181818181818182, "acc_stderr,exam_id__USP_2023": 0.03345273573325805, "acc,exam_id__UNICAMP_2019": 0.2, "acc_stderr,exam_id__UNICAMP_2019": 0.03269771596389771, "acc,exam_id__USP_2019": 0.175, "acc_stderr,exam_id__USP_2019": 0.0348121283420538, "alias": "bluex" }, "enem_challenge": { "alias": "enem", "acc,all": 0.2092372288313506, "acc_stderr,all": 0.006229253057208555, "acc,exam_id__2016_2": 0.21138211382113822, "acc_stderr,exam_id__2016_2": 0.02123691588211098, "acc,exam_id__2023": 0.1925925925925926, "acc_stderr,exam_id__2023": 0.019583861187616968, "acc,exam_id__2013": 0.2037037037037037, "acc_stderr,exam_id__2013": 0.02238599079325693, "acc,exam_id__2012": 0.12931034482758622, "acc_stderr,exam_id__2012": 0.01803175905554286, "acc,exam_id__2009": 0.23478260869565218, "acc_stderr,exam_id__2009": 0.022814803582640184, "acc,exam_id__2022": 0.17293233082706766, "acc_stderr,exam_id__2022": 0.01889827424607104, "acc,exam_id__2015": 0.31932773109243695, "acc_stderr,exam_id__2015": 0.02472561230954832, "acc,exam_id__2014": 0.21100917431192662, "acc_stderr,exam_id__2014": 0.022513127008089658, "acc,exam_id__2016": 0.18181818181818182, "acc_stderr,exam_id__2016": 0.020248897347876847, "acc,exam_id__2010": 0.23076923076923078, "acc_stderr,exam_id__2010": 0.02246971773699712, "acc,exam_id__2017": 0.27586206896551724, "acc_stderr,exam_id__2017": 0.023994074423977864, "acc,exam_id__2011": 0.15384615384615385, "acc_stderr,exam_id__2011": 0.019264278502154123 }, "faquad_nli": { "f1_macro,all": 0.4396551724137931, "f1_macro_stderr,all": 0.0035796984729087084, "acc,all": 0.7846153846153846, "acc_stderr,all": 0.011396120309131327, "alias": "faquad_nli" }, "hatebr_offensive": { "alias": "hatebr_offensive_binary", "f1_macro,all": 0.43054708155379295, "f1_macro_stderr,all": 0.009093679844467082, "acc,all": 0.4742857142857143, "acc_stderr,all": 0.009437507998400261 }, "oab_exams": { "acc,all": 0.25968109339407747, "acc_stderr,all": 0.005403181658894358, "acc,exam_id__2017-22": 0.2375, "acc_stderr,exam_id__2017-22": 0.027511429390216682, "acc,exam_id__2016-20a": 0.2, "acc_stderr,exam_id__2016-20a": 0.02584175311098727, "acc,exam_id__2011-04": 0.2875, "acc_stderr,exam_id__2011-04": 0.02919454405528515, "acc,exam_id__2013-12": 0.2875, "acc_stderr,exam_id__2013-12": 0.029277381115049662, "acc,exam_id__2013-11": 0.325, "acc_stderr,exam_id__2013-11": 0.030286419424458838, "acc,exam_id__2010-02": 0.32, "acc_stderr,exam_id__2010-02": 0.026888774775418785, "acc,exam_id__2012-07": 0.2375, "acc_stderr,exam_id__2012-07": 0.02737558649609428, "acc,exam_id__2016-21": 0.25, "acc_stderr,exam_id__2016-21": 0.027994547544285982, "acc,exam_id__2015-17": 0.24358974358974358, "acc_stderr,exam_id__2015-17": 0.02793267139214751, "acc,exam_id__2015-18": 0.2625, "acc_stderr,exam_id__2015-18": 0.028396710161944567, "acc,exam_id__2014-14": 0.225, "acc_stderr,exam_id__2014-14": 0.026939185801353988, "acc,exam_id__2015-16": 0.3375, "acc_stderr,exam_id__2015-16": 0.030631826713546063, "acc,exam_id__2012-09": 0.23376623376623376, "acc_stderr,exam_id__2012-09": 0.02787359925121907, "acc,exam_id__2011-03": 0.26262626262626265, "acc_stderr,exam_id__2011-03": 0.025505720074946718, "acc,exam_id__2016-19": 0.2692307692307692, "acc_stderr,exam_id__2016-19": 0.028948751914583667, "acc,exam_id__2012-06a": 0.2375, "acc_stderr,exam_id__2012-06a": 0.027440075549438697, "acc,exam_id__2011-05": 0.2625, "acc_stderr,exam_id__2011-05": 0.02835789202564455, "acc,exam_id__2013-10": 0.2125, "acc_stderr,exam_id__2013-10": 0.026367641247603036, "acc,exam_id__2017-24": 0.25, "acc_stderr,exam_id__2017-24": 0.028053164455460838, "acc,exam_id__2016-20": 0.2625, "acc_stderr,exam_id__2016-20": 0.0283327789711091, "acc,exam_id__2012-08": 0.275, "acc_stderr,exam_id__2012-08": 0.02893752928626648, "acc,exam_id__2014-13": 0.2, "acc_stderr,exam_id__2014-13": 0.025784866156114444, "acc,exam_id__2018-25": 0.25, "acc_stderr,exam_id__2018-25": 0.027961840366717016, "acc,exam_id__2017-23": 0.3, "acc_stderr,exam_id__2017-23": 0.02953160157687412, "acc,exam_id__2014-15": 0.2564102564102564, "acc_stderr,exam_id__2014-15": 0.028456647275964232, "acc,exam_id__2012-06": 0.275, "acc_stderr,exam_id__2012-06": 0.028790584320040398, "acc,exam_id__2010-01": 0.23529411764705882, "acc_stderr,exam_id__2010-01": 0.026470002521428834, "alias": "oab_exams" }, "portuguese_hate_speech": { "alias": "portuguese_hate_speech_binary", "f1_macro,all": 0.35895915678524376, "f1_macro_stderr,all": 0.011204300451999685, "acc,all": 0.381903642773208, "acc_stderr,all": 0.011741654959752653 }, "tweetsentbr": { "f1_macro,all": 0.2114730555936155, "f1_macro_stderr,all": 0.0056538657419370805, "acc,all": 0.3169154228855721, "acc_stderr,all": 0.007343236186351586, "alias": "tweetsentbr" } }, "configs": { "assin2_rte": { "task": "assin2_rte", "group": [ "pt_benchmark", "assin2" ], "dataset_path": "assin2", "test_split": "test", "fewshot_split": "train", "doc_to_text": "Premissa: {{premise}}\nHipótese: {{hypothesis}}\nPergunta: A hipótese pode ser inferida pela premissa? Sim ou Não?\nResposta:", "doc_to_target": "{{['Não', 'Sim'][entailment_judgment]}}", "description": "Abaixo estão pares de premissa e hipótese. Para cada par, indique se a hipótese pode ser inferida a partir da premissa, responda apenas com \"Sim\" ou \"Não\".\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "id_sampler", "sampler_config": { "id_list": [ 1, 3251, 2, 3252, 3, 4, 5, 6, 3253, 7, 3254, 3255, 3256, 8, 9, 10, 3257, 11, 3258, 12, 13, 14, 15, 3259, 3260, 3261, 3262, 3263, 16, 17, 3264, 18, 3265, 3266, 3267, 19, 20, 3268, 3269, 21, 3270, 3271, 22, 3272, 3273, 23, 3274, 24, 25, 3275 ], "id_column": "sentence_pair_id" } }, "num_fewshot": 15, "metric_list": [ { "metric": "f1_macro", "aggregation": "f1_macro", "higher_is_better": true }, { "metric": "acc", "aggregation": "acc", "higher_is_better": true } ], "output_type": "generate_until", "generation_kwargs": { "max_gen_toks": 32, "do_sample": false, "temperature": 0.0, "top_k": null, "top_p": null, "until": [ "\n\n" ] }, "repeats": 1, "filter_list": [ { "name": "all", "filter": [ { "function": "find_similar_label", "labels": [ "Sim", "Não" ] }, { "function": "take_first" } ] } ], "should_decontaminate": false, "metadata": { "version": 1.1 } }, "assin2_sts": { "task": "assin2_sts", "group": [ "pt_benchmark", "assin2" ], "dataset_path": "assin2", "test_split": "test", "fewshot_split": "train", "doc_to_text": "Frase 1: {{premise}}\nFrase 2: {{hypothesis}}\nPergunta: Quão similares são as duas frases? Dê uma pontuação entre 1,0 a 5,0.\nResposta:", "doc_to_target": "", "description": "Abaixo estão pares de frases que você deve avaliar o grau de similaridade. Dê uma pontuação entre 1,0 e 5,0, sendo 1,0 pouco similar e 5,0 muito similar.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "id_sampler", "sampler_config": { "id_list": [ 1, 3251, 2, 3252, 3, 4, 5, 6, 3253, 7, 3254, 3255, 3256, 8, 9, 10, 3257, 11, 3258, 12, 13, 14, 15, 3259, 3260, 3261, 3262, 3263, 16, 17, 3264, 18, 3265, 3266, 3267, 19, 20, 3268, 3269, 21, 3270, 3271, 22, 3272, 3273, 23, 3274, 24, 25, 3275 ], "id_column": "sentence_pair_id" } }, "num_fewshot": 10, "metric_list": [ { "metric": "pearson", "aggregation": "pearsonr", "higher_is_better": true }, { "metric": "mse", "aggregation": "mean_squared_error", "higher_is_better": false } ], "output_type": "generate_until", "generation_kwargs": { "max_gen_toks": 32, "do_sample": false, "temperature": 0.0, "top_k": null, "top_p": null, "until": [ "\n\n" ] }, "repeats": 1, "filter_list": [ { "name": "all", "filter": [ { "function": "number_filter", "type": "float", "range_min": 1.0, "range_max": 5.0, "on_outside_range": "clip", "fallback": 5.0 }, { "function": "take_first" } ] } ], "should_decontaminate": false, "metadata": { "version": 1.1 } }, "bluex": { "task": "bluex", "group": [ "pt_benchmark", "vestibular" ], "dataset_path": "eduagarcia-temp/BLUEX_without_images", "test_split": "train", "fewshot_split": "train", "doc_to_text": "", "doc_to_target": "{{answerKey}}", "description": "As perguntas a seguir são questões de múltipla escolha de provas de vestibular de universidades brasileiras, selecione a única alternativa correta e responda apenas com as letras \"A\", \"B\", \"C\", \"D\" ou \"E\".\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "id_sampler", "sampler_config": { "id_list": [ "USP_2018_3", "UNICAMP_2018_2", "USP_2018_35", "UNICAMP_2018_16", "USP_2018_89" ], "id_column": "id", "exclude_from_task": true } }, "num_fewshot": 3, "metric_list": [ { "metric": "acc", "aggregation": "acc", "higher_is_better": true } ], "output_type": "generate_until", "generation_kwargs": { "max_gen_toks": 32, "do_sample": false, "temperature": 0.0, "top_k": null, "top_p": null, "until": [ "\n\n" ] }, "repeats": 1, "filter_list": [ { "name": "all", "filter": [ { "function": "normalize_spaces" }, { "function": "remove_accents" }, { "function": "find_choices", "choices": [ "A", "B", "C", "D", "E" ], "regex_patterns": [ "(?:[Ll]etra|[Aa]lternativa|[Rr]esposta|[Rr]esposta [Cc]orreta|[Rr]esposta [Cc]orreta e|[Oo]pcao):? ([ABCDE])\\b", "\\b([ABCDE])\\.", "\\b([ABCDE]) ?[.):-]", "\\b([ABCDE])$", "\\b([ABCDE])\\b" ] }, { "function": "take_first" } ], "group_by": { "column": "exam_id" } } ], "should_decontaminate": true, "doc_to_decontamination_query": "", "metadata": { "version": 1.1 } }, "enem_challenge": { "task": "enem_challenge", "task_alias": "enem", "group": [ "pt_benchmark", "vestibular" ], "dataset_path": "eduagarcia/enem_challenge", "test_split": "train", "fewshot_split": "train", "doc_to_text": "", "doc_to_target": "{{answerKey}}", "description": "As perguntas a seguir são questões de múltipla escolha do Exame Nacional do Ensino Médio (ENEM), selecione a única alternativa correta e responda apenas com as letras \"A\", \"B\", \"C\", \"D\" ou \"E\".\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "id_sampler", "sampler_config": { "id_list": [ "2022_21", "2022_88", "2022_143" ], "id_column": "id", "exclude_from_task": true } }, "num_fewshot": 3, "metric_list": [ { "metric": "acc", "aggregation": "acc", "higher_is_better": true } ], "output_type": "generate_until", "generation_kwargs": { "max_gen_toks": 32, "do_sample": false, "temperature": 0.0, "top_k": null, "top_p": null, "until": [ "\n\n" ] }, "repeats": 1, "filter_list": [ { "name": "all", "filter": [ { "function": "normalize_spaces" }, { "function": "remove_accents" }, { "function": "find_choices", "choices": [ "A", "B", "C", "D", "E" ], "regex_patterns": [ "(?:[Ll]etra|[Aa]lternativa|[Rr]esposta|[Rr]esposta [Cc]orreta|[Rr]esposta [Cc]orreta e|[Oo]pcao):? ([ABCDE])\\b", "\\b([ABCDE])\\.", "\\b([ABCDE]) ?[.):-]", "\\b([ABCDE])$", "\\b([ABCDE])\\b" ] }, { "function": "take_first" } ], "group_by": { "column": "exam_id" } } ], "should_decontaminate": true, "doc_to_decontamination_query": "", "metadata": { "version": 1.1 } }, "faquad_nli": { "task": "faquad_nli", "group": [ "pt_benchmark" ], "dataset_path": "ruanchaves/faquad-nli", "test_split": "test", "fewshot_split": "train", "doc_to_text": "Pergunta: {{question}}\nResposta: {{answer}}\nA resposta dada satisfaz à pergunta? Sim ou Não?", "doc_to_target": "{{['Não', 'Sim'][label]}}", "description": "Abaixo estão pares de pergunta e resposta. Para cada par, você deve julgar se a resposta responde à pergunta de maneira satisfatória e aparenta estar correta. Escreva apenas \"Sim\" ou \"Não\".\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n", "sampler_config": { "fewshot_indices": [ 1893, 949, 663, 105, 1169, 2910, 2227, 2813, 974, 558, 1503, 1958, 2918, 601, 1560, 984, 2388, 995, 2233, 1982, 165, 2788, 1312, 2285, 522, 1113, 1670, 323, 236, 1263, 1562, 2519, 1049, 432, 1167, 1394, 2022, 2551, 2194, 2187, 2282, 2816, 108, 301, 1185, 1315, 1420, 2436, 2322, 766 ] } }, "num_fewshot": 15, "metric_list": [ { "metric": "f1_macro", "aggregation": "f1_macro", "higher_is_better": true }, { "metric": "acc", "aggregation": "acc", "higher_is_better": true } ], "output_type": "generate_until", "generation_kwargs": { "max_gen_toks": 32, "do_sample": false, "temperature": 0.0, "top_k": null, "top_p": null, "until": [ "\n\n" ] }, "repeats": 1, "filter_list": [ { "name": "all", "filter": [ { "function": "find_similar_label", "labels": [ "Sim", "Não" ] }, { "function": "take_first" } ] } ], "should_decontaminate": false, "metadata": { "version": 1.1 } }, "hatebr_offensive": { "task": "hatebr_offensive", "task_alias": "hatebr_offensive_binary", "group": [ "pt_benchmark" ], "dataset_path": "eduagarcia/portuguese_benchmark", "dataset_name": "HateBR_offensive_binary", "test_split": "test", "fewshot_split": "train", "doc_to_text": "Texto: {{sentence}}\nPergunta: O texto é ofensivo?\nResposta:", "doc_to_target": "{{'Sim' if label == 1 else 'Não'}}", "description": "Abaixo contém o texto de comentários de usuários do Instagram em português, sua tarefa é classificar se o texto é ofensivo ou não. Responda apenas com \"Sim\" ou \"Não\".\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "id_sampler", "sampler_config": { "id_list": [ 48, 44, 36, 20, 3511, 88, 3555, 16, 56, 3535, 60, 40, 3527, 4, 76, 3579, 3523, 3551, 68, 3503, 84, 3539, 64, 3599, 80, 3563, 3559, 3543, 3547, 3587, 3595, 3575, 3567, 3591, 24, 96, 92, 3507, 52, 72, 8, 3571, 3515, 3519, 3531, 28, 32, 0, 12, 3583 ], "id_column": "idx" } }, "num_fewshot": 25, "metric_list": [ { "metric": "f1_macro", "aggregation": "f1_macro", "higher_is_better": true }, { "metric": "acc", "aggregation": "acc", "higher_is_better": true } ], "output_type": "generate_until", "generation_kwargs": { "max_gen_toks": 32, "do_sample": false, "temperature": 0.0, "top_k": null, "top_p": null, "until": [ "\n\n" ] }, "repeats": 1, "filter_list": [ { "name": "all", "filter": [ { "function": "find_similar_label", "labels": [ "Sim", "Não" ] }, { "function": "take_first" } ] } ], "should_decontaminate": false, "metadata": { "version": 1.0 } }, "oab_exams": { "task": "oab_exams", "group": [ "legal_benchmark", "pt_benchmark" ], "dataset_path": "eduagarcia/oab_exams", "test_split": "train", "fewshot_split": "train", "doc_to_text": "", "doc_to_target": "{{answerKey}}", "description": "As perguntas a seguir são questões de múltipla escolha do Exame de Ordem da Ordem dos Advogados do Brasil (OAB), selecione a única alternativa correta e responda apenas com as letras \"A\", \"B\", \"C\" ou \"D\".\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "id_sampler", "sampler_config": { "id_list": [ "2010-01_1", "2010-01_11", "2010-01_13", "2010-01_23", "2010-01_26", "2010-01_28", "2010-01_38", "2010-01_48", "2010-01_58", "2010-01_68", "2010-01_76", "2010-01_83", "2010-01_85", "2010-01_91", "2010-01_99" ], "id_column": "id", "exclude_from_task": true } }, "num_fewshot": 3, "metric_list": [ { "metric": "acc", "aggregation": "acc", "higher_is_better": true } ], "output_type": "generate_until", "generation_kwargs": { "max_gen_toks": 32, "do_sample": false, "temperature": 0.0, "top_k": null, "top_p": null, "until": [ "\n\n" ] }, "repeats": 1, "filter_list": [ { "name": "all", "filter": [ { "function": "normalize_spaces" }, { "function": "remove_accents" }, { "function": "find_choices", "choices": [ "A", "B", "C", "D" ], "regex_patterns": [ "(?:[Ll]etra|[Aa]lternativa|[Rr]esposta|[Rr]esposta [Cc]orreta|[Rr]esposta [Cc]orreta e|[Oo]pcao):? ([ABCD])\\b", "\\b([ABCD])\\.", "\\b([ABCD]) ?[.):-]", "\\b([ABCD])$", "\\b([ABCD])\\b" ] }, { "function": "take_first" } ], "group_by": { "column": "exam_id" } } ], "should_decontaminate": true, "doc_to_decontamination_query": "", "metadata": { "version": 1.5 } }, "portuguese_hate_speech": { "task": "portuguese_hate_speech", "task_alias": "portuguese_hate_speech_binary", "group": [ "pt_benchmark" ], "dataset_path": "eduagarcia/portuguese_benchmark", "dataset_name": "Portuguese_Hate_Speech_binary", "test_split": "test", "fewshot_split": "train", "doc_to_text": "Texto: {{sentence}}\nPergunta: O texto contém discurso de ódio?\nResposta:", "doc_to_target": "{{'Sim' if label == 1 else 'Não'}}", "description": "Abaixo contém o texto de tweets de usuários do Twitter em português, sua tarefa é classificar se o texto contém discurso de ódio ou não. Responda apenas com \"Sim\" ou \"Não\".\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "id_sampler", "sampler_config": { "id_list": [ 52, 50, 39, 28, 3, 105, 22, 25, 60, 11, 66, 41, 9, 4, 91, 42, 7, 20, 76, 1, 104, 13, 67, 54, 97, 27, 24, 14, 16, 48, 53, 40, 34, 49, 32, 119, 114, 2, 58, 83, 18, 36, 5, 6, 10, 35, 38, 0, 21, 46 ], "id_column": "idx" } }, "num_fewshot": 25, "metric_list": [ { "metric": "f1_macro", "aggregation": "f1_macro", "higher_is_better": true }, { "metric": "acc", "aggregation": "acc", "higher_is_better": true } ], "output_type": "generate_until", "generation_kwargs": { "max_gen_toks": 32, "do_sample": false, "temperature": 0.0, "top_k": null, "top_p": null, "until": [ "\n\n" ] }, "repeats": 1, "filter_list": [ { "name": "all", "filter": [ { "function": "find_similar_label", "labels": [ "Sim", "Não" ] }, { "function": "take_first" } ] } ], "should_decontaminate": false, "metadata": { "version": 1.0 } }, "tweetsentbr": { "task": "tweetsentbr", "group": [ "pt_benchmark" ], "dataset_path": "eduagarcia/tweetsentbr_fewshot", "test_split": "test", "fewshot_split": "train", "doc_to_text": "Texto: {{sentence}}\nPergunta: O sentimento do texto é Positivo, Neutro ou Negativo?\nResposta:", "doc_to_target": "{{'Positivo' if label == 'Positive' else ('Negativo' if label == 'Negative' else 'Neutro')}}", "description": "Abaixo contém o texto de tweets de usuários do Twitter em português, sua tarefa é classificar se o sentimento do texto é Positivo, Neutro ou Negativo. Responda apenas com uma das opções.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 25, "metric_list": [ { "metric": "f1_macro", "aggregation": "f1_macro", "higher_is_better": true }, { "metric": "acc", "aggregation": "acc", "higher_is_better": true } ], "output_type": "generate_until", "generation_kwargs": { "max_gen_toks": 32, "do_sample": false, "temperature": 0.0, "top_k": null, "top_p": null, "until": [ "\n\n" ] }, "repeats": 1, "filter_list": [ { "name": "all", "filter": [ { "function": "find_similar_label", "labels": [ "Positivo", "Neutro", "Negativo" ] }, { "function": "take_first" } ] } ], "should_decontaminate": false, "metadata": { "version": 1.0 } } }, "versions": { "assin2_rte": 1.1, "assin2_sts": 1.1, "bluex": 1.1, "enem_challenge": 1.1, "faquad_nli": 1.1, "hatebr_offensive": 1.0, "oab_exams": 1.5, "portuguese_hate_speech": 1.0, "tweetsentbr": 1.0 }, "n-shot": { "assin2_rte": 15, "assin2_sts": 10, "bluex": 3, "enem_challenge": 3, "faquad_nli": 15, "hatebr_offensive": 25, "oab_exams": 3, "portuguese_hate_speech": 25, "tweetsentbr": 25 }, "model_meta": { "truncated": 0, "non_truncated": 14150, "padded": 0, "non_padded": 14150, "fewshots_truncated": 0, "has_chat_template": true, "chat_type": "user_assistant", "n_gpus": 1, "accelerate_num_process": null, "model_sha": "None", "model_dtype": "torch.bfloat16", "model_memory_footprint": 4889264960, "model_num_parameters": 2444628480, "model_is_loaded_in_4bit": null, "model_is_loaded_in_8bit": null, "model_is_quantized": null, "model_device": "cuda:0", "batch_size": 16, "max_length": 4096, "max_ctx_length": 4064, "max_gen_toks": 32 }, "task_model_meta": { "assin2_rte": { "sample_size": 2448, "truncated": 0, "non_truncated": 2448, "padded": 0, "non_padded": 2448, "fewshots_truncated": 0, "mean_seq_length": 1061.423202614379, "min_seq_length": 1046, "max_seq_length": 1100, "max_ctx_length": 4064, "max_gen_toks": 32, "mean_original_fewshots_size": 15.0, "mean_effective_fewshot_size": 15.0 }, "assin2_sts": { "sample_size": 2448, "truncated": 0, "non_truncated": 2448, "padded": 0, "non_padded": 2448, "fewshots_truncated": 0, "mean_seq_length": 747.4232026143791, "min_seq_length": 732, "max_seq_length": 786, "max_ctx_length": 4064, "max_gen_toks": 32, "mean_original_fewshots_size": 10.0, "mean_effective_fewshot_size": 10.0 }, "bluex": { "sample_size": 719, "truncated": 0, "non_truncated": 719, "padded": 0, "non_padded": 719, "fewshots_truncated": 0, "mean_seq_length": 1198.817802503477, "min_seq_length": 932, "max_seq_length": 1829, "max_ctx_length": 4064, "max_gen_toks": 32, "mean_original_fewshots_size": 3.0, "mean_effective_fewshot_size": 3.0 }, "enem_challenge": { "sample_size": 1429, "truncated": 0, "non_truncated": 1429, "padded": 0, "non_padded": 1429, "fewshots_truncated": 0, "mean_seq_length": 1035.4177746675998, "min_seq_length": 857, "max_seq_length": 2512, "max_ctx_length": 4064, "max_gen_toks": 32, "mean_original_fewshots_size": 3.0, "mean_effective_fewshot_size": 3.0 }, "faquad_nli": { "sample_size": 650, "truncated": 0, "non_truncated": 650, "padded": 0, "non_padded": 650, "fewshots_truncated": 0, "mean_seq_length": 1083.1338461538462, "min_seq_length": 1051, "max_seq_length": 1149, "max_ctx_length": 4064, "max_gen_toks": 32, "mean_original_fewshots_size": 15.0, "mean_effective_fewshot_size": 15.0 }, "hatebr_offensive": { "sample_size": 1400, "truncated": 0, "non_truncated": 1400, "padded": 0, "non_padded": 1400, "fewshots_truncated": 0, "mean_seq_length": 1090.4407142857142, "min_seq_length": 1075, "max_seq_length": 1284, "max_ctx_length": 4064, "max_gen_toks": 32, "mean_original_fewshots_size": 25.0, "mean_effective_fewshot_size": 25.0 }, "oab_exams": { "sample_size": 2195, "truncated": 0, "non_truncated": 2195, "padded": 0, "non_padded": 2195, "fewshots_truncated": 0, "mean_seq_length": 863.024145785877, "min_seq_length": 690, "max_seq_length": 1139, "max_ctx_length": 4064, "max_gen_toks": 32, "mean_original_fewshots_size": 3.0, "mean_effective_fewshot_size": 3.0 }, "portuguese_hate_speech": { "sample_size": 851, "truncated": 0, "non_truncated": 851, "padded": 0, "non_padded": 851, "fewshots_truncated": 0, "mean_seq_length": 1442.021151586369, "min_seq_length": 1415, "max_seq_length": 1478, "max_ctx_length": 4064, "max_gen_toks": 32, "mean_original_fewshots_size": 25.0, "mean_effective_fewshot_size": 25.0 }, "tweetsentbr": { "sample_size": 2010, "truncated": 0, "non_truncated": 2010, "padded": 0, "non_padded": 2010, "fewshots_truncated": 0, "mean_seq_length": 1370.4194029850746, "min_seq_length": 1353, "max_seq_length": 1427, "max_ctx_length": 4064, "max_gen_toks": 32, "mean_original_fewshots_size": 25.0, "mean_effective_fewshot_size": 25.0 } }, "config": { "model": "huggingface", "model_args": "pretrained=/lustre/mlnvme/data/asen_hpc-mula/checkpoints-llama/slurm_job_17782345/step_42164", "batch_size": "auto", "batch_sizes": [], "device": "cuda:0", "use_cache": null, "limit": null, "bootstrap_iters": 100000, "gen_kwargs": null }, "git_hash": null }