from dataclasses import dataclass from enum import Enum @dataclass class Task: benchmark: str metric: str col_name: str # Select your tasks here # --------------------------------------------------- class Tasks(Enum): # task_key in the json file, metric_key in the json file, name to display in the leaderboard task0 = Task("Math", "acc", "Math") task1 = Task("Chemistry", "acc", "Chemistry") task2 = Task("Physics", "acc", "Physics") task3 = Task("Arabic", "acc", "Arabic") task4 = Task("English", "acc", "English") task5 = Task("Religion", "acc", "Religion") task6 = Task("Persian Literature", "acc", "Persian Literature") NUM_FEWSHOT = 0 # Change with your few shot # --------------------------------------------------- # Your leaderboard name TITLE = """

IRUEX leaderboard

""" # What does your leaderboard evaluate? INTRODUCTION_TEXT = """ Intro text """ # Which evaluations are you running? how can people reproduce what you have? LLM_BENCHMARKS_TEXT = f""" ## How it works ## Reproducibility To reproduce our results, here is the commands you can run: """ # Your leaderboard name TITLE = """

IRUEX Leaderboard

""" # What does your leaderboard evaluate? INTRODUCTION_TEXT = """ **Welcome to the IRUEX Leaderboard!** This platform evaluates large language models based on Iran's University Entrance Exam subjects. [Explore the IRUEX Dataset](https://github.com/hamedkhaledi/IRUEX-dataset) on GitHub. """ # Which evaluations are you running? How can people reproduce them? LLM_BENCHMARKS_TEXT = """ ## Evaluation Process We assess models across various subjects, including Math, Chemistry, Physics, Arabic, English, Religion, and Persian Literature. Each model's performance is measured using accuracy metrics specific to each subject. ## Reproducibility To reproduce our results, execute the following commands: ```bash # Example command to run evaluations python evaluate_model.py --model_name your_model_name --task Math --num_fewshot 0 ``` """ EVALUATION_QUEUE_TEXT = "" CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" CITATION_BUTTON_TEXT = r""" @inproceedings{khademi-khaledi-faili-2025-iruex, title = "{IRUEX}: A Study on Large Language Models' Problem-Solving Skills in Iran's University Entrance Exam", author = "Khademi Khaledi, Hamed and Faili, Heshaam", editor = "Rambow, Owen and Wanner, Leo and Apidianaki, Marianna and Al-Khalifa, Hend and Di Eugenio, Barbara and Schockaert, Steven", booktitle = "Proceedings of the 31st International Conference on Computational Linguistics", month = jan, year = "2025", address = "Abu Dhabi, UAE", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2025.coling-main.434/", pages = "6505--6519", } """