Spaces:

osunlp
/

Online_Mind2Web_Leaderboard

Running

App Files Files Community

Online_Mind2Web_Leaderboard / content.py

WeijianQi1999

update m2w citation

3621285 3 months ago

raw

history blame

4.33 kB

	TITLE = """<h1 align="center" id="space-title">🏆 Online Mind2Web Leaderboard</h1>"""
	LINKS = """
	<div align="center">
	<a href="https://tiancixue.notion.site/An-Illusion-of-Progress-Assessing-the-Current-State-of-Web-Agents-1ac6cd2b9aac80719cd6f68374aaf4b4?pvs=4">Blog</a> \|
	<a href="#">Paper</a> \|
	<a href="https://github.com/OSU-NLP-Group/Online-Mind2Web">Code</a> \|
	<a href="https://huggingface.co/datasets/osunlp/Online-Mind2Web">Data</a>
	</div>
	"""

	INTRODUCTION_TEXT = """
	Online Mind2Web is a benchmark designed to evaluate the real-world performance of web agents on live websites, featuring 300 tasks across 136 popular sites in diverse domains.
	Based on the number of steps required by human annotators, tasks are divided into three difficulty levels: Easy (1–5 steps), Medium (6–10 steps), and Hard (11+ steps).
	"""

	LEADERBOARD_TEXT = """
	### Leaderboard
	We maintain two leaderboards—one for automated evaluation and another for human evaluation.
	All submissions will be auto-evaluated internally, and if human evaluation results are provided, a subset will be selected for rigorous spot-check verification.
	"""

	SUBMISSION_TEXT = """
	## Submissions
	Participants are invited to submit your agent's trajectory to test. The submissions will be evaluated based on our auto-eval.

	### Format of submission
	Submissions must include a sequence of images (i.e., screenshots in the trajectory) and a result.json file for each task. The JSON file should contain the fields: "Task", "Task_id", and "action_history". You can refer to an example of the submission files.
	"""

	CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
	CITATION_BUTTON_TEXT = r"""
	@article{xue2025webagents,
	title = "An Illusion of Progress? Assessing the Current State of Web Agents",
	author = "Xue, Tianci and Qi, Weijian and Shi, Tianneng and Song, Chan Hee and Gou, Boyu and Song, Dawn and Sun, Huan and Su, Yu",
	journal = "OSU NLP Blog",
	year = "2025",
	month = "Mar",
	url = "https://tiancixue.notion.site/An-Illusion-of-Progress-Assessing-the-Current-State-of-Web-Agents-1ac6cd2b9aac80719cd6f68374aaf4b4?pvs=4"
	}

	@inproceedings{NEURIPS2023_5950bf29,
	author = {Deng, Xiang and Gu, Yu and Zheng, Boyuan and Chen, Shijie and Stevens, Sam and Wang, Boshi and Sun, Huan and Su, Yu},
	booktitle = {Advances in Neural Information Processing Systems},
	editor = {A. Oh and T. Naumann and A. Globerson and K. Saenko and M. Hardt and S. Levine},
	pages = {28091--28114},
	publisher = {Curran Associates, Inc.},
	title = {Mind2Web: Towards a Generalist Agent for the Web},
	url = {https://proceedings.neurips.cc/paper_files/paper/2023/file/5950bf290a1570ea401bf98882128160-Paper-Datasets_and_Benchmarks.pdf},
	volume = {36},
	year = {2023}
	}
	"""

	SUBMIT_INTRODUCTION = """
	## ⚠ Please submit the trajectory file with the following format:
	Each task is stored in a folder named after its `task_id`, containing:

	- `trajectory/`: Stores screenshots of each step.
	- `result.json`: Task metadata and action history.

	Structure:
	```
	main_directory/
	└── task_id/
	├── result.json
	└── trajectory/
	├── 0_screenshot.png
	├── 1_screenshot.png
	└── ...
	```

	`result.json` format:
	```json
	{
	"task_id": 123,
	"task": "abc",
	"action_history": ["abc", "xyz", "..."]
	}
	```
	Please send your agent's name, model family, and organization via email to [email protected], along with the trajectory directory attached.

	We will run the auto-evaluation. If you have conducted your own human evaluation, please also attach your human eval results—we will spot-check these before adding them to the human-eval table.

	"""
	DATA_DATASET = """## More Statistics for Online Mind2Web Benchmark
	"""


	def format_error(msg):
	return f"<p style='color: red; font-size: 20px; text-align: center;'>{msg}</p>"

	def format_warning(msg):
	return f"<p style='color: orange; font-size: 20px; text-align: center;'>{msg}</p>"

	def format_log(msg):
	return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"

	def model_hyperlink(link, model_name):
	return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'