import gradio as gr
from utils import submit_gradio_module, load_retrieval_results, load_reranking_results
from fuzzywuzzy import fuzz
HEADER = """
The Arabic RAG Leaderboard
The only leaderboard you will require for your RAG needs đ
"""
RETRIEVAL_ABOUT_SECTION = """
## About Retrieval Evaluation
The retrieval evaluation assesses a model's ability to find and retrieve relevant information from a large corpus of Arabic text. Models are evaluated on:
### Web Search Dataset Metrics
- **MRR (Mean Reciprocal Rank)**: Measures the ranking quality by focusing on the position of the first relevant result
- **nDCG (Normalized Discounted Cumulative Gain)**: Evaluates the ranking quality considering all relevant results
- **Recall@5**: Measures the proportion of relevant documents found in the top 5 results
- **Overall Score**: Combined score calculated as the average of MRR, nDCG, and Recall@5
### Model Requirements
- Must support Arabic text embeddings
- Should handle queries of at least 512 tokens
- Must work with `sentence-transformers` library
### Evaluation Process
1. Models process Arabic web search queries
2. Retrieved documents are evaluated using:
- MRR for first relevant result positioning
- nDCG for overall ranking quality
- Recall@5 for top results accuracy
3. Metrics are averaged to calculate the overall score
4. Models are ranked based on their overall performance
### How to Prepare Your Model
- Ensure your model is publicly available on HuggingFace Hub (We don't support private model evaluations yet)
- Model should output fixed-dimension embeddings for text
- Support batch processing for efficient evaluation (this is default if you use `sentence-transformers`)
"""
RERANKER_ABOUT_SECTION = """
## About Reranking Evaluation
The reranking evaluation assesses a model's ability to improve search quality by reordering initially retrieved results. Models are evaluated across multiple unseen Arabic datasets to ensure robust performance.
### Evaluation Metrics
- **MRR@10 (Mean Reciprocal Rank at 10)**: Measures the ranking quality focusing on the first relevant result in top-10
- **NDCG@10 (Normalized DCG at 10)**: Evaluates the ranking quality of all relevant results in top-10
- **MAP (Mean Average Precision)**: Measures the overall precision across all relevant documents
All metrics are averaged across multiple evaluation datasets to provide a comprehensive assessment of model performance.
### Model Requirements
- Must accept query-document pairs as input
- Should output relevance scores for reranking (has cross-attention or similar mechanism for query-document matching)
- Support for Arabic text processing
### Evaluation Process
1. Models are tested on multiple unseen Arabic datasets
2. For each dataset:
- Initial candidate documents are provided
- Model reranks the candidates
- MRR@10, NDCG@10, and MAP are calculated
3. Final scores are averaged across all datasets
4. Models are ranked based on overall performance
### How to Prepare Your Model
- Model should be public on HuggingFace Hub (private models are not supported yet)
- Make sure it works coherently with `sentence-transformers` library
"""
CITATION_BUTTON_LABEL = """
Copy the following snippet to cite these results
"""
CITATION_BUTTON_TEXT = """
@misc{TARL,
author = {Mohaned A. Rashad, Hamza Shahid},
title = {The Arabic RAG Leaderboard},
year = {2025},
publisher = {Navid-AI},
howpublished = "url{https://huggingface.co/spaces/Navid-AI/The-Arabic-Rag-Leaderboard}"
}
"""
retrieval_df = None
reranking_df = None
def search_leaderboard(df, model_name, columns_to_show, threshold=95):
if not model_name.strip():
return df.loc[:, columns_to_show]
search_name = model_name.lower() # compute once for efficiency
def calculate_similarity(row):
return fuzz.partial_ratio(search_name, row["Model"].lower())
filtered_df = df.copy()
filtered_df["similarity"] = filtered_df.apply(calculate_similarity, axis=1)
filtered_df = filtered_df[filtered_df["similarity"] >= threshold].sort_values('similarity', ascending=False)
filtered_df = filtered_df.drop('similarity', axis=1).loc[:, columns_to_show]
return filtered_df
def retrieval_search_leaderboard(model_name, columns_to_show):
return search_leaderboard(retrieval_df, model_name, columns_to_show)
def reranking_search_leaderboard(model_name, columns_to_show):
return search_leaderboard(reranking_df, model_name, columns_to_show)
def main():
global retrieval_df, reranking_df
# Prepare retrieval dataframe
retrieval_df = load_retrieval_results(True, "Web Search Dataset (Overall Score)", ["Revision", "Precision", "Task"])
retrieval_columns_to_show = ["Model", "Web Search Dataset (Overall Score)", "Model Parameters (in Millions)", "Embedding Dimension", "Max Tokens", "Num Likes"]
retrieval_cols = retrieval_df.columns.tolist() # cache columns
# Prepare reranking dataframe
reranking_df = load_reranking_results(True, sort_col="Overall Score")
reranking_columns_to_show = ["Model", "Overall Score", "Model Parameters (in Millions)", "Num Downloads", "MRR@10", "NDCG@10", "MAP"]
reranking_cols = reranking_df.columns.tolist() # cache columns
with gr.Blocks() as demo:
gr.HTML(HEADER)
with gr.Tabs():
with gr.Tab("đĩī¸ââī¸ Retrieval"):
with gr.Tabs():
with gr.Tab("đ Leaderboard"):
with gr.Row():
search_box_retrieval = gr.Textbox(
placeholder="Search for models...",
label="Search",
scale=5
)
retrieval_columns_to_show_input = gr.CheckboxGroup(
label="Columns to Show",
choices=retrieval_cols, # use cached list
value=retrieval_columns_to_show,
scale=4
)
retrieval_leaderboard = gr.Dataframe(
value=retrieval_df[retrieval_columns_to_show],
datatype="markdown",
wrap=True,
show_fullscreen_button=True,
interactive=False
)
# Submit the search box and the leaderboard
search_box_retrieval.input(
retrieval_search_leaderboard,
inputs=[search_box_retrieval, retrieval_columns_to_show_input],
outputs=retrieval_leaderboard
)
retrieval_columns_to_show_input.select(
lambda columns: retrieval_df.loc[:, columns],
inputs=retrieval_columns_to_show_input,
outputs=retrieval_leaderboard
)
with gr.Tab("đĩī¸ Submit Retriever"):
submit_gradio_module("Retriever")
with gr.Tab("âšī¸ About"):
gr.Markdown(RETRIEVAL_ABOUT_SECTION)
with gr.Tab("đ Reranking"):
with gr.Tabs():
with gr.Tab("đ Leaderboard"):
with gr.Row():
search_box_reranker = gr.Textbox(
placeholder="Search for models...",
label="Search",
scale=5
)
reranking_columns_to_show_input = gr.CheckboxGroup(
label="Columns to Show",
choices=reranking_cols, # use cached list
value=reranking_columns_to_show,
scale=4
)
reranker_leaderboard = gr.Dataframe(
value=reranking_df[reranking_columns_to_show],
datatype="markdown",
wrap=False,
show_fullscreen_button=True,
interactive=False,
)
# Submit the search box and the leaderboard
search_box_reranker.input(
reranking_search_leaderboard,
inputs=[search_box_reranker, reranking_columns_to_show_input],
outputs=reranker_leaderboard
)
reranking_columns_to_show_input.select(
lambda columns: reranking_df.loc[:, columns],
inputs=reranking_columns_to_show_input,
outputs=reranker_leaderboard
)
with gr.Tab("đĩī¸ Submit Reranker"):
submit_gradio_module("Reranker")
with gr.Tab("âšī¸ About"):
gr.Markdown(RERANKER_ABOUT_SECTION)
with gr.Row():
with gr.Accordion("đ Citation", open=False):
gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
lines=20,
elem_id="citation-button",
show_copy_button=True,
)
demo.launch()
if __name__ == "__main__":
main()