import gradio as gr from utils import submit_gradio_module, load_retrieval_results, load_reranking_results from fuzzywuzzy import fuzz HEADER = """

The Arabic RAG Leaderboard

The only leaderboard you will require for your RAG needs 🏆

""" RETRIEVAL_ABOUT_SECTION = """ ## About Retrieval Evaluation The retrieval evaluation assesses a model's ability to find and retrieve relevant information from a large corpus of Arabic text. Models are evaluated on: ### Web Search Dataset Metrics - **MRR (Mean Reciprocal Rank)**: Measures the ranking quality by focusing on the position of the first relevant result - **nDCG (Normalized Discounted Cumulative Gain)**: Evaluates the ranking quality considering all relevant results - **Recall@5**: Measures the proportion of relevant documents found in the top 5 results - **Overall Score**: Combined score calculated as the average of MRR, nDCG, and Recall@5 ### Model Requirements - Must support Arabic text embeddings - Should handle queries of at least 512 tokens - Must work with `sentence-transformers` library ### Evaluation Process 1. Models process Arabic web search queries 2. Retrieved documents are evaluated using: - MRR for first relevant result positioning - nDCG for overall ranking quality - Recall@5 for top results accuracy 3. Metrics are averaged to calculate the overall score 4. Models are ranked based on their overall performance ### How to Prepare Your Model - Ensure your model is publicly available on HuggingFace Hub (We don't support private model evaluations yet) - Model should output fixed-dimension embeddings for text - Support batch processing for efficient evaluation (this is default if you use `sentence-transformers`) """ RERANKER_ABOUT_SECTION = """ ## About Reranking Evaluation The reranking evaluation assesses a model's ability to improve search quality by reordering initially retrieved results. Models are evaluated across multiple unseen Arabic datasets to ensure robust performance. ### Evaluation Metrics - **MRR@10 (Mean Reciprocal Rank at 10)**: Measures the ranking quality focusing on the first relevant result in top-10 - **NDCG@10 (Normalized DCG at 10)**: Evaluates the ranking quality of all relevant results in top-10 - **MAP (Mean Average Precision)**: Measures the overall precision across all relevant documents All metrics are averaged across multiple evaluation datasets to provide a comprehensive assessment of model performance. ### Model Requirements - Must accept query-document pairs as input - Should output relevance scores for reranking (has cross-attention or similar mechanism for query-document matching) - Support for Arabic text processing ### Evaluation Process 1. Models are tested on multiple unseen Arabic datasets 2. For each dataset: - Initial candidate documents are provided - Model reranks the candidates - MRR@10, NDCG@10, and MAP are calculated 3. Final scores are averaged across all datasets 4. Models are ranked based on overall performance ### How to Prepare Your Model - Model should be public on HuggingFace Hub (private models are not supported yet) - Make sure it works coherently with `sentence-transformers` library """ CITATION_BUTTON_LABEL = """ Copy the following snippet to cite these results """ CITATION_BUTTON_TEXT = """ @misc{TARL, author = {Mohaned A. Rashad, Hamza Shahid}, title = {The Arabic RAG Leaderboard}, year = {2025}, publisher = {Navid-AI}, howpublished = "url{https://huggingface.co/spaces/Navid-AI/The-Arabic-Rag-Leaderboard}" } """ retrieval_df = None reranking_df = None def search_leaderboard(df, model_name, columns_to_show, threshold=95): if not model_name.strip(): return df.loc[:, columns_to_show] search_name = model_name.lower() # compute once for efficiency def calculate_similarity(row): return fuzz.partial_ratio(search_name, row["Model"].lower()) filtered_df = df.copy() filtered_df["similarity"] = filtered_df.apply(calculate_similarity, axis=1) filtered_df = filtered_df[filtered_df["similarity"] >= threshold].sort_values('similarity', ascending=False) filtered_df = filtered_df.drop('similarity', axis=1).loc[:, columns_to_show] return filtered_df def retrieval_search_leaderboard(model_name, columns_to_show): return search_leaderboard(retrieval_df, model_name, columns_to_show) def reranking_search_leaderboard(model_name, columns_to_show): return search_leaderboard(reranking_df, model_name, columns_to_show) def main(): global retrieval_df, reranking_df # Prepare retrieval dataframe retrieval_df = load_retrieval_results(True, "Web Search Dataset (Overall Score)", ["Revision", "Precision", "Task"]) retrieval_columns_to_show = ["Model", "Web Search Dataset (Overall Score)", "Model Parameters (in Millions)", "Embedding Dimension", "Max Tokens", "Num Likes"] retrieval_cols = retrieval_df.columns.tolist() # cache columns # Prepare reranking dataframe reranking_df = load_reranking_results(True, sort_col="Overall Score") reranking_columns_to_show = ["Model", "Overall Score", "Model Parameters (in Millions)", "Num Downloads", "MRR@10", "NDCG@10", "MAP"] reranking_cols = reranking_df.columns.tolist() # cache columns with gr.Blocks() as demo: gr.HTML(HEADER) with gr.Tabs(): with gr.Tab("đŸ•ĩī¸â€â™‚ī¸ Retrieval"): with gr.Tabs(): with gr.Tab("👑 Leaderboard"): with gr.Row(): search_box_retrieval = gr.Textbox( placeholder="Search for models...", label="Search", scale=5 ) retrieval_columns_to_show_input = gr.CheckboxGroup( label="Columns to Show", choices=retrieval_cols, # use cached list value=retrieval_columns_to_show, scale=4 ) retrieval_leaderboard = gr.Dataframe( value=retrieval_df[retrieval_columns_to_show], datatype="markdown", wrap=True, show_fullscreen_button=True, interactive=False ) # Submit the search box and the leaderboard search_box_retrieval.input( retrieval_search_leaderboard, inputs=[search_box_retrieval, retrieval_columns_to_show_input], outputs=retrieval_leaderboard ) retrieval_columns_to_show_input.select( lambda columns: retrieval_df.loc[:, columns], inputs=retrieval_columns_to_show_input, outputs=retrieval_leaderboard ) with gr.Tab("đŸĩī¸ Submit Retriever"): submit_gradio_module("Retriever") with gr.Tab("ℹī¸ About"): gr.Markdown(RETRIEVAL_ABOUT_SECTION) with gr.Tab("📊 Reranking"): with gr.Tabs(): with gr.Tab("👑 Leaderboard"): with gr.Row(): search_box_reranker = gr.Textbox( placeholder="Search for models...", label="Search", scale=5 ) reranking_columns_to_show_input = gr.CheckboxGroup( label="Columns to Show", choices=reranking_cols, # use cached list value=reranking_columns_to_show, scale=4 ) reranker_leaderboard = gr.Dataframe( value=reranking_df[reranking_columns_to_show], datatype="markdown", wrap=False, show_fullscreen_button=True, interactive=False, ) # Submit the search box and the leaderboard search_box_reranker.input( reranking_search_leaderboard, inputs=[search_box_reranker, reranking_columns_to_show_input], outputs=reranker_leaderboard ) reranking_columns_to_show_input.select( lambda columns: reranking_df.loc[:, columns], inputs=reranking_columns_to_show_input, outputs=reranker_leaderboard ) with gr.Tab("đŸĩī¸ Submit Reranker"): submit_gradio_module("Reranker") with gr.Tab("ℹī¸ About"): gr.Markdown(RERANKER_ABOUT_SECTION) with gr.Row(): with gr.Accordion("📙 Citation", open=False): gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True, ) demo.launch() if __name__ == "__main__": main()