import streamlit as st from draw_utils import PAGE_MARKDOWN, PAGE_INFO, LENGTHS from draw_utils import load_results, style_dataframe st.set_page_config(layout="wide", page_title="Leaderboard App") st.markdown(PAGE_MARKDOWN, unsafe_allow_html=True) def draw_leaderboard(): df = load_results() tasks = ['avg'] + [f"qa{i}" for i in range(1, 11)] columns = ["model_name", "≤32k", "≤128k"] + LENGTHS st.title("🔎📚🪡📚❓ BABILong Leaderboard 🏆") st.markdown(PAGE_INFO) st.subheader("Evaluation results:") st.text('Each tab corresponds to a task, avg - averaged scores over qa1-5 tasks.') st.markdown('Predictions of all evaluated models: ' '[BABILong evals](https://huggingface.co/datasets/RMT-team/babilong_evals)') search_term = st.text_input("Search models:", "") tabs = st.tabs([str(task) for task in tasks]) for i, tab in enumerate(tabs): with tab: task_df = df[df.task == tasks[i]][columns] if i == 0: # do not dispay models with no evals ≤1k for avg task task_df = task_df.loc[~task_df[task_df.columns[:5]].isna().any(axis=1)] if search_term: task_df = task_df[task_df['model_name'].str.contains(search_term, case=False)] task_df.reset_index(drop=True, inplace=True) row_height = 35 height = (len(task_df) + 1) * row_height styled_df = style_dataframe(task_df).format(precision=1) st.dataframe( styled_df, width=1100, height=height, ) if __name__ == "__main__": draw_leaderboard()