from typing import Literal from functools import partial from content import * import gradio as gr import numpy as np import pandas as pd import pandas as pd # This dataframe must contain the following columns: # - model: the name of the model # - language: the language of the model # - dataset: the dataset used to evaluate the model # - score: the score of the model on the (language, dataset) pair # - model_type: the type of the model (e.g. "Chat Model", "Base Model") df = pd.read_csv("data/raw_scores.csv") choices_language = list(df["language"].unique()) choices_dataset = list(df["dataset"].unique()) choices_model_type = list(df["model_type"].unique()) # Utility functions for data processing reduce_functions = { "Mean": lambda x: np.mean(x), "Median": lambda x: np.median(x), "Max": lambda x: np.max(x), "Min": lambda x: np.min(x), } map_functions = { "Raw": lambda x: x, "Rank": partial(pd.Series.rank, ascending=False, method="dense"), "Normalize": lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)), } score_ascending = { "Raw": False, "Rank": True, "Normalize": False, } def prepare_dataframe( df: pd.DataFrame, filters: dict[str, list[str]], group_by: Literal["language", "dataset"], map_function: str, reduce_function: str, ) -> pd.DataFrame: # Filters contains a value subset for each column language = filters["language"] dataset = filters["dataset"] # Columns are divided into two groups: other_columns and group_by_columns, apart from `model` which should be the index other_columns = list(set(df.columns) - set(["language", "dataset", "score"])) group_by_columns = filters[group_by] # Step 1: Filter the dataframe based on the selected language and dataset for k, v in filters.items(): df = df[df[k].isin(v)] # If dataframe is empty, return an empty dataframe if len(df) == 0: gr.Warning( f"No scores remain after the filter application. Please verify the checkboxes." ) return pd.DataFrame(columns=other_columns) # Sanity check: All score exists for each (language,dataset) pair score_count = ( df.drop_duplicates(subset=["model", "language", "dataset"]) .groupby(["model"])["score"] .count() ) invalid_models = score_count[ score_count < len(language) * len(dataset) ].index.tolist() df = df[~df["model"].isin(invalid_models)] # Send a warning message if there are any invalid models for model in invalid_models: gr.Warning( f"{model} is lacking some scores thus hidden. Please report to the maintainers." ) # Step 2: Process Scores # Step 2.0: Map the scores along each (language, dataset) pair df["score"] = df.groupby(["language", "dataset"])["score"].transform( map_functions[map_function] ) # Step 2.1: Reduce the scores along the column other than `group_by` df = ( df.groupby(other_columns + [group_by]) .agg({"score": reduce_functions[reduce_function]}) .reset_index() ) # Step 2.2: Reduce the scores along `group_by` to get the overall score of each model reduced_col = df.groupby(other_columns).agg( {"score": reduce_functions[reduce_function]} )["score"] # Step 2.3: Pivot the dataframe, then concat the overall score df = df.pivot(index=other_columns, columns=group_by, values=["score"]).droplevel( 0, 1 ) df["Overall Score"] = reduced_col # Step 3: Styling for display # - Sort the dataframe by the reduced score # - Sort the columns for better readability # - Highlight the maximum value in each column # - Format the score to 2 decimal places if it is a float other_columns.remove("model") df = ( df.reset_index()[ ["model"] + other_columns + ["Overall Score"] + group_by_columns ] .sort_values(by="Overall Score", ascending=score_ascending[map_function]) .style.format(precision=2) ) if score_ascending[map_function]: df = df.highlight_min( axis=0, color="#18864B", subset=["Overall Score"] + group_by_columns ) else: df = df.highlight_max( axis=0, color="#18864B", subset=["Overall Score"] + group_by_columns ) return df with gr.Blocks(theme=gr.themes.Base()) as demo: # UI definition with gr.Row(): with gr.Column(): gr.Markdown( MARKDOWN_HEADER ) checkbox_language = gr.CheckboxGroup( choices=choices_language, value=choices_language, label="Language(s)", interactive=True, ) checkbox_dataset = gr.CheckboxGroup( choices=choices_dataset, value=choices_dataset, label="Dataset(s)", interactive=True, ) checkbox_model_type = gr.CheckboxGroup( choices=choices_model_type, value=choices_model_type, label="Model Type(s)", interactive=True, ) dropdown_map_function = gr.Dropdown( choices=map_functions.keys(), value="Raw", label="Map Function", interactive=True, info=MARKDOWN_MAP_FUNCTION ) dropdown_reduce_function = gr.Dropdown( choices=reduce_functions.keys(), value="Mean", label="Reduce Function", interactive=True, info=MARKDOWN_REDUCE_FUNCTION ) ratio_group_by = gr.Radio( choices=["language", "dataset"], value="language", label="Group by", interactive=True, ) dataframe = gr.DataFrame( prepare_dataframe( df=df, filters={ "language": choices_language, "dataset": choices_dataset, }, group_by="language", map_function="Raw", reduce_function="Mean", ), interactive=False, ) gr.Code( language="markdown", label="Citation", value=CITATION, ) # Event listeners gr.on( triggers=[ checkbox_model_type.change, checkbox_language.change, checkbox_dataset.change, ratio_group_by.change, dropdown_reduce_function.change, dropdown_map_function.change, ], fn=lambda model_type, language, dataset, group_by, map_function, reduce_function: prepare_dataframe( df=df, filters={ "language": language, "dataset": dataset, "model_type": model_type, }, group_by=group_by, map_function=map_function, reduce_function=reduce_function, ), inputs=[ checkbox_model_type, checkbox_language, checkbox_dataset, ratio_group_by, dropdown_map_function, dropdown_reduce_function, ], outputs=[dataframe], ) if __name__ == "__main__": demo.launch()