Spaces:
Sleeping
Sleeping
import json | |
from dataclasses import dataclass, field, fields | |
from functools import cached_property | |
from pathlib import Path | |
from typing import Literal | |
from functools import partial | |
from content import * | |
import gradio as gr | |
import numpy as np | |
import pandas as pd | |
import plotly.graph_objects as go | |
import pandas as pd | |
# This dataframe must contain the following columns: | |
# - model: the name of the model | |
# - language: the language of the model | |
# - dataset: the dataset used to evaluate the model | |
# - score: the score of the model on the (language, dataset) pair | |
# - model_type: the type of the model (e.g. "Chat Model", "Base Model") | |
df = pd.read_csv("data/raw_scores.csv") | |
choices_language = list(df["language"].unique()) | |
choices_dataset = list(df["dataset"].unique()) | |
choices_model_type = list(df["model_type"].unique()) | |
# Utility functions for data processing | |
reduce_functions = { | |
"Mean": lambda x: np.mean(x), | |
"Median": lambda x: np.median(x), | |
"Max": lambda x: np.max(x), | |
"Min": lambda x: np.min(x), | |
} | |
map_functions = { | |
"Raw": lambda x: x, | |
"Rank": partial(pd.Series.rank, ascending=False, method="dense"), | |
"Normalize": lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)), | |
} | |
score_ascending = { | |
"Raw": False, | |
"Rank": True, | |
"Normalize": False, | |
} | |
def prepare_dataframe( | |
df: pd.DataFrame, | |
filters: dict[str, list[str]], | |
group_by: Literal["language", "dataset"], | |
map_function: str, | |
reduce_function: str, | |
) -> pd.DataFrame: | |
# Filters contains a value subset for each column | |
language = filters["language"] | |
dataset = filters["dataset"] | |
# Columns are divided into two groups: other_columns and group_by_columns, apart from `model` which should be the index | |
other_columns = list(set(df.columns) - set(["language", "dataset", "score"])) | |
group_by_columns = filters[group_by] | |
# Step 1: Filter the dataframe based on the selected language and dataset | |
for k, v in filters.items(): | |
df = df[df[k].isin(v)] | |
# If dataframe is empty, return an empty dataframe | |
if len(df) == 0: | |
gr.Warning( | |
f"No scores remain after the filter application. Please verify the checkboxes." | |
) | |
return pd.DataFrame(columns=other_columns) | |
# Sanity check: All score exists for each (language,dataset) pair | |
score_count = ( | |
df.drop_duplicates(subset=["model", "language", "dataset"]) | |
.groupby(["model"])["score"] | |
.count() | |
) | |
invalid_models = score_count[ | |
score_count < len(language) * len(dataset) | |
].index.tolist() | |
df = df[~df["model"].isin(invalid_models)] | |
# Send a warning message if there are any invalid models | |
for model in invalid_models: | |
gr.Warning( | |
f"<strong>{model}</strong> is lacking some scores thus hidden. Please report to the maintainers." | |
) | |
# Step 2: Process Scores | |
# Step 2.0: Map the scores along each (language, dataset) pair | |
df["score"] = df.groupby(["language", "dataset"])["score"].transform( | |
map_functions[map_function] | |
) | |
# Step 2.1: Reduce the scores along the column other than `group_by` | |
df = ( | |
df.groupby(other_columns + [group_by]) | |
.agg({"score": reduce_functions[reduce_function]}) | |
.reset_index() | |
) | |
# Step 2.2: Reduce the scores along `group_by` to get the overall score of each model | |
reduced_col = df.groupby(other_columns).agg( | |
{"score": reduce_functions[reduce_function]} | |
)["score"] | |
# Step 2.3: Pivot the dataframe, then concat the overall score | |
df = df.pivot(index=other_columns, columns=group_by, values=["score"]).droplevel( | |
0, 1 | |
) | |
df["Overall Score"] = reduced_col | |
# Step 3: Styling for display | |
# - Sort the dataframe by the reduced score | |
# - Sort the columns for better readability | |
# - Highlight the maximum value in each column | |
# - Format the score to 2 decimal places if it is a float | |
other_columns.remove("model") | |
df = ( | |
df.reset_index()[ | |
["model"] + other_columns + ["Overall Score"] + group_by_columns | |
] | |
.sort_values(by="Overall Score", ascending=score_ascending[map_function]) | |
.style.format(precision=2) | |
) | |
if score_ascending[map_function]: | |
df = df.highlight_min( | |
axis=0, color="#18864B", subset=["Overall Score"] + group_by_columns | |
) | |
else: | |
df = df.highlight_max( | |
axis=0, color="#18864B", subset=["Overall Score"] + group_by_columns | |
) | |
return df | |
with gr.Blocks(theme=gr.themes.Base()) as demo: | |
gr.Markdown( | |
MARKDOWN_HEADER | |
) | |
with gr.Column(): | |
# UI definition | |
checkbox_language = gr.CheckboxGroup( | |
choices=choices_language, | |
value=choices_language, | |
label="Language(s)", | |
interactive=True, | |
) | |
checkbox_dataset = gr.CheckboxGroup( | |
choices=choices_dataset, | |
value=choices_dataset, | |
label="Dataset(s)", | |
interactive=True, | |
) | |
checkbox_model_type = gr.CheckboxGroup( | |
choices=choices_model_type, | |
value=choices_model_type, | |
label="Model Type(s)", | |
interactive=True, | |
) | |
dropdown_map_function = gr.Dropdown( | |
choices=map_functions.keys(), | |
value="Raw", | |
label="Map Function", | |
interactive=True, | |
info=MARKDOWN_MAP_FUNCTION | |
) | |
dropdown_reduce_function = gr.Dropdown( | |
choices=reduce_functions.keys(), | |
value="Mean", | |
label="Reduce Function", | |
interactive=True, | |
info=MARKDOWN_REDUCE_FUNCTION | |
) | |
ratio_group_by = gr.Radio( | |
choices=["language", "dataset"], | |
value="language", | |
label="Group by", | |
interactive=True, | |
) | |
dataframe = gr.DataFrame( | |
prepare_dataframe( | |
df=df, | |
filters={ | |
"language": choices_language, | |
"dataset": choices_dataset, | |
}, | |
group_by="language", | |
map_function="Raw", | |
reduce_function="Mean", | |
) | |
) | |
# Event listeners | |
gr.on( | |
triggers=[ | |
checkbox_model_type.change, | |
checkbox_language.change, | |
checkbox_dataset.change, | |
ratio_group_by.change, | |
dropdown_reduce_function.change, | |
dropdown_map_function.change, | |
], | |
fn=lambda model_type, language, dataset, group_by, map_function, reduce_function: prepare_dataframe( | |
df=df, | |
filters={ | |
"language": language, | |
"dataset": dataset, | |
"model_type": model_type, | |
}, | |
group_by=group_by, | |
map_function=map_function, | |
reduce_function=reduce_function, | |
), | |
inputs=[ | |
checkbox_model_type, | |
checkbox_language, | |
checkbox_dataset, | |
ratio_group_by, | |
dropdown_map_function, | |
dropdown_reduce_function, | |
], | |
outputs=[dataframe], | |
) | |
if __name__ == "__main__": | |
demo.launch(debug=True, server_port=7899) | |