m-yass / app.py
Maxwell Lyu
init
bcf4698
raw
history blame
7.44 kB
import json
from dataclasses import dataclass, field, fields
from functools import cached_property
from pathlib import Path
from typing import Literal
from functools import partial
from content import *
import gradio as gr
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import pandas as pd
# This dataframe must contain the following columns:
# - model: the name of the model
# - language: the language of the model
# - dataset: the dataset used to evaluate the model
# - score: the score of the model on the (language, dataset) pair
# - model_type: the type of the model (e.g. "Chat Model", "Base Model")
df = pd.read_csv("data/raw_scores.csv")
choices_language = list(df["language"].unique())
choices_dataset = list(df["dataset"].unique())
choices_model_type = list(df["model_type"].unique())
# Utility functions for data processing
reduce_functions = {
"Mean": lambda x: np.mean(x),
"Median": lambda x: np.median(x),
"Max": lambda x: np.max(x),
"Min": lambda x: np.min(x),
}
map_functions = {
"Raw": lambda x: x,
"Rank": partial(pd.Series.rank, ascending=False, method="dense"),
"Normalize": lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)),
}
score_ascending = {
"Raw": False,
"Rank": True,
"Normalize": False,
}
def prepare_dataframe(
df: pd.DataFrame,
filters: dict[str, list[str]],
group_by: Literal["language", "dataset"],
map_function: str,
reduce_function: str,
) -> pd.DataFrame:
# Filters contains a value subset for each column
language = filters["language"]
dataset = filters["dataset"]
# Columns are divided into two groups: other_columns and group_by_columns, apart from `model` which should be the index
other_columns = list(set(df.columns) - set(["language", "dataset", "score"]))
group_by_columns = filters[group_by]
# Step 1: Filter the dataframe based on the selected language and dataset
for k, v in filters.items():
df = df[df[k].isin(v)]
# If dataframe is empty, return an empty dataframe
if len(df) == 0:
gr.Warning(
f"No scores remain after the filter application. Please verify the checkboxes."
)
return pd.DataFrame(columns=other_columns)
# Sanity check: All score exists for each (language,dataset) pair
score_count = (
df.drop_duplicates(subset=["model", "language", "dataset"])
.groupby(["model"])["score"]
.count()
)
invalid_models = score_count[
score_count < len(language) * len(dataset)
].index.tolist()
df = df[~df["model"].isin(invalid_models)]
# Send a warning message if there are any invalid models
for model in invalid_models:
gr.Warning(
f"<strong>{model}</strong> is lacking some scores thus hidden. Please report to the maintainers."
)
# Step 2: Process Scores
# Step 2.0: Map the scores along each (language, dataset) pair
df["score"] = df.groupby(["language", "dataset"])["score"].transform(
map_functions[map_function]
)
# Step 2.1: Reduce the scores along the column other than `group_by`
df = (
df.groupby(other_columns + [group_by])
.agg({"score": reduce_functions[reduce_function]})
.reset_index()
)
# Step 2.2: Reduce the scores along `group_by` to get the overall score of each model
reduced_col = df.groupby(other_columns).agg(
{"score": reduce_functions[reduce_function]}
)["score"]
# Step 2.3: Pivot the dataframe, then concat the overall score
df = df.pivot(index=other_columns, columns=group_by, values=["score"]).droplevel(
0, 1
)
df["Overall Score"] = reduced_col
# Step 3: Styling for display
# - Sort the dataframe by the reduced score
# - Sort the columns for better readability
# - Highlight the maximum value in each column
# - Format the score to 2 decimal places if it is a float
other_columns.remove("model")
df = (
df.reset_index()[
["model"] + other_columns + ["Overall Score"] + group_by_columns
]
.sort_values(by="Overall Score", ascending=score_ascending[map_function])
.style.format(precision=2)
)
if score_ascending[map_function]:
df = df.highlight_min(
axis=0, color="#18864B", subset=["Overall Score"] + group_by_columns
)
else:
df = df.highlight_max(
axis=0, color="#18864B", subset=["Overall Score"] + group_by_columns
)
return df
with gr.Blocks(theme=gr.themes.Base()) as demo:
gr.Markdown(
MARKDOWN_HEADER
)
with gr.Column():
# UI definition
checkbox_language = gr.CheckboxGroup(
choices=choices_language,
value=choices_language,
label="Language(s)",
interactive=True,
)
checkbox_dataset = gr.CheckboxGroup(
choices=choices_dataset,
value=choices_dataset,
label="Dataset(s)",
interactive=True,
)
checkbox_model_type = gr.CheckboxGroup(
choices=choices_model_type,
value=choices_model_type,
label="Model Type(s)",
interactive=True,
)
dropdown_map_function = gr.Dropdown(
choices=map_functions.keys(),
value="Raw",
label="Map Function",
interactive=True,
info=MARKDOWN_MAP_FUNCTION
)
dropdown_reduce_function = gr.Dropdown(
choices=reduce_functions.keys(),
value="Mean",
label="Reduce Function",
interactive=True,
info=MARKDOWN_REDUCE_FUNCTION
)
ratio_group_by = gr.Radio(
choices=["language", "dataset"],
value="language",
label="Group by",
interactive=True,
)
dataframe = gr.DataFrame(
prepare_dataframe(
df=df,
filters={
"language": choices_language,
"dataset": choices_dataset,
},
group_by="language",
map_function="Raw",
reduce_function="Mean",
)
)
# Event listeners
gr.on(
triggers=[
checkbox_model_type.change,
checkbox_language.change,
checkbox_dataset.change,
ratio_group_by.change,
dropdown_reduce_function.change,
dropdown_map_function.change,
],
fn=lambda model_type, language, dataset, group_by, map_function, reduce_function: prepare_dataframe(
df=df,
filters={
"language": language,
"dataset": dataset,
"model_type": model_type,
},
group_by=group_by,
map_function=map_function,
reduce_function=reduce_function,
),
inputs=[
checkbox_model_type,
checkbox_language,
checkbox_dataset,
ratio_group_by,
dropdown_map_function,
dropdown_reduce_function,
],
outputs=[dataframe],
)
if __name__ == "__main__":
demo.launch(debug=True, server_port=7899)