Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	| import json | |
| from dataclasses import dataclass, field, fields | |
| from functools import cached_property | |
| from pathlib import Path | |
| from typing import Literal | |
| from functools import partial | |
| from content import * | |
| import gradio as gr | |
| import numpy as np | |
| import pandas as pd | |
| import plotly.graph_objects as go | |
| import pandas as pd | |
| # This dataframe must contain the following columns: | |
| # - model: the name of the model | |
| # - language: the language of the model | |
| # - dataset: the dataset used to evaluate the model | |
| # - score: the score of the model on the (language, dataset) pair | |
| # - model_type: the type of the model (e.g. "Chat Model", "Base Model") | |
| df = pd.read_csv("data/raw_scores.csv") | |
| choices_language = list(df["language"].unique()) | |
| choices_dataset = list(df["dataset"].unique()) | |
| choices_model_type = list(df["model_type"].unique()) | |
| # Utility functions for data processing | |
| reduce_functions = { | |
| "Mean": lambda x: np.mean(x), | |
| "Median": lambda x: np.median(x), | |
| "Max": lambda x: np.max(x), | |
| "Min": lambda x: np.min(x), | |
| } | |
| map_functions = { | |
| "Raw": lambda x: x, | |
| "Rank": partial(pd.Series.rank, ascending=False, method="dense"), | |
| "Normalize": lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)), | |
| } | |
| score_ascending = { | |
| "Raw": False, | |
| "Rank": True, | |
| "Normalize": False, | |
| } | |
| def prepare_dataframe( | |
| df: pd.DataFrame, | |
| filters: dict[str, list[str]], | |
| group_by: Literal["language", "dataset"], | |
| map_function: str, | |
| reduce_function: str, | |
| ) -> pd.DataFrame: | |
| # Filters contains a value subset for each column | |
| language = filters["language"] | |
| dataset = filters["dataset"] | |
| # Columns are divided into two groups: other_columns and group_by_columns, apart from `model` which should be the index | |
| other_columns = list(set(df.columns) - set(["language", "dataset", "score"])) | |
| group_by_columns = filters[group_by] | |
| # Step 1: Filter the dataframe based on the selected language and dataset | |
| for k, v in filters.items(): | |
| df = df[df[k].isin(v)] | |
| # If dataframe is empty, return an empty dataframe | |
| if len(df) == 0: | |
| gr.Warning( | |
| f"No scores remain after the filter application. Please verify the checkboxes." | |
| ) | |
| return pd.DataFrame(columns=other_columns) | |
| # Sanity check: All score exists for each (language,dataset) pair | |
| score_count = ( | |
| df.drop_duplicates(subset=["model", "language", "dataset"]) | |
| .groupby(["model"])["score"] | |
| .count() | |
| ) | |
| invalid_models = score_count[ | |
| score_count < len(language) * len(dataset) | |
| ].index.tolist() | |
| df = df[~df["model"].isin(invalid_models)] | |
| # Send a warning message if there are any invalid models | |
| for model in invalid_models: | |
| gr.Warning( | |
| f"<strong>{model}</strong> is lacking some scores thus hidden. Please report to the maintainers." | |
| ) | |
| # Step 2: Process Scores | |
| # Step 2.0: Map the scores along each (language, dataset) pair | |
| df["score"] = df.groupby(["language", "dataset"])["score"].transform( | |
| map_functions[map_function] | |
| ) | |
| # Step 2.1: Reduce the scores along the column other than `group_by` | |
| df = ( | |
| df.groupby(other_columns + [group_by]) | |
| .agg({"score": reduce_functions[reduce_function]}) | |
| .reset_index() | |
| ) | |
| # Step 2.2: Reduce the scores along `group_by` to get the overall score of each model | |
| reduced_col = df.groupby(other_columns).agg( | |
| {"score": reduce_functions[reduce_function]} | |
| )["score"] | |
| # Step 2.3: Pivot the dataframe, then concat the overall score | |
| df = df.pivot(index=other_columns, columns=group_by, values=["score"]).droplevel( | |
| 0, 1 | |
| ) | |
| df["Overall Score"] = reduced_col | |
| # Step 3: Styling for display | |
| # - Sort the dataframe by the reduced score | |
| # - Sort the columns for better readability | |
| # - Highlight the maximum value in each column | |
| # - Format the score to 2 decimal places if it is a float | |
| other_columns.remove("model") | |
| df = ( | |
| df.reset_index()[ | |
| ["model"] + other_columns + ["Overall Score"] + group_by_columns | |
| ] | |
| .sort_values(by="Overall Score", ascending=score_ascending[map_function]) | |
| .style.format(precision=2) | |
| ) | |
| if score_ascending[map_function]: | |
| df = df.highlight_min( | |
| axis=0, color="#18864B", subset=["Overall Score"] + group_by_columns | |
| ) | |
| else: | |
| df = df.highlight_max( | |
| axis=0, color="#18864B", subset=["Overall Score"] + group_by_columns | |
| ) | |
| return df | |
| with gr.Blocks(theme=gr.themes.Base()) as demo: | |
| gr.Markdown( | |
| MARKDOWN_HEADER | |
| ) | |
| with gr.Column(): | |
| # UI definition | |
| checkbox_language = gr.CheckboxGroup( | |
| choices=choices_language, | |
| value=choices_language, | |
| label="Language(s)", | |
| interactive=True, | |
| ) | |
| checkbox_dataset = gr.CheckboxGroup( | |
| choices=choices_dataset, | |
| value=choices_dataset, | |
| label="Dataset(s)", | |
| interactive=True, | |
| ) | |
| checkbox_model_type = gr.CheckboxGroup( | |
| choices=choices_model_type, | |
| value=choices_model_type, | |
| label="Model Type(s)", | |
| interactive=True, | |
| ) | |
| dropdown_map_function = gr.Dropdown( | |
| choices=map_functions.keys(), | |
| value="Raw", | |
| label="Map Function", | |
| interactive=True, | |
| info=MARKDOWN_MAP_FUNCTION | |
| ) | |
| dropdown_reduce_function = gr.Dropdown( | |
| choices=reduce_functions.keys(), | |
| value="Mean", | |
| label="Reduce Function", | |
| interactive=True, | |
| info=MARKDOWN_REDUCE_FUNCTION | |
| ) | |
| ratio_group_by = gr.Radio( | |
| choices=["language", "dataset"], | |
| value="language", | |
| label="Group by", | |
| interactive=True, | |
| ) | |
| dataframe = gr.DataFrame( | |
| prepare_dataframe( | |
| df=df, | |
| filters={ | |
| "language": choices_language, | |
| "dataset": choices_dataset, | |
| }, | |
| group_by="language", | |
| map_function="Raw", | |
| reduce_function="Mean", | |
| ) | |
| ) | |
| # Event listeners | |
| gr.on( | |
| triggers=[ | |
| checkbox_model_type.change, | |
| checkbox_language.change, | |
| checkbox_dataset.change, | |
| ratio_group_by.change, | |
| dropdown_reduce_function.change, | |
| dropdown_map_function.change, | |
| ], | |
| fn=lambda model_type, language, dataset, group_by, map_function, reduce_function: prepare_dataframe( | |
| df=df, | |
| filters={ | |
| "language": language, | |
| "dataset": dataset, | |
| "model_type": model_type, | |
| }, | |
| group_by=group_by, | |
| map_function=map_function, | |
| reduce_function=reduce_function, | |
| ), | |
| inputs=[ | |
| checkbox_model_type, | |
| checkbox_language, | |
| checkbox_dataset, | |
| ratio_group_by, | |
| dropdown_map_function, | |
| dropdown_reduce_function, | |
| ], | |
| outputs=[dataframe], | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(debug=True, server_port=7899) | |
