Spaces:

autogluon
/

fev-leaderboard

Running

File size: 6,053 Bytes

0737cdd
fdb3d3e
 
 
5068c86
 
 
fdb3d3e
 
e1ca246
732efe6
 
 
 
 
e1ca246
fdb3d3e
0737cdd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
079b094
0737cdd
079b094
0737cdd
079b094
218d801
 
 
 
 
 
 
 
 
 
079b094
 
 
 
 
 
 
 
218d801
0737cdd
fdb3d3e
e1ca246
079b094
732efe6
079b094
732efe6
079b094
 
 
 
 
 
 
 
 
218d801
079b094
 
 
 
732efe6
 
079b094
0737cdd
079b094
 
0737cdd
 
 
079b094
 
c1ea364
079b094
f3548e9
c1ea364
fdb3d3e
e1ca246
 
fdb3d3e
5068c86
e1ca246

import fev
import gradio as gr
import pandas as pd

# Load the CSV data into a pandas DataFrame
df = pd.read_csv(
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/seasonal_naive.csv"
)

markdown_text = """
This space hosts evaluation results for time series forecasting models.

Benchmark definitions, implementations of models, as well as the evaluation results for individual tasks are available under https://github.com/autogluon/fev.

Currently, the results in this space are a minimal proof of concept. Stay tuned for more benchmarks, results for new models and instructions on how to contribute your results.
"""

summary_urls = [
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/auto_arima.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/auto_ets.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/auto_theta.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_base.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_large.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_mini.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_small.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_tiny.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_bolt_base.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_bolt_mini.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_bolt_small.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_bolt_tiny.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/moirai_base.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/moirai_large.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/moirai_small.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/seasonal_naive.csv",
    "https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/timesfm.csv",
]

rename_cols = {
    "gmean_relative_error": "Average relative error",
    "avg_rank": "Average rank",
    "median_inference_time_s": "Median inference time (s)",
    "training_corpus_overlap": "Training corpus overlap (%)",
}
selected_cols = list(rename_cols.keys())


def highlight_zeroshot(styler):
    """Highlight training overlap for zero-shot models with bold green."""

    def style_func(val):
        if val == 0:
            return "color: green; font-weight: bold"
        else:
            return "color: black"

    return styler.map(style_func, subset=["Training corpus overlap (%)"])


leaderboards = {}
for metric in ["WQL", "MASE"]:
    lb = fev.leaderboard(summary_urls, metric_column=metric)[selected_cols].rename(columns=rename_cols)
    format_dict = {}
    for col in lb.columns:
        format_dict[col] = "{:.3f}" if col != "Training corpus overlap (%)" else "{:.1%}"
    leaderboards[metric] = highlight_zeroshot(lb.reset_index().style.format(format_dict))


with gr.Blocks() as demo:
    with gr.Tab("Chronos Benchmark II"):
        gr.Markdown("""
                    ## Chronos Benchmark II results

                    This tab contains results for various forecasting models on the 28 datasets used in Benchmark II in the publication [Chronos: Learning the Language of Time Series](https://arxiv.org/abs/2403.07815).

                    These datasets were used for zero-shot evaluation of Chronos models (i.e., Chronos models were not trained on these datasets), but some other models did include certain datasets in their training corpus.

                    Each table contains the following information:

                    * **Average relative error**: Geometric mean of the relative errors for each task. The relative error for each task is computed as `model_error / baseline_error`.
                    * **Average rank**: Arithmetic mean of the ranks achieved by each model on each task.
                    * **Median inference time (s)**: Median of the times required to make predictions for the entire dataset (in seconds).
                    * **Training corpus overlap (%)**: Percentage of the datasets used in the benchmark that were included in the model's training corpus. Zero-shot models are highlighted in green.

                    Lower values are better for all of the above metrics.

                    Task definitions and the detailed results are available on [GitHub](https://github.com/autogluon/fev/tree/main/benchmarks/chronos_zeroshot). More information for the datasets is available in [Table 3 of the paper](https://arxiv.org/abs/2403.07815).

                    """)
        gr.Markdown("### Probabilistic forecast accuracy\nMeasured by Weighted Quantile Loss (WQL).")
        gr.Dataframe(
            value=leaderboards["WQL"],
            datatype=["str", "number", "number", "number"],
            interactive=False,
        )

        gr.Markdown("""### Point forecast accuracy\nMeasured by Mean Absolute Scaled Error (MASE).
                    """)
        gr.Dataframe(
            value=leaderboards["MASE"],
            interactive=False,
        )

    with gr.Tab("About"):
        gr.Markdown(markdown_text)

if __name__ == "__main__":
    demo.launch()