Spaces:
Running
Running
import fev | |
import gradio as gr | |
import pandas as pd | |
# Load the CSV data into a pandas DataFrame | |
df = pd.read_csv( | |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/seasonal_naive.csv" | |
) | |
markdown_text = """ | |
This space hosts evaluation results for time series forecasting models. | |
Benchmark definitions, implementations of models, as well as the evaluation results for individual tasks are available under https://github.com/autogluon/fev. | |
Currently, the results in this space are a minimal proof of concept. Stay tuned for more benchmarks, results for new models and instructions on how to contribute your results. | |
""" | |
summary_urls = [ | |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/auto_arima.csv", | |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/auto_ets.csv", | |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/auto_theta.csv", | |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_base.csv", | |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_large.csv", | |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_mini.csv", | |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_small.csv", | |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_tiny.csv", | |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_bolt_base.csv", | |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_bolt_mini.csv", | |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_bolt_small.csv", | |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/chronos_bolt_tiny.csv", | |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/moirai_base.csv", | |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/moirai_large.csv", | |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/moirai_small.csv", | |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/seasonal_naive.csv", | |
"https://raw.githubusercontent.com/autogluon/fev/refs/heads/main/benchmarks/chronos_zeroshot/results/timesfm.csv", | |
] | |
rename_cols = { | |
"gmean_relative_error": "Average relative error", | |
"avg_rank": "Average rank", | |
"median_inference_time_s": "Median inference time (s)", | |
"training_corpus_overlap": "Training corpus overlap (%)", | |
} | |
selected_cols = list(rename_cols.keys()) | |
def highlight_zeroshot(styler): | |
"""Highlight training overlap for zero-shot models with bold green.""" | |
def style_func(val): | |
if val == 0: | |
return "color: green; font-weight: bold" | |
else: | |
return "color: black" | |
return styler.map(style_func, subset=["Training corpus overlap (%)"]) | |
leaderboards = {} | |
for metric in ["WQL", "MASE"]: | |
lb = fev.leaderboard(summary_urls, metric_column=metric)[selected_cols].rename(columns=rename_cols) | |
format_dict = {} | |
for col in lb.columns: | |
format_dict[col] = "{:.3f}" if col != "Training corpus overlap (%)" else "{:.1%}" | |
leaderboards[metric] = highlight_zeroshot(lb.reset_index().style.format(format_dict)) | |
with gr.Blocks() as demo: | |
with gr.Tab("Chronos Benchmark II"): | |
gr.Markdown(""" | |
## Chronos Benchmark II results | |
This tab contains results for various forecasting models on the 28 datasets used in Benchmark II in the publication [Chronos: Learning the Language of Time Series](https://arxiv.org/abs/2403.07815). | |
These datasets were used for zero-shot evaluation of Chronos models (i.e., Chronos models were not trained on these datasets), but some other models did include certain datasets in their training corpus. | |
Each table contains the following information: | |
* **Average relative error**: Geometric mean of the relative errors for each task. The relative error for each task is computed as `model_error / baseline_error`. | |
* **Average rank**: Arithmetic mean of the ranks achieved by each model on each task. | |
* **Median inference time (s)**: Median of the times required to make predictions for the entire dataset (in seconds). | |
* **Training corpus overlap (%)**: Percentage of the datasets used in the benchmark that were included in the model's training corpus. Zero-shot models are highlighted in <span style="color:green; font-weight:bold;">green</span>. | |
Lower values are better for all of the above metrics. | |
Task definitions and the detailed results are available on [GitHub](https://github.com/autogluon/fev/tree/main/benchmarks/chronos_zeroshot). More information for the datasets is available in [Table 3 of the paper](https://arxiv.org/abs/2403.07815). | |
""") | |
gr.Markdown("### Probabilistic forecast accuracy\nMeasured by Weighted Quantile Loss (WQL).") | |
gr.Dataframe( | |
value=leaderboards["WQL"], | |
datatype=["str", "number", "number", "number"], | |
interactive=False, | |
) | |
gr.Markdown("""### Point forecast accuracy\nMeasured by Mean Absolute Scaled Error (MASE). | |
""") | |
gr.Dataframe( | |
value=leaderboards["MASE"], | |
interactive=False, | |
) | |
with gr.Tab("About"): | |
gr.Markdown(markdown_text) | |
if __name__ == "__main__": | |
demo.launch() | |