|
from datasets import load_dataset |
|
import gradio as gr |
|
from huggingface_hub import HfApi, hf_hub_download |
|
from huggingface_hub.repocard import metadata_load |
|
import pandas as pd |
|
|
|
TASKS = [ |
|
"BitextMining", |
|
"Classification", |
|
"Clustering", |
|
"PairClassification", |
|
"Reranking", |
|
"Retrieval", |
|
"STS", |
|
"Summarization", |
|
] |
|
|
|
TASK_LIST_CLASSIFICATION = [ |
|
"AmazonCounterfactualClassification (en)", |
|
"AmazonPolarityClassification", |
|
"AmazonReviewsClassification (en)", |
|
"Banking77Classification", |
|
"EmotionClassification", |
|
"ImdbClassification", |
|
"MassiveIntentClassification (en)", |
|
"MassiveScenarioClassification (en)", |
|
"MTOPDomainClassification (en)", |
|
"MTOPIntentClassification (en)", |
|
"ToxicConversationsClassification", |
|
"TweetSentimentExtractionClassification", |
|
] |
|
|
|
TASK_LIST_CLASSIFICATION_NORM = [x.replace(" (en)", "") for x in TASK_LIST_CLASSIFICATION] |
|
|
|
TASK_LIST_CLUSTERING = [ |
|
"ArxivClusteringP2P", |
|
"ArxivClusteringS2S", |
|
"BiorxivClusteringP2P", |
|
"BiorxivClusteringS2S", |
|
"MedrxivClusteringP2P", |
|
"MedrxivClusteringS2S", |
|
"RedditClustering", |
|
"RedditClusteringP2P", |
|
"StackExchangeClustering", |
|
"StackExchangeClusteringP2P", |
|
"TwentyNewsgroupsClustering", |
|
] |
|
|
|
TASK_LIST_PAIR_CLASSIFICATION = [ |
|
"SprintDuplicateQuestions", |
|
"TwitterSemEval2015", |
|
"TwitterURLCorpus", |
|
] |
|
|
|
TASK_LIST_RERANKING = [ |
|
"AskUbuntuDupQuestions", |
|
"MindSmallReranking", |
|
"SciDocsRR", |
|
"StackOverflowDupQuestions", |
|
] |
|
|
|
TASK_LIST_RETRIEVAL = [ |
|
"ArguAna", |
|
"ClimateFEVER", |
|
"CQADupstackRetrieval", |
|
"DBPedia", |
|
"FEVER", |
|
"FiQA2018", |
|
"HotpotQA", |
|
"MSMARCO", |
|
"NFCorpus", |
|
"NQ", |
|
"QuoraRetrieval", |
|
"SCIDOCS", |
|
"SciFact", |
|
"Touche2020", |
|
"TRECCOVID", |
|
] |
|
|
|
TASK_LIST_RETRIEVAL_NORM = TASK_LIST_RETRIEVAL + ["CQADupstackAndroidRetrieval", |
|
"CQADupstackEnglishRetrieval", |
|
"CQADupstackGamingRetrieval", |
|
"CQADupstackGisRetrieval", |
|
"CQADupstackMathematicaRetrieval", |
|
"CQADupstackPhysicsRetrieval", |
|
"CQADupstackProgrammersRetrieval", |
|
"CQADupstackStatsRetrieval", |
|
"CQADupstackTexRetrieval", |
|
"CQADupstackUnixRetrieval", |
|
"CQADupstackWebmastersRetrieval", |
|
"CQADupstackWordpressRetrieval" |
|
] |
|
|
|
TASK_LIST_STS = [ |
|
"BIOSSES", |
|
"SICK-R", |
|
"STS12", |
|
"STS13", |
|
"STS14", |
|
"STS15", |
|
"STS16", |
|
"STS17 (en-en)", |
|
"STS22 (en)", |
|
"STSBenchmark", |
|
] |
|
|
|
TASK_LIST_STS_NORM = [x.replace(" (en)", "").replace(" (en-en)", "") for x in TASK_LIST_STS] |
|
|
|
TASK_LIST_SUMMARIZATION = [ |
|
"SummEval", |
|
] |
|
|
|
TASK_LIST_EN = TASK_LIST_CLASSIFICATION + TASK_LIST_CLUSTERING + TASK_LIST_PAIR_CLASSIFICATION + TASK_LIST_RERANKING + TASK_LIST_RETRIEVAL + TASK_LIST_STS + TASK_LIST_SUMMARIZATION |
|
|
|
TASK_TO_METRIC = { |
|
"BitextMining": "f1", |
|
"Clustering": "v_measure", |
|
"Classification": "accuracy", |
|
"PairClassification": "cos_sim_ap", |
|
"Reranking": "map", |
|
"Retrieval": "ndcg_at_10", |
|
"STS": "cos_sim_spearman", |
|
"Summarization": "cos_sim_spearman", |
|
} |
|
|
|
def make_clickable_model(model_name, link=None): |
|
if link is None: |
|
link = "https://huggingface.co/" + model_name |
|
|
|
return ( |
|
f'<a target="_blank" style="text-decoration: underline" href="{link}">{model_name.split("/")[-1]}</a>' |
|
) |
|
|
|
|
|
EXTERNAL_MODELS = [ |
|
"LASER2", |
|
"LaBSE", |
|
"all-MiniLM-L12-v2", |
|
"all-MiniLM-L6-v2", |
|
"all-mpnet-base-v2", |
|
"allenai-specter", |
|
"bert-base-uncased", |
|
"contriever-base-msmarco", |
|
"glove.6B.300d", |
|
"gtr-t5-base", |
|
"gtr-t5-large", |
|
"gtr-t5-xl", |
|
"gtr-t5-xxl", |
|
"komninos", |
|
"msmarco-bert-co-condensor", |
|
"paraphrase-multilingual-MiniLM-L12-v2", |
|
"paraphrase-multilingual-mpnet-base-v2", |
|
"sentence-t5-base", |
|
"sentence-t5-large", |
|
"sentence-t5-xl", |
|
"sentence-t5-xxl", |
|
"sgpt-bloom-1b3-nli", |
|
"sup-simcse-bert-base-uncased", |
|
"text-similarity-ada-001", |
|
"unsup-simcse-bert-base-uncased", |
|
] |
|
EXTERNAL_MODEL_TO_LINK = { |
|
"LASER2": "https://github.com/facebookresearch/LASER", |
|
"text-similarity-ada-001": "https://beta.openai.com/docs/guides/embeddings/types-of-embedding-models", |
|
"LaBSE": "https://huggingface.co/sentence-transformers/LaBSE", |
|
"sentence-t5-xxl": "https://huggingface.co/sentence-transformers/sentence-t5-xxl", |
|
"sentence-t5-xl": "https://huggingface.co/sentence-transformers/sentence-t5-xl", |
|
"sentence-t5-large": "https://huggingface.co/sentence-transformers/sentence-t5-large", |
|
"sentence-t5-base": "https://huggingface.co/sentence-transformers/sentence-t5-base", |
|
"gtr-t5-xxl": "https://huggingface.co/sentence-transformers/gtr-t5-xxl", |
|
"gtr-t5-xl": "https://huggingface.co/sentence-transformers/gtr-t5-xl", |
|
"gtr-t5-large": "https://huggingface.co/sentence-transformers/gtr-t5-large", |
|
"gtr-t5-base": "https://huggingface.co/sentence-transformers/gtr-t5-base", |
|
"gtr-t5-xxl": "https://huggingface.co/sentence-transformers/gtr-t5-xxl", |
|
"gtr-t5-xl": "https://huggingface.co/sentence-transformers/gtr-t5-xl", |
|
"gtr-t5-large": "https://huggingface.co/sentence-transformers/gtr-t5-large", |
|
"gtr-t5-base": "https://huggingface.co/sentence-transformers/gtr-t5-base", |
|
"bert-base-uncased": "https://huggingface.co/bert-base-uncased", |
|
"allenai-specter": "https://huggingface.co/sentence-transformers/allenai-specter", |
|
"allenai-specter": "https://huggingface.co/sentence-transformers/allenai-specter", |
|
"unsup-simcse-bert-base-uncased": "https://huggingface.co/princeton-nlp/unsup-simcse-bert-base-uncased", |
|
"sup-simcse-bert-base-uncased": "https://huggingface.co/princeton-nlp/sup-simcse-bert-base-uncased", |
|
"komninos": "https://huggingface.co/sentence-transformers/average_word_embeddings_komninos", |
|
"glove.6B.300d": "https://huggingface.co/sentence-transformers/average_word_embeddings_glove.6B.300d", |
|
"msmarco-bert-co-condensor": "https://huggingface.co/sentence-transformers/msmarco-bert-co-condensor", |
|
"all-MiniLM-L12-v2": "https://huggingface.co/sentence-transformers/all-MiniLM-L12-v2", |
|
"all-MiniLM-L6-v2": "https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2", |
|
"all-mpnet-base-v2": "https://huggingface.co/sentence-transformers/all-mpnet-base-v2", |
|
"paraphrase-multilingual-mpnet-base-v2": "https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2", |
|
"paraphrase-multilingual-MiniLM-L12-v2": "https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", |
|
} |
|
|
|
|
|
EXTERNAL_MODEL_RESULTS = {model: {k: {v: []} for k, v in TASK_TO_METRIC.items()} for model in EXTERNAL_MODELS} |
|
|
|
def add_lang(examples): |
|
if not(examples["eval_language"]): |
|
examples["mteb_dataset_name_with_lang"] = examples["mteb_dataset_name"] |
|
else: |
|
examples["mteb_dataset_name_with_lang"] = examples["mteb_dataset_name"] + f' ({examples["eval_language"]})' |
|
return examples |
|
|
|
def add_task(examples): |
|
|
|
if examples["mteb_dataset_name"] in TASK_LIST_CLASSIFICATION_NORM: |
|
examples["mteb_task"] = "Classification" |
|
elif examples["mteb_dataset_name"] in TASK_LIST_CLUSTERING: |
|
examples["mteb_task"] = "Clustering" |
|
elif examples["mteb_dataset_name"] in TASK_LIST_PAIR_CLASSIFICATION: |
|
examples["mteb_task"] = "PairClassification" |
|
elif examples["mteb_dataset_name"] in TASK_LIST_RERANKING: |
|
examples["mteb_task"] = "Reranking" |
|
elif examples["mteb_dataset_name"] in TASK_LIST_RETRIEVAL_NORM: |
|
examples["mteb_task"] = "Retrieval" |
|
elif examples["mteb_dataset_name"] in TASK_LIST_STS_NORM: |
|
examples["mteb_task"] = "STS" |
|
elif examples["mteb_dataset_name"] in TASK_LIST_SUMMARIZATION: |
|
examples["mteb_task"] = "Summarization" |
|
else: |
|
examples["mteb_task"] = "BitextMining" |
|
return examples |
|
|
|
for model in EXTERNAL_MODELS: |
|
ds = load_dataset("mteb/results", model) |
|
|
|
|
|
ds = ds.map(add_lang) |
|
ds = ds.map(add_task) |
|
base_dict = {"Model": make_clickable_model(model, link=EXTERNAL_MODEL_TO_LINK.get(model, "https://huggingface.co/spaces/mteb/leaderboard"))} |
|
|
|
for task, metric in TASK_TO_METRIC.items(): |
|
ds_dict = ds.filter(lambda x: (x["mteb_task"] == task) and (x["metric"] == metric))["test"].to_dict() |
|
ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])} |
|
EXTERNAL_MODEL_RESULTS[model][task][metric].append({**base_dict, **ds_dict}) |
|
|
|
|
|
def get_mteb_data(tasks=["Clustering"], langs=[], fillna=True, task_to_metric=TASK_TO_METRIC): |
|
api = HfApi() |
|
models = api.list_models(filter="mteb") |
|
|
|
df_list = [] |
|
for model in EXTERNAL_MODEL_RESULTS: |
|
results_list = [res for task in tasks for res in EXTERNAL_MODEL_RESULTS[model][task][task_to_metric[task]]] |
|
if langs: |
|
|
|
langs_format = [f"({lang})" for lang in langs] |
|
res = {k: v for d in results_list for k, v in d.items() if any([k.split(" ")[-1] in (k, x) for x in langs_format])} |
|
else: |
|
res = {k: v for d in results_list for k, v in d.items()} |
|
|
|
if len(res) > 1: |
|
df_list.append(res) |
|
|
|
for model in models: |
|
readme_path = hf_hub_download(model.modelId, filename="README.md") |
|
meta = metadata_load(readme_path) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if langs: |
|
task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks) and (sub_res.get("dataset", {}).get("config", "default") in ("default", *langs))] |
|
else: |
|
task_results = [sub_res for sub_res in meta["model-index"][0]["results"] if (sub_res.get("task", {}).get("type", "") in tasks)] |
|
out = [{res["dataset"]["name"].replace("MTEB ", ""): [round(score["value"], 2) for score in res["metrics"] if score["type"] == task_to_metric.get(res["task"]["type"])][0]} for res in task_results] |
|
out = {k: v for d in out for k, v in d.items()} |
|
out["Model"] = make_clickable_model(model.modelId) |
|
df_list.append(out) |
|
df = pd.DataFrame(df_list) |
|
|
|
cols = sorted(list(df.columns)) |
|
cols.insert(0, cols.pop(cols.index("Model"))) |
|
df = df[cols] |
|
if fillna: |
|
df.fillna("", inplace=True) |
|
return df |
|
|
|
def get_mteb_average(): |
|
global DATA_OVERALL, DATA_CLASSIFICATION_EN, DATA_CLUSTERING, DATA_PAIR_CLASSIFICATION, DATA_RERANKING, DATA_RETRIEVAL, DATA_STS_EN, DATA_SUMMARIZATION, NUM_SCORES |
|
DATA_OVERALL = get_mteb_data( |
|
tasks=[ |
|
"Classification", |
|
"Clustering", |
|
"PairClassification", |
|
"Reranking", |
|
"Retrieval", |
|
"STS", |
|
"Summarization", |
|
], |
|
langs=["en", "en-en"], |
|
fillna=False |
|
) |
|
|
|
NUM_SCORES = DATA_OVERALL.shape[0] * DATA_OVERALL.shape[1] |
|
|
|
|
|
|
|
|
|
DATA_OVERALL.insert(1, f"Average ({len(TASK_LIST_EN)} datasets)", DATA_OVERALL[TASK_LIST_EN].mean(axis=1, skipna=False)) |
|
DATA_OVERALL.insert(2, f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", DATA_OVERALL[TASK_LIST_CLASSIFICATION].mean(axis=1, skipna=False)) |
|
DATA_OVERALL.insert(3, f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", DATA_OVERALL[TASK_LIST_CLUSTERING].mean(axis=1, skipna=False)) |
|
DATA_OVERALL.insert(4, f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", DATA_OVERALL[TASK_LIST_PAIR_CLASSIFICATION].mean(axis=1, skipna=False)) |
|
DATA_OVERALL.insert(5, f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", DATA_OVERALL[TASK_LIST_RERANKING].mean(axis=1, skipna=False)) |
|
DATA_OVERALL.insert(6, f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", DATA_OVERALL[TASK_LIST_RETRIEVAL].mean(axis=1, skipna=False)) |
|
DATA_OVERALL.insert(7, f"STS Average ({len(TASK_LIST_STS)} datasets)", DATA_OVERALL[TASK_LIST_STS].mean(axis=1, skipna=False)) |
|
DATA_OVERALL.insert(8, f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)", DATA_OVERALL[TASK_LIST_SUMMARIZATION].mean(axis=1, skipna=False)) |
|
DATA_OVERALL.sort_values(f"Average ({len(TASK_LIST_EN)} datasets)", ascending=False, inplace=True) |
|
|
|
DATA_OVERALL.insert(0, "Rank", list(range(1, len(DATA_OVERALL) + 1))) |
|
|
|
DATA_OVERALL = DATA_OVERALL.round(2) |
|
|
|
|
|
DATA_OVERALL.fillna("", inplace=True) |
|
|
|
DATA_CLASSIFICATION_EN = DATA_OVERALL[["Model"] + TASK_LIST_CLASSIFICATION] |
|
DATA_CLUSTERING = DATA_OVERALL[["Model"] + TASK_LIST_CLUSTERING] |
|
DATA_PAIR_CLASSIFICATION = DATA_OVERALL[["Model"] + TASK_LIST_PAIR_CLASSIFICATION] |
|
DATA_RERANKING = DATA_OVERALL[["Model"] + TASK_LIST_RERANKING] |
|
DATA_RETRIEVAL = DATA_OVERALL[["Model"] + TASK_LIST_RETRIEVAL] |
|
DATA_STS_EN = DATA_OVERALL[["Model"] + TASK_LIST_STS] |
|
DATA_SUMMARIZATION = DATA_OVERALL[["Model"] + TASK_LIST_SUMMARIZATION] |
|
|
|
DATA_OVERALL = DATA_OVERALL[["Rank", "Model", f"Average ({len(TASK_LIST_EN)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", f"STS Average ({len(TASK_LIST_STS)} datasets)", f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)"]] |
|
|
|
return DATA_OVERALL |
|
|
|
get_mteb_average() |
|
block = gr.Blocks() |
|
|
|
|
|
with block: |
|
gr.Markdown(f""" |
|
Massive Text Embedding Benchmark (MTEB) Leaderboard. To submit, refer to the <a href="https://github.com/embeddings-benchmark/mteb#leaderboard" target="_blank" style="text-decoration: underline">MTEB GitHub repository</a> 🤗 |
|
|
|
- **Total Datasets**: 56 |
|
- **Total Languages**: 117 |
|
- **Total Scores**: >{NUM_SCORES} |
|
- **Total Models**: {len(DATA_OVERALL)} |
|
""") |
|
with gr.Tabs(): |
|
with gr.TabItem("Overall"): |
|
with gr.Row(): |
|
gr.Markdown(""" |
|
**Overall MTEB English leaderboard 🔮** |
|
|
|
- **Metric:** Various, refer to task tabs |
|
- **Languages:** English, refer to task tabs for others |
|
""") |
|
with gr.Row(): |
|
data_overall = gr.components.Dataframe( |
|
DATA_OVERALL, |
|
datatype=["number", "markdown"] + ["number"] * len(DATA_OVERALL.columns), |
|
type="pandas", |
|
wrap=True, |
|
) |
|
with gr.Row(): |
|
data_run = gr.Button("Refresh") |
|
data_run.click(get_mteb_average, inputs=None, outputs=data_overall) |
|
with gr.TabItem("BitextMining"): |
|
with gr.Row(): |
|
gr.Markdown(""" |
|
**Bitext Mining Leaderboard 🎌** |
|
|
|
- **Metric:** F1 (f1) |
|
- **Languages:** 117 |
|
""") |
|
with gr.Row(): |
|
data_bitext_mining = gr.components.Dataframe( |
|
datatype=["markdown"] + ["number"] * 500, |
|
type="pandas", |
|
) |
|
with gr.Row(): |
|
data_run = gr.Button("Refresh") |
|
task_bitext_mining = gr.Variable(value=["BitextMining"]) |
|
data_run.click( |
|
get_mteb_data, |
|
inputs=[task_bitext_mining], |
|
outputs=data_bitext_mining, |
|
) |
|
with gr.TabItem("Classification"): |
|
with gr.TabItem("English"): |
|
with gr.Row(): |
|
gr.Markdown(""" |
|
**Classification Leaderboard ❤️** |
|
|
|
- **Metric:** Accuracy (accuracy) |
|
- **Languages:** English |
|
""") |
|
with gr.Row(): |
|
data_classification_en = gr.components.Dataframe( |
|
DATA_CLASSIFICATION_EN, |
|
datatype=["markdown"] + ["number"] * len(DATA_CLASSIFICATION_EN.columns), |
|
type="pandas", |
|
) |
|
with gr.Row(): |
|
data_run_classification_en = gr.Button("Refresh") |
|
task_classification_en = gr.Variable(value=["Classification"]) |
|
lang_classification_en = gr.Variable(value=["en"]) |
|
data_run_classification_en.click( |
|
get_mteb_data, |
|
inputs=[ |
|
task_classification_en, |
|
lang_classification_en, |
|
], |
|
outputs=data_classification_en, |
|
) |
|
with gr.TabItem("Multilingual"): |
|
with gr.Row(): |
|
gr.Markdown(""" |
|
**Classification Multilingual Leaderboard 💜💚💙** |
|
|
|
- **Metric:** Accuracy (accuracy) |
|
- **Languages:** 51 |
|
""") |
|
with gr.Row(): |
|
data_classification = gr.components.Dataframe( |
|
datatype=["markdown"] + ["number"] * 200, |
|
type="pandas", |
|
) |
|
with gr.Row(): |
|
data_run = gr.Button("Refresh") |
|
task_classification = gr.Variable(value=["Classification"]) |
|
data_run.click( |
|
get_mteb_data, |
|
inputs=[task_classification], |
|
outputs=data_classification, |
|
) |
|
with gr.TabItem("Clustering"): |
|
with gr.Row(): |
|
gr.Markdown(""" |
|
**Clustering Leaderboard ✨** |
|
|
|
- **Metric:** Validity Measure (v_measure) |
|
- **Languages:** English |
|
""") |
|
with gr.Row(): |
|
data_clustering = gr.components.Dataframe( |
|
DATA_CLUSTERING, |
|
datatype=["markdown"] + ["number"] * len(DATA_CLUSTERING.columns), |
|
type="pandas", |
|
) |
|
with gr.Row(): |
|
data_run = gr.Button("Refresh") |
|
task_clustering = gr.Variable(value=["Clustering"]) |
|
data_run.click( |
|
get_mteb_data, |
|
inputs=[task_clustering], |
|
outputs=data_clustering, |
|
) |
|
with gr.TabItem("Pair Classification"): |
|
with gr.Row(): |
|
gr.Markdown(""" |
|
**Pair Classification Leaderboard 🎭** |
|
|
|
- **Metric:** Average Precision based on Cosine Similarities (cos_sim_ap) |
|
- **Languages:** English |
|
""") |
|
with gr.Row(): |
|
data_pair_classification = gr.components.Dataframe( |
|
DATA_PAIR_CLASSIFICATION, |
|
datatype=["markdown"] + ["number"] * len(DATA_PAIR_CLASSIFICATION.columns), |
|
type="pandas", |
|
) |
|
with gr.Row(): |
|
data_run = gr.Button("Refresh") |
|
task_pair_classification = gr.Variable(value=["PairClassification"]) |
|
data_run.click( |
|
get_mteb_data, |
|
inputs=[task_pair_classification], |
|
outputs=data_pair_classification, |
|
) |
|
with gr.TabItem("Retrieval"): |
|
with gr.Row(): |
|
gr.Markdown(""" |
|
**Retrieval Leaderboard 🔎** |
|
|
|
- **Metric:** Normalized Discounted Cumulative Gain @ k (ndcg_at_10) |
|
- **Languages:** English |
|
""") |
|
with gr.Row(): |
|
data_retrieval = gr.components.Dataframe( |
|
DATA_RETRIEVAL, |
|
|
|
datatype=["markdown"] + ["number"] * len(DATA_RETRIEVAL.columns) * 2, |
|
type="pandas", |
|
) |
|
with gr.Row(): |
|
data_run = gr.Button("Refresh") |
|
task_retrieval = gr.Variable(value=["Retrieval"]) |
|
data_run.click( |
|
get_mteb_data, inputs=[task_retrieval], outputs=data_retrieval |
|
) |
|
with gr.TabItem("Reranking"): |
|
with gr.Row(): |
|
gr.Markdown(""" |
|
**Reranking Leaderboard 🥈** |
|
|
|
- **Metric:** Mean Average Precision (MAP) |
|
- **Languages:** English |
|
""") |
|
with gr.Row(): |
|
data_reranking = gr.components.Dataframe( |
|
DATA_RERANKING, |
|
datatype=["markdown"] + ["number"] * len(DATA_RERANKING.columns), |
|
type="pandas", |
|
) |
|
with gr.Row(): |
|
data_run = gr.Button("Refresh") |
|
task_reranking = gr.Variable(value=["Reranking"]) |
|
metric_reranking = gr.Variable(value="map") |
|
data_run.click( |
|
get_mteb_data, inputs=[task_reranking], outputs=data_reranking |
|
) |
|
with gr.TabItem("STS"): |
|
with gr.TabItem("English"): |
|
with gr.Row(): |
|
gr.Markdown(""" |
|
**STS Leaderboard 🤖** |
|
|
|
- **Metric:** Spearman correlation based on cosine similarity |
|
- **Languages:** English |
|
""") |
|
with gr.Row(): |
|
data_sts_en = gr.components.Dataframe( |
|
DATA_STS_EN, |
|
datatype=["markdown"] + ["number"] * len(DATA_STS_EN.columns), |
|
type="pandas", |
|
) |
|
with gr.Row(): |
|
data_run_sts_en = gr.Button("Refresh") |
|
task_sts_en = gr.Variable(value=["STS"]) |
|
lang_sts_en = gr.Variable(value=["en", "en-en"]) |
|
data_run_sts_en.click( |
|
get_mteb_data, |
|
inputs=[task_sts_en, lang_sts_en], |
|
outputs=data_sts_en, |
|
) |
|
with gr.TabItem("Multilingual"): |
|
with gr.Row(): |
|
gr.Markdown(""" |
|
**STS Multilingual Leaderboard 👽** |
|
|
|
- **Metric:** Spearman correlation based on cosine similarity |
|
- **Languages:** Arabic, Chinese, Dutch, English, French, German, Italian, Korean, Polish, Russian, Spanish |
|
""") |
|
with gr.Row(): |
|
data_sts = gr.components.Dataframe( |
|
datatype=["markdown"] + ["number"] * 100, |
|
type="pandas", |
|
) |
|
with gr.Row(): |
|
data_run = gr.Button("Refresh") |
|
task_sts = gr.Variable(value=["STS"]) |
|
data_run.click(get_mteb_data, inputs=[task_sts], outputs=data_sts) |
|
with gr.TabItem("Summarization"): |
|
with gr.Row(): |
|
gr.Markdown(""" |
|
**Summarization Leaderboard 📜** |
|
|
|
- **Metric:** Spearman correlation based on cosine similarity |
|
- **Languages:** English |
|
""") |
|
with gr.Row(): |
|
data_summarization = gr.components.Dataframe( |
|
DATA_SUMMARIZATION, |
|
datatype=["markdown"] + ["number"] * 2, |
|
type="pandas", |
|
) |
|
with gr.Row(): |
|
data_run = gr.Button("Refresh") |
|
task_summarization = gr.Variable(value=["Summarization"]) |
|
data_run.click( |
|
get_mteb_data, |
|
inputs=[task_summarization], |
|
outputs=data_summarization, |
|
) |
|
gr.Markdown(f""" |
|
<p style="text-align: center;">Made with ❤️ for NLP by <a href=https://huggingface.co/Muennighoff>Niklas Muennighoff</a>.</p> |
|
""") |
|
|
|
|
|
block.load(get_mteb_data, inputs=[task_bitext_mining], outputs=data_bitext_mining) |
|
block.load(get_mteb_data, inputs=[task_classification_en, lang_classification_en], outputs=data_classification_en) |
|
block.load(get_mteb_data, inputs=[task_classification], outputs=data_classification) |
|
block.load(get_mteb_data, inputs=[task_clustering], outputs=data_clustering) |
|
block.load(get_mteb_data, inputs=[task_pair_classification], outputs=data_pair_classification) |
|
block.load(get_mteb_data, inputs=[task_retrieval], outputs=data_retrieval) |
|
block.load(get_mteb_data, inputs=[task_reranking], outputs=data_reranking) |
|
block.load(get_mteb_data, inputs=[task_sts_en, lang_sts_en], outputs=data_sts_en) |
|
block.load(get_mteb_data, inputs=[task_sts], outputs=data_sts) |
|
block.load(get_mteb_data, inputs=[task_summarization], outputs=data_summarization) |
|
|
|
block.launch() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|