Spaces:
Running
Running
Muennighoff
commited on
Commit
•
17e0108
1
Parent(s):
a51beac
Fix dataframe dtypes for proper sorting
Browse files
app.py
CHANGED
@@ -206,7 +206,7 @@ for model in EXTERNAL_MODELS:
|
|
206 |
EXTERNAL_MODEL_RESULTS[model][task][metric].append({**base_dict, **ds_dict})
|
207 |
|
208 |
|
209 |
-
def get_mteb_data(tasks=["Clustering"], langs=[],
|
210 |
api = HfApi()
|
211 |
models = api.list_models(filter="mteb")
|
212 |
# Initialize list to models that we cannot fetch metadata from
|
@@ -255,8 +255,6 @@ def get_mteb_data(tasks=["Clustering"], langs=[], cast_to_str=True, task_to_metr
|
|
255 |
cols.insert(0, cols.pop(cols.index("Model")))
|
256 |
df = df[cols]
|
257 |
df.fillna("", inplace=True)
|
258 |
-
if cast_to_str:
|
259 |
-
return df.astype(str) # Cast to str as Gradio does not accept floats
|
260 |
return df
|
261 |
|
262 |
def get_mteb_average():
|
@@ -272,7 +270,6 @@ def get_mteb_average():
|
|
272 |
"Summarization",
|
273 |
],
|
274 |
langs=["en", "en-en"],
|
275 |
-
cast_to_str=False
|
276 |
)
|
277 |
# Approximation (Missing Bitext Mining & including some nans)
|
278 |
NUM_SCORES = DATA_OVERALL.shape[0] * DATA_OVERALL.shape[1]
|
@@ -292,7 +289,7 @@ def get_mteb_average():
|
|
292 |
# Start ranking from 1
|
293 |
DATA_OVERALL.insert(0, "Rank", list(range(1, len(DATA_OVERALL) + 1)))
|
294 |
|
295 |
-
DATA_OVERALL = DATA_OVERALL.round(2)
|
296 |
|
297 |
DATA_CLASSIFICATION_EN = DATA_OVERALL[["Model"] + TASK_LIST_CLASSIFICATION]
|
298 |
DATA_CLUSTERING = DATA_OVERALL[["Model"] + TASK_LIST_CLUSTERING]
|
@@ -331,7 +328,7 @@ with block:
|
|
331 |
with gr.Row():
|
332 |
data_overall = gr.components.Dataframe(
|
333 |
DATA_OVERALL,
|
334 |
-
datatype=["markdown"] * len(DATA_OVERALL.columns)
|
335 |
type="pandas",
|
336 |
wrap=True,
|
337 |
)
|
@@ -348,7 +345,7 @@ with block:
|
|
348 |
""")
|
349 |
with gr.Row():
|
350 |
data_bitext_mining = gr.components.Dataframe(
|
351 |
-
datatype=["markdown"] * 500, # hack when we don't know how many columns
|
352 |
type="pandas",
|
353 |
)
|
354 |
with gr.Row():
|
@@ -371,7 +368,7 @@ with block:
|
|
371 |
with gr.Row():
|
372 |
data_classification_en = gr.components.Dataframe(
|
373 |
DATA_CLASSIFICATION_EN,
|
374 |
-
datatype=["markdown"] * len(DATA_CLASSIFICATION_EN.columns)
|
375 |
type="pandas",
|
376 |
)
|
377 |
with gr.Row():
|
@@ -396,7 +393,7 @@ with block:
|
|
396 |
""")
|
397 |
with gr.Row():
|
398 |
data_classification = gr.components.Dataframe(
|
399 |
-
datatype=["markdown"] * 200, # hack when we don't know how many columns
|
400 |
type="pandas",
|
401 |
)
|
402 |
with gr.Row():
|
@@ -418,7 +415,7 @@ with block:
|
|
418 |
with gr.Row():
|
419 |
data_clustering = gr.components.Dataframe(
|
420 |
DATA_CLUSTERING,
|
421 |
-
datatype=["markdown"] * len(DATA_CLUSTERING.columns)
|
422 |
type="pandas",
|
423 |
)
|
424 |
with gr.Row():
|
@@ -440,7 +437,7 @@ with block:
|
|
440 |
with gr.Row():
|
441 |
data_pair_classification = gr.components.Dataframe(
|
442 |
DATA_PAIR_CLASSIFICATION,
|
443 |
-
datatype=["markdown"] * len(DATA_PAIR_CLASSIFICATION.columns)
|
444 |
type="pandas",
|
445 |
)
|
446 |
with gr.Row():
|
@@ -462,7 +459,8 @@ with block:
|
|
462 |
with gr.Row():
|
463 |
data_retrieval = gr.components.Dataframe(
|
464 |
DATA_RETRIEVAL,
|
465 |
-
|
|
|
466 |
type="pandas",
|
467 |
)
|
468 |
with gr.Row():
|
@@ -482,7 +480,7 @@ with block:
|
|
482 |
with gr.Row():
|
483 |
data_reranking = gr.components.Dataframe(
|
484 |
DATA_RERANKING,
|
485 |
-
datatype=["markdown"] * len(DATA_RERANKING.columns)
|
486 |
type="pandas",
|
487 |
)
|
488 |
with gr.Row():
|
@@ -504,7 +502,7 @@ with block:
|
|
504 |
with gr.Row():
|
505 |
data_sts_en = gr.components.Dataframe(
|
506 |
DATA_STS_EN,
|
507 |
-
datatype=["markdown"] * len(DATA_STS_EN.columns)
|
508 |
type="pandas",
|
509 |
)
|
510 |
with gr.Row():
|
@@ -526,7 +524,7 @@ with block:
|
|
526 |
""")
|
527 |
with gr.Row():
|
528 |
data_sts = gr.components.Dataframe(
|
529 |
-
datatype=["markdown"] *
|
530 |
type="pandas",
|
531 |
)
|
532 |
with gr.Row():
|
@@ -543,8 +541,8 @@ with block:
|
|
543 |
""")
|
544 |
with gr.Row():
|
545 |
data_summarization = gr.components.Dataframe(
|
546 |
-
DATA_SUMMARIZATION
|
547 |
-
datatype="markdown",
|
548 |
type="pandas",
|
549 |
)
|
550 |
with gr.Row():
|
@@ -564,6 +562,7 @@ with block:
|
|
564 |
block.load(get_mteb_data, inputs=[task_classification_en, lang_classification_en], outputs=data_classification_en)
|
565 |
block.load(get_mteb_data, inputs=[task_classification], outputs=data_classification)
|
566 |
block.load(get_mteb_data, inputs=[task_clustering], outputs=data_clustering)
|
|
|
567 |
block.load(get_mteb_data, inputs=[task_retrieval], outputs=data_retrieval)
|
568 |
block.load(get_mteb_data, inputs=[task_reranking], outputs=data_reranking)
|
569 |
block.load(get_mteb_data, inputs=[task_sts_en, lang_sts_en], outputs=data_sts_en)
|
@@ -577,6 +576,7 @@ block.launch()
|
|
577 |
# Could check if tasks are valid (Currently users could just invent new tasks - similar for languages)
|
578 |
# Could make it load in the background without the Gradio logo closer to the Deep RL space
|
579 |
# Could add graphs / other visual content
|
|
|
580 |
|
581 |
# Sources:
|
582 |
# https://huggingface.co/spaces/gradio/leaderboard
|
|
|
206 |
EXTERNAL_MODEL_RESULTS[model][task][metric].append({**base_dict, **ds_dict})
|
207 |
|
208 |
|
209 |
+
def get_mteb_data(tasks=["Clustering"], langs=[], task_to_metric=TASK_TO_METRIC):
|
210 |
api = HfApi()
|
211 |
models = api.list_models(filter="mteb")
|
212 |
# Initialize list to models that we cannot fetch metadata from
|
|
|
255 |
cols.insert(0, cols.pop(cols.index("Model")))
|
256 |
df = df[cols]
|
257 |
df.fillna("", inplace=True)
|
|
|
|
|
258 |
return df
|
259 |
|
260 |
def get_mteb_average():
|
|
|
270 |
"Summarization",
|
271 |
],
|
272 |
langs=["en", "en-en"],
|
|
|
273 |
)
|
274 |
# Approximation (Missing Bitext Mining & including some nans)
|
275 |
NUM_SCORES = DATA_OVERALL.shape[0] * DATA_OVERALL.shape[1]
|
|
|
289 |
# Start ranking from 1
|
290 |
DATA_OVERALL.insert(0, "Rank", list(range(1, len(DATA_OVERALL) + 1)))
|
291 |
|
292 |
+
DATA_OVERALL = DATA_OVERALL.round(2)
|
293 |
|
294 |
DATA_CLASSIFICATION_EN = DATA_OVERALL[["Model"] + TASK_LIST_CLASSIFICATION]
|
295 |
DATA_CLUSTERING = DATA_OVERALL[["Model"] + TASK_LIST_CLUSTERING]
|
|
|
328 |
with gr.Row():
|
329 |
data_overall = gr.components.Dataframe(
|
330 |
DATA_OVERALL,
|
331 |
+
datatype=["number", "markdown"] + ["number"] * len(DATA_OVERALL.columns),
|
332 |
type="pandas",
|
333 |
wrap=True,
|
334 |
)
|
|
|
345 |
""")
|
346 |
with gr.Row():
|
347 |
data_bitext_mining = gr.components.Dataframe(
|
348 |
+
datatype=["markdown"] + ["number"] * 500, # hack when we don't know how many columns
|
349 |
type="pandas",
|
350 |
)
|
351 |
with gr.Row():
|
|
|
368 |
with gr.Row():
|
369 |
data_classification_en = gr.components.Dataframe(
|
370 |
DATA_CLASSIFICATION_EN,
|
371 |
+
datatype=["markdown"] + ["number"] * len(DATA_CLASSIFICATION_EN.columns),
|
372 |
type="pandas",
|
373 |
)
|
374 |
with gr.Row():
|
|
|
393 |
""")
|
394 |
with gr.Row():
|
395 |
data_classification = gr.components.Dataframe(
|
396 |
+
datatype=["markdown"] + ["number"] * 200, # hack when we don't know how many columns
|
397 |
type="pandas",
|
398 |
)
|
399 |
with gr.Row():
|
|
|
415 |
with gr.Row():
|
416 |
data_clustering = gr.components.Dataframe(
|
417 |
DATA_CLUSTERING,
|
418 |
+
datatype=["markdown"] + ["number"] * len(DATA_CLUSTERING.columns),
|
419 |
type="pandas",
|
420 |
)
|
421 |
with gr.Row():
|
|
|
437 |
with gr.Row():
|
438 |
data_pair_classification = gr.components.Dataframe(
|
439 |
DATA_PAIR_CLASSIFICATION,
|
440 |
+
datatype=["markdown"] + ["number"] * len(DATA_PAIR_CLASSIFICATION.columns),
|
441 |
type="pandas",
|
442 |
)
|
443 |
with gr.Row():
|
|
|
459 |
with gr.Row():
|
460 |
data_retrieval = gr.components.Dataframe(
|
461 |
DATA_RETRIEVAL,
|
462 |
+
# Add support for more columns than existing as a buffer for CQADupstack & other Retrieval tasks (e.g. MSMARCOv2)
|
463 |
+
datatype=["markdown"] + ["number"] * len(DATA_RETRIEVAL.columns) * 2,
|
464 |
type="pandas",
|
465 |
)
|
466 |
with gr.Row():
|
|
|
480 |
with gr.Row():
|
481 |
data_reranking = gr.components.Dataframe(
|
482 |
DATA_RERANKING,
|
483 |
+
datatype=["markdown"] + ["number"] * len(DATA_RERANKING.columns),
|
484 |
type="pandas",
|
485 |
)
|
486 |
with gr.Row():
|
|
|
502 |
with gr.Row():
|
503 |
data_sts_en = gr.components.Dataframe(
|
504 |
DATA_STS_EN,
|
505 |
+
datatype=["markdown"] + ["number"] * len(DATA_STS_EN.columns),
|
506 |
type="pandas",
|
507 |
)
|
508 |
with gr.Row():
|
|
|
524 |
""")
|
525 |
with gr.Row():
|
526 |
data_sts = gr.components.Dataframe(
|
527 |
+
datatype=["markdown"] + ["number"] * 100, # hack when we don't know how many columns
|
528 |
type="pandas",
|
529 |
)
|
530 |
with gr.Row():
|
|
|
541 |
""")
|
542 |
with gr.Row():
|
543 |
data_summarization = gr.components.Dataframe(
|
544 |
+
DATA_SUMMARIZATION,
|
545 |
+
datatype=["markdown"] + ["number"] * 2,
|
546 |
type="pandas",
|
547 |
)
|
548 |
with gr.Row():
|
|
|
562 |
block.load(get_mteb_data, inputs=[task_classification_en, lang_classification_en], outputs=data_classification_en)
|
563 |
block.load(get_mteb_data, inputs=[task_classification], outputs=data_classification)
|
564 |
block.load(get_mteb_data, inputs=[task_clustering], outputs=data_clustering)
|
565 |
+
block.load(get_mteb_data, inputs=[task_pair_classification], outputs=data_pair_classification)
|
566 |
block.load(get_mteb_data, inputs=[task_retrieval], outputs=data_retrieval)
|
567 |
block.load(get_mteb_data, inputs=[task_reranking], outputs=data_reranking)
|
568 |
block.load(get_mteb_data, inputs=[task_sts_en, lang_sts_en], outputs=data_sts_en)
|
|
|
576 |
# Could check if tasks are valid (Currently users could just invent new tasks - similar for languages)
|
577 |
# Could make it load in the background without the Gradio logo closer to the Deep RL space
|
578 |
# Could add graphs / other visual content
|
579 |
+
# Could add verification marks
|
580 |
|
581 |
# Sources:
|
582 |
# https://huggingface.co/spaces/gradio/leaderboard
|