Spaces:
Sleeping
Sleeping
Upload 14 files
Browse files- app.py +458 -41
- external_models.csv +31 -31
- extract_portuguese_leaderboard.py +2 -0
- portuguese_leaderboard.csv +0 -0
app.py
CHANGED
@@ -24,14 +24,14 @@ def load_portuguese_leaderboard_data() -> pd.DataFrame:
|
|
24 |
if os.path.exists(csv_path):
|
25 |
df = pd.read_csv(csv_path)
|
26 |
# Select only the relevant columns
|
27 |
-
relevant_columns = ['model_name', 'assin2_rte', 'assin2_sts', 'faquad_nli', 'hatebr_offensive']
|
28 |
df = df[relevant_columns].copy()
|
29 |
|
30 |
# Rename columns to match the existing format
|
31 |
df = df.rename(columns={
|
32 |
'assin2_rte': 'ASSIN2 RTE',
|
33 |
'assin2_sts': 'ASSIN2 STS',
|
34 |
-
'faquad_nli': '
|
35 |
'hatebr_offensive': 'HateBR'
|
36 |
})
|
37 |
|
@@ -62,13 +62,16 @@ def load_external_models_data() -> pd.DataFrame:
|
|
62 |
'model': 'model_name',
|
63 |
'assin2_rte': 'ASSIN2 RTE',
|
64 |
'assin2_sts': 'ASSIN2 STS',
|
65 |
-
'faquad_nli': '
|
66 |
'hatebr_offensive': 'HateBR'
|
67 |
})
|
68 |
|
69 |
# Add source information
|
70 |
df['source'] = 'external_models'
|
71 |
|
|
|
|
|
|
|
72 |
print(f"Loaded {len(df)} external models")
|
73 |
return df
|
74 |
else:
|
@@ -84,7 +87,7 @@ PORTUGUESE_LEADERBOARD_DATA = load_portuguese_leaderboard_data()
|
|
84 |
# Load external models data
|
85 |
EXTERNAL_MODELS_DATA = load_external_models_data()
|
86 |
|
87 |
-
def create_simplified_benchmark_table(selected_datasets: List[str] = None, show_napolab_thesis: bool = True, show_teenytinyllama: bool = True, show_portuguese_leaderboard: bool = True, show_external_models: bool = True, hide_incomplete_models: bool = False, min_average_performance: float = 0.0, search_query: str = "") -> pd.DataFrame:
|
88 |
"""Create a simplified benchmark table with one column per dataset."""
|
89 |
# Get all dataset names
|
90 |
dataset_names = sorted(NAPOLAB_DATASETS.keys())
|
@@ -120,14 +123,15 @@ def create_simplified_benchmark_table(selected_datasets: List[str] = None, show_
|
|
120 |
model_data[model_name] = {
|
121 |
'dataset_scores': {},
|
122 |
'url': None,
|
123 |
-
'source': 'portuguese_leaderboard'
|
|
|
124 |
}
|
125 |
|
126 |
# Map Portuguese leaderboard columns to dataset names
|
127 |
column_mapping = {
|
128 |
'ASSIN2 RTE': 'assin2_rte',
|
129 |
'ASSIN2 STS': 'assin2_sts',
|
130 |
-
'
|
131 |
'HateBR': 'hatebr'
|
132 |
}
|
133 |
|
@@ -146,14 +150,15 @@ def create_simplified_benchmark_table(selected_datasets: List[str] = None, show_
|
|
146 |
model_data[model_name] = {
|
147 |
'dataset_scores': {},
|
148 |
'url': row.get('link', ''),
|
149 |
-
'source': 'external_models'
|
|
|
150 |
}
|
151 |
|
152 |
# Map external models columns to dataset names
|
153 |
column_mapping = {
|
154 |
'ASSIN2 RTE': 'assin2_rte',
|
155 |
'ASSIN2 STS': 'assin2_sts',
|
156 |
-
'
|
157 |
'HateBR': 'hatebr'
|
158 |
}
|
159 |
|
@@ -177,6 +182,9 @@ def create_simplified_benchmark_table(selected_datasets: List[str] = None, show_
|
|
177 |
model_metadata = MODEL_METADATA.get(model_name, {})
|
178 |
source = model_metadata.get('source', 'unknown')
|
179 |
model_data[model_name]['source'] = source
|
|
|
|
|
|
|
180 |
|
181 |
# Create table data
|
182 |
table_data = []
|
@@ -198,6 +206,12 @@ def create_simplified_benchmark_table(selected_datasets: List[str] = None, show_
|
|
198 |
if source == 'unknown':
|
199 |
continue
|
200 |
|
|
|
|
|
|
|
|
|
|
|
|
|
201 |
# Create clickable link for model name
|
202 |
if data['url']:
|
203 |
model_display = f"[{model_name}]({data['url']})"
|
@@ -394,7 +408,7 @@ def cleanup_current_csv():
|
|
394 |
print(f"Error deleting file {current_csv_file}: {e}")
|
395 |
|
396 |
|
397 |
-
def create_model_performance_radar(selected_datasets: List[str] = None, show_napolab_thesis: bool = True, show_teenytinyllama: bool = True, show_portuguese_leaderboard: bool = True, show_external_models: bool = True, hide_incomplete_models: bool = False, min_average_performance: float = 0.0, search_query: str = "") -> go.Figure:
|
398 |
"""Create a radar chart showing model performance across all datasets."""
|
399 |
# Use selected datasets if provided, otherwise use all datasets
|
400 |
if selected_datasets is None:
|
@@ -431,14 +445,15 @@ def create_model_performance_radar(selected_datasets: List[str] = None, show_nap
|
|
431 |
model_data[model_name] = {
|
432 |
'performances': {},
|
433 |
'architecture': 'Unknown',
|
434 |
-
'source': 'portuguese_leaderboard'
|
|
|
435 |
}
|
436 |
|
437 |
# Map Portuguese leaderboard columns to dataset names
|
438 |
column_mapping = {
|
439 |
'ASSIN2 RTE': 'assin2_rte',
|
440 |
'ASSIN2 STS': 'assin2_sts',
|
441 |
-
'
|
442 |
'HateBR': 'hatebr'
|
443 |
}
|
444 |
|
@@ -457,14 +472,15 @@ def create_model_performance_radar(selected_datasets: List[str] = None, show_nap
|
|
457 |
model_data[model_name] = {
|
458 |
'performances': {},
|
459 |
'architecture': 'Unknown',
|
460 |
-
'source': 'external_models'
|
|
|
461 |
}
|
462 |
|
463 |
# Map external models columns to dataset names
|
464 |
column_mapping = {
|
465 |
'ASSIN2 RTE': 'assin2_rte',
|
466 |
'ASSIN2 STS': 'assin2_sts',
|
467 |
-
'
|
468 |
'HateBR': 'hatebr'
|
469 |
}
|
470 |
|
@@ -488,6 +504,9 @@ def create_model_performance_radar(selected_datasets: List[str] = None, show_nap
|
|
488 |
model_metadata = MODEL_METADATA.get(model_name, {})
|
489 |
source = model_metadata.get('source', 'unknown')
|
490 |
model_data[model_name]['source'] = source
|
|
|
|
|
|
|
491 |
|
492 |
# Apply source filtering
|
493 |
filtered_model_data = {}
|
@@ -507,6 +526,12 @@ def create_model_performance_radar(selected_datasets: List[str] = None, show_nap
|
|
507 |
if source == 'unknown':
|
508 |
continue
|
509 |
|
|
|
|
|
|
|
|
|
|
|
|
|
510 |
filtered_model_data[model_name] = data
|
511 |
|
512 |
# Apply incomplete model filtering
|
@@ -731,8 +756,8 @@ with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
|
|
731 |
dataset_checkboxes = []
|
732 |
for dataset_name in sorted(NAPOLAB_DATASETS.keys()):
|
733 |
display_name = NAPOLAB_DATASETS[dataset_name].get('name', dataset_name)
|
734 |
-
# Default to selected only for ASSIN 2 STS,
|
735 |
-
default_value =
|
736 |
checkbox = gr.Checkbox(
|
737 |
label=display_name,
|
738 |
value=default_value
|
@@ -774,6 +799,22 @@ with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
|
|
774 |
value=True
|
775 |
)
|
776 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
777 |
# Search bar for filtering models
|
778 |
search_query = gr.Textbox(
|
779 |
label="Search models by name (supports regex)",
|
@@ -807,8 +848,8 @@ with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
|
|
807 |
analysis_dataset_checkboxes = []
|
808 |
for dataset_name in sorted(NAPOLAB_DATASETS.keys()):
|
809 |
display_name = NAPOLAB_DATASETS[dataset_name].get('name', dataset_name)
|
810 |
-
# Default to selected only for ASSIN 2 STS,
|
811 |
-
default_value =
|
812 |
checkbox = gr.Checkbox(
|
813 |
label=display_name,
|
814 |
value=default_value
|
@@ -853,6 +894,18 @@ with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
|
|
853 |
value=True
|
854 |
)
|
855 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
856 |
# Search bar for filtering models in radar chart
|
857 |
search_query_analysis = gr.Textbox(
|
858 |
label="Search models by name (supports regex)",
|
@@ -863,6 +916,9 @@ with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
|
|
863 |
|
864 |
model_analysis_chart = gr.Plot(label="Model Performance Radar Chart")
|
865 |
|
|
|
|
|
|
|
866 |
gr.Markdown("""
|
867 |
**How to interact with the chart:**
|
868 |
- **Click on legend items** to show/hide specific models.
|
@@ -918,6 +974,272 @@ with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
|
|
918 |
|
919 |
""")
|
920 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
921 |
# Event handlers
|
922 |
def update_radar_chart(*args):
|
923 |
# Extract arguments for radar chart
|
@@ -929,6 +1251,7 @@ with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
|
|
929 |
show_portuguese_leaderboard = args[len(analysis_dataset_checkboxes) + 4]
|
930 |
show_external_models = args[len(analysis_dataset_checkboxes) + 5]
|
931 |
search_query = args[len(analysis_dataset_checkboxes) + 6]
|
|
|
932 |
|
933 |
# Convert dataset selections to list of selected dataset names
|
934 |
selected_datasets = []
|
@@ -936,7 +1259,7 @@ with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
|
|
936 |
if dataset_values[i]:
|
937 |
selected_datasets.append(dataset_name)
|
938 |
|
939 |
-
return create_model_performance_radar(selected_datasets, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, hide_incomplete_models, min_average_performance, search_query)
|
940 |
|
941 |
def update_benchmark_table(*args):
|
942 |
# Extract arguments
|
@@ -948,6 +1271,7 @@ with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
|
|
948 |
show_portuguese_leaderboard = args[len(dataset_checkboxes) + 4]
|
949 |
show_external_models = args[len(dataset_checkboxes) + 5]
|
950 |
search_query = args[len(dataset_checkboxes) + 6]
|
|
|
951 |
|
952 |
# Convert dataset selections to list of selected dataset names
|
953 |
selected_datasets = []
|
@@ -955,65 +1279,85 @@ with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
|
|
955 |
if dataset_values[i]:
|
956 |
selected_datasets.append(dataset_name)
|
957 |
|
958 |
-
df = create_simplified_benchmark_table(selected_datasets, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, hide_incomplete_models, min_average_performance, search_query)
|
959 |
|
960 |
return df
|
961 |
|
962 |
-
|
963 |
-
|
964 |
-
|
965 |
-
|
966 |
-
|
967 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
968 |
|
969 |
# Connect dataset checkboxes to update table
|
970 |
for dataset_name, checkbox in dataset_checkboxes:
|
971 |
checkbox.change(
|
972 |
update_benchmark_table,
|
973 |
-
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
|
974 |
outputs=benchmark_table
|
975 |
)
|
976 |
|
977 |
hide_incomplete_models.change(
|
978 |
update_benchmark_table,
|
979 |
-
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
|
980 |
outputs=benchmark_table
|
981 |
)
|
982 |
|
983 |
min_average_performance.change(
|
984 |
update_benchmark_table,
|
985 |
-
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
|
986 |
outputs=benchmark_table
|
987 |
)
|
988 |
|
989 |
show_napolab_thesis.change(
|
990 |
update_benchmark_table,
|
991 |
-
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
|
992 |
outputs=benchmark_table
|
993 |
)
|
994 |
|
995 |
show_teenytinyllama.change(
|
996 |
update_benchmark_table,
|
997 |
-
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
|
998 |
outputs=benchmark_table
|
999 |
)
|
1000 |
|
1001 |
show_portuguese_leaderboard.change(
|
1002 |
update_benchmark_table,
|
1003 |
-
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
|
1004 |
outputs=benchmark_table
|
1005 |
)
|
1006 |
|
1007 |
show_external_models.change(
|
1008 |
update_benchmark_table,
|
1009 |
-
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
|
1010 |
outputs=benchmark_table
|
1011 |
)
|
1012 |
|
1013 |
# Connect search query to update table
|
1014 |
search_query.change(
|
1015 |
update_benchmark_table,
|
1016 |
-
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1017 |
outputs=benchmark_table
|
1018 |
)
|
1019 |
|
@@ -1036,52 +1380,125 @@ with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
|
|
1036 |
for dataset_name, checkbox in analysis_dataset_checkboxes:
|
1037 |
checkbox.change(
|
1038 |
update_radar_chart,
|
1039 |
-
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
|
1040 |
outputs=model_analysis_chart
|
1041 |
)
|
1042 |
|
1043 |
hide_incomplete_models_analysis.change(
|
1044 |
update_radar_chart,
|
1045 |
-
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
|
1046 |
outputs=model_analysis_chart
|
1047 |
)
|
1048 |
|
1049 |
min_average_performance_analysis.change(
|
1050 |
update_radar_chart,
|
1051 |
-
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
|
1052 |
outputs=model_analysis_chart
|
1053 |
)
|
1054 |
|
1055 |
show_napolab_thesis_analysis.change(
|
1056 |
update_radar_chart,
|
1057 |
-
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
|
1058 |
outputs=model_analysis_chart
|
1059 |
)
|
1060 |
|
1061 |
show_teenytinyllama_analysis.change(
|
1062 |
update_radar_chart,
|
1063 |
-
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
|
1064 |
outputs=model_analysis_chart
|
1065 |
)
|
1066 |
|
1067 |
show_portuguese_leaderboard_analysis.change(
|
1068 |
update_radar_chart,
|
1069 |
-
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
|
1070 |
outputs=model_analysis_chart
|
1071 |
)
|
1072 |
|
1073 |
show_external_models_analysis.change(
|
1074 |
update_radar_chart,
|
1075 |
-
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
|
1076 |
outputs=model_analysis_chart
|
1077 |
)
|
1078 |
|
1079 |
# Connect search query to update radar chart
|
1080 |
search_query_analysis.change(
|
1081 |
update_radar_chart,
|
1082 |
-
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
|
1083 |
outputs=model_analysis_chart
|
1084 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1085 |
|
1086 |
if __name__ == "__main__":
|
1087 |
app.launch(server_name="0.0.0.0", server_port=7860)
|
|
|
24 |
if os.path.exists(csv_path):
|
25 |
df = pd.read_csv(csv_path)
|
26 |
# Select only the relevant columns
|
27 |
+
relevant_columns = ['model_name', 'model_num_parameters', 'assin2_rte', 'assin2_sts', 'faquad_nli', 'hatebr_offensive']
|
28 |
df = df[relevant_columns].copy()
|
29 |
|
30 |
# Rename columns to match the existing format
|
31 |
df = df.rename(columns={
|
32 |
'assin2_rte': 'ASSIN2 RTE',
|
33 |
'assin2_sts': 'ASSIN2 STS',
|
34 |
+
'faquad_nli': 'FaQUaD-NLI',
|
35 |
'hatebr_offensive': 'HateBR'
|
36 |
})
|
37 |
|
|
|
62 |
'model': 'model_name',
|
63 |
'assin2_rte': 'ASSIN2 RTE',
|
64 |
'assin2_sts': 'ASSIN2 STS',
|
65 |
+
'faquad_nli': 'FaQUaD-NLI',
|
66 |
'hatebr_offensive': 'HateBR'
|
67 |
})
|
68 |
|
69 |
# Add source information
|
70 |
df['source'] = 'external_models'
|
71 |
|
72 |
+
# Add model_num_parameters column with 0 for external models
|
73 |
+
df['model_num_parameters'] = 0
|
74 |
+
|
75 |
print(f"Loaded {len(df)} external models")
|
76 |
return df
|
77 |
else:
|
|
|
87 |
# Load external models data
|
88 |
EXTERNAL_MODELS_DATA = load_external_models_data()
|
89 |
|
90 |
+
def create_simplified_benchmark_table(selected_datasets: List[str] = None, show_napolab_thesis: bool = True, show_teenytinyllama: bool = True, show_portuguese_leaderboard: bool = True, show_external_models: bool = True, hide_incomplete_models: bool = False, min_average_performance: float = 0.0, search_query: str = "", max_num_parameters: int = 0) -> pd.DataFrame:
|
91 |
"""Create a simplified benchmark table with one column per dataset."""
|
92 |
# Get all dataset names
|
93 |
dataset_names = sorted(NAPOLAB_DATASETS.keys())
|
|
|
123 |
model_data[model_name] = {
|
124 |
'dataset_scores': {},
|
125 |
'url': None,
|
126 |
+
'source': 'portuguese_leaderboard',
|
127 |
+
'num_parameters': row.get('model_num_parameters', 0)
|
128 |
}
|
129 |
|
130 |
# Map Portuguese leaderboard columns to dataset names
|
131 |
column_mapping = {
|
132 |
'ASSIN2 RTE': 'assin2_rte',
|
133 |
'ASSIN2 STS': 'assin2_sts',
|
134 |
+
'FaQUaD-NLI': 'faquad-nli',
|
135 |
'HateBR': 'hatebr'
|
136 |
}
|
137 |
|
|
|
150 |
model_data[model_name] = {
|
151 |
'dataset_scores': {},
|
152 |
'url': row.get('link', ''),
|
153 |
+
'source': 'external_models',
|
154 |
+
'num_parameters': row.get('model_num_parameters', 0)
|
155 |
}
|
156 |
|
157 |
# Map external models columns to dataset names
|
158 |
column_mapping = {
|
159 |
'ASSIN2 RTE': 'assin2_rte',
|
160 |
'ASSIN2 STS': 'assin2_sts',
|
161 |
+
'FaQUaD-NLI': 'faquad-nli',
|
162 |
'HateBR': 'hatebr'
|
163 |
}
|
164 |
|
|
|
182 |
model_metadata = MODEL_METADATA.get(model_name, {})
|
183 |
source = model_metadata.get('source', 'unknown')
|
184 |
model_data[model_name]['source'] = source
|
185 |
+
|
186 |
+
# Add num_parameters for existing models (set to 0 as they don't have this info)
|
187 |
+
model_data[model_name]['num_parameters'] = 0
|
188 |
|
189 |
# Create table data
|
190 |
table_data = []
|
|
|
206 |
if source == 'unknown':
|
207 |
continue
|
208 |
|
209 |
+
# Apply parameter filtering (only for Portuguese leaderboard models)
|
210 |
+
if max_num_parameters > 0 and source == 'portuguese_leaderboard':
|
211 |
+
num_parameters = data.get('num_parameters', 0)
|
212 |
+
if num_parameters > max_num_parameters:
|
213 |
+
continue
|
214 |
+
|
215 |
# Create clickable link for model name
|
216 |
if data['url']:
|
217 |
model_display = f"[{model_name}]({data['url']})"
|
|
|
408 |
print(f"Error deleting file {current_csv_file}: {e}")
|
409 |
|
410 |
|
411 |
+
def create_model_performance_radar(selected_datasets: List[str] = None, show_napolab_thesis: bool = True, show_teenytinyllama: bool = True, show_portuguese_leaderboard: bool = True, show_external_models: bool = True, hide_incomplete_models: bool = False, min_average_performance: float = 0.0, search_query: str = "", max_num_parameters: int = 0) -> go.Figure:
|
412 |
"""Create a radar chart showing model performance across all datasets."""
|
413 |
# Use selected datasets if provided, otherwise use all datasets
|
414 |
if selected_datasets is None:
|
|
|
445 |
model_data[model_name] = {
|
446 |
'performances': {},
|
447 |
'architecture': 'Unknown',
|
448 |
+
'source': 'portuguese_leaderboard',
|
449 |
+
'num_parameters': row.get('model_num_parameters', 0)
|
450 |
}
|
451 |
|
452 |
# Map Portuguese leaderboard columns to dataset names
|
453 |
column_mapping = {
|
454 |
'ASSIN2 RTE': 'assin2_rte',
|
455 |
'ASSIN2 STS': 'assin2_sts',
|
456 |
+
'FaQUaD-NLI': 'faquad-nli',
|
457 |
'HateBR': 'hatebr'
|
458 |
}
|
459 |
|
|
|
472 |
model_data[model_name] = {
|
473 |
'performances': {},
|
474 |
'architecture': 'Unknown',
|
475 |
+
'source': 'external_models',
|
476 |
+
'num_parameters': row.get('model_num_parameters', 0)
|
477 |
}
|
478 |
|
479 |
# Map external models columns to dataset names
|
480 |
column_mapping = {
|
481 |
'ASSIN2 RTE': 'assin2_rte',
|
482 |
'ASSIN2 STS': 'assin2_sts',
|
483 |
+
'FaQUaD-NLI': 'faquad-nli',
|
484 |
'HateBR': 'hatebr'
|
485 |
}
|
486 |
|
|
|
504 |
model_metadata = MODEL_METADATA.get(model_name, {})
|
505 |
source = model_metadata.get('source', 'unknown')
|
506 |
model_data[model_name]['source'] = source
|
507 |
+
|
508 |
+
# Add num_parameters for existing models (set to 0 as they don't have this info)
|
509 |
+
model_data[model_name]['num_parameters'] = 0
|
510 |
|
511 |
# Apply source filtering
|
512 |
filtered_model_data = {}
|
|
|
526 |
if source == 'unknown':
|
527 |
continue
|
528 |
|
529 |
+
# Apply parameter filtering (only for Portuguese leaderboard models)
|
530 |
+
if max_num_parameters > 0 and source == 'portuguese_leaderboard':
|
531 |
+
num_parameters = data.get('num_parameters', 0)
|
532 |
+
if num_parameters > max_num_parameters:
|
533 |
+
continue
|
534 |
+
|
535 |
filtered_model_data[model_name] = data
|
536 |
|
537 |
# Apply incomplete model filtering
|
|
|
756 |
dataset_checkboxes = []
|
757 |
for dataset_name in sorted(NAPOLAB_DATASETS.keys()):
|
758 |
display_name = NAPOLAB_DATASETS[dataset_name].get('name', dataset_name)
|
759 |
+
# Default to selected only for ASSIN 2 STS, FaQUaD-NLI, and HateBR
|
760 |
+
default_value = display_name in ['ASSIN 2 STS', 'FaQUaD-NLI', 'HateBR']
|
761 |
checkbox = gr.Checkbox(
|
762 |
label=display_name,
|
763 |
value=default_value
|
|
|
799 |
value=True
|
800 |
)
|
801 |
|
802 |
+
# Calculate max parameters for slider
|
803 |
+
max_params = 0
|
804 |
+
if not PORTUGUESE_LEADERBOARD_DATA.empty:
|
805 |
+
max_params = int(PORTUGUESE_LEADERBOARD_DATA['model_num_parameters'].max())
|
806 |
+
|
807 |
+
with gr.Accordion("Filter by Model Size: (Click to expand)", open=False):
|
808 |
+
with gr.Row():
|
809 |
+
max_num_parameters = gr.Slider(
|
810 |
+
minimum=0,
|
811 |
+
maximum=max_params,
|
812 |
+
value=0,
|
813 |
+
step=1,
|
814 |
+
label="Maximum Number of Parameters",
|
815 |
+
info="This slider is applicable only to Open PT LLM Leaderboard models. For other models, it will have no effect."
|
816 |
+
)
|
817 |
+
|
818 |
# Search bar for filtering models
|
819 |
search_query = gr.Textbox(
|
820 |
label="Search models by name (supports regex)",
|
|
|
848 |
analysis_dataset_checkboxes = []
|
849 |
for dataset_name in sorted(NAPOLAB_DATASETS.keys()):
|
850 |
display_name = NAPOLAB_DATASETS[dataset_name].get('name', dataset_name)
|
851 |
+
# Default to selected only for ASSIN 2 STS, FaQUaD-NLI, and HateBR
|
852 |
+
default_value = display_name in ['ASSIN 2 STS', 'FaQUaD-NLI', 'HateBR']
|
853 |
checkbox = gr.Checkbox(
|
854 |
label=display_name,
|
855 |
value=default_value
|
|
|
894 |
value=True
|
895 |
)
|
896 |
|
897 |
+
# Parameter slider for Model Analysis tab
|
898 |
+
with gr.Accordion("Filter by Model Size: (Click to expand)", open=False):
|
899 |
+
with gr.Row():
|
900 |
+
max_num_parameters_analysis = gr.Slider(
|
901 |
+
minimum=0,
|
902 |
+
maximum=max_params,
|
903 |
+
value=0,
|
904 |
+
step=1,
|
905 |
+
label="Maximum Number of Parameters",
|
906 |
+
info="This slider is applicable only to Open PT LLM Leaderboard models. For other models, it will have no effect."
|
907 |
+
)
|
908 |
+
|
909 |
# Search bar for filtering models in radar chart
|
910 |
search_query_analysis = gr.Textbox(
|
911 |
label="Search models by name (supports regex)",
|
|
|
916 |
|
917 |
model_analysis_chart = gr.Plot(label="Model Performance Radar Chart")
|
918 |
|
919 |
+
# Add scatter plot below radar chart
|
920 |
+
model_scatter_plot = gr.Plot(label="Model Performance vs Number of Parameters")
|
921 |
+
|
922 |
gr.Markdown("""
|
923 |
**How to interact with the chart:**
|
924 |
- **Click on legend items** to show/hide specific models.
|
|
|
974 |
|
975 |
""")
|
976 |
|
977 |
+
def create_model_performance_scatter(selected_datasets: List[str] = None, show_napolab_thesis: bool = True, show_teenytinyllama: bool = True, show_portuguese_leaderboard: bool = True, show_external_models: bool = True, hide_incomplete_models: bool = False, min_average_performance: float = 0.0, search_query: str = "", max_num_parameters: int = 0) -> go.Figure:
|
978 |
+
"""Create a scatter plot showing model performance vs number of parameters."""
|
979 |
+
# Use selected datasets if provided, otherwise use all datasets
|
980 |
+
if selected_datasets is None:
|
981 |
+
selected_datasets = list(NAPOLAB_DATASETS.keys())
|
982 |
+
|
983 |
+
# Collect data for each model
|
984 |
+
model_data = {}
|
985 |
+
|
986 |
+
# Process existing benchmark results
|
987 |
+
for dataset_name, models in SAMPLE_BENCHMARK_RESULTS.items():
|
988 |
+
if dataset_name in selected_datasets:
|
989 |
+
for model_name, metrics in models.items():
|
990 |
+
if model_name not in model_data:
|
991 |
+
# Get actual source from MODEL_METADATA
|
992 |
+
model_metadata = MODEL_METADATA.get(model_name, {})
|
993 |
+
actual_source = model_metadata.get('source', 'unknown')
|
994 |
+
|
995 |
+
model_data[model_name] = {
|
996 |
+
'performances': {},
|
997 |
+
'architecture': model_metadata.get('architecture', 'Unknown'),
|
998 |
+
'source': actual_source,
|
999 |
+
'num_parameters': 0
|
1000 |
+
}
|
1001 |
+
|
1002 |
+
# Calculate average performance for this dataset
|
1003 |
+
avg_performance = np.mean(list(metrics.values()))
|
1004 |
+
model_data[model_name]['performances'][dataset_name] = avg_performance
|
1005 |
+
|
1006 |
+
# Process Portuguese leaderboard data
|
1007 |
+
if show_portuguese_leaderboard and not PORTUGUESE_LEADERBOARD_DATA.empty:
|
1008 |
+
for _, row in PORTUGUESE_LEADERBOARD_DATA.iterrows():
|
1009 |
+
model_name = row['model_name']
|
1010 |
+
|
1011 |
+
if model_name not in model_data:
|
1012 |
+
model_data[model_name] = {
|
1013 |
+
'performances': {},
|
1014 |
+
'architecture': 'Unknown',
|
1015 |
+
'source': 'portuguese_leaderboard',
|
1016 |
+
'num_parameters': row.get('model_num_parameters', 0)
|
1017 |
+
}
|
1018 |
+
|
1019 |
+
# Map Portuguese leaderboard columns to dataset names
|
1020 |
+
column_mapping = {
|
1021 |
+
'ASSIN2 RTE': 'assin2_rte',
|
1022 |
+
'ASSIN2 STS': 'assin2_sts',
|
1023 |
+
'FaQUaD-NLI': 'faquad-nli',
|
1024 |
+
'HateBR': 'hatebr'
|
1025 |
+
}
|
1026 |
+
|
1027 |
+
for display_name, dataset_name in column_mapping.items():
|
1028 |
+
if dataset_name in selected_datasets:
|
1029 |
+
score = row[display_name]
|
1030 |
+
if pd.notna(score) and score > 0:
|
1031 |
+
model_data[model_name]['performances'][dataset_name] = score
|
1032 |
+
|
1033 |
+
# Process external models data
|
1034 |
+
if show_external_models and not EXTERNAL_MODELS_DATA.empty:
|
1035 |
+
for _, row in EXTERNAL_MODELS_DATA.iterrows():
|
1036 |
+
model_name = row['model_name']
|
1037 |
+
|
1038 |
+
if model_name not in model_data:
|
1039 |
+
model_data[model_name] = {
|
1040 |
+
'performances': {},
|
1041 |
+
'architecture': 'Unknown',
|
1042 |
+
'source': 'external_models',
|
1043 |
+
'num_parameters': row.get('model_num_parameters', 0)
|
1044 |
+
}
|
1045 |
+
|
1046 |
+
# Map external models columns to dataset names
|
1047 |
+
column_mapping = {
|
1048 |
+
'ASSIN2 RTE': 'assin2_rte',
|
1049 |
+
'ASSIN2 STS': 'assin2_sts',
|
1050 |
+
'FaQUaD-NLI': 'faquad-nli',
|
1051 |
+
'HateBR': 'hatebr'
|
1052 |
+
}
|
1053 |
+
|
1054 |
+
for display_name, dataset_name in column_mapping.items():
|
1055 |
+
if dataset_name in selected_datasets:
|
1056 |
+
score = row[display_name]
|
1057 |
+
if pd.notna(score) and score > 0:
|
1058 |
+
model_data[model_name]['performances'][dataset_name] = score
|
1059 |
+
|
1060 |
+
# Apply source filtering
|
1061 |
+
filtered_model_data = {}
|
1062 |
+
for model_name, data in model_data.items():
|
1063 |
+
source = data.get('source', 'existing')
|
1064 |
+
|
1065 |
+
# Apply show filters - only show models from sources that are checked
|
1066 |
+
if source == 'napolab_thesis' and not show_napolab_thesis:
|
1067 |
+
continue
|
1068 |
+
if source == 'teenytinyllama_paper' and not show_teenytinyllama:
|
1069 |
+
continue
|
1070 |
+
if source == 'portuguese_leaderboard' and not show_portuguese_leaderboard:
|
1071 |
+
continue
|
1072 |
+
if source == 'external_models' and not show_external_models:
|
1073 |
+
continue
|
1074 |
+
# Hide models with unknown source (should not happen with proper data)
|
1075 |
+
if source == 'unknown':
|
1076 |
+
continue
|
1077 |
+
|
1078 |
+
# Apply parameter filtering (only for Portuguese leaderboard models)
|
1079 |
+
if max_num_parameters > 0 and source == 'portuguese_leaderboard':
|
1080 |
+
num_parameters = data.get('num_parameters', 0)
|
1081 |
+
if num_parameters > max_num_parameters:
|
1082 |
+
continue
|
1083 |
+
|
1084 |
+
filtered_model_data[model_name] = data
|
1085 |
+
|
1086 |
+
# Apply incomplete model filtering
|
1087 |
+
if hide_incomplete_models and selected_datasets:
|
1088 |
+
final_filtered_data = {}
|
1089 |
+
for model_name, data in filtered_model_data.items():
|
1090 |
+
has_all_scores = True
|
1091 |
+
for dataset_name in selected_datasets:
|
1092 |
+
if data['performances'].get(dataset_name, 0) == 0:
|
1093 |
+
has_all_scores = False
|
1094 |
+
break
|
1095 |
+
if has_all_scores:
|
1096 |
+
final_filtered_data[model_name] = data
|
1097 |
+
filtered_model_data = final_filtered_data
|
1098 |
+
|
1099 |
+
# Apply minimum average performance filtering
|
1100 |
+
if min_average_performance > 0 and selected_datasets:
|
1101 |
+
final_filtered_data = {}
|
1102 |
+
for model_name, data in filtered_model_data.items():
|
1103 |
+
# Calculate average performance for selected datasets
|
1104 |
+
scores = []
|
1105 |
+
for dataset_name in selected_datasets:
|
1106 |
+
score = data['performances'].get(dataset_name, 0)
|
1107 |
+
if score > 0: # Only include non-zero scores
|
1108 |
+
scores.append(score)
|
1109 |
+
|
1110 |
+
if scores:
|
1111 |
+
avg_performance = np.mean(scores)
|
1112 |
+
if avg_performance >= min_average_performance:
|
1113 |
+
final_filtered_data[model_name] = data
|
1114 |
+
filtered_model_data = final_filtered_data
|
1115 |
+
|
1116 |
+
# Apply search query filtering
|
1117 |
+
if search_query:
|
1118 |
+
final_filtered_data = {}
|
1119 |
+
try:
|
1120 |
+
# Use regex pattern matching
|
1121 |
+
import re
|
1122 |
+
pattern = re.compile(search_query, re.IGNORECASE)
|
1123 |
+
for model_name, data in filtered_model_data.items():
|
1124 |
+
if pattern.search(model_name):
|
1125 |
+
final_filtered_data[model_name] = data
|
1126 |
+
except re.error:
|
1127 |
+
# Fallback to simple string matching if regex is invalid
|
1128 |
+
for model_name, data in filtered_model_data.items():
|
1129 |
+
if search_query.lower() in model_name.lower():
|
1130 |
+
final_filtered_data[model_name] = data
|
1131 |
+
filtered_model_data = final_filtered_data
|
1132 |
+
|
1133 |
+
# Prepare data for scatter plot
|
1134 |
+
scatter_data = []
|
1135 |
+
for model_name, data in filtered_model_data.items():
|
1136 |
+
# Calculate average performance for selected datasets
|
1137 |
+
scores = []
|
1138 |
+
for dataset_name in selected_datasets:
|
1139 |
+
score = data['performances'].get(dataset_name, 0)
|
1140 |
+
if score > 0: # Only include non-zero scores
|
1141 |
+
scores.append(score)
|
1142 |
+
|
1143 |
+
if scores:
|
1144 |
+
avg_performance = np.mean(scores)
|
1145 |
+
num_parameters = data.get('num_parameters', 0)
|
1146 |
+
source = data.get('source', 'unknown')
|
1147 |
+
|
1148 |
+
scatter_data.append({
|
1149 |
+
'model_name': model_name,
|
1150 |
+
'avg_performance': avg_performance,
|
1151 |
+
'num_parameters': num_parameters,
|
1152 |
+
'source': source
|
1153 |
+
})
|
1154 |
+
|
1155 |
+
if not scatter_data:
|
1156 |
+
# Create empty figure if no data
|
1157 |
+
fig = go.Figure()
|
1158 |
+
fig.add_annotation(
|
1159 |
+
text="No data available for the selected filters",
|
1160 |
+
xref="paper", yref="paper",
|
1161 |
+
x=0.5, y=0.5, showarrow=False,
|
1162 |
+
font=dict(size=16)
|
1163 |
+
)
|
1164 |
+
fig.update_layout(
|
1165 |
+
title="Model Performance vs Number of Parameters",
|
1166 |
+
xaxis_title="Number of Parameters",
|
1167 |
+
yaxis_title="Average Performance Score",
|
1168 |
+
height=500
|
1169 |
+
)
|
1170 |
+
return fig
|
1171 |
+
|
1172 |
+
# Create scatter plot
|
1173 |
+
df_scatter = pd.DataFrame(scatter_data)
|
1174 |
+
|
1175 |
+
# Create color mapping for sources
|
1176 |
+
color_map = {
|
1177 |
+
'portuguese_leaderboard': '#1f77b4',
|
1178 |
+
'external_models': '#ff7f0e',
|
1179 |
+
'napolab_thesis': '#2ca02c',
|
1180 |
+
'teenytinyllama_paper': '#d62728',
|
1181 |
+
'unknown': '#9467bd'
|
1182 |
+
}
|
1183 |
+
|
1184 |
+
# Create display name mapping for sources
|
1185 |
+
display_name_map = {
|
1186 |
+
'portuguese_leaderboard': 'Open PT LLM Leaderboard',
|
1187 |
+
'external_models': 'Proprietary Models',
|
1188 |
+
'napolab_thesis': 'Napolab Thesis',
|
1189 |
+
'teenytinyllama_paper': 'TeenyTinyLlama Paper',
|
1190 |
+
'unknown': 'Unknown Source'
|
1191 |
+
}
|
1192 |
+
|
1193 |
+
fig = go.Figure()
|
1194 |
+
|
1195 |
+
for source in df_scatter['source'].unique():
|
1196 |
+
source_data = df_scatter[df_scatter['source'] == source]
|
1197 |
+
color = color_map.get(source, '#7f7f7f')
|
1198 |
+
display_name = display_name_map.get(source, source.replace('_', ' ').title())
|
1199 |
+
|
1200 |
+
fig.add_trace(go.Scatter(
|
1201 |
+
x=source_data['num_parameters'],
|
1202 |
+
y=source_data['avg_performance'],
|
1203 |
+
mode='markers',
|
1204 |
+
name=display_name,
|
1205 |
+
marker=dict(
|
1206 |
+
color=color,
|
1207 |
+
size=8,
|
1208 |
+
opacity=0.7
|
1209 |
+
),
|
1210 |
+
text=source_data['model_name'],
|
1211 |
+
hovertemplate=(
|
1212 |
+
"<b>%{text}</b><br>" +
|
1213 |
+
"Average Performance: %{y:.3f}<br>" +
|
1214 |
+
"Number of Parameters: %{x:,}<br>" +
|
1215 |
+
"Source: " + display_name + "<br>" +
|
1216 |
+
"<extra></extra>"
|
1217 |
+
)
|
1218 |
+
))
|
1219 |
+
|
1220 |
+
fig.update_layout(
|
1221 |
+
title="Model Performance vs Number of Parameters",
|
1222 |
+
xaxis_title="Number of Parameters",
|
1223 |
+
yaxis_title="Average Performance Score",
|
1224 |
+
height=500,
|
1225 |
+
showlegend=True,
|
1226 |
+
plot_bgcolor='rgba(255, 255, 255, 0)',
|
1227 |
+
paper_bgcolor='rgba(255, 255, 255, 0)',
|
1228 |
+
legend=dict(
|
1229 |
+
yanchor="top",
|
1230 |
+
y=-0.15,
|
1231 |
+
xanchor="center",
|
1232 |
+
x=0.5,
|
1233 |
+
bgcolor='rgba(255, 255, 255, 0.95)',
|
1234 |
+
bordercolor='rgba(0, 0, 0, 0.2)',
|
1235 |
+
borderwidth=1,
|
1236 |
+
orientation="h"
|
1237 |
+
),
|
1238 |
+
margin=dict(l=50, r=50, t=100, b=100)
|
1239 |
+
)
|
1240 |
+
|
1241 |
+
return fig
|
1242 |
+
|
1243 |
# Event handlers
|
1244 |
def update_radar_chart(*args):
|
1245 |
# Extract arguments for radar chart
|
|
|
1251 |
show_portuguese_leaderboard = args[len(analysis_dataset_checkboxes) + 4]
|
1252 |
show_external_models = args[len(analysis_dataset_checkboxes) + 5]
|
1253 |
search_query = args[len(analysis_dataset_checkboxes) + 6]
|
1254 |
+
max_num_parameters = args[len(analysis_dataset_checkboxes) + 7]
|
1255 |
|
1256 |
# Convert dataset selections to list of selected dataset names
|
1257 |
selected_datasets = []
|
|
|
1259 |
if dataset_values[i]:
|
1260 |
selected_datasets.append(dataset_name)
|
1261 |
|
1262 |
+
return create_model_performance_radar(selected_datasets, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, hide_incomplete_models, min_average_performance, search_query, max_num_parameters)
|
1263 |
|
1264 |
def update_benchmark_table(*args):
|
1265 |
# Extract arguments
|
|
|
1271 |
show_portuguese_leaderboard = args[len(dataset_checkboxes) + 4]
|
1272 |
show_external_models = args[len(dataset_checkboxes) + 5]
|
1273 |
search_query = args[len(dataset_checkboxes) + 6]
|
1274 |
+
max_num_parameters = args[len(dataset_checkboxes) + 7]
|
1275 |
|
1276 |
# Convert dataset selections to list of selected dataset names
|
1277 |
selected_datasets = []
|
|
|
1279 |
if dataset_values[i]:
|
1280 |
selected_datasets.append(dataset_name)
|
1281 |
|
1282 |
+
df = create_simplified_benchmark_table(selected_datasets, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, hide_incomplete_models, min_average_performance, search_query, max_num_parameters)
|
1283 |
|
1284 |
return df
|
1285 |
|
1286 |
+
def update_scatter_plot(*args):
|
1287 |
+
# Extract arguments for scatter plot
|
1288 |
+
dataset_values = args[:len(analysis_dataset_checkboxes)]
|
1289 |
+
hide_incomplete_models = args[len(analysis_dataset_checkboxes)]
|
1290 |
+
min_average_performance = args[len(analysis_dataset_checkboxes) + 1] / 100.0 # Convert percentage to decimal
|
1291 |
+
show_napolab_thesis = args[len(analysis_dataset_checkboxes) + 2]
|
1292 |
+
show_teenytinyllama = args[len(analysis_dataset_checkboxes) + 3]
|
1293 |
+
show_portuguese_leaderboard = args[len(analysis_dataset_checkboxes) + 4]
|
1294 |
+
show_external_models = args[len(analysis_dataset_checkboxes) + 5]
|
1295 |
+
search_query = args[len(analysis_dataset_checkboxes) + 6]
|
1296 |
+
max_num_parameters = args[len(analysis_dataset_checkboxes) + 7]
|
1297 |
+
|
1298 |
+
# Convert dataset selections to list of selected dataset names
|
1299 |
+
selected_datasets = []
|
1300 |
+
for i, (dataset_name, _) in enumerate(analysis_dataset_checkboxes):
|
1301 |
+
if dataset_values[i]:
|
1302 |
+
selected_datasets.append(dataset_name)
|
1303 |
+
|
1304 |
+
return create_model_performance_scatter(selected_datasets, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, hide_incomplete_models, min_average_performance, search_query, max_num_parameters)
|
1305 |
|
1306 |
# Connect dataset checkboxes to update table
|
1307 |
for dataset_name, checkbox in dataset_checkboxes:
|
1308 |
checkbox.change(
|
1309 |
update_benchmark_table,
|
1310 |
+
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
|
1311 |
outputs=benchmark_table
|
1312 |
)
|
1313 |
|
1314 |
hide_incomplete_models.change(
|
1315 |
update_benchmark_table,
|
1316 |
+
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
|
1317 |
outputs=benchmark_table
|
1318 |
)
|
1319 |
|
1320 |
min_average_performance.change(
|
1321 |
update_benchmark_table,
|
1322 |
+
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
|
1323 |
outputs=benchmark_table
|
1324 |
)
|
1325 |
|
1326 |
show_napolab_thesis.change(
|
1327 |
update_benchmark_table,
|
1328 |
+
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
|
1329 |
outputs=benchmark_table
|
1330 |
)
|
1331 |
|
1332 |
show_teenytinyllama.change(
|
1333 |
update_benchmark_table,
|
1334 |
+
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
|
1335 |
outputs=benchmark_table
|
1336 |
)
|
1337 |
|
1338 |
show_portuguese_leaderboard.change(
|
1339 |
update_benchmark_table,
|
1340 |
+
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
|
1341 |
outputs=benchmark_table
|
1342 |
)
|
1343 |
|
1344 |
show_external_models.change(
|
1345 |
update_benchmark_table,
|
1346 |
+
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
|
1347 |
outputs=benchmark_table
|
1348 |
)
|
1349 |
|
1350 |
# Connect search query to update table
|
1351 |
search_query.change(
|
1352 |
update_benchmark_table,
|
1353 |
+
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
|
1354 |
+
outputs=benchmark_table
|
1355 |
+
)
|
1356 |
+
|
1357 |
+
# Connect max_num_parameters to update table
|
1358 |
+
max_num_parameters.change(
|
1359 |
+
update_benchmark_table,
|
1360 |
+
inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
|
1361 |
outputs=benchmark_table
|
1362 |
)
|
1363 |
|
|
|
1380 |
for dataset_name, checkbox in analysis_dataset_checkboxes:
|
1381 |
checkbox.change(
|
1382 |
update_radar_chart,
|
1383 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
|
1384 |
outputs=model_analysis_chart
|
1385 |
)
|
1386 |
|
1387 |
hide_incomplete_models_analysis.change(
|
1388 |
update_radar_chart,
|
1389 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
|
1390 |
outputs=model_analysis_chart
|
1391 |
)
|
1392 |
|
1393 |
min_average_performance_analysis.change(
|
1394 |
update_radar_chart,
|
1395 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
|
1396 |
outputs=model_analysis_chart
|
1397 |
)
|
1398 |
|
1399 |
show_napolab_thesis_analysis.change(
|
1400 |
update_radar_chart,
|
1401 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
|
1402 |
outputs=model_analysis_chart
|
1403 |
)
|
1404 |
|
1405 |
show_teenytinyllama_analysis.change(
|
1406 |
update_radar_chart,
|
1407 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
|
1408 |
outputs=model_analysis_chart
|
1409 |
)
|
1410 |
|
1411 |
show_portuguese_leaderboard_analysis.change(
|
1412 |
update_radar_chart,
|
1413 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
|
1414 |
outputs=model_analysis_chart
|
1415 |
)
|
1416 |
|
1417 |
show_external_models_analysis.change(
|
1418 |
update_radar_chart,
|
1419 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
|
1420 |
outputs=model_analysis_chart
|
1421 |
)
|
1422 |
|
1423 |
# Connect search query to update radar chart
|
1424 |
search_query_analysis.change(
|
1425 |
update_radar_chart,
|
1426 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
|
1427 |
outputs=model_analysis_chart
|
1428 |
)
|
1429 |
+
|
1430 |
+
# Connect max_num_parameters_analysis to update radar chart
|
1431 |
+
max_num_parameters_analysis.change(
|
1432 |
+
update_radar_chart,
|
1433 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
|
1434 |
+
outputs=model_analysis_chart
|
1435 |
+
)
|
1436 |
+
|
1437 |
+
# Connect all analysis controls to update scatter plot
|
1438 |
+
for dataset_name, checkbox in analysis_dataset_checkboxes:
|
1439 |
+
checkbox.change(
|
1440 |
+
update_scatter_plot,
|
1441 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
|
1442 |
+
outputs=model_scatter_plot
|
1443 |
+
)
|
1444 |
+
|
1445 |
+
hide_incomplete_models_analysis.change(
|
1446 |
+
update_scatter_plot,
|
1447 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
|
1448 |
+
outputs=model_scatter_plot
|
1449 |
+
)
|
1450 |
+
|
1451 |
+
min_average_performance_analysis.change(
|
1452 |
+
update_scatter_plot,
|
1453 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
|
1454 |
+
outputs=model_scatter_plot
|
1455 |
+
)
|
1456 |
+
|
1457 |
+
show_napolab_thesis_analysis.change(
|
1458 |
+
update_scatter_plot,
|
1459 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
|
1460 |
+
outputs=model_scatter_plot
|
1461 |
+
)
|
1462 |
+
|
1463 |
+
show_teenytinyllama_analysis.change(
|
1464 |
+
update_scatter_plot,
|
1465 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
|
1466 |
+
outputs=model_scatter_plot
|
1467 |
+
)
|
1468 |
+
|
1469 |
+
show_portuguese_leaderboard_analysis.change(
|
1470 |
+
update_scatter_plot,
|
1471 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
|
1472 |
+
outputs=model_scatter_plot
|
1473 |
+
)
|
1474 |
+
|
1475 |
+
show_external_models_analysis.change(
|
1476 |
+
update_scatter_plot,
|
1477 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
|
1478 |
+
outputs=model_scatter_plot
|
1479 |
+
)
|
1480 |
+
|
1481 |
+
search_query_analysis.change(
|
1482 |
+
update_scatter_plot,
|
1483 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
|
1484 |
+
outputs=model_scatter_plot
|
1485 |
+
)
|
1486 |
+
|
1487 |
+
max_num_parameters_analysis.change(
|
1488 |
+
update_scatter_plot,
|
1489 |
+
inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
|
1490 |
+
outputs=model_scatter_plot
|
1491 |
+
)
|
1492 |
+
|
1493 |
+
# Connect events
|
1494 |
+
# Load model analysis chart on app start
|
1495 |
+
app.load(lambda: update_radar_chart(*([display_name in ['ASSIN 2 STS', 'FaQUaD-NLI', 'HateBR'] for _, display_name in [(name, NAPOLAB_DATASETS[name].get('name', name)) for name in sorted(NAPOLAB_DATASETS.keys())]] + [True, 80, True, True, True, True, "", 0])), outputs=model_analysis_chart)
|
1496 |
+
|
1497 |
+
# Load scatter plot on app start
|
1498 |
+
app.load(lambda: update_scatter_plot(*([display_name in ['ASSIN 2 STS', 'FaQUaD-NLI', 'HateBR'] for _, display_name in [(name, NAPOLAB_DATASETS[name].get('name', name)) for name in sorted(NAPOLAB_DATASETS.keys())]] + [True, 80, True, True, True, True, "", 0])), outputs=model_scatter_plot)
|
1499 |
+
|
1500 |
+
# Load benchmark table on app start
|
1501 |
+
app.load(lambda: update_benchmark_table(*([display_name in ['ASSIN 2 STS', 'FaQUaD-NLI', 'HateBR'] for _, display_name in [(name, NAPOLAB_DATASETS[name].get('name', name)) for name in sorted(NAPOLAB_DATASETS.keys())]] + [True, 80, True, True, True, True, "", 0])), outputs=benchmark_table)
|
1502 |
|
1503 |
if __name__ == "__main__":
|
1504 |
app.launch(server_name="0.0.0.0", server_port=7860)
|
external_models.csv
CHANGED
@@ -1,31 +1,31 @@
|
|
1 |
-
model,link,assin2_sts,assin2_rte,faquad_nli,hatebr_offensive
|
2 |
-
sabia-2-small,https://www.maritaca.ai/,0.7053302344881672,0.9121728362223306,0.7575848453041435,0.753800795680591
|
3 |
-
sabia-2-medium,https://www.maritaca.ai/,0.7804108376537757,0.923459363368553,0.7657657657657658,0.8349989882997386
|
4 |
-
gpt-3.5-turbo-0125,https://www.openai.com/,0.7378460201077941,0.8823038414050672,0.746353108609074,0.8056205941193919
|
5 |
-
claude-3-haiku-20240307,https://www.claude.ai/,0.7892124744168747,0.9184462138121732,0.6340996599941455,0.8023698759439051
|
6 |
-
gemini-1.0-pro,https://ai.google.dev/,0.7058831239763663,0.8945993304651698,0.7070913567220611,0.8086330094493972
|
7 |
-
gemini-1.5-pro-preview-0409,https://cloud.google.com/vertex-ai,0.8159702278408203,0.9328989988467518,0.7290756302521009,0.8697698647467024
|
8 |
-
deepseek-v2-chat,https://www.deepseek.com/,0.8533174657651231,0.9440170304568147,0.7995469048381548,0.8842986491071644
|
9 |
-
gemini-1.5-flash-preview-0514,https://cloud.google.com/vertex-ai,0.841655158151231,0.9362097477374545,0.8092185592185592,0.9099110141445836
|
10 |
-
gemini-1.5-flash-001,https://cloud.google.com/vertex-ai,0.838806085610371,0.9366169973822607,0.7963910785668922,0.9092078461170015
|
11 |
-
gpt-4o-mini-2024-07-18,https://www.openai.com/,0.7259038954527597,0.942809846745341,0.819807735300693,0.8682357029532165
|
12 |
-
nemotron-4-340b-instruct,https://huggingface.co/nvidia/Nemotron-4-340B-Instruct,0.7857731021403329,0.9489354458928496,0.8194444444444444,0.8641580001234928
|
13 |
-
llama_405b_instruct,https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct,0.7888441732870783,0.9476445477916471,0.825063276593557,0.9073940659389119
|
14 |
-
sabia-3,https://www.maritaca.ai/,0.8253863689009022,0.9477034821619312,0.8243848812618203,0.8278737774590023
|
15 |
-
llama3_3_70b,https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct,0.7275578599896508,0.9407071010860484,0.8787563033858187,0.9024358249091997
|
16 |
-
llama3_2_90b,https://huggingface.co/meta-llama/Llama-3.2-90B-Vision-Instruct,0.7368518566379951,0.9216548775103446,0.8632015306122449,0.8965270877302478
|
17 |
-
gemini-1.5-flash-002,https://cloud.google.com/vertex-ai,0.8380176734291938,0.941176117215237,0.8360786822325283,0.9046145161133335
|
18 |
-
gemini-1.5-flash-8b-001,https://aistudio.google.com,0.7638946799836569,0.9329452628161146,0.7937022965448601,0.850497640901663
|
19 |
-
gemini-2.0-flash-001,https://cloud.google.com/vertex-ai,0.8440142633742483,0.9305165510724053,0.7533651260745065,0.8890432813545366
|
20 |
-
gemini-2.0-flash-lite-001,https://cloud.google.com/vertex-ai,0.8492479991621328,0.9216548775103446,0.7652777777777777,0.8522499647780968
|
21 |
-
gemini-2.5-pro-exp-03-25,https://aistudio.google.com,0.837785744915033,0.9415510158830285,0.8738735797309651,0.9248478168290788
|
22 |
-
deepSeek-v3-0324,https://huggingface.co/deepseek-ai/DeepSeek-V3-0324,0.8145997097875548,0.9421860387625551,0.796751127001399,0.9060129756724185
|
23 |
-
qwen2-5-vl-72b-instruct,https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct,0.7595538567467497,0.9472975104201871,0.8447190882122586,0.8810695094657859
|
24 |
-
qwen2-5-72b-instruct,https://huggingface.co/Qwen/Qwen2.5-72B-Instruct,0.8230708844558656,0.9509720145268106,0.8194444444444444,0.8810033427242816
|
25 |
-
qwen2-5-vl-32b-instruct,https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct,0.7780549055529008,0.9472975104201871,0.8447190882122586,0.8810695094657859
|
26 |
-
qwen-turbo-2024-11-01,https://www.alibabacloud.com/en/product/modelstudio,0.7640477700456898,0.9260451969385788,0.8128063725490196,0.8567933277676292
|
27 |
-
gpt-4o-2024-08-06,https://www.openai.com/,0.8078677969518289,0.9407235712144604,0.8654396266184885,0.9320137873994456
|
28 |
-
claude-3-7-sonnet-20250219,https://www.anthropic.com/,0.8087979933117393,0.9472965253044003,0.8097848807348216,0.9125114739050616
|
29 |
-
llama-4-scout-16e,https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct,0.7741640227983941,0.9312877465954967,0.8567037452287072,0.8813700069483281
|
30 |
-
llama-4-maverick-128e,https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct,0.7333246903202654,0.9329419027588105,0.7823695413019562,0.9047550357833591
|
31 |
-
gemma-3-27b-it,https://huggingface.co/google/gemma-3-27b-it,0.8147646517017526,0.9411147367212748,0.8143210816987241,0.8729414870796344
|
|
|
1 |
+
model,link,assin2_sts,assin2_rte,faquad_nli,hatebr_offensive
|
2 |
+
sabia-2-small,https://www.maritaca.ai/,0.7053302344881672,0.9121728362223306,0.7575848453041435,0.753800795680591
|
3 |
+
sabia-2-medium,https://www.maritaca.ai/,0.7804108376537757,0.923459363368553,0.7657657657657658,0.8349989882997386
|
4 |
+
gpt-3.5-turbo-0125,https://www.openai.com/,0.7378460201077941,0.8823038414050672,0.746353108609074,0.8056205941193919
|
5 |
+
claude-3-haiku-20240307,https://www.claude.ai/,0.7892124744168747,0.9184462138121732,0.6340996599941455,0.8023698759439051
|
6 |
+
gemini-1.0-pro,https://ai.google.dev/,0.7058831239763663,0.8945993304651698,0.7070913567220611,0.8086330094493972
|
7 |
+
gemini-1.5-pro-preview-0409,https://cloud.google.com/vertex-ai,0.8159702278408203,0.9328989988467518,0.7290756302521009,0.8697698647467024
|
8 |
+
deepseek-v2-chat,https://www.deepseek.com/,0.8533174657651231,0.9440170304568147,0.7995469048381548,0.8842986491071644
|
9 |
+
gemini-1.5-flash-preview-0514,https://cloud.google.com/vertex-ai,0.841655158151231,0.9362097477374545,0.8092185592185592,0.9099110141445836
|
10 |
+
gemini-1.5-flash-001,https://cloud.google.com/vertex-ai,0.838806085610371,0.9366169973822607,0.7963910785668922,0.9092078461170015
|
11 |
+
gpt-4o-mini-2024-07-18,https://www.openai.com/,0.7259038954527597,0.942809846745341,0.819807735300693,0.8682357029532165
|
12 |
+
nemotron-4-340b-instruct,https://huggingface.co/nvidia/Nemotron-4-340B-Instruct,0.7857731021403329,0.9489354458928496,0.8194444444444444,0.8641580001234928
|
13 |
+
llama_405b_instruct,https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct,0.7888441732870783,0.9476445477916471,0.825063276593557,0.9073940659389119
|
14 |
+
sabia-3,https://www.maritaca.ai/,0.8253863689009022,0.9477034821619312,0.8243848812618203,0.8278737774590023
|
15 |
+
llama3_3_70b,https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct,0.7275578599896508,0.9407071010860484,0.8787563033858187,0.9024358249091997
|
16 |
+
llama3_2_90b,https://huggingface.co/meta-llama/Llama-3.2-90B-Vision-Instruct,0.7368518566379951,0.9216548775103446,0.8632015306122449,0.8965270877302478
|
17 |
+
gemini-1.5-flash-002,https://cloud.google.com/vertex-ai,0.8380176734291938,0.941176117215237,0.8360786822325283,0.9046145161133335
|
18 |
+
gemini-1.5-flash-8b-001,https://aistudio.google.com,0.7638946799836569,0.9329452628161146,0.7937022965448601,0.850497640901663
|
19 |
+
gemini-2.0-flash-001,https://cloud.google.com/vertex-ai,0.8440142633742483,0.9305165510724053,0.7533651260745065,0.8890432813545366
|
20 |
+
gemini-2.0-flash-lite-001,https://cloud.google.com/vertex-ai,0.8492479991621328,0.9216548775103446,0.7652777777777777,0.8522499647780968
|
21 |
+
gemini-2.5-pro-exp-03-25,https://aistudio.google.com,0.837785744915033,0.9415510158830285,0.8738735797309651,0.9248478168290788
|
22 |
+
deepSeek-v3-0324,https://huggingface.co/deepseek-ai/DeepSeek-V3-0324,0.8145997097875548,0.9421860387625551,0.796751127001399,0.9060129756724185
|
23 |
+
qwen2-5-vl-72b-instruct,https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct,0.7595538567467497,0.9472975104201871,0.8447190882122586,0.8810695094657859
|
24 |
+
qwen2-5-72b-instruct,https://huggingface.co/Qwen/Qwen2.5-72B-Instruct,0.8230708844558656,0.9509720145268106,0.8194444444444444,0.8810033427242816
|
25 |
+
qwen2-5-vl-32b-instruct,https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct,0.7780549055529008,0.9472975104201871,0.8447190882122586,0.8810695094657859
|
26 |
+
qwen-turbo-2024-11-01,https://www.alibabacloud.com/en/product/modelstudio,0.7640477700456898,0.9260451969385788,0.8128063725490196,0.8567933277676292
|
27 |
+
gpt-4o-2024-08-06,https://www.openai.com/,0.8078677969518289,0.9407235712144604,0.8654396266184885,0.9320137873994456
|
28 |
+
claude-3-7-sonnet-20250219,https://www.anthropic.com/,0.8087979933117393,0.9472965253044003,0.8097848807348216,0.9125114739050616
|
29 |
+
llama-4-scout-16e,https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct,0.7741640227983941,0.9312877465954967,0.8567037452287072,0.8813700069483281
|
30 |
+
llama-4-maverick-128e,https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct,0.7333246903202654,0.9329419027588105,0.7823695413019562,0.9047550357833591
|
31 |
+
gemma-3-27b-it,https://huggingface.co/google/gemma-3-27b-it,0.8147646517017526,0.9411147367212748,0.8143210816987241,0.8729414870796344
|
extract_portuguese_leaderboard.py
CHANGED
@@ -83,6 +83,7 @@ def extract_data_from_json(json_file_path):
|
|
83 |
# Extract model information
|
84 |
model_name = config_general.get('model_name', '')
|
85 |
model_private = config_general.get('model_private', False)
|
|
|
86 |
|
87 |
# Extract results
|
88 |
all_grouped = results.get('all_grouped', {})
|
@@ -98,6 +99,7 @@ def extract_data_from_json(json_file_path):
|
|
98 |
'json_file': str(json_file_path),
|
99 |
'model_name': model_name,
|
100 |
'model_private': model_private,
|
|
|
101 |
'assin2_rte': assin2_rte,
|
102 |
'assin2_sts': assin2_sts,
|
103 |
'faquad_nli': faquad_nli,
|
|
|
83 |
# Extract model information
|
84 |
model_name = config_general.get('model_name', '')
|
85 |
model_private = config_general.get('model_private', False)
|
86 |
+
model_num_parameters = config_general.get('model_num_parameters', 0)
|
87 |
|
88 |
# Extract results
|
89 |
all_grouped = results.get('all_grouped', {})
|
|
|
99 |
'json_file': str(json_file_path),
|
100 |
'model_name': model_name,
|
101 |
'model_private': model_private,
|
102 |
+
'model_num_parameters': model_num_parameters,
|
103 |
'assin2_rte': assin2_rte,
|
104 |
'assin2_sts': assin2_sts,
|
105 |
'faquad_nli': faquad_nli,
|
portuguese_leaderboard.csv
CHANGED
The diff for this file is too large to render.
See raw diff
|
|