ruanchaves commited on
Commit
2c482cc
·
verified ·
1 Parent(s): 225b6ff

Upload 14 files

Browse files
app.py CHANGED
@@ -24,14 +24,14 @@ def load_portuguese_leaderboard_data() -> pd.DataFrame:
24
  if os.path.exists(csv_path):
25
  df = pd.read_csv(csv_path)
26
  # Select only the relevant columns
27
- relevant_columns = ['model_name', 'assin2_rte', 'assin2_sts', 'faquad_nli', 'hatebr_offensive']
28
  df = df[relevant_columns].copy()
29
 
30
  # Rename columns to match the existing format
31
  df = df.rename(columns={
32
  'assin2_rte': 'ASSIN2 RTE',
33
  'assin2_sts': 'ASSIN2 STS',
34
- 'faquad_nli': 'FaQuAD-NLI',
35
  'hatebr_offensive': 'HateBR'
36
  })
37
 
@@ -62,13 +62,16 @@ def load_external_models_data() -> pd.DataFrame:
62
  'model': 'model_name',
63
  'assin2_rte': 'ASSIN2 RTE',
64
  'assin2_sts': 'ASSIN2 STS',
65
- 'faquad_nli': 'FaQuAD-NLI',
66
  'hatebr_offensive': 'HateBR'
67
  })
68
 
69
  # Add source information
70
  df['source'] = 'external_models'
71
 
 
 
 
72
  print(f"Loaded {len(df)} external models")
73
  return df
74
  else:
@@ -84,7 +87,7 @@ PORTUGUESE_LEADERBOARD_DATA = load_portuguese_leaderboard_data()
84
  # Load external models data
85
  EXTERNAL_MODELS_DATA = load_external_models_data()
86
 
87
- def create_simplified_benchmark_table(selected_datasets: List[str] = None, show_napolab_thesis: bool = True, show_teenytinyllama: bool = True, show_portuguese_leaderboard: bool = True, show_external_models: bool = True, hide_incomplete_models: bool = False, min_average_performance: float = 0.0, search_query: str = "") -> pd.DataFrame:
88
  """Create a simplified benchmark table with one column per dataset."""
89
  # Get all dataset names
90
  dataset_names = sorted(NAPOLAB_DATASETS.keys())
@@ -120,14 +123,15 @@ def create_simplified_benchmark_table(selected_datasets: List[str] = None, show_
120
  model_data[model_name] = {
121
  'dataset_scores': {},
122
  'url': None,
123
- 'source': 'portuguese_leaderboard'
 
124
  }
125
 
126
  # Map Portuguese leaderboard columns to dataset names
127
  column_mapping = {
128
  'ASSIN2 RTE': 'assin2_rte',
129
  'ASSIN2 STS': 'assin2_sts',
130
- 'FaQuAD-NLI': 'faquad-nli',
131
  'HateBR': 'hatebr'
132
  }
133
 
@@ -146,14 +150,15 @@ def create_simplified_benchmark_table(selected_datasets: List[str] = None, show_
146
  model_data[model_name] = {
147
  'dataset_scores': {},
148
  'url': row.get('link', ''),
149
- 'source': 'external_models'
 
150
  }
151
 
152
  # Map external models columns to dataset names
153
  column_mapping = {
154
  'ASSIN2 RTE': 'assin2_rte',
155
  'ASSIN2 STS': 'assin2_sts',
156
- 'FaQuAD-NLI': 'faquad-nli',
157
  'HateBR': 'hatebr'
158
  }
159
 
@@ -177,6 +182,9 @@ def create_simplified_benchmark_table(selected_datasets: List[str] = None, show_
177
  model_metadata = MODEL_METADATA.get(model_name, {})
178
  source = model_metadata.get('source', 'unknown')
179
  model_data[model_name]['source'] = source
 
 
 
180
 
181
  # Create table data
182
  table_data = []
@@ -198,6 +206,12 @@ def create_simplified_benchmark_table(selected_datasets: List[str] = None, show_
198
  if source == 'unknown':
199
  continue
200
 
 
 
 
 
 
 
201
  # Create clickable link for model name
202
  if data['url']:
203
  model_display = f"[{model_name}]({data['url']})"
@@ -394,7 +408,7 @@ def cleanup_current_csv():
394
  print(f"Error deleting file {current_csv_file}: {e}")
395
 
396
 
397
- def create_model_performance_radar(selected_datasets: List[str] = None, show_napolab_thesis: bool = True, show_teenytinyllama: bool = True, show_portuguese_leaderboard: bool = True, show_external_models: bool = True, hide_incomplete_models: bool = False, min_average_performance: float = 0.0, search_query: str = "") -> go.Figure:
398
  """Create a radar chart showing model performance across all datasets."""
399
  # Use selected datasets if provided, otherwise use all datasets
400
  if selected_datasets is None:
@@ -431,14 +445,15 @@ def create_model_performance_radar(selected_datasets: List[str] = None, show_nap
431
  model_data[model_name] = {
432
  'performances': {},
433
  'architecture': 'Unknown',
434
- 'source': 'portuguese_leaderboard'
 
435
  }
436
 
437
  # Map Portuguese leaderboard columns to dataset names
438
  column_mapping = {
439
  'ASSIN2 RTE': 'assin2_rte',
440
  'ASSIN2 STS': 'assin2_sts',
441
- 'FaQuAD-NLI': 'faquad-nli',
442
  'HateBR': 'hatebr'
443
  }
444
 
@@ -457,14 +472,15 @@ def create_model_performance_radar(selected_datasets: List[str] = None, show_nap
457
  model_data[model_name] = {
458
  'performances': {},
459
  'architecture': 'Unknown',
460
- 'source': 'external_models'
 
461
  }
462
 
463
  # Map external models columns to dataset names
464
  column_mapping = {
465
  'ASSIN2 RTE': 'assin2_rte',
466
  'ASSIN2 STS': 'assin2_sts',
467
- 'FaQuAD-NLI': 'faquad-nli',
468
  'HateBR': 'hatebr'
469
  }
470
 
@@ -488,6 +504,9 @@ def create_model_performance_radar(selected_datasets: List[str] = None, show_nap
488
  model_metadata = MODEL_METADATA.get(model_name, {})
489
  source = model_metadata.get('source', 'unknown')
490
  model_data[model_name]['source'] = source
 
 
 
491
 
492
  # Apply source filtering
493
  filtered_model_data = {}
@@ -507,6 +526,12 @@ def create_model_performance_radar(selected_datasets: List[str] = None, show_nap
507
  if source == 'unknown':
508
  continue
509
 
 
 
 
 
 
 
510
  filtered_model_data[model_name] = data
511
 
512
  # Apply incomplete model filtering
@@ -731,8 +756,8 @@ with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
731
  dataset_checkboxes = []
732
  for dataset_name in sorted(NAPOLAB_DATASETS.keys()):
733
  display_name = NAPOLAB_DATASETS[dataset_name].get('name', dataset_name)
734
- # Default to selected only for ASSIN 2 STS, FaQuAD-NLI, and HateBR
735
- default_value = dataset_name in ['assin2_sts', 'faquad-nli', 'hatebr']
736
  checkbox = gr.Checkbox(
737
  label=display_name,
738
  value=default_value
@@ -774,6 +799,22 @@ with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
774
  value=True
775
  )
776
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
777
  # Search bar for filtering models
778
  search_query = gr.Textbox(
779
  label="Search models by name (supports regex)",
@@ -807,8 +848,8 @@ with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
807
  analysis_dataset_checkboxes = []
808
  for dataset_name in sorted(NAPOLAB_DATASETS.keys()):
809
  display_name = NAPOLAB_DATASETS[dataset_name].get('name', dataset_name)
810
- # Default to selected only for ASSIN 2 STS, FaQuAD-NLI, and HateBR
811
- default_value = dataset_name in ['assin2_sts', 'faquad-nli', 'hatebr']
812
  checkbox = gr.Checkbox(
813
  label=display_name,
814
  value=default_value
@@ -853,6 +894,18 @@ with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
853
  value=True
854
  )
855
 
 
 
 
 
 
 
 
 
 
 
 
 
856
  # Search bar for filtering models in radar chart
857
  search_query_analysis = gr.Textbox(
858
  label="Search models by name (supports regex)",
@@ -863,6 +916,9 @@ with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
863
 
864
  model_analysis_chart = gr.Plot(label="Model Performance Radar Chart")
865
 
 
 
 
866
  gr.Markdown("""
867
  **How to interact with the chart:**
868
  - **Click on legend items** to show/hide specific models.
@@ -918,6 +974,272 @@ with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
918
 
919
  """)
920
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
921
  # Event handlers
922
  def update_radar_chart(*args):
923
  # Extract arguments for radar chart
@@ -929,6 +1251,7 @@ with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
929
  show_portuguese_leaderboard = args[len(analysis_dataset_checkboxes) + 4]
930
  show_external_models = args[len(analysis_dataset_checkboxes) + 5]
931
  search_query = args[len(analysis_dataset_checkboxes) + 6]
 
932
 
933
  # Convert dataset selections to list of selected dataset names
934
  selected_datasets = []
@@ -936,7 +1259,7 @@ with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
936
  if dataset_values[i]:
937
  selected_datasets.append(dataset_name)
938
 
939
- return create_model_performance_radar(selected_datasets, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, hide_incomplete_models, min_average_performance, search_query)
940
 
941
  def update_benchmark_table(*args):
942
  # Extract arguments
@@ -948,6 +1271,7 @@ with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
948
  show_portuguese_leaderboard = args[len(dataset_checkboxes) + 4]
949
  show_external_models = args[len(dataset_checkboxes) + 5]
950
  search_query = args[len(dataset_checkboxes) + 6]
 
951
 
952
  # Convert dataset selections to list of selected dataset names
953
  selected_datasets = []
@@ -955,65 +1279,85 @@ with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
955
  if dataset_values[i]:
956
  selected_datasets.append(dataset_name)
957
 
958
- df = create_simplified_benchmark_table(selected_datasets, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, hide_incomplete_models, min_average_performance, search_query)
959
 
960
  return df
961
 
962
- # Connect events
963
- # Load model analysis chart on app start
964
- app.load(lambda: update_radar_chart(*([name in ['assin2_sts', 'faquad-nli', 'hatebr'] for name in sorted(NAPOLAB_DATASETS.keys())] + [True, 80, True, True, True, True, ""])), outputs=model_analysis_chart)
965
-
966
- # Load benchmark table on app start
967
- app.load(lambda: update_benchmark_table(*([name in ['assin2_sts', 'faquad-nli', 'hatebr'] for name in sorted(NAPOLAB_DATASETS.keys())] + [True, 80, True, True, True, True, ""])), outputs=benchmark_table)
 
 
 
 
 
 
 
 
 
 
 
 
 
968
 
969
  # Connect dataset checkboxes to update table
970
  for dataset_name, checkbox in dataset_checkboxes:
971
  checkbox.change(
972
  update_benchmark_table,
973
- inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
974
  outputs=benchmark_table
975
  )
976
 
977
  hide_incomplete_models.change(
978
  update_benchmark_table,
979
- inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
980
  outputs=benchmark_table
981
  )
982
 
983
  min_average_performance.change(
984
  update_benchmark_table,
985
- inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
986
  outputs=benchmark_table
987
  )
988
 
989
  show_napolab_thesis.change(
990
  update_benchmark_table,
991
- inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
992
  outputs=benchmark_table
993
  )
994
 
995
  show_teenytinyllama.change(
996
  update_benchmark_table,
997
- inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
998
  outputs=benchmark_table
999
  )
1000
 
1001
  show_portuguese_leaderboard.change(
1002
  update_benchmark_table,
1003
- inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
1004
  outputs=benchmark_table
1005
  )
1006
 
1007
  show_external_models.change(
1008
  update_benchmark_table,
1009
- inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
1010
  outputs=benchmark_table
1011
  )
1012
 
1013
  # Connect search query to update table
1014
  search_query.change(
1015
  update_benchmark_table,
1016
- inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query],
 
 
 
 
 
 
 
1017
  outputs=benchmark_table
1018
  )
1019
 
@@ -1036,52 +1380,125 @@ with gr.Blocks(title="Napolab Leaderboard", theme=gr.themes.Soft()) as app:
1036
  for dataset_name, checkbox in analysis_dataset_checkboxes:
1037
  checkbox.change(
1038
  update_radar_chart,
1039
- inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
1040
  outputs=model_analysis_chart
1041
  )
1042
 
1043
  hide_incomplete_models_analysis.change(
1044
  update_radar_chart,
1045
- inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
1046
  outputs=model_analysis_chart
1047
  )
1048
 
1049
  min_average_performance_analysis.change(
1050
  update_radar_chart,
1051
- inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
1052
  outputs=model_analysis_chart
1053
  )
1054
 
1055
  show_napolab_thesis_analysis.change(
1056
  update_radar_chart,
1057
- inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
1058
  outputs=model_analysis_chart
1059
  )
1060
 
1061
  show_teenytinyllama_analysis.change(
1062
  update_radar_chart,
1063
- inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
1064
  outputs=model_analysis_chart
1065
  )
1066
 
1067
  show_portuguese_leaderboard_analysis.change(
1068
  update_radar_chart,
1069
- inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
1070
  outputs=model_analysis_chart
1071
  )
1072
 
1073
  show_external_models_analysis.change(
1074
  update_radar_chart,
1075
- inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
1076
  outputs=model_analysis_chart
1077
  )
1078
 
1079
  # Connect search query to update radar chart
1080
  search_query_analysis.change(
1081
  update_radar_chart,
1082
- inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis],
1083
  outputs=model_analysis_chart
1084
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1085
 
1086
  if __name__ == "__main__":
1087
  app.launch(server_name="0.0.0.0", server_port=7860)
 
24
  if os.path.exists(csv_path):
25
  df = pd.read_csv(csv_path)
26
  # Select only the relevant columns
27
+ relevant_columns = ['model_name', 'model_num_parameters', 'assin2_rte', 'assin2_sts', 'faquad_nli', 'hatebr_offensive']
28
  df = df[relevant_columns].copy()
29
 
30
  # Rename columns to match the existing format
31
  df = df.rename(columns={
32
  'assin2_rte': 'ASSIN2 RTE',
33
  'assin2_sts': 'ASSIN2 STS',
34
+ 'faquad_nli': 'FaQUaD-NLI',
35
  'hatebr_offensive': 'HateBR'
36
  })
37
 
 
62
  'model': 'model_name',
63
  'assin2_rte': 'ASSIN2 RTE',
64
  'assin2_sts': 'ASSIN2 STS',
65
+ 'faquad_nli': 'FaQUaD-NLI',
66
  'hatebr_offensive': 'HateBR'
67
  })
68
 
69
  # Add source information
70
  df['source'] = 'external_models'
71
 
72
+ # Add model_num_parameters column with 0 for external models
73
+ df['model_num_parameters'] = 0
74
+
75
  print(f"Loaded {len(df)} external models")
76
  return df
77
  else:
 
87
  # Load external models data
88
  EXTERNAL_MODELS_DATA = load_external_models_data()
89
 
90
+ def create_simplified_benchmark_table(selected_datasets: List[str] = None, show_napolab_thesis: bool = True, show_teenytinyllama: bool = True, show_portuguese_leaderboard: bool = True, show_external_models: bool = True, hide_incomplete_models: bool = False, min_average_performance: float = 0.0, search_query: str = "", max_num_parameters: int = 0) -> pd.DataFrame:
91
  """Create a simplified benchmark table with one column per dataset."""
92
  # Get all dataset names
93
  dataset_names = sorted(NAPOLAB_DATASETS.keys())
 
123
  model_data[model_name] = {
124
  'dataset_scores': {},
125
  'url': None,
126
+ 'source': 'portuguese_leaderboard',
127
+ 'num_parameters': row.get('model_num_parameters', 0)
128
  }
129
 
130
  # Map Portuguese leaderboard columns to dataset names
131
  column_mapping = {
132
  'ASSIN2 RTE': 'assin2_rte',
133
  'ASSIN2 STS': 'assin2_sts',
134
+ 'FaQUaD-NLI': 'faquad-nli',
135
  'HateBR': 'hatebr'
136
  }
137
 
 
150
  model_data[model_name] = {
151
  'dataset_scores': {},
152
  'url': row.get('link', ''),
153
+ 'source': 'external_models',
154
+ 'num_parameters': row.get('model_num_parameters', 0)
155
  }
156
 
157
  # Map external models columns to dataset names
158
  column_mapping = {
159
  'ASSIN2 RTE': 'assin2_rte',
160
  'ASSIN2 STS': 'assin2_sts',
161
+ 'FaQUaD-NLI': 'faquad-nli',
162
  'HateBR': 'hatebr'
163
  }
164
 
 
182
  model_metadata = MODEL_METADATA.get(model_name, {})
183
  source = model_metadata.get('source', 'unknown')
184
  model_data[model_name]['source'] = source
185
+
186
+ # Add num_parameters for existing models (set to 0 as they don't have this info)
187
+ model_data[model_name]['num_parameters'] = 0
188
 
189
  # Create table data
190
  table_data = []
 
206
  if source == 'unknown':
207
  continue
208
 
209
+ # Apply parameter filtering (only for Portuguese leaderboard models)
210
+ if max_num_parameters > 0 and source == 'portuguese_leaderboard':
211
+ num_parameters = data.get('num_parameters', 0)
212
+ if num_parameters > max_num_parameters:
213
+ continue
214
+
215
  # Create clickable link for model name
216
  if data['url']:
217
  model_display = f"[{model_name}]({data['url']})"
 
408
  print(f"Error deleting file {current_csv_file}: {e}")
409
 
410
 
411
+ def create_model_performance_radar(selected_datasets: List[str] = None, show_napolab_thesis: bool = True, show_teenytinyllama: bool = True, show_portuguese_leaderboard: bool = True, show_external_models: bool = True, hide_incomplete_models: bool = False, min_average_performance: float = 0.0, search_query: str = "", max_num_parameters: int = 0) -> go.Figure:
412
  """Create a radar chart showing model performance across all datasets."""
413
  # Use selected datasets if provided, otherwise use all datasets
414
  if selected_datasets is None:
 
445
  model_data[model_name] = {
446
  'performances': {},
447
  'architecture': 'Unknown',
448
+ 'source': 'portuguese_leaderboard',
449
+ 'num_parameters': row.get('model_num_parameters', 0)
450
  }
451
 
452
  # Map Portuguese leaderboard columns to dataset names
453
  column_mapping = {
454
  'ASSIN2 RTE': 'assin2_rte',
455
  'ASSIN2 STS': 'assin2_sts',
456
+ 'FaQUaD-NLI': 'faquad-nli',
457
  'HateBR': 'hatebr'
458
  }
459
 
 
472
  model_data[model_name] = {
473
  'performances': {},
474
  'architecture': 'Unknown',
475
+ 'source': 'external_models',
476
+ 'num_parameters': row.get('model_num_parameters', 0)
477
  }
478
 
479
  # Map external models columns to dataset names
480
  column_mapping = {
481
  'ASSIN2 RTE': 'assin2_rte',
482
  'ASSIN2 STS': 'assin2_sts',
483
+ 'FaQUaD-NLI': 'faquad-nli',
484
  'HateBR': 'hatebr'
485
  }
486
 
 
504
  model_metadata = MODEL_METADATA.get(model_name, {})
505
  source = model_metadata.get('source', 'unknown')
506
  model_data[model_name]['source'] = source
507
+
508
+ # Add num_parameters for existing models (set to 0 as they don't have this info)
509
+ model_data[model_name]['num_parameters'] = 0
510
 
511
  # Apply source filtering
512
  filtered_model_data = {}
 
526
  if source == 'unknown':
527
  continue
528
 
529
+ # Apply parameter filtering (only for Portuguese leaderboard models)
530
+ if max_num_parameters > 0 and source == 'portuguese_leaderboard':
531
+ num_parameters = data.get('num_parameters', 0)
532
+ if num_parameters > max_num_parameters:
533
+ continue
534
+
535
  filtered_model_data[model_name] = data
536
 
537
  # Apply incomplete model filtering
 
756
  dataset_checkboxes = []
757
  for dataset_name in sorted(NAPOLAB_DATASETS.keys()):
758
  display_name = NAPOLAB_DATASETS[dataset_name].get('name', dataset_name)
759
+ # Default to selected only for ASSIN 2 STS, FaQUaD-NLI, and HateBR
760
+ default_value = display_name in ['ASSIN 2 STS', 'FaQUaD-NLI', 'HateBR']
761
  checkbox = gr.Checkbox(
762
  label=display_name,
763
  value=default_value
 
799
  value=True
800
  )
801
 
802
+ # Calculate max parameters for slider
803
+ max_params = 0
804
+ if not PORTUGUESE_LEADERBOARD_DATA.empty:
805
+ max_params = int(PORTUGUESE_LEADERBOARD_DATA['model_num_parameters'].max())
806
+
807
+ with gr.Accordion("Filter by Model Size: (Click to expand)", open=False):
808
+ with gr.Row():
809
+ max_num_parameters = gr.Slider(
810
+ minimum=0,
811
+ maximum=max_params,
812
+ value=0,
813
+ step=1,
814
+ label="Maximum Number of Parameters",
815
+ info="This slider is applicable only to Open PT LLM Leaderboard models. For other models, it will have no effect."
816
+ )
817
+
818
  # Search bar for filtering models
819
  search_query = gr.Textbox(
820
  label="Search models by name (supports regex)",
 
848
  analysis_dataset_checkboxes = []
849
  for dataset_name in sorted(NAPOLAB_DATASETS.keys()):
850
  display_name = NAPOLAB_DATASETS[dataset_name].get('name', dataset_name)
851
+ # Default to selected only for ASSIN 2 STS, FaQUaD-NLI, and HateBR
852
+ default_value = display_name in ['ASSIN 2 STS', 'FaQUaD-NLI', 'HateBR']
853
  checkbox = gr.Checkbox(
854
  label=display_name,
855
  value=default_value
 
894
  value=True
895
  )
896
 
897
+ # Parameter slider for Model Analysis tab
898
+ with gr.Accordion("Filter by Model Size: (Click to expand)", open=False):
899
+ with gr.Row():
900
+ max_num_parameters_analysis = gr.Slider(
901
+ minimum=0,
902
+ maximum=max_params,
903
+ value=0,
904
+ step=1,
905
+ label="Maximum Number of Parameters",
906
+ info="This slider is applicable only to Open PT LLM Leaderboard models. For other models, it will have no effect."
907
+ )
908
+
909
  # Search bar for filtering models in radar chart
910
  search_query_analysis = gr.Textbox(
911
  label="Search models by name (supports regex)",
 
916
 
917
  model_analysis_chart = gr.Plot(label="Model Performance Radar Chart")
918
 
919
+ # Add scatter plot below radar chart
920
+ model_scatter_plot = gr.Plot(label="Model Performance vs Number of Parameters")
921
+
922
  gr.Markdown("""
923
  **How to interact with the chart:**
924
  - **Click on legend items** to show/hide specific models.
 
974
 
975
  """)
976
 
977
+ def create_model_performance_scatter(selected_datasets: List[str] = None, show_napolab_thesis: bool = True, show_teenytinyllama: bool = True, show_portuguese_leaderboard: bool = True, show_external_models: bool = True, hide_incomplete_models: bool = False, min_average_performance: float = 0.0, search_query: str = "", max_num_parameters: int = 0) -> go.Figure:
978
+ """Create a scatter plot showing model performance vs number of parameters."""
979
+ # Use selected datasets if provided, otherwise use all datasets
980
+ if selected_datasets is None:
981
+ selected_datasets = list(NAPOLAB_DATASETS.keys())
982
+
983
+ # Collect data for each model
984
+ model_data = {}
985
+
986
+ # Process existing benchmark results
987
+ for dataset_name, models in SAMPLE_BENCHMARK_RESULTS.items():
988
+ if dataset_name in selected_datasets:
989
+ for model_name, metrics in models.items():
990
+ if model_name not in model_data:
991
+ # Get actual source from MODEL_METADATA
992
+ model_metadata = MODEL_METADATA.get(model_name, {})
993
+ actual_source = model_metadata.get('source', 'unknown')
994
+
995
+ model_data[model_name] = {
996
+ 'performances': {},
997
+ 'architecture': model_metadata.get('architecture', 'Unknown'),
998
+ 'source': actual_source,
999
+ 'num_parameters': 0
1000
+ }
1001
+
1002
+ # Calculate average performance for this dataset
1003
+ avg_performance = np.mean(list(metrics.values()))
1004
+ model_data[model_name]['performances'][dataset_name] = avg_performance
1005
+
1006
+ # Process Portuguese leaderboard data
1007
+ if show_portuguese_leaderboard and not PORTUGUESE_LEADERBOARD_DATA.empty:
1008
+ for _, row in PORTUGUESE_LEADERBOARD_DATA.iterrows():
1009
+ model_name = row['model_name']
1010
+
1011
+ if model_name not in model_data:
1012
+ model_data[model_name] = {
1013
+ 'performances': {},
1014
+ 'architecture': 'Unknown',
1015
+ 'source': 'portuguese_leaderboard',
1016
+ 'num_parameters': row.get('model_num_parameters', 0)
1017
+ }
1018
+
1019
+ # Map Portuguese leaderboard columns to dataset names
1020
+ column_mapping = {
1021
+ 'ASSIN2 RTE': 'assin2_rte',
1022
+ 'ASSIN2 STS': 'assin2_sts',
1023
+ 'FaQUaD-NLI': 'faquad-nli',
1024
+ 'HateBR': 'hatebr'
1025
+ }
1026
+
1027
+ for display_name, dataset_name in column_mapping.items():
1028
+ if dataset_name in selected_datasets:
1029
+ score = row[display_name]
1030
+ if pd.notna(score) and score > 0:
1031
+ model_data[model_name]['performances'][dataset_name] = score
1032
+
1033
+ # Process external models data
1034
+ if show_external_models and not EXTERNAL_MODELS_DATA.empty:
1035
+ for _, row in EXTERNAL_MODELS_DATA.iterrows():
1036
+ model_name = row['model_name']
1037
+
1038
+ if model_name not in model_data:
1039
+ model_data[model_name] = {
1040
+ 'performances': {},
1041
+ 'architecture': 'Unknown',
1042
+ 'source': 'external_models',
1043
+ 'num_parameters': row.get('model_num_parameters', 0)
1044
+ }
1045
+
1046
+ # Map external models columns to dataset names
1047
+ column_mapping = {
1048
+ 'ASSIN2 RTE': 'assin2_rte',
1049
+ 'ASSIN2 STS': 'assin2_sts',
1050
+ 'FaQUaD-NLI': 'faquad-nli',
1051
+ 'HateBR': 'hatebr'
1052
+ }
1053
+
1054
+ for display_name, dataset_name in column_mapping.items():
1055
+ if dataset_name in selected_datasets:
1056
+ score = row[display_name]
1057
+ if pd.notna(score) and score > 0:
1058
+ model_data[model_name]['performances'][dataset_name] = score
1059
+
1060
+ # Apply source filtering
1061
+ filtered_model_data = {}
1062
+ for model_name, data in model_data.items():
1063
+ source = data.get('source', 'existing')
1064
+
1065
+ # Apply show filters - only show models from sources that are checked
1066
+ if source == 'napolab_thesis' and not show_napolab_thesis:
1067
+ continue
1068
+ if source == 'teenytinyllama_paper' and not show_teenytinyllama:
1069
+ continue
1070
+ if source == 'portuguese_leaderboard' and not show_portuguese_leaderboard:
1071
+ continue
1072
+ if source == 'external_models' and not show_external_models:
1073
+ continue
1074
+ # Hide models with unknown source (should not happen with proper data)
1075
+ if source == 'unknown':
1076
+ continue
1077
+
1078
+ # Apply parameter filtering (only for Portuguese leaderboard models)
1079
+ if max_num_parameters > 0 and source == 'portuguese_leaderboard':
1080
+ num_parameters = data.get('num_parameters', 0)
1081
+ if num_parameters > max_num_parameters:
1082
+ continue
1083
+
1084
+ filtered_model_data[model_name] = data
1085
+
1086
+ # Apply incomplete model filtering
1087
+ if hide_incomplete_models and selected_datasets:
1088
+ final_filtered_data = {}
1089
+ for model_name, data in filtered_model_data.items():
1090
+ has_all_scores = True
1091
+ for dataset_name in selected_datasets:
1092
+ if data['performances'].get(dataset_name, 0) == 0:
1093
+ has_all_scores = False
1094
+ break
1095
+ if has_all_scores:
1096
+ final_filtered_data[model_name] = data
1097
+ filtered_model_data = final_filtered_data
1098
+
1099
+ # Apply minimum average performance filtering
1100
+ if min_average_performance > 0 and selected_datasets:
1101
+ final_filtered_data = {}
1102
+ for model_name, data in filtered_model_data.items():
1103
+ # Calculate average performance for selected datasets
1104
+ scores = []
1105
+ for dataset_name in selected_datasets:
1106
+ score = data['performances'].get(dataset_name, 0)
1107
+ if score > 0: # Only include non-zero scores
1108
+ scores.append(score)
1109
+
1110
+ if scores:
1111
+ avg_performance = np.mean(scores)
1112
+ if avg_performance >= min_average_performance:
1113
+ final_filtered_data[model_name] = data
1114
+ filtered_model_data = final_filtered_data
1115
+
1116
+ # Apply search query filtering
1117
+ if search_query:
1118
+ final_filtered_data = {}
1119
+ try:
1120
+ # Use regex pattern matching
1121
+ import re
1122
+ pattern = re.compile(search_query, re.IGNORECASE)
1123
+ for model_name, data in filtered_model_data.items():
1124
+ if pattern.search(model_name):
1125
+ final_filtered_data[model_name] = data
1126
+ except re.error:
1127
+ # Fallback to simple string matching if regex is invalid
1128
+ for model_name, data in filtered_model_data.items():
1129
+ if search_query.lower() in model_name.lower():
1130
+ final_filtered_data[model_name] = data
1131
+ filtered_model_data = final_filtered_data
1132
+
1133
+ # Prepare data for scatter plot
1134
+ scatter_data = []
1135
+ for model_name, data in filtered_model_data.items():
1136
+ # Calculate average performance for selected datasets
1137
+ scores = []
1138
+ for dataset_name in selected_datasets:
1139
+ score = data['performances'].get(dataset_name, 0)
1140
+ if score > 0: # Only include non-zero scores
1141
+ scores.append(score)
1142
+
1143
+ if scores:
1144
+ avg_performance = np.mean(scores)
1145
+ num_parameters = data.get('num_parameters', 0)
1146
+ source = data.get('source', 'unknown')
1147
+
1148
+ scatter_data.append({
1149
+ 'model_name': model_name,
1150
+ 'avg_performance': avg_performance,
1151
+ 'num_parameters': num_parameters,
1152
+ 'source': source
1153
+ })
1154
+
1155
+ if not scatter_data:
1156
+ # Create empty figure if no data
1157
+ fig = go.Figure()
1158
+ fig.add_annotation(
1159
+ text="No data available for the selected filters",
1160
+ xref="paper", yref="paper",
1161
+ x=0.5, y=0.5, showarrow=False,
1162
+ font=dict(size=16)
1163
+ )
1164
+ fig.update_layout(
1165
+ title="Model Performance vs Number of Parameters",
1166
+ xaxis_title="Number of Parameters",
1167
+ yaxis_title="Average Performance Score",
1168
+ height=500
1169
+ )
1170
+ return fig
1171
+
1172
+ # Create scatter plot
1173
+ df_scatter = pd.DataFrame(scatter_data)
1174
+
1175
+ # Create color mapping for sources
1176
+ color_map = {
1177
+ 'portuguese_leaderboard': '#1f77b4',
1178
+ 'external_models': '#ff7f0e',
1179
+ 'napolab_thesis': '#2ca02c',
1180
+ 'teenytinyllama_paper': '#d62728',
1181
+ 'unknown': '#9467bd'
1182
+ }
1183
+
1184
+ # Create display name mapping for sources
1185
+ display_name_map = {
1186
+ 'portuguese_leaderboard': 'Open PT LLM Leaderboard',
1187
+ 'external_models': 'Proprietary Models',
1188
+ 'napolab_thesis': 'Napolab Thesis',
1189
+ 'teenytinyllama_paper': 'TeenyTinyLlama Paper',
1190
+ 'unknown': 'Unknown Source'
1191
+ }
1192
+
1193
+ fig = go.Figure()
1194
+
1195
+ for source in df_scatter['source'].unique():
1196
+ source_data = df_scatter[df_scatter['source'] == source]
1197
+ color = color_map.get(source, '#7f7f7f')
1198
+ display_name = display_name_map.get(source, source.replace('_', ' ').title())
1199
+
1200
+ fig.add_trace(go.Scatter(
1201
+ x=source_data['num_parameters'],
1202
+ y=source_data['avg_performance'],
1203
+ mode='markers',
1204
+ name=display_name,
1205
+ marker=dict(
1206
+ color=color,
1207
+ size=8,
1208
+ opacity=0.7
1209
+ ),
1210
+ text=source_data['model_name'],
1211
+ hovertemplate=(
1212
+ "<b>%{text}</b><br>" +
1213
+ "Average Performance: %{y:.3f}<br>" +
1214
+ "Number of Parameters: %{x:,}<br>" +
1215
+ "Source: " + display_name + "<br>" +
1216
+ "<extra></extra>"
1217
+ )
1218
+ ))
1219
+
1220
+ fig.update_layout(
1221
+ title="Model Performance vs Number of Parameters",
1222
+ xaxis_title="Number of Parameters",
1223
+ yaxis_title="Average Performance Score",
1224
+ height=500,
1225
+ showlegend=True,
1226
+ plot_bgcolor='rgba(255, 255, 255, 0)',
1227
+ paper_bgcolor='rgba(255, 255, 255, 0)',
1228
+ legend=dict(
1229
+ yanchor="top",
1230
+ y=-0.15,
1231
+ xanchor="center",
1232
+ x=0.5,
1233
+ bgcolor='rgba(255, 255, 255, 0.95)',
1234
+ bordercolor='rgba(0, 0, 0, 0.2)',
1235
+ borderwidth=1,
1236
+ orientation="h"
1237
+ ),
1238
+ margin=dict(l=50, r=50, t=100, b=100)
1239
+ )
1240
+
1241
+ return fig
1242
+
1243
  # Event handlers
1244
  def update_radar_chart(*args):
1245
  # Extract arguments for radar chart
 
1251
  show_portuguese_leaderboard = args[len(analysis_dataset_checkboxes) + 4]
1252
  show_external_models = args[len(analysis_dataset_checkboxes) + 5]
1253
  search_query = args[len(analysis_dataset_checkboxes) + 6]
1254
+ max_num_parameters = args[len(analysis_dataset_checkboxes) + 7]
1255
 
1256
  # Convert dataset selections to list of selected dataset names
1257
  selected_datasets = []
 
1259
  if dataset_values[i]:
1260
  selected_datasets.append(dataset_name)
1261
 
1262
+ return create_model_performance_radar(selected_datasets, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, hide_incomplete_models, min_average_performance, search_query, max_num_parameters)
1263
 
1264
  def update_benchmark_table(*args):
1265
  # Extract arguments
 
1271
  show_portuguese_leaderboard = args[len(dataset_checkboxes) + 4]
1272
  show_external_models = args[len(dataset_checkboxes) + 5]
1273
  search_query = args[len(dataset_checkboxes) + 6]
1274
+ max_num_parameters = args[len(dataset_checkboxes) + 7]
1275
 
1276
  # Convert dataset selections to list of selected dataset names
1277
  selected_datasets = []
 
1279
  if dataset_values[i]:
1280
  selected_datasets.append(dataset_name)
1281
 
1282
+ df = create_simplified_benchmark_table(selected_datasets, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, hide_incomplete_models, min_average_performance, search_query, max_num_parameters)
1283
 
1284
  return df
1285
 
1286
+ def update_scatter_plot(*args):
1287
+ # Extract arguments for scatter plot
1288
+ dataset_values = args[:len(analysis_dataset_checkboxes)]
1289
+ hide_incomplete_models = args[len(analysis_dataset_checkboxes)]
1290
+ min_average_performance = args[len(analysis_dataset_checkboxes) + 1] / 100.0 # Convert percentage to decimal
1291
+ show_napolab_thesis = args[len(analysis_dataset_checkboxes) + 2]
1292
+ show_teenytinyllama = args[len(analysis_dataset_checkboxes) + 3]
1293
+ show_portuguese_leaderboard = args[len(analysis_dataset_checkboxes) + 4]
1294
+ show_external_models = args[len(analysis_dataset_checkboxes) + 5]
1295
+ search_query = args[len(analysis_dataset_checkboxes) + 6]
1296
+ max_num_parameters = args[len(analysis_dataset_checkboxes) + 7]
1297
+
1298
+ # Convert dataset selections to list of selected dataset names
1299
+ selected_datasets = []
1300
+ for i, (dataset_name, _) in enumerate(analysis_dataset_checkboxes):
1301
+ if dataset_values[i]:
1302
+ selected_datasets.append(dataset_name)
1303
+
1304
+ return create_model_performance_scatter(selected_datasets, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, hide_incomplete_models, min_average_performance, search_query, max_num_parameters)
1305
 
1306
  # Connect dataset checkboxes to update table
1307
  for dataset_name, checkbox in dataset_checkboxes:
1308
  checkbox.change(
1309
  update_benchmark_table,
1310
+ inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
1311
  outputs=benchmark_table
1312
  )
1313
 
1314
  hide_incomplete_models.change(
1315
  update_benchmark_table,
1316
+ inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
1317
  outputs=benchmark_table
1318
  )
1319
 
1320
  min_average_performance.change(
1321
  update_benchmark_table,
1322
+ inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
1323
  outputs=benchmark_table
1324
  )
1325
 
1326
  show_napolab_thesis.change(
1327
  update_benchmark_table,
1328
+ inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
1329
  outputs=benchmark_table
1330
  )
1331
 
1332
  show_teenytinyllama.change(
1333
  update_benchmark_table,
1334
+ inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
1335
  outputs=benchmark_table
1336
  )
1337
 
1338
  show_portuguese_leaderboard.change(
1339
  update_benchmark_table,
1340
+ inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
1341
  outputs=benchmark_table
1342
  )
1343
 
1344
  show_external_models.change(
1345
  update_benchmark_table,
1346
+ inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
1347
  outputs=benchmark_table
1348
  )
1349
 
1350
  # Connect search query to update table
1351
  search_query.change(
1352
  update_benchmark_table,
1353
+ inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
1354
+ outputs=benchmark_table
1355
+ )
1356
+
1357
+ # Connect max_num_parameters to update table
1358
+ max_num_parameters.change(
1359
+ update_benchmark_table,
1360
+ inputs=[cb for _, cb in dataset_checkboxes] + [hide_incomplete_models, min_average_performance, show_napolab_thesis, show_teenytinyllama, show_portuguese_leaderboard, show_external_models, search_query, max_num_parameters],
1361
  outputs=benchmark_table
1362
  )
1363
 
 
1380
  for dataset_name, checkbox in analysis_dataset_checkboxes:
1381
  checkbox.change(
1382
  update_radar_chart,
1383
+ inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
1384
  outputs=model_analysis_chart
1385
  )
1386
 
1387
  hide_incomplete_models_analysis.change(
1388
  update_radar_chart,
1389
+ inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
1390
  outputs=model_analysis_chart
1391
  )
1392
 
1393
  min_average_performance_analysis.change(
1394
  update_radar_chart,
1395
+ inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
1396
  outputs=model_analysis_chart
1397
  )
1398
 
1399
  show_napolab_thesis_analysis.change(
1400
  update_radar_chart,
1401
+ inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
1402
  outputs=model_analysis_chart
1403
  )
1404
 
1405
  show_teenytinyllama_analysis.change(
1406
  update_radar_chart,
1407
+ inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
1408
  outputs=model_analysis_chart
1409
  )
1410
 
1411
  show_portuguese_leaderboard_analysis.change(
1412
  update_radar_chart,
1413
+ inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
1414
  outputs=model_analysis_chart
1415
  )
1416
 
1417
  show_external_models_analysis.change(
1418
  update_radar_chart,
1419
+ inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
1420
  outputs=model_analysis_chart
1421
  )
1422
 
1423
  # Connect search query to update radar chart
1424
  search_query_analysis.change(
1425
  update_radar_chart,
1426
+ inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
1427
  outputs=model_analysis_chart
1428
  )
1429
+
1430
+ # Connect max_num_parameters_analysis to update radar chart
1431
+ max_num_parameters_analysis.change(
1432
+ update_radar_chart,
1433
+ inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
1434
+ outputs=model_analysis_chart
1435
+ )
1436
+
1437
+ # Connect all analysis controls to update scatter plot
1438
+ for dataset_name, checkbox in analysis_dataset_checkboxes:
1439
+ checkbox.change(
1440
+ update_scatter_plot,
1441
+ inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
1442
+ outputs=model_scatter_plot
1443
+ )
1444
+
1445
+ hide_incomplete_models_analysis.change(
1446
+ update_scatter_plot,
1447
+ inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
1448
+ outputs=model_scatter_plot
1449
+ )
1450
+
1451
+ min_average_performance_analysis.change(
1452
+ update_scatter_plot,
1453
+ inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
1454
+ outputs=model_scatter_plot
1455
+ )
1456
+
1457
+ show_napolab_thesis_analysis.change(
1458
+ update_scatter_plot,
1459
+ inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
1460
+ outputs=model_scatter_plot
1461
+ )
1462
+
1463
+ show_teenytinyllama_analysis.change(
1464
+ update_scatter_plot,
1465
+ inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
1466
+ outputs=model_scatter_plot
1467
+ )
1468
+
1469
+ show_portuguese_leaderboard_analysis.change(
1470
+ update_scatter_plot,
1471
+ inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
1472
+ outputs=model_scatter_plot
1473
+ )
1474
+
1475
+ show_external_models_analysis.change(
1476
+ update_scatter_plot,
1477
+ inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
1478
+ outputs=model_scatter_plot
1479
+ )
1480
+
1481
+ search_query_analysis.change(
1482
+ update_scatter_plot,
1483
+ inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
1484
+ outputs=model_scatter_plot
1485
+ )
1486
+
1487
+ max_num_parameters_analysis.change(
1488
+ update_scatter_plot,
1489
+ inputs=[cb for _, cb in analysis_dataset_checkboxes] + [hide_incomplete_models_analysis, min_average_performance_analysis, show_napolab_thesis_analysis, show_teenytinyllama_analysis, show_portuguese_leaderboard_analysis, show_external_models_analysis, search_query_analysis, max_num_parameters_analysis],
1490
+ outputs=model_scatter_plot
1491
+ )
1492
+
1493
+ # Connect events
1494
+ # Load model analysis chart on app start
1495
+ app.load(lambda: update_radar_chart(*([display_name in ['ASSIN 2 STS', 'FaQUaD-NLI', 'HateBR'] for _, display_name in [(name, NAPOLAB_DATASETS[name].get('name', name)) for name in sorted(NAPOLAB_DATASETS.keys())]] + [True, 80, True, True, True, True, "", 0])), outputs=model_analysis_chart)
1496
+
1497
+ # Load scatter plot on app start
1498
+ app.load(lambda: update_scatter_plot(*([display_name in ['ASSIN 2 STS', 'FaQUaD-NLI', 'HateBR'] for _, display_name in [(name, NAPOLAB_DATASETS[name].get('name', name)) for name in sorted(NAPOLAB_DATASETS.keys())]] + [True, 80, True, True, True, True, "", 0])), outputs=model_scatter_plot)
1499
+
1500
+ # Load benchmark table on app start
1501
+ app.load(lambda: update_benchmark_table(*([display_name in ['ASSIN 2 STS', 'FaQUaD-NLI', 'HateBR'] for _, display_name in [(name, NAPOLAB_DATASETS[name].get('name', name)) for name in sorted(NAPOLAB_DATASETS.keys())]] + [True, 80, True, True, True, True, "", 0])), outputs=benchmark_table)
1502
 
1503
  if __name__ == "__main__":
1504
  app.launch(server_name="0.0.0.0", server_port=7860)
external_models.csv CHANGED
@@ -1,31 +1,31 @@
1
- model,link,assin2_sts,assin2_rte,faquad_nli,hatebr_offensive
2
- sabia-2-small,https://www.maritaca.ai/,0.7053302344881672,0.9121728362223306,0.7575848453041435,0.753800795680591
3
- sabia-2-medium,https://www.maritaca.ai/,0.7804108376537757,0.923459363368553,0.7657657657657658,0.8349989882997386
4
- gpt-3.5-turbo-0125,https://www.openai.com/,0.7378460201077941,0.8823038414050672,0.746353108609074,0.8056205941193919
5
- claude-3-haiku-20240307,https://www.claude.ai/,0.7892124744168747,0.9184462138121732,0.6340996599941455,0.8023698759439051
6
- gemini-1.0-pro,https://ai.google.dev/,0.7058831239763663,0.8945993304651698,0.7070913567220611,0.8086330094493972
7
- gemini-1.5-pro-preview-0409,https://cloud.google.com/vertex-ai,0.8159702278408203,0.9328989988467518,0.7290756302521009,0.8697698647467024
8
- deepseek-v2-chat,https://www.deepseek.com/,0.8533174657651231,0.9440170304568147,0.7995469048381548,0.8842986491071644
9
- gemini-1.5-flash-preview-0514,https://cloud.google.com/vertex-ai,0.841655158151231,0.9362097477374545,0.8092185592185592,0.9099110141445836
10
- gemini-1.5-flash-001,https://cloud.google.com/vertex-ai,0.838806085610371,0.9366169973822607,0.7963910785668922,0.9092078461170015
11
- gpt-4o-mini-2024-07-18,https://www.openai.com/,0.7259038954527597,0.942809846745341,0.819807735300693,0.8682357029532165
12
- nemotron-4-340b-instruct,https://huggingface.co/nvidia/Nemotron-4-340B-Instruct,0.7857731021403329,0.9489354458928496,0.8194444444444444,0.8641580001234928
13
- llama_405b_instruct,https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct,0.7888441732870783,0.9476445477916471,0.825063276593557,0.9073940659389119
14
- sabia-3,https://www.maritaca.ai/,0.8253863689009022,0.9477034821619312,0.8243848812618203,0.8278737774590023
15
- llama3_3_70b,https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct,0.7275578599896508,0.9407071010860484,0.8787563033858187,0.9024358249091997
16
- llama3_2_90b,https://huggingface.co/meta-llama/Llama-3.2-90B-Vision-Instruct,0.7368518566379951,0.9216548775103446,0.8632015306122449,0.8965270877302478
17
- gemini-1.5-flash-002,https://cloud.google.com/vertex-ai,0.8380176734291938,0.941176117215237,0.8360786822325283,0.9046145161133335
18
- gemini-1.5-flash-8b-001,https://aistudio.google.com,0.7638946799836569,0.9329452628161146,0.7937022965448601,0.850497640901663
19
- gemini-2.0-flash-001,https://cloud.google.com/vertex-ai,0.8440142633742483,0.9305165510724053,0.7533651260745065,0.8890432813545366
20
- gemini-2.0-flash-lite-001,https://cloud.google.com/vertex-ai,0.8492479991621328,0.9216548775103446,0.7652777777777777,0.8522499647780968
21
- gemini-2.5-pro-exp-03-25,https://aistudio.google.com,0.837785744915033,0.9415510158830285,0.8738735797309651,0.9248478168290788
22
- deepSeek-v3-0324,https://huggingface.co/deepseek-ai/DeepSeek-V3-0324,0.8145997097875548,0.9421860387625551,0.796751127001399,0.9060129756724185
23
- qwen2-5-vl-72b-instruct,https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct,0.7595538567467497,0.9472975104201871,0.8447190882122586,0.8810695094657859
24
- qwen2-5-72b-instruct,https://huggingface.co/Qwen/Qwen2.5-72B-Instruct,0.8230708844558656,0.9509720145268106,0.8194444444444444,0.8810033427242816
25
- qwen2-5-vl-32b-instruct,https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct,0.7780549055529008,0.9472975104201871,0.8447190882122586,0.8810695094657859
26
- qwen-turbo-2024-11-01,https://www.alibabacloud.com/en/product/modelstudio,0.7640477700456898,0.9260451969385788,0.8128063725490196,0.8567933277676292
27
- gpt-4o-2024-08-06,https://www.openai.com/,0.8078677969518289,0.9407235712144604,0.8654396266184885,0.9320137873994456
28
- claude-3-7-sonnet-20250219,https://www.anthropic.com/,0.8087979933117393,0.9472965253044003,0.8097848807348216,0.9125114739050616
29
- llama-4-scout-16e,https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct,0.7741640227983941,0.9312877465954967,0.8567037452287072,0.8813700069483281
30
- llama-4-maverick-128e,https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct,0.7333246903202654,0.9329419027588105,0.7823695413019562,0.9047550357833591
31
- gemma-3-27b-it,https://huggingface.co/google/gemma-3-27b-it,0.8147646517017526,0.9411147367212748,0.8143210816987241,0.8729414870796344
 
1
+ model,link,assin2_sts,assin2_rte,faquad_nli,hatebr_offensive
2
+ sabia-2-small,https://www.maritaca.ai/,0.7053302344881672,0.9121728362223306,0.7575848453041435,0.753800795680591
3
+ sabia-2-medium,https://www.maritaca.ai/,0.7804108376537757,0.923459363368553,0.7657657657657658,0.8349989882997386
4
+ gpt-3.5-turbo-0125,https://www.openai.com/,0.7378460201077941,0.8823038414050672,0.746353108609074,0.8056205941193919
5
+ claude-3-haiku-20240307,https://www.claude.ai/,0.7892124744168747,0.9184462138121732,0.6340996599941455,0.8023698759439051
6
+ gemini-1.0-pro,https://ai.google.dev/,0.7058831239763663,0.8945993304651698,0.7070913567220611,0.8086330094493972
7
+ gemini-1.5-pro-preview-0409,https://cloud.google.com/vertex-ai,0.8159702278408203,0.9328989988467518,0.7290756302521009,0.8697698647467024
8
+ deepseek-v2-chat,https://www.deepseek.com/,0.8533174657651231,0.9440170304568147,0.7995469048381548,0.8842986491071644
9
+ gemini-1.5-flash-preview-0514,https://cloud.google.com/vertex-ai,0.841655158151231,0.9362097477374545,0.8092185592185592,0.9099110141445836
10
+ gemini-1.5-flash-001,https://cloud.google.com/vertex-ai,0.838806085610371,0.9366169973822607,0.7963910785668922,0.9092078461170015
11
+ gpt-4o-mini-2024-07-18,https://www.openai.com/,0.7259038954527597,0.942809846745341,0.819807735300693,0.8682357029532165
12
+ nemotron-4-340b-instruct,https://huggingface.co/nvidia/Nemotron-4-340B-Instruct,0.7857731021403329,0.9489354458928496,0.8194444444444444,0.8641580001234928
13
+ llama_405b_instruct,https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct,0.7888441732870783,0.9476445477916471,0.825063276593557,0.9073940659389119
14
+ sabia-3,https://www.maritaca.ai/,0.8253863689009022,0.9477034821619312,0.8243848812618203,0.8278737774590023
15
+ llama3_3_70b,https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct,0.7275578599896508,0.9407071010860484,0.8787563033858187,0.9024358249091997
16
+ llama3_2_90b,https://huggingface.co/meta-llama/Llama-3.2-90B-Vision-Instruct,0.7368518566379951,0.9216548775103446,0.8632015306122449,0.8965270877302478
17
+ gemini-1.5-flash-002,https://cloud.google.com/vertex-ai,0.8380176734291938,0.941176117215237,0.8360786822325283,0.9046145161133335
18
+ gemini-1.5-flash-8b-001,https://aistudio.google.com,0.7638946799836569,0.9329452628161146,0.7937022965448601,0.850497640901663
19
+ gemini-2.0-flash-001,https://cloud.google.com/vertex-ai,0.8440142633742483,0.9305165510724053,0.7533651260745065,0.8890432813545366
20
+ gemini-2.0-flash-lite-001,https://cloud.google.com/vertex-ai,0.8492479991621328,0.9216548775103446,0.7652777777777777,0.8522499647780968
21
+ gemini-2.5-pro-exp-03-25,https://aistudio.google.com,0.837785744915033,0.9415510158830285,0.8738735797309651,0.9248478168290788
22
+ deepSeek-v3-0324,https://huggingface.co/deepseek-ai/DeepSeek-V3-0324,0.8145997097875548,0.9421860387625551,0.796751127001399,0.9060129756724185
23
+ qwen2-5-vl-72b-instruct,https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct,0.7595538567467497,0.9472975104201871,0.8447190882122586,0.8810695094657859
24
+ qwen2-5-72b-instruct,https://huggingface.co/Qwen/Qwen2.5-72B-Instruct,0.8230708844558656,0.9509720145268106,0.8194444444444444,0.8810033427242816
25
+ qwen2-5-vl-32b-instruct,https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct,0.7780549055529008,0.9472975104201871,0.8447190882122586,0.8810695094657859
26
+ qwen-turbo-2024-11-01,https://www.alibabacloud.com/en/product/modelstudio,0.7640477700456898,0.9260451969385788,0.8128063725490196,0.8567933277676292
27
+ gpt-4o-2024-08-06,https://www.openai.com/,0.8078677969518289,0.9407235712144604,0.8654396266184885,0.9320137873994456
28
+ claude-3-7-sonnet-20250219,https://www.anthropic.com/,0.8087979933117393,0.9472965253044003,0.8097848807348216,0.9125114739050616
29
+ llama-4-scout-16e,https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct,0.7741640227983941,0.9312877465954967,0.8567037452287072,0.8813700069483281
30
+ llama-4-maverick-128e,https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct,0.7333246903202654,0.9329419027588105,0.7823695413019562,0.9047550357833591
31
+ gemma-3-27b-it,https://huggingface.co/google/gemma-3-27b-it,0.8147646517017526,0.9411147367212748,0.8143210816987241,0.8729414870796344
extract_portuguese_leaderboard.py CHANGED
@@ -83,6 +83,7 @@ def extract_data_from_json(json_file_path):
83
  # Extract model information
84
  model_name = config_general.get('model_name', '')
85
  model_private = config_general.get('model_private', False)
 
86
 
87
  # Extract results
88
  all_grouped = results.get('all_grouped', {})
@@ -98,6 +99,7 @@ def extract_data_from_json(json_file_path):
98
  'json_file': str(json_file_path),
99
  'model_name': model_name,
100
  'model_private': model_private,
 
101
  'assin2_rte': assin2_rte,
102
  'assin2_sts': assin2_sts,
103
  'faquad_nli': faquad_nli,
 
83
  # Extract model information
84
  model_name = config_general.get('model_name', '')
85
  model_private = config_general.get('model_private', False)
86
+ model_num_parameters = config_general.get('model_num_parameters', 0)
87
 
88
  # Extract results
89
  all_grouped = results.get('all_grouped', {})
 
99
  'json_file': str(json_file_path),
100
  'model_name': model_name,
101
  'model_private': model_private,
102
+ 'model_num_parameters': model_num_parameters,
103
  'assin2_rte': assin2_rte,
104
  'assin2_sts': assin2_sts,
105
  'faquad_nli': faquad_nli,
portuguese_leaderboard.csv CHANGED
The diff for this file is too large to render. See raw diff