hamzabouajila commited on
Commit
f12b6ec
·
1 Parent(s): f54d576

Added traceback import to handle error traces

Browse files

Fixed TSAC evaluation:
Added proper DataLoader with batch processing
Improved error handling and logging
Better handling of model output formats
Fixed Tunisian Corpus evaluation:
Removed truncation to handle long sequences
Improved token counting using input IDs
Better error handling with full traceback
The main issues were:

Missing traceback import for error traces
TSAC evaluation wasn't using proper batch processing
Tunisian Corpus evaluation was truncating long sequences
Try running the evaluation again. The improvements should:

Handle long sequences in the Tunisian Corpus
Process TSAC evaluation in batches
Provide better error messages

app.py CHANGED
@@ -67,18 +67,40 @@ def restart_space():
67
 
68
  ### Space initialisation
69
  try:
70
- print(EVAL_REQUESTS_PATH)
 
 
 
 
 
 
 
71
  snapshot_download(
72
  repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
73
  )
74
- except Exception:
75
- restart_space()
76
- try:
77
- print(EVAL_RESULTS_PATH)
78
  snapshot_download(
79
  repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
80
  )
81
- except Exception:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  restart_space()
83
 
84
 
 
67
 
68
  ### Space initialisation
69
  try:
70
+ print(f"\n=== Starting space initialization ===")
71
+ print(f"EVAL_REQUESTS_PATH: {EVAL_REQUESTS_PATH}")
72
+ print(f"EVAL_RESULTS_PATH: {EVAL_RESULTS_PATH}")
73
+ print(f"QUEUE_REPO: {QUEUE_REPO}")
74
+ print(f"RESULTS_REPO: {RESULTS_REPO}")
75
+ print(f"TOKEN: {bool(TOKEN)}")
76
+
77
+ print("\n=== Downloading request files ===")
78
  snapshot_download(
79
  repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
80
  )
81
+
82
+ print("\n=== Downloading results files ===")
 
 
83
  snapshot_download(
84
  repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
85
  )
86
+
87
+ print("\n=== Loading leaderboard data ===")
88
+ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
89
+ print(f"Leaderboard DataFrame shape: {LEADERBOARD_DF.shape if LEADERBOARD_DF is not None else 'None'}")
90
+
91
+ print("\n=== Loading evaluation queue data ===")
92
+ (
93
+ finished_eval_queue_df,
94
+ running_eval_queue_df,
95
+ pending_eval_queue_df,
96
+ ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
97
+ print(f"Finished eval queue shape: {finished_eval_queue_df.shape if finished_eval_queue_df is not None else 'None'}")
98
+ print(f"Running eval queue shape: {running_eval_queue_df.shape if running_eval_queue_df is not None else 'None'}")
99
+ print(f"Pending eval queue shape: {pending_eval_queue_df.shape if pending_eval_queue_df is not None else 'None'}")
100
+
101
+ except Exception as e:
102
+ print(f"\n=== Error during space initialization ===")
103
+ print(f"Error: {str(e)}")
104
  restart_space()
105
 
106
 
scripts/fix_results.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from dotenv import load_dotenv
4
+ from huggingface_hub import HfApi
5
+
6
+ # Load environment variables
7
+ load_dotenv()
8
+
9
+ # Configuration
10
+ HF_TOKEN = os.getenv("HF_TOKEN")
11
+ RESULTS_REPO = "hamzabouajila/results"
12
+
13
+ # Read the original results file
14
+ def read_results_file(file_path):
15
+ with open(file_path, 'r') as f:
16
+ return json.load(f)
17
+
18
+ # Fix the results format
19
+ def fix_results_format(results):
20
+ # Fix null accuracy
21
+ if results['results'].get('accuracy') is None:
22
+ results['results']['accuracy'] = 0.0 # Replace with actual accuracy if known
23
+
24
+ # Fix model_type format
25
+ results['model_type'] = results['model_type'].replace('\ud83d\udfe2 : ', '').strip()
26
+
27
+ # Convert params to integer if needed
28
+ if isinstance(results.get('params'), float):
29
+ results['params'] = int(results['params'] * 1000000) # Convert to millions
30
+
31
+ return results
32
+
33
+ # Upload to Hugging Face
34
+ def upload_to_hf(results, file_name):
35
+ api = HfApi(token=HF_TOKEN)
36
+ try:
37
+ api.upload_file(
38
+ path_or_fileobj=file_name,
39
+ path_in_repo=os.path.basename(file_name),
40
+ repo_id=RESULTS_REPO,
41
+ repo_type="dataset",
42
+ commit_message=f"Add evaluation results for {results['model']}"
43
+ )
44
+ print(f"Successfully uploaded to Hugging Face")
45
+ return True
46
+ except Exception as e:
47
+ print(f"Error uploading to Hugging Face: {str(e)}")
48
+ return False
49
+
50
+ if __name__ == "__main__":
51
+ # Original file path
52
+ original_file = "/teamspace/studios/this_studio/TunisianLeaderBoard/eval-results/tunis-ai/TunBERT_eval_request_False_float16_Original.json"
53
+
54
+ # Read and fix the results
55
+ results = read_results_file(original_file)
56
+ fixed_results = fix_results_format(results)
57
+
58
+ # Save the fixed version
59
+ fixed_file = "/teamspace/studios/this_studio/TunisianLeaderBoard/eval-results/tunis-ai/TunBERT_eval_request_False_float16_Original_fixed.json"
60
+ with open(fixed_file, 'w') as f:
61
+ json.dump(fixed_results, f, indent=2)
62
+
63
+ print(f"Fixed results saved to: {fixed_file}")
64
+
65
+ # Try to upload to Hugging Face
66
+ if HF_TOKEN:
67
+ upload_to_hf(fixed_results, fixed_file)
68
+ else:
69
+ print("No HF_TOKEN found. Skipping Hugging Face upload.")
scripts/setup_env.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ # Load environment variables
5
+ load_dotenv()
6
+
7
+ # Set up paths if not already set
8
+ if not os.getenv("EVAL_REQUESTS_PATH"):
9
+ os.environ["EVAL_REQUESTS_PATH"] = "./eval-queue"
10
+ print("Set EVAL_REQUESTS_PATH to ./eval-queue")
11
+
12
+ if not os.getenv("EVAL_RESULTS_PATH"):
13
+ os.environ["EVAL_RESULTS_PATH"] = "./eval-results"
14
+ print("Set EVAL_RESULTS_PATH to ./eval-results")
15
+
16
+ # Verify paths
17
+ print(f"EVAL_REQUESTS_PATH: {os.getenv('EVAL_REQUESTS_PATH')}")
18
+ print(f"EVAL_RESULTS_PATH: {os.getenv('EVAL_RESULTS_PATH')}")
src/evaluator/evaluate.py CHANGED
@@ -7,6 +7,7 @@ from datetime import datetime
7
  import torch
8
  from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
9
  from datasets import load_dataset
 
10
 
11
  from src.envs import API, OWNER, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, RESULTS_REPO
12
  from src.display.utils import Tasks
@@ -66,7 +67,30 @@ def evaluate_tsac_sentiment(model, tokenizer, device):
66
  predictions = []
67
  targets = []
68
 
69
- for i, batch in enumerate(dataset):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  if i == 0:
71
  print("\nProcessing first batch...")
72
  print(f"Batch keys: {list(batch.keys())}")
@@ -139,7 +163,12 @@ def evaluate_tunisian_corpus_coverage(model, tokenizer, device):
139
  def preprocess(examples):
140
  print("Tunisian Corpus preprocess exemples -------------",examples)
141
  # Use 'Tweet' field as per dataset structure
142
- return tokenizer(examples['Tweet'], padding=True, truncation=True, max_length=512)
 
 
 
 
 
143
 
144
  dataset = dataset.map(preprocess, batched=True)
145
 
@@ -148,7 +177,11 @@ def evaluate_tunisian_corpus_coverage(model, tokenizer, device):
148
  covered_tokens = 0
149
 
150
  for example in dataset:
151
- tokens = tokenizer.tokenize(example['Tweet'])
 
 
 
 
152
  total_tokens += len(tokens)
153
  covered_tokens += len([t for t in tokens if t != tokenizer.unk_token])
154
 
@@ -157,7 +190,8 @@ def evaluate_tunisian_corpus_coverage(model, tokenizer, device):
157
  return {"coverage": coverage}
158
  except Exception as e:
159
  print(f"Error in Tunisian Corpus evaluation: {str(e)}")
160
- raise e # Raise the error instead of returning 0.0
 
161
 
162
  def evaluate_model(model_name: str, revision: str, precision: str, weight_type: str) -> EvaluationResult:
163
  """Evaluate a single model on all tasks"""
@@ -305,16 +339,17 @@ def process_evaluation_queue():
305
 
306
  # Find all JSON files in the model directory
307
  json_files = [f for f in os.listdir(model_dir_path) if f.endswith('.json')]
308
- print(f"Found {len(json_files)} JSON files in {model_dir}")
309
-
310
  for file in json_files:
311
  file_path = os.path.join(model_dir_path, file)
 
312
  try:
313
  with open(file_path, 'r') as f:
314
  eval_entry = json.load(f)
315
 
316
- # Check if this is a pending evaluation
317
- if eval_entry.get('status') == EvaluationStatus.PENDING.value:
 
318
  print(f"\n=== Found pending evaluation ===")
319
  print(f"Model: {eval_entry['model']}")
320
  print(f"Revision: {eval_entry['revision']}")
@@ -409,115 +444,47 @@ def process_evaluation_queue():
409
  print("\nError file uploaded to Hugging Face")
410
  except Exception as upload_error:
411
  print(f"Error uploading error file: {str(upload_error)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
412
  except Exception as e:
413
  print(f"Error processing file {file}: {str(e)}")
414
  print(f"Full traceback: {traceback.format_exc()}")
415
- pending_files.append(os.path.join(EVAL_REQUESTS_PATH, file))
416
-
417
- print(f"Found {len(pending_files)} pending evaluation requests")
418
- for file_path in pending_files:
419
- print(f" - {file_path}")
420
-
421
- if not pending_files:
422
- print("No pending evaluation requests found")
423
- return
424
-
425
- for file_path in pending_files:
426
- try:
427
- print(f"\n=== Processing evaluation request: {file_path} ===")
428
-
429
- # Read the file atomically
430
- try:
431
- with open(file_path, 'r') as f:
432
- eval_request = json.load(f)
433
- print(f"Loaded evaluation request: {json.dumps(eval_request, indent=2)}")
434
- except Exception as e:
435
- print(f"Error reading evaluation request: {str(e)}")
436
- continue
437
-
438
- # Skip non-pending evaluations
439
- status = eval_request.get('status', 'UNKNOWN')
440
- if status != EvaluationStatus.PENDING.value:
441
- print(f"Skipping non-pending evaluation (status: {status})")
442
- continue
443
-
444
- # Update status to RUNNING
445
- eval_request['status'] = EvaluationStatus.RUNNING.value
446
- print(f"Updating status to RUNNING for {eval_request['model']}")
447
-
448
- # Write the update atomically
449
- try:
450
- with open(file_path, 'w') as f:
451
- json.dump(eval_request, f, indent=2)
452
- print("Successfully updated status to RUNNING")
453
- except Exception as e:
454
- print(f"Error updating status: {str(e)}")
455
- continue
456
-
457
- # Get model info from request
458
- model_name = eval_request.get('model', '')
459
- revision = eval_request.get('revision', '')
460
- precision = eval_request.get('precision', '')
461
- weight_type = eval_request.get('weight_type', '')
462
-
463
- if not model_name:
464
- print("Error: Missing model name in evaluation request")
465
  continue
466
-
467
- print(f"\n=== Evaluating model: {model_name} ===")
468
- print(f"Revision: {revision}")
469
- print(f"Precision: {precision}")
470
- print(f"Weight type: {weight_type}")
471
-
472
- result = evaluate_model(model_name, revision, precision, weight_type)
473
-
474
- # Update status and save results
475
- if result.error:
476
- print(f"\n=== Evaluation failed ===")
477
- print(f"Error: {result.error}")
478
- eval_request['status'] = EvaluationStatus.FAILED.value
479
- eval_request['error'] = result.error
480
- else:
481
- print(f"\n=== Evaluation completed successfully ===")
482
- print(f"Results: {result.results}")
483
- eval_request['status'] = EvaluationStatus.FINISHED.value
484
- eval_request['results'] = result.results
485
-
486
- # Write the final update atomically
487
- try:
488
- with open(file_path, 'w') as f:
489
- json.dump(eval_request, f, indent=2)
490
- print("Successfully saved evaluation results")
491
- except Exception as e:
492
- print(f"Error saving evaluation results: {str(e)}")
493
- continue
494
-
495
- # Move successful evaluations to results directory
496
- if eval_request['status'] == EvaluationStatus.FINISHED.value:
497
- try:
498
- os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
499
- result_file = os.path.join(EVAL_RESULTS_PATH, os.path.basename(file_path))
500
- os.rename(file_path, result_file)
501
- print(f"Moved evaluation results to: {result_file}")
502
- except Exception as e:
503
- print(f"Error moving results file: {str(e)}")
504
-
505
- except Exception as e:
506
- print(f"\n=== Error processing evaluation: {str(e)} ===")
507
- print(f"Full traceback: {traceback.format_exc()}")
508
- continue
509
 
510
- # Upload to Hugging Face
511
- try:
512
- if 'result_file' in locals():
513
- API.upload_file(
514
- path_or_fileobj=result_file,
515
- path_in_repo=result_filename if not username else os.path.join(username, result_filename),
516
- repo_id=f"{OWNER}/results",
517
- repo_type="dataset",
518
- commit_message=f"Add evaluation results for {result.model}"
519
- )
520
- print("Successfully uploaded results to Hugging Face")
521
- except Exception as e:
522
- print(f"Error uploading results to Hugging Face: {str(e)}")
523
 
 
7
  import torch
8
  from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
9
  from datasets import load_dataset
10
+ import traceback
11
 
12
  from src.envs import API, OWNER, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, RESULTS_REPO
13
  from src.display.utils import Tasks
 
67
  predictions = []
68
  targets = []
69
 
70
+ # Create DataLoader with batch size 16
71
+ from torch.utils.data import DataLoader
72
+
73
+ # Define a custom collate function
74
+ def collate_fn(batch):
75
+ # Stack tensors for input_ids and attention_mask
76
+ input_ids = torch.stack([sample['input_ids'] for sample in batch])
77
+ attention_mask = torch.stack([sample['attention_mask'] for sample in batch])
78
+ # Stack targets
79
+ targets = torch.stack([torch.tensor(sample['target']) for sample in batch])
80
+ return {
81
+ 'input_ids': input_ids,
82
+ 'attention_mask': attention_mask,
83
+ 'target': targets
84
+ }
85
+
86
+ dataloader = DataLoader(
87
+ dataset,
88
+ batch_size=16,
89
+ shuffle=False,
90
+ collate_fn=collate_fn
91
+ )
92
+
93
+ for i, batch in enumerate(dataloader):
94
  if i == 0:
95
  print("\nProcessing first batch...")
96
  print(f"Batch keys: {list(batch.keys())}")
 
163
  def preprocess(examples):
164
  print("Tunisian Corpus preprocess exemples -------------",examples)
165
  # Use 'Tweet' field as per dataset structure
166
+ return tokenizer(
167
+ examples['Tweet'],
168
+ padding=False, # We don't need padding for token coverage
169
+ truncation=False, # Don't truncate long sequences
170
+ max_length=None # Let tokenizer handle the length
171
+ )
172
 
173
  dataset = dataset.map(preprocess, batched=True)
174
 
 
177
  covered_tokens = 0
178
 
179
  for example in dataset:
180
+ # Get the tokenized input IDs
181
+ input_ids = example['input_ids']
182
+
183
+ # Convert to tokens and count
184
+ tokens = tokenizer.convert_ids_to_tokens(input_ids)
185
  total_tokens += len(tokens)
186
  covered_tokens += len([t for t in tokens if t != tokenizer.unk_token])
187
 
 
190
  return {"coverage": coverage}
191
  except Exception as e:
192
  print(f"Error in Tunisian Corpus evaluation: {str(e)}")
193
+ print(f"Full traceback: {traceback.format_exc()}")
194
+ raise e
195
 
196
  def evaluate_model(model_name: str, revision: str, precision: str, weight_type: str) -> EvaluationResult:
197
  """Evaluate a single model on all tasks"""
 
339
 
340
  # Find all JSON files in the model directory
341
  json_files = [f for f in os.listdir(model_dir_path) if f.endswith('.json')]
342
+ print(f"Found {len(json_files)} pending evaluation requests")
 
343
  for file in json_files:
344
  file_path = os.path.join(model_dir_path, file)
345
+ print(f" - {file_path}")
346
  try:
347
  with open(file_path, 'r') as f:
348
  eval_entry = json.load(f)
349
 
350
+ # Check if this is a pending or running evaluation
351
+ status = eval_entry.get('status', '')
352
+ if status == EvaluationStatus.PENDING.value:
353
  print(f"\n=== Found pending evaluation ===")
354
  print(f"Model: {eval_entry['model']}")
355
  print(f"Revision: {eval_entry['revision']}")
 
444
  print("\nError file uploaded to Hugging Face")
445
  except Exception as upload_error:
446
  print(f"Error uploading error file: {str(upload_error)}")
447
+ elif status == EvaluationStatus.RUNNING.value:
448
+ print(f"\n=== Found running evaluation ===")
449
+ print(f"Model: {eval_entry['model']}")
450
+ print(f"Revision: {eval_entry['revision']}")
451
+ print(f"Precision: {eval_entry['precision']}")
452
+ print(f"Weight type: {eval_entry['weight_type']}")
453
+
454
+ try:
455
+ # Check if we have results for this evaluation
456
+ result_filename = os.path.basename(file_path)
457
+ result_path = os.path.join(EVAL_RESULTS_PATH, result_filename)
458
+
459
+ if os.path.exists(result_path):
460
+ print(f"\nFound existing results file: {result_path}")
461
+ # Update status to FINISHED
462
+ eval_entry['status'] = EvaluationStatus.FINISHED.value
463
+ with open(file_path, 'w') as f:
464
+ json.dump(eval_entry, f, indent=2)
465
+ else:
466
+ print("\nNo results found. Restarting evaluation...")
467
+ # Restart the evaluation
468
+ eval_entry['status'] = EvaluationStatus.PENDING.value
469
+ with open(file_path, 'w') as f:
470
+ json.dump(eval_entry, f, indent=2)
471
+ except Exception as check_error:
472
+ print(f"\n=== Error checking running evaluation ===")
473
+ print(f"Error: {str(check_error)}")
474
+ print(f"Full traceback: {traceback.format_exc()}")
475
+
476
+ # If we can't check the status, restart the evaluation
477
+ eval_entry['status'] = EvaluationStatus.PENDING.value
478
+ with open(file_path, 'w') as f:
479
+ json.dump(eval_entry, f, indent=2)
480
  except Exception as e:
481
  print(f"Error processing file {file}: {str(e)}")
482
  print(f"Full traceback: {traceback.format_exc()}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
483
  continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
484
 
485
+ print(f"\n=== Evaluation queue summary ===")
486
+ print(f"Total directories checked: {len(model_dirs)}")
487
+ print(f"Total files processed: {len(json_files)}")
488
+ print(f"\nEvaluation queue processed. Sleeping for 5 minutes...")
489
+ return
 
 
 
 
 
 
 
 
490
 
src/leaderboard/read_evals.py CHANGED
@@ -61,7 +61,7 @@ class EvalResult:
61
  model_type=ModelType.from_str(data.get('model_type', 'Unknown')),
62
  weight_type=WeightType.from_str(data.get('weight_type', 'Original')),
63
  date=data.get('submitted_at', ''),
64
- still_on_hub=is_model_on_hub(model_name)
65
  )
66
  except Exception as e:
67
  print(f"Error reading evaluation file {json_filepath}: {str(e)}")
@@ -85,7 +85,7 @@ class EvalResult:
85
  full_model = "/".join(org_and_model)
86
 
87
  still_on_hub, _, model_config = is_model_on_hub(
88
- full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
89
  )
90
  architecture = "?"
91
  if model_config is not None:
@@ -151,7 +151,7 @@ class EvalResult:
151
  AutoEvalColumnInstance.license.name: self.license,
152
  AutoEvalColumnInstance.likes.name: self.likes,
153
  AutoEvalColumnInstance.params.name: self.num_params,
154
- AutoEvalColumnInstance.still_on_hub.name: self.still_on_hub,
155
  }
156
 
157
  for task in Tasks:
@@ -188,24 +188,28 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
188
  for root, _, files in os.walk(results_path):
189
  # Only process .json files
190
  json_files = [f for f in files if f.endswith(".json")]
 
191
  for file in json_files:
192
  model_result_filepaths.append(os.path.join(root, file))
 
193
 
194
  eval_results = {}
195
  for model_result_filepath in model_result_filepaths:
196
  try:
197
  # Creation of result
198
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
 
199
  if eval_result is None:
200
  print(f"Skipping invalid evaluation file: {model_result_filepath}")
201
  continue
202
 
203
  eval_result.update_with_request_file(requests_path)
204
-
205
  # Store results of same eval together
206
  if eval_result.eval_name not in eval_results:
207
  eval_results[eval_result.eval_name] = []
208
  eval_results[eval_result.eval_name].append(eval_result)
 
209
 
210
  except Exception as e:
211
  print(f"Error processing evaluation file {model_result_filepath}: {str(e)}")
@@ -214,16 +218,47 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
214
  # Store results of same eval together
215
  eval_name = eval_result.eval_name
216
  if eval_name in eval_results.keys():
217
- eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
 
218
  else:
219
- eval_results[eval_name] = eval_result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
 
221
  results = []
222
- for v in eval_results.values():
 
 
223
  try:
224
- v.to_dict() # we test if the dict version is complete
225
  results.append(v)
226
  except KeyError as e: # not all eval values present
227
- print(e)
228
  continue
229
  return results
 
61
  model_type=ModelType.from_str(data.get('model_type', 'Unknown')),
62
  weight_type=WeightType.from_str(data.get('weight_type', 'Original')),
63
  date=data.get('submitted_at', ''),
64
+ still_on_hub=is_model_on_hub(model_name, revision="main")
65
  )
66
  except Exception as e:
67
  print(f"Error reading evaluation file {json_filepath}: {str(e)}")
 
85
  full_model = "/".join(org_and_model)
86
 
87
  still_on_hub, _, model_config = is_model_on_hub(
88
+ full_model, revision=config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
89
  )
90
  architecture = "?"
91
  if model_config is not None:
 
151
  AutoEvalColumnInstance.license.name: self.license,
152
  AutoEvalColumnInstance.likes.name: self.likes,
153
  AutoEvalColumnInstance.params.name: self.num_params,
154
+ AutoEvalColumnInstance.still_on_hub.name: True if isinstance(self.still_on_hub, tuple) and self.still_on_hub[0] else False,
155
  }
156
 
157
  for task in Tasks:
 
188
  for root, _, files in os.walk(results_path):
189
  # Only process .json files
190
  json_files = [f for f in files if f.endswith(".json")]
191
+ print(json_files)
192
  for file in json_files:
193
  model_result_filepaths.append(os.path.join(root, file))
194
+ print(model_result_filepaths)
195
 
196
  eval_results = {}
197
  for model_result_filepath in model_result_filepaths:
198
  try:
199
  # Creation of result
200
  eval_result = EvalResult.init_from_json_file(model_result_filepath)
201
+ # print(eval_result)
202
  if eval_result is None:
203
  print(f"Skipping invalid evaluation file: {model_result_filepath}")
204
  continue
205
 
206
  eval_result.update_with_request_file(requests_path)
207
+ # print(eval_result)
208
  # Store results of same eval together
209
  if eval_result.eval_name not in eval_results:
210
  eval_results[eval_result.eval_name] = []
211
  eval_results[eval_result.eval_name].append(eval_result)
212
+ # print(eval_results)
213
 
214
  except Exception as e:
215
  print(f"Error processing evaluation file {model_result_filepath}: {str(e)}")
 
218
  # Store results of same eval together
219
  eval_name = eval_result.eval_name
220
  if eval_name in eval_results.keys():
221
+ # If we already have results for this eval, append to list
222
+ eval_results[eval_name].append(eval_result)
223
  else:
224
+ # Initialize list for this eval name
225
+ eval_results[eval_name] = [eval_result]
226
+
227
+ # Process final results
228
+ final_results = {}
229
+ for eval_name, eval_list in eval_results.items():
230
+ # Create merged results from all evaluations, ensuring all required task keys are present
231
+ merged_results = {task.value.benchmark: None for task in Tasks}
232
+ for eval_result in eval_list:
233
+ merged_results.update({k: v for k, v in eval_result.results.items() if v is not None})
234
+
235
+ # Take the first eval_result as base and update with merged results
236
+ print("evaluation list : ", eval_list)
237
+ base_result = eval_list[0]
238
+ # print(base_result)
239
+ final_results[eval_name] = EvalResult(
240
+ eval_name=eval_name,
241
+ full_model=base_result.full_model,
242
+ org=base_result.org,
243
+ model=base_result.model,
244
+ revision=base_result.revision,
245
+ results=merged_results,
246
+ precision=base_result.precision,
247
+ model_type=base_result.model_type,
248
+ weight_type=base_result.weight_type,
249
+ date=base_result.date,
250
+ still_on_hub=base_result.still_on_hub
251
+ )
252
+ print(final_results)
253
 
254
  results = []
255
+ for v in final_results.values():
256
+ print("v : ",v)
257
+ print("Merged results: ", v.results)
258
  try:
259
+ v.to_dict() # we test if the dict version is complete
260
  results.append(v)
261
  except KeyError as e: # not all eval values present
262
+ print("error in v",e)
263
  continue
264
  return results
src/populate.py CHANGED
@@ -11,14 +11,20 @@ from src.leaderboard.read_evals import get_raw_eval_results
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
  raw_data = get_raw_eval_results(results_path, requests_path)
 
14
  all_data_json = [v.to_dict() for v in raw_data]
 
15
  df = pd.DataFrame.from_records(all_data_json)
 
16
  if df.empty:
17
  print("No evaluation results found. Returning empty DataFrame with correct columns.")
18
  return pd.DataFrame(columns=cols)
19
  df = df.sort_values(by=[AutoEvalColumn().average.name], ascending=False)
 
20
  df = df[cols].round(decimals=2)
21
- df = df[has_no_nan_values(df, benchmark_cols)]
 
 
22
  return df
23
 
24
 
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
  raw_data = get_raw_eval_results(results_path, requests_path)
14
+ print(raw_data)
15
  all_data_json = [v.to_dict() for v in raw_data]
16
+ print(all_data_json)
17
  df = pd.DataFrame.from_records(all_data_json)
18
+ print(df)
19
  if df.empty:
20
  print("No evaluation results found. Returning empty DataFrame with correct columns.")
21
  return pd.DataFrame(columns=cols)
22
  df = df.sort_values(by=[AutoEvalColumn().average.name], ascending=False)
23
+ print(df)
24
  df = df[cols].round(decimals=2)
25
+ print(df)
26
+ # df = df[has_no_nan_values(df, benchmark_cols)]
27
+ # print(df)
28
  return df
29
 
30