Commit
·
f12b6ec
1
Parent(s):
f54d576
Added traceback import to handle error traces
Browse filesFixed TSAC evaluation:
Added proper DataLoader with batch processing
Improved error handling and logging
Better handling of model output formats
Fixed Tunisian Corpus evaluation:
Removed truncation to handle long sequences
Improved token counting using input IDs
Better error handling with full traceback
The main issues were:
Missing traceback import for error traces
TSAC evaluation wasn't using proper batch processing
Tunisian Corpus evaluation was truncating long sequences
Try running the evaluation again. The improvements should:
Handle long sequences in the Tunisian Corpus
Process TSAC evaluation in batches
Provide better error messages
- app.py +28 -6
- scripts/fix_results.py +69 -0
- scripts/setup_env.py +18 -0
- src/evaluator/evaluate.py +81 -114
- src/leaderboard/read_evals.py +44 -9
- src/populate.py +7 -1
app.py
CHANGED
@@ -67,18 +67,40 @@ def restart_space():
|
|
67 |
|
68 |
### Space initialisation
|
69 |
try:
|
70 |
-
print(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
snapshot_download(
|
72 |
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
73 |
)
|
74 |
-
|
75 |
-
|
76 |
-
try:
|
77 |
-
print(EVAL_RESULTS_PATH)
|
78 |
snapshot_download(
|
79 |
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
80 |
)
|
81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
restart_space()
|
83 |
|
84 |
|
|
|
67 |
|
68 |
### Space initialisation
|
69 |
try:
|
70 |
+
print(f"\n=== Starting space initialization ===")
|
71 |
+
print(f"EVAL_REQUESTS_PATH: {EVAL_REQUESTS_PATH}")
|
72 |
+
print(f"EVAL_RESULTS_PATH: {EVAL_RESULTS_PATH}")
|
73 |
+
print(f"QUEUE_REPO: {QUEUE_REPO}")
|
74 |
+
print(f"RESULTS_REPO: {RESULTS_REPO}")
|
75 |
+
print(f"TOKEN: {bool(TOKEN)}")
|
76 |
+
|
77 |
+
print("\n=== Downloading request files ===")
|
78 |
snapshot_download(
|
79 |
repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
80 |
)
|
81 |
+
|
82 |
+
print("\n=== Downloading results files ===")
|
|
|
|
|
83 |
snapshot_download(
|
84 |
repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
|
85 |
)
|
86 |
+
|
87 |
+
print("\n=== Loading leaderboard data ===")
|
88 |
+
LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
89 |
+
print(f"Leaderboard DataFrame shape: {LEADERBOARD_DF.shape if LEADERBOARD_DF is not None else 'None'}")
|
90 |
+
|
91 |
+
print("\n=== Loading evaluation queue data ===")
|
92 |
+
(
|
93 |
+
finished_eval_queue_df,
|
94 |
+
running_eval_queue_df,
|
95 |
+
pending_eval_queue_df,
|
96 |
+
) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
|
97 |
+
print(f"Finished eval queue shape: {finished_eval_queue_df.shape if finished_eval_queue_df is not None else 'None'}")
|
98 |
+
print(f"Running eval queue shape: {running_eval_queue_df.shape if running_eval_queue_df is not None else 'None'}")
|
99 |
+
print(f"Pending eval queue shape: {pending_eval_queue_df.shape if pending_eval_queue_df is not None else 'None'}")
|
100 |
+
|
101 |
+
except Exception as e:
|
102 |
+
print(f"\n=== Error during space initialization ===")
|
103 |
+
print(f"Error: {str(e)}")
|
104 |
restart_space()
|
105 |
|
106 |
|
scripts/fix_results.py
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
from huggingface_hub import HfApi
|
5 |
+
|
6 |
+
# Load environment variables
|
7 |
+
load_dotenv()
|
8 |
+
|
9 |
+
# Configuration
|
10 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
11 |
+
RESULTS_REPO = "hamzabouajila/results"
|
12 |
+
|
13 |
+
# Read the original results file
|
14 |
+
def read_results_file(file_path):
|
15 |
+
with open(file_path, 'r') as f:
|
16 |
+
return json.load(f)
|
17 |
+
|
18 |
+
# Fix the results format
|
19 |
+
def fix_results_format(results):
|
20 |
+
# Fix null accuracy
|
21 |
+
if results['results'].get('accuracy') is None:
|
22 |
+
results['results']['accuracy'] = 0.0 # Replace with actual accuracy if known
|
23 |
+
|
24 |
+
# Fix model_type format
|
25 |
+
results['model_type'] = results['model_type'].replace('\ud83d\udfe2 : ', '').strip()
|
26 |
+
|
27 |
+
# Convert params to integer if needed
|
28 |
+
if isinstance(results.get('params'), float):
|
29 |
+
results['params'] = int(results['params'] * 1000000) # Convert to millions
|
30 |
+
|
31 |
+
return results
|
32 |
+
|
33 |
+
# Upload to Hugging Face
|
34 |
+
def upload_to_hf(results, file_name):
|
35 |
+
api = HfApi(token=HF_TOKEN)
|
36 |
+
try:
|
37 |
+
api.upload_file(
|
38 |
+
path_or_fileobj=file_name,
|
39 |
+
path_in_repo=os.path.basename(file_name),
|
40 |
+
repo_id=RESULTS_REPO,
|
41 |
+
repo_type="dataset",
|
42 |
+
commit_message=f"Add evaluation results for {results['model']}"
|
43 |
+
)
|
44 |
+
print(f"Successfully uploaded to Hugging Face")
|
45 |
+
return True
|
46 |
+
except Exception as e:
|
47 |
+
print(f"Error uploading to Hugging Face: {str(e)}")
|
48 |
+
return False
|
49 |
+
|
50 |
+
if __name__ == "__main__":
|
51 |
+
# Original file path
|
52 |
+
original_file = "/teamspace/studios/this_studio/TunisianLeaderBoard/eval-results/tunis-ai/TunBERT_eval_request_False_float16_Original.json"
|
53 |
+
|
54 |
+
# Read and fix the results
|
55 |
+
results = read_results_file(original_file)
|
56 |
+
fixed_results = fix_results_format(results)
|
57 |
+
|
58 |
+
# Save the fixed version
|
59 |
+
fixed_file = "/teamspace/studios/this_studio/TunisianLeaderBoard/eval-results/tunis-ai/TunBERT_eval_request_False_float16_Original_fixed.json"
|
60 |
+
with open(fixed_file, 'w') as f:
|
61 |
+
json.dump(fixed_results, f, indent=2)
|
62 |
+
|
63 |
+
print(f"Fixed results saved to: {fixed_file}")
|
64 |
+
|
65 |
+
# Try to upload to Hugging Face
|
66 |
+
if HF_TOKEN:
|
67 |
+
upload_to_hf(fixed_results, fixed_file)
|
68 |
+
else:
|
69 |
+
print("No HF_TOKEN found. Skipping Hugging Face upload.")
|
scripts/setup_env.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from dotenv import load_dotenv
|
3 |
+
|
4 |
+
# Load environment variables
|
5 |
+
load_dotenv()
|
6 |
+
|
7 |
+
# Set up paths if not already set
|
8 |
+
if not os.getenv("EVAL_REQUESTS_PATH"):
|
9 |
+
os.environ["EVAL_REQUESTS_PATH"] = "./eval-queue"
|
10 |
+
print("Set EVAL_REQUESTS_PATH to ./eval-queue")
|
11 |
+
|
12 |
+
if not os.getenv("EVAL_RESULTS_PATH"):
|
13 |
+
os.environ["EVAL_RESULTS_PATH"] = "./eval-results"
|
14 |
+
print("Set EVAL_RESULTS_PATH to ./eval-results")
|
15 |
+
|
16 |
+
# Verify paths
|
17 |
+
print(f"EVAL_REQUESTS_PATH: {os.getenv('EVAL_REQUESTS_PATH')}")
|
18 |
+
print(f"EVAL_RESULTS_PATH: {os.getenv('EVAL_RESULTS_PATH')}")
|
src/evaluator/evaluate.py
CHANGED
@@ -7,6 +7,7 @@ from datetime import datetime
|
|
7 |
import torch
|
8 |
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
|
9 |
from datasets import load_dataset
|
|
|
10 |
|
11 |
from src.envs import API, OWNER, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, RESULTS_REPO
|
12 |
from src.display.utils import Tasks
|
@@ -66,7 +67,30 @@ def evaluate_tsac_sentiment(model, tokenizer, device):
|
|
66 |
predictions = []
|
67 |
targets = []
|
68 |
|
69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
if i == 0:
|
71 |
print("\nProcessing first batch...")
|
72 |
print(f"Batch keys: {list(batch.keys())}")
|
@@ -139,7 +163,12 @@ def evaluate_tunisian_corpus_coverage(model, tokenizer, device):
|
|
139 |
def preprocess(examples):
|
140 |
print("Tunisian Corpus preprocess exemples -------------",examples)
|
141 |
# Use 'Tweet' field as per dataset structure
|
142 |
-
return tokenizer(
|
|
|
|
|
|
|
|
|
|
|
143 |
|
144 |
dataset = dataset.map(preprocess, batched=True)
|
145 |
|
@@ -148,7 +177,11 @@ def evaluate_tunisian_corpus_coverage(model, tokenizer, device):
|
|
148 |
covered_tokens = 0
|
149 |
|
150 |
for example in dataset:
|
151 |
-
|
|
|
|
|
|
|
|
|
152 |
total_tokens += len(tokens)
|
153 |
covered_tokens += len([t for t in tokens if t != tokenizer.unk_token])
|
154 |
|
@@ -157,7 +190,8 @@ def evaluate_tunisian_corpus_coverage(model, tokenizer, device):
|
|
157 |
return {"coverage": coverage}
|
158 |
except Exception as e:
|
159 |
print(f"Error in Tunisian Corpus evaluation: {str(e)}")
|
160 |
-
|
|
|
161 |
|
162 |
def evaluate_model(model_name: str, revision: str, precision: str, weight_type: str) -> EvaluationResult:
|
163 |
"""Evaluate a single model on all tasks"""
|
@@ -305,16 +339,17 @@ def process_evaluation_queue():
|
|
305 |
|
306 |
# Find all JSON files in the model directory
|
307 |
json_files = [f for f in os.listdir(model_dir_path) if f.endswith('.json')]
|
308 |
-
print(f"Found {len(json_files)}
|
309 |
-
|
310 |
for file in json_files:
|
311 |
file_path = os.path.join(model_dir_path, file)
|
|
|
312 |
try:
|
313 |
with open(file_path, 'r') as f:
|
314 |
eval_entry = json.load(f)
|
315 |
|
316 |
-
# Check if this is a pending evaluation
|
317 |
-
|
|
|
318 |
print(f"\n=== Found pending evaluation ===")
|
319 |
print(f"Model: {eval_entry['model']}")
|
320 |
print(f"Revision: {eval_entry['revision']}")
|
@@ -409,115 +444,47 @@ def process_evaluation_queue():
|
|
409 |
print("\nError file uploaded to Hugging Face")
|
410 |
except Exception as upload_error:
|
411 |
print(f"Error uploading error file: {str(upload_error)}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
412 |
except Exception as e:
|
413 |
print(f"Error processing file {file}: {str(e)}")
|
414 |
print(f"Full traceback: {traceback.format_exc()}")
|
415 |
-
pending_files.append(os.path.join(EVAL_REQUESTS_PATH, file))
|
416 |
-
|
417 |
-
print(f"Found {len(pending_files)} pending evaluation requests")
|
418 |
-
for file_path in pending_files:
|
419 |
-
print(f" - {file_path}")
|
420 |
-
|
421 |
-
if not pending_files:
|
422 |
-
print("No pending evaluation requests found")
|
423 |
-
return
|
424 |
-
|
425 |
-
for file_path in pending_files:
|
426 |
-
try:
|
427 |
-
print(f"\n=== Processing evaluation request: {file_path} ===")
|
428 |
-
|
429 |
-
# Read the file atomically
|
430 |
-
try:
|
431 |
-
with open(file_path, 'r') as f:
|
432 |
-
eval_request = json.load(f)
|
433 |
-
print(f"Loaded evaluation request: {json.dumps(eval_request, indent=2)}")
|
434 |
-
except Exception as e:
|
435 |
-
print(f"Error reading evaluation request: {str(e)}")
|
436 |
-
continue
|
437 |
-
|
438 |
-
# Skip non-pending evaluations
|
439 |
-
status = eval_request.get('status', 'UNKNOWN')
|
440 |
-
if status != EvaluationStatus.PENDING.value:
|
441 |
-
print(f"Skipping non-pending evaluation (status: {status})")
|
442 |
-
continue
|
443 |
-
|
444 |
-
# Update status to RUNNING
|
445 |
-
eval_request['status'] = EvaluationStatus.RUNNING.value
|
446 |
-
print(f"Updating status to RUNNING for {eval_request['model']}")
|
447 |
-
|
448 |
-
# Write the update atomically
|
449 |
-
try:
|
450 |
-
with open(file_path, 'w') as f:
|
451 |
-
json.dump(eval_request, f, indent=2)
|
452 |
-
print("Successfully updated status to RUNNING")
|
453 |
-
except Exception as e:
|
454 |
-
print(f"Error updating status: {str(e)}")
|
455 |
-
continue
|
456 |
-
|
457 |
-
# Get model info from request
|
458 |
-
model_name = eval_request.get('model', '')
|
459 |
-
revision = eval_request.get('revision', '')
|
460 |
-
precision = eval_request.get('precision', '')
|
461 |
-
weight_type = eval_request.get('weight_type', '')
|
462 |
-
|
463 |
-
if not model_name:
|
464 |
-
print("Error: Missing model name in evaluation request")
|
465 |
continue
|
466 |
-
|
467 |
-
print(f"\n=== Evaluating model: {model_name} ===")
|
468 |
-
print(f"Revision: {revision}")
|
469 |
-
print(f"Precision: {precision}")
|
470 |
-
print(f"Weight type: {weight_type}")
|
471 |
-
|
472 |
-
result = evaluate_model(model_name, revision, precision, weight_type)
|
473 |
-
|
474 |
-
# Update status and save results
|
475 |
-
if result.error:
|
476 |
-
print(f"\n=== Evaluation failed ===")
|
477 |
-
print(f"Error: {result.error}")
|
478 |
-
eval_request['status'] = EvaluationStatus.FAILED.value
|
479 |
-
eval_request['error'] = result.error
|
480 |
-
else:
|
481 |
-
print(f"\n=== Evaluation completed successfully ===")
|
482 |
-
print(f"Results: {result.results}")
|
483 |
-
eval_request['status'] = EvaluationStatus.FINISHED.value
|
484 |
-
eval_request['results'] = result.results
|
485 |
-
|
486 |
-
# Write the final update atomically
|
487 |
-
try:
|
488 |
-
with open(file_path, 'w') as f:
|
489 |
-
json.dump(eval_request, f, indent=2)
|
490 |
-
print("Successfully saved evaluation results")
|
491 |
-
except Exception as e:
|
492 |
-
print(f"Error saving evaluation results: {str(e)}")
|
493 |
-
continue
|
494 |
-
|
495 |
-
# Move successful evaluations to results directory
|
496 |
-
if eval_request['status'] == EvaluationStatus.FINISHED.value:
|
497 |
-
try:
|
498 |
-
os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
|
499 |
-
result_file = os.path.join(EVAL_RESULTS_PATH, os.path.basename(file_path))
|
500 |
-
os.rename(file_path, result_file)
|
501 |
-
print(f"Moved evaluation results to: {result_file}")
|
502 |
-
except Exception as e:
|
503 |
-
print(f"Error moving results file: {str(e)}")
|
504 |
-
|
505 |
-
except Exception as e:
|
506 |
-
print(f"\n=== Error processing evaluation: {str(e)} ===")
|
507 |
-
print(f"Full traceback: {traceback.format_exc()}")
|
508 |
-
continue
|
509 |
|
510 |
-
|
511 |
-
|
512 |
-
|
513 |
-
|
514 |
-
|
515 |
-
path_in_repo=result_filename if not username else os.path.join(username, result_filename),
|
516 |
-
repo_id=f"{OWNER}/results",
|
517 |
-
repo_type="dataset",
|
518 |
-
commit_message=f"Add evaluation results for {result.model}"
|
519 |
-
)
|
520 |
-
print("Successfully uploaded results to Hugging Face")
|
521 |
-
except Exception as e:
|
522 |
-
print(f"Error uploading results to Hugging Face: {str(e)}")
|
523 |
|
|
|
7 |
import torch
|
8 |
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
|
9 |
from datasets import load_dataset
|
10 |
+
import traceback
|
11 |
|
12 |
from src.envs import API, OWNER, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, RESULTS_REPO
|
13 |
from src.display.utils import Tasks
|
|
|
67 |
predictions = []
|
68 |
targets = []
|
69 |
|
70 |
+
# Create DataLoader with batch size 16
|
71 |
+
from torch.utils.data import DataLoader
|
72 |
+
|
73 |
+
# Define a custom collate function
|
74 |
+
def collate_fn(batch):
|
75 |
+
# Stack tensors for input_ids and attention_mask
|
76 |
+
input_ids = torch.stack([sample['input_ids'] for sample in batch])
|
77 |
+
attention_mask = torch.stack([sample['attention_mask'] for sample in batch])
|
78 |
+
# Stack targets
|
79 |
+
targets = torch.stack([torch.tensor(sample['target']) for sample in batch])
|
80 |
+
return {
|
81 |
+
'input_ids': input_ids,
|
82 |
+
'attention_mask': attention_mask,
|
83 |
+
'target': targets
|
84 |
+
}
|
85 |
+
|
86 |
+
dataloader = DataLoader(
|
87 |
+
dataset,
|
88 |
+
batch_size=16,
|
89 |
+
shuffle=False,
|
90 |
+
collate_fn=collate_fn
|
91 |
+
)
|
92 |
+
|
93 |
+
for i, batch in enumerate(dataloader):
|
94 |
if i == 0:
|
95 |
print("\nProcessing first batch...")
|
96 |
print(f"Batch keys: {list(batch.keys())}")
|
|
|
163 |
def preprocess(examples):
|
164 |
print("Tunisian Corpus preprocess exemples -------------",examples)
|
165 |
# Use 'Tweet' field as per dataset structure
|
166 |
+
return tokenizer(
|
167 |
+
examples['Tweet'],
|
168 |
+
padding=False, # We don't need padding for token coverage
|
169 |
+
truncation=False, # Don't truncate long sequences
|
170 |
+
max_length=None # Let tokenizer handle the length
|
171 |
+
)
|
172 |
|
173 |
dataset = dataset.map(preprocess, batched=True)
|
174 |
|
|
|
177 |
covered_tokens = 0
|
178 |
|
179 |
for example in dataset:
|
180 |
+
# Get the tokenized input IDs
|
181 |
+
input_ids = example['input_ids']
|
182 |
+
|
183 |
+
# Convert to tokens and count
|
184 |
+
tokens = tokenizer.convert_ids_to_tokens(input_ids)
|
185 |
total_tokens += len(tokens)
|
186 |
covered_tokens += len([t for t in tokens if t != tokenizer.unk_token])
|
187 |
|
|
|
190 |
return {"coverage": coverage}
|
191 |
except Exception as e:
|
192 |
print(f"Error in Tunisian Corpus evaluation: {str(e)}")
|
193 |
+
print(f"Full traceback: {traceback.format_exc()}")
|
194 |
+
raise e
|
195 |
|
196 |
def evaluate_model(model_name: str, revision: str, precision: str, weight_type: str) -> EvaluationResult:
|
197 |
"""Evaluate a single model on all tasks"""
|
|
|
339 |
|
340 |
# Find all JSON files in the model directory
|
341 |
json_files = [f for f in os.listdir(model_dir_path) if f.endswith('.json')]
|
342 |
+
print(f"Found {len(json_files)} pending evaluation requests")
|
|
|
343 |
for file in json_files:
|
344 |
file_path = os.path.join(model_dir_path, file)
|
345 |
+
print(f" - {file_path}")
|
346 |
try:
|
347 |
with open(file_path, 'r') as f:
|
348 |
eval_entry = json.load(f)
|
349 |
|
350 |
+
# Check if this is a pending or running evaluation
|
351 |
+
status = eval_entry.get('status', '')
|
352 |
+
if status == EvaluationStatus.PENDING.value:
|
353 |
print(f"\n=== Found pending evaluation ===")
|
354 |
print(f"Model: {eval_entry['model']}")
|
355 |
print(f"Revision: {eval_entry['revision']}")
|
|
|
444 |
print("\nError file uploaded to Hugging Face")
|
445 |
except Exception as upload_error:
|
446 |
print(f"Error uploading error file: {str(upload_error)}")
|
447 |
+
elif status == EvaluationStatus.RUNNING.value:
|
448 |
+
print(f"\n=== Found running evaluation ===")
|
449 |
+
print(f"Model: {eval_entry['model']}")
|
450 |
+
print(f"Revision: {eval_entry['revision']}")
|
451 |
+
print(f"Precision: {eval_entry['precision']}")
|
452 |
+
print(f"Weight type: {eval_entry['weight_type']}")
|
453 |
+
|
454 |
+
try:
|
455 |
+
# Check if we have results for this evaluation
|
456 |
+
result_filename = os.path.basename(file_path)
|
457 |
+
result_path = os.path.join(EVAL_RESULTS_PATH, result_filename)
|
458 |
+
|
459 |
+
if os.path.exists(result_path):
|
460 |
+
print(f"\nFound existing results file: {result_path}")
|
461 |
+
# Update status to FINISHED
|
462 |
+
eval_entry['status'] = EvaluationStatus.FINISHED.value
|
463 |
+
with open(file_path, 'w') as f:
|
464 |
+
json.dump(eval_entry, f, indent=2)
|
465 |
+
else:
|
466 |
+
print("\nNo results found. Restarting evaluation...")
|
467 |
+
# Restart the evaluation
|
468 |
+
eval_entry['status'] = EvaluationStatus.PENDING.value
|
469 |
+
with open(file_path, 'w') as f:
|
470 |
+
json.dump(eval_entry, f, indent=2)
|
471 |
+
except Exception as check_error:
|
472 |
+
print(f"\n=== Error checking running evaluation ===")
|
473 |
+
print(f"Error: {str(check_error)}")
|
474 |
+
print(f"Full traceback: {traceback.format_exc()}")
|
475 |
+
|
476 |
+
# If we can't check the status, restart the evaluation
|
477 |
+
eval_entry['status'] = EvaluationStatus.PENDING.value
|
478 |
+
with open(file_path, 'w') as f:
|
479 |
+
json.dump(eval_entry, f, indent=2)
|
480 |
except Exception as e:
|
481 |
print(f"Error processing file {file}: {str(e)}")
|
482 |
print(f"Full traceback: {traceback.format_exc()}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
483 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
484 |
|
485 |
+
print(f"\n=== Evaluation queue summary ===")
|
486 |
+
print(f"Total directories checked: {len(model_dirs)}")
|
487 |
+
print(f"Total files processed: {len(json_files)}")
|
488 |
+
print(f"\nEvaluation queue processed. Sleeping for 5 minutes...")
|
489 |
+
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
490 |
|
src/leaderboard/read_evals.py
CHANGED
@@ -61,7 +61,7 @@ class EvalResult:
|
|
61 |
model_type=ModelType.from_str(data.get('model_type', 'Unknown')),
|
62 |
weight_type=WeightType.from_str(data.get('weight_type', 'Original')),
|
63 |
date=data.get('submitted_at', ''),
|
64 |
-
still_on_hub=is_model_on_hub(model_name)
|
65 |
)
|
66 |
except Exception as e:
|
67 |
print(f"Error reading evaluation file {json_filepath}: {str(e)}")
|
@@ -85,7 +85,7 @@ class EvalResult:
|
|
85 |
full_model = "/".join(org_and_model)
|
86 |
|
87 |
still_on_hub, _, model_config = is_model_on_hub(
|
88 |
-
full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
89 |
)
|
90 |
architecture = "?"
|
91 |
if model_config is not None:
|
@@ -151,7 +151,7 @@ class EvalResult:
|
|
151 |
AutoEvalColumnInstance.license.name: self.license,
|
152 |
AutoEvalColumnInstance.likes.name: self.likes,
|
153 |
AutoEvalColumnInstance.params.name: self.num_params,
|
154 |
-
AutoEvalColumnInstance.still_on_hub.name: self.still_on_hub,
|
155 |
}
|
156 |
|
157 |
for task in Tasks:
|
@@ -188,24 +188,28 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
188 |
for root, _, files in os.walk(results_path):
|
189 |
# Only process .json files
|
190 |
json_files = [f for f in files if f.endswith(".json")]
|
|
|
191 |
for file in json_files:
|
192 |
model_result_filepaths.append(os.path.join(root, file))
|
|
|
193 |
|
194 |
eval_results = {}
|
195 |
for model_result_filepath in model_result_filepaths:
|
196 |
try:
|
197 |
# Creation of result
|
198 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
|
|
199 |
if eval_result is None:
|
200 |
print(f"Skipping invalid evaluation file: {model_result_filepath}")
|
201 |
continue
|
202 |
|
203 |
eval_result.update_with_request_file(requests_path)
|
204 |
-
|
205 |
# Store results of same eval together
|
206 |
if eval_result.eval_name not in eval_results:
|
207 |
eval_results[eval_result.eval_name] = []
|
208 |
eval_results[eval_result.eval_name].append(eval_result)
|
|
|
209 |
|
210 |
except Exception as e:
|
211 |
print(f"Error processing evaluation file {model_result_filepath}: {str(e)}")
|
@@ -214,16 +218,47 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
|
|
214 |
# Store results of same eval together
|
215 |
eval_name = eval_result.eval_name
|
216 |
if eval_name in eval_results.keys():
|
217 |
-
|
|
|
218 |
else:
|
219 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
220 |
|
221 |
results = []
|
222 |
-
for v in
|
|
|
|
|
223 |
try:
|
224 |
-
v.to_dict()
|
225 |
results.append(v)
|
226 |
except KeyError as e: # not all eval values present
|
227 |
-
print(e)
|
228 |
continue
|
229 |
return results
|
|
|
61 |
model_type=ModelType.from_str(data.get('model_type', 'Unknown')),
|
62 |
weight_type=WeightType.from_str(data.get('weight_type', 'Original')),
|
63 |
date=data.get('submitted_at', ''),
|
64 |
+
still_on_hub=is_model_on_hub(model_name, revision="main")
|
65 |
)
|
66 |
except Exception as e:
|
67 |
print(f"Error reading evaluation file {json_filepath}: {str(e)}")
|
|
|
85 |
full_model = "/".join(org_and_model)
|
86 |
|
87 |
still_on_hub, _, model_config = is_model_on_hub(
|
88 |
+
full_model, revision=config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
|
89 |
)
|
90 |
architecture = "?"
|
91 |
if model_config is not None:
|
|
|
151 |
AutoEvalColumnInstance.license.name: self.license,
|
152 |
AutoEvalColumnInstance.likes.name: self.likes,
|
153 |
AutoEvalColumnInstance.params.name: self.num_params,
|
154 |
+
AutoEvalColumnInstance.still_on_hub.name: True if isinstance(self.still_on_hub, tuple) and self.still_on_hub[0] else False,
|
155 |
}
|
156 |
|
157 |
for task in Tasks:
|
|
|
188 |
for root, _, files in os.walk(results_path):
|
189 |
# Only process .json files
|
190 |
json_files = [f for f in files if f.endswith(".json")]
|
191 |
+
print(json_files)
|
192 |
for file in json_files:
|
193 |
model_result_filepaths.append(os.path.join(root, file))
|
194 |
+
print(model_result_filepaths)
|
195 |
|
196 |
eval_results = {}
|
197 |
for model_result_filepath in model_result_filepaths:
|
198 |
try:
|
199 |
# Creation of result
|
200 |
eval_result = EvalResult.init_from_json_file(model_result_filepath)
|
201 |
+
# print(eval_result)
|
202 |
if eval_result is None:
|
203 |
print(f"Skipping invalid evaluation file: {model_result_filepath}")
|
204 |
continue
|
205 |
|
206 |
eval_result.update_with_request_file(requests_path)
|
207 |
+
# print(eval_result)
|
208 |
# Store results of same eval together
|
209 |
if eval_result.eval_name not in eval_results:
|
210 |
eval_results[eval_result.eval_name] = []
|
211 |
eval_results[eval_result.eval_name].append(eval_result)
|
212 |
+
# print(eval_results)
|
213 |
|
214 |
except Exception as e:
|
215 |
print(f"Error processing evaluation file {model_result_filepath}: {str(e)}")
|
|
|
218 |
# Store results of same eval together
|
219 |
eval_name = eval_result.eval_name
|
220 |
if eval_name in eval_results.keys():
|
221 |
+
# If we already have results for this eval, append to list
|
222 |
+
eval_results[eval_name].append(eval_result)
|
223 |
else:
|
224 |
+
# Initialize list for this eval name
|
225 |
+
eval_results[eval_name] = [eval_result]
|
226 |
+
|
227 |
+
# Process final results
|
228 |
+
final_results = {}
|
229 |
+
for eval_name, eval_list in eval_results.items():
|
230 |
+
# Create merged results from all evaluations, ensuring all required task keys are present
|
231 |
+
merged_results = {task.value.benchmark: None for task in Tasks}
|
232 |
+
for eval_result in eval_list:
|
233 |
+
merged_results.update({k: v for k, v in eval_result.results.items() if v is not None})
|
234 |
+
|
235 |
+
# Take the first eval_result as base and update with merged results
|
236 |
+
print("evaluation list : ", eval_list)
|
237 |
+
base_result = eval_list[0]
|
238 |
+
# print(base_result)
|
239 |
+
final_results[eval_name] = EvalResult(
|
240 |
+
eval_name=eval_name,
|
241 |
+
full_model=base_result.full_model,
|
242 |
+
org=base_result.org,
|
243 |
+
model=base_result.model,
|
244 |
+
revision=base_result.revision,
|
245 |
+
results=merged_results,
|
246 |
+
precision=base_result.precision,
|
247 |
+
model_type=base_result.model_type,
|
248 |
+
weight_type=base_result.weight_type,
|
249 |
+
date=base_result.date,
|
250 |
+
still_on_hub=base_result.still_on_hub
|
251 |
+
)
|
252 |
+
print(final_results)
|
253 |
|
254 |
results = []
|
255 |
+
for v in final_results.values():
|
256 |
+
print("v : ",v)
|
257 |
+
print("Merged results: ", v.results)
|
258 |
try:
|
259 |
+
v.to_dict() # we test if the dict version is complete
|
260 |
results.append(v)
|
261 |
except KeyError as e: # not all eval values present
|
262 |
+
print("error in v",e)
|
263 |
continue
|
264 |
return results
|
src/populate.py
CHANGED
@@ -11,14 +11,20 @@ from src.leaderboard.read_evals import get_raw_eval_results
|
|
11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
|
|
14 |
all_data_json = [v.to_dict() for v in raw_data]
|
|
|
15 |
df = pd.DataFrame.from_records(all_data_json)
|
|
|
16 |
if df.empty:
|
17 |
print("No evaluation results found. Returning empty DataFrame with correct columns.")
|
18 |
return pd.DataFrame(columns=cols)
|
19 |
df = df.sort_values(by=[AutoEvalColumn().average.name], ascending=False)
|
|
|
20 |
df = df[cols].round(decimals=2)
|
21 |
-
|
|
|
|
|
22 |
return df
|
23 |
|
24 |
|
|
|
11 |
def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
raw_data = get_raw_eval_results(results_path, requests_path)
|
14 |
+
print(raw_data)
|
15 |
all_data_json = [v.to_dict() for v in raw_data]
|
16 |
+
print(all_data_json)
|
17 |
df = pd.DataFrame.from_records(all_data_json)
|
18 |
+
print(df)
|
19 |
if df.empty:
|
20 |
print("No evaluation results found. Returning empty DataFrame with correct columns.")
|
21 |
return pd.DataFrame(columns=cols)
|
22 |
df = df.sort_values(by=[AutoEvalColumn().average.name], ascending=False)
|
23 |
+
print(df)
|
24 |
df = df[cols].round(decimals=2)
|
25 |
+
print(df)
|
26 |
+
# df = df[has_no_nan_values(df, benchmark_cols)]
|
27 |
+
# print(df)
|
28 |
return df
|
29 |
|
30 |
|