Spaces:

tunis-ai
/

TunisianEncoderModelsLeaderboard

Running

hamzabouajila commited on Jul 11

Commit

f12b6ec

1 Parent(s): f54d576

Added traceback import to handle error traces

Fixed TSAC evaluation:
Added proper DataLoader with batch processing
Improved error handling and logging
Better handling of model output formats
Fixed Tunisian Corpus evaluation:
Removed truncation to handle long sequences
Improved token counting using input IDs
Better error handling with full traceback
The main issues were:

Missing traceback import for error traces
TSAC evaluation wasn't using proper batch processing
Tunisian Corpus evaluation was truncating long sequences
Try running the evaluation again. The improvements should:

Handle long sequences in the Tunisian Corpus
Process TSAC evaluation in batches
Provide better error messages

Files changed (6) hide show

app.py +28 -6
scripts/fix_results.py +69 -0
scripts/setup_env.py +18 -0
src/evaluator/evaluate.py +81 -114
src/leaderboard/read_evals.py +44 -9
src/populate.py +7 -1

app.py CHANGED Viewed

@@ -67,18 +67,40 @@ def restart_space():
 ### Space initialisation
 try:
-    print(EVAL_REQUESTS_PATH)
     snapshot_download(
         repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
     )
-except Exception:
-    restart_space()
-try:
-    print(EVAL_RESULTS_PATH)
     snapshot_download(
         repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
     )
-except Exception:
     restart_space()

 ### Space initialisation
 try:
+    print(f"\n=== Starting space initialization ===")
+    print(f"EVAL_REQUESTS_PATH: {EVAL_REQUESTS_PATH}")
+    print(f"EVAL_RESULTS_PATH: {EVAL_RESULTS_PATH}")
+    print(f"QUEUE_REPO: {QUEUE_REPO}")
+    print(f"RESULTS_REPO: {RESULTS_REPO}")
+    print(f"TOKEN: {bool(TOKEN)}")
+    print("\n=== Downloading request files ===")
     snapshot_download(
         repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
     )
+    print("\n=== Downloading results files ===")
     snapshot_download(
         repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
     )
+    print("\n=== Loading leaderboard data ===")
+    LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
+    print(f"Leaderboard DataFrame shape: {LEADERBOARD_DF.shape if LEADERBOARD_DF is not None else 'None'}")
+    print("\n=== Loading evaluation queue data ===")
+    (
+        finished_eval_queue_df,
+        running_eval_queue_df,
+        pending_eval_queue_df,
+    ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
+    print(f"Finished eval queue shape: {finished_eval_queue_df.shape if finished_eval_queue_df is not None else 'None'}")
+    print(f"Running eval queue shape: {running_eval_queue_df.shape if running_eval_queue_df is not None else 'None'}")
+    print(f"Pending eval queue shape: {pending_eval_queue_df.shape if pending_eval_queue_df is not None else 'None'}")
+except Exception as e:
+    print(f"\n=== Error during space initialization ===")
+    print(f"Error: {str(e)}")
     restart_space()

scripts/fix_results.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import json
+import os
+from dotenv import load_dotenv
+from huggingface_hub import HfApi
+# Load environment variables
+load_dotenv()
+# Configuration
+HF_TOKEN = os.getenv("HF_TOKEN")
+RESULTS_REPO = "hamzabouajila/results"
+# Read the original results file
+def read_results_file(file_path):
+    with open(file_path, 'r') as f:
+        return json.load(f)
+# Fix the results format
+def fix_results_format(results):
+    # Fix null accuracy
+    if results['results'].get('accuracy') is None:
+        results['results']['accuracy'] = 0.0  # Replace with actual accuracy if known
+    # Fix model_type format
+    results['model_type'] = results['model_type'].replace('\ud83d\udfe2 : ', '').strip()
+    # Convert params to integer if needed
+    if isinstance(results.get('params'), float):
+        results['params'] = int(results['params'] * 1000000)  # Convert to millions
+    return results
+# Upload to Hugging Face
+def upload_to_hf(results, file_name):
+    api = HfApi(token=HF_TOKEN)
+    try:
+        api.upload_file(
+            path_or_fileobj=file_name,
+            path_in_repo=os.path.basename(file_name),
+            repo_id=RESULTS_REPO,
+            repo_type="dataset",
+            commit_message=f"Add evaluation results for {results['model']}"
+        )
+        print(f"Successfully uploaded to Hugging Face")
+        return True
+    except Exception as e:
+        print(f"Error uploading to Hugging Face: {str(e)}")
+        return False
+if __name__ == "__main__":
+    # Original file path
+    original_file = "/teamspace/studios/this_studio/TunisianLeaderBoard/eval-results/tunis-ai/TunBERT_eval_request_False_float16_Original.json"
+    # Read and fix the results
+    results = read_results_file(original_file)
+    fixed_results = fix_results_format(results)
+    # Save the fixed version
+    fixed_file = "/teamspace/studios/this_studio/TunisianLeaderBoard/eval-results/tunis-ai/TunBERT_eval_request_False_float16_Original_fixed.json"
+    with open(fixed_file, 'w') as f:
+        json.dump(fixed_results, f, indent=2)
+    print(f"Fixed results saved to: {fixed_file}")
+    # Try to upload to Hugging Face
+    if HF_TOKEN:
+        upload_to_hf(fixed_results, fixed_file)
+    else:
+        print("No HF_TOKEN found. Skipping Hugging Face upload.")

scripts/setup_env.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import os
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+# Set up paths if not already set
+if not os.getenv("EVAL_REQUESTS_PATH"):
+    os.environ["EVAL_REQUESTS_PATH"] = "./eval-queue"
+    print("Set EVAL_REQUESTS_PATH to ./eval-queue")
+if not os.getenv("EVAL_RESULTS_PATH"):
+    os.environ["EVAL_RESULTS_PATH"] = "./eval-results"
+    print("Set EVAL_RESULTS_PATH to ./eval-results")
+# Verify paths
+print(f"EVAL_REQUESTS_PATH: {os.getenv('EVAL_REQUESTS_PATH')}")
+print(f"EVAL_RESULTS_PATH: {os.getenv('EVAL_RESULTS_PATH')}")

src/evaluator/evaluate.py CHANGED Viewed

@@ -7,6 +7,7 @@ from datetime import datetime
 import torch
 from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
 from datasets import load_dataset
 from src.envs import API, OWNER, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, RESULTS_REPO
 from src.display.utils import Tasks
@@ -66,7 +67,30 @@ def evaluate_tsac_sentiment(model, tokenizer, device):
             predictions = []
             targets = []
-            for i, batch in enumerate(dataset):
                 if i == 0:
                     print("\nProcessing first batch...")
                     print(f"Batch keys: {list(batch.keys())}")
@@ -139,7 +163,12 @@ def evaluate_tunisian_corpus_coverage(model, tokenizer, device):
         def preprocess(examples):
             print("Tunisian Corpus preprocess exemples -------------",examples)
             # Use 'Tweet' field as per dataset structure
-            return tokenizer(examples['Tweet'], padding=True, truncation=True, max_length=512)
         dataset = dataset.map(preprocess, batched=True)
@@ -148,7 +177,11 @@ def evaluate_tunisian_corpus_coverage(model, tokenizer, device):
         covered_tokens = 0
         for example in dataset:
-            tokens = tokenizer.tokenize(example['Tweet'])
             total_tokens += len(tokens)
             covered_tokens += len([t for t in tokens if t != tokenizer.unk_token])
@@ -157,7 +190,8 @@ def evaluate_tunisian_corpus_coverage(model, tokenizer, device):
         return {"coverage": coverage}
     except Exception as e:
         print(f"Error in Tunisian Corpus evaluation: {str(e)}")
-        raise e  # Raise the error instead of returning 0.0
 def evaluate_model(model_name: str, revision: str, precision: str, weight_type: str) -> EvaluationResult:
     """Evaluate a single model on all tasks"""
@@ -305,16 +339,17 @@ def process_evaluation_queue():
         # Find all JSON files in the model directory
         json_files = [f for f in os.listdir(model_dir_path) if f.endswith('.json')]
-        print(f"Found {len(json_files)} JSON files in {model_dir}")
         for file in json_files:
             file_path = os.path.join(model_dir_path, file)
             try:
                 with open(file_path, 'r') as f:
                     eval_entry = json.load(f)
-                # Check if this is a pending evaluation
-                if eval_entry.get('status') == EvaluationStatus.PENDING.value:
                     print(f"\n=== Found pending evaluation ===")
                     print(f"Model: {eval_entry['model']}")
                     print(f"Revision: {eval_entry['revision']}")
@@ -409,115 +444,47 @@ def process_evaluation_queue():
                             print("\nError file uploaded to Hugging Face")
                         except Exception as upload_error:
                             print(f"Error uploading error file: {str(upload_error)}")
             except Exception as e:
                 print(f"Error processing file {file}: {str(e)}")
                 print(f"Full traceback: {traceback.format_exc()}")
-            pending_files.append(os.path.join(EVAL_REQUESTS_PATH, file))
-    print(f"Found {len(pending_files)} pending evaluation requests")
-    for file_path in pending_files:
-        print(f"  - {file_path}")
-    if not pending_files:
-        print("No pending evaluation requests found")
-        return
-    for file_path in pending_files:
-        try:
-            print(f"\n=== Processing evaluation request: {file_path} ===")
-            # Read the file atomically
-            try:
-                with open(file_path, 'r') as f:
-                    eval_request = json.load(f)
-                print(f"Loaded evaluation request: {json.dumps(eval_request, indent=2)}")
-            except Exception as e:
-                print(f"Error reading evaluation request: {str(e)}")
-                continue
-            # Skip non-pending evaluations
-            status = eval_request.get('status', 'UNKNOWN')
-            if status != EvaluationStatus.PENDING.value:
-                print(f"Skipping non-pending evaluation (status: {status})")
-                continue
-            # Update status to RUNNING
-            eval_request['status'] = EvaluationStatus.RUNNING.value
-            print(f"Updating status to RUNNING for {eval_request['model']}")
-            # Write the update atomically
-            try:
-                with open(file_path, 'w') as f:
-                    json.dump(eval_request, f, indent=2)
-                print("Successfully updated status to RUNNING")
-            except Exception as e:
-                print(f"Error updating status: {str(e)}")
-                continue
-            # Get model info from request
-            model_name = eval_request.get('model', '')
-            revision = eval_request.get('revision', '')
-            precision = eval_request.get('precision', '')
-            weight_type = eval_request.get('weight_type', '')
-            if not model_name:
-                print("Error: Missing model name in evaluation request")
                 continue
-            print(f"\n=== Evaluating model: {model_name} ===")
-            print(f"Revision: {revision}")
-            print(f"Precision: {precision}")
-            print(f"Weight type: {weight_type}")
-            result = evaluate_model(model_name, revision, precision, weight_type)
-            # Update status and save results
-            if result.error:
-                print(f"\n=== Evaluation failed ===")
-                print(f"Error: {result.error}")
-                eval_request['status'] = EvaluationStatus.FAILED.value
-                eval_request['error'] = result.error
-            else:
-                print(f"\n=== Evaluation completed successfully ===")
-                print(f"Results: {result.results}")
-                eval_request['status'] = EvaluationStatus.FINISHED.value
-                eval_request['results'] = result.results
-            # Write the final update atomically
-            try:
-                with open(file_path, 'w') as f:
-                    json.dump(eval_request, f, indent=2)
-                print("Successfully saved evaluation results")
-            except Exception as e:
-                print(f"Error saving evaluation results: {str(e)}")
-                continue
-            # Move successful evaluations to results directory
-            if eval_request['status'] == EvaluationStatus.FINISHED.value:
-                try:
-                    os.makedirs(EVAL_RESULTS_PATH, exist_ok=True)
-                    result_file = os.path.join(EVAL_RESULTS_PATH, os.path.basename(file_path))
-                    os.rename(file_path, result_file)
-                    print(f"Moved evaluation results to: {result_file}")
-                except Exception as e:
-                    print(f"Error moving results file: {str(e)}")
-        except Exception as e:
-            print(f"\n=== Error processing evaluation: {str(e)} ===")
-            print(f"Full traceback: {traceback.format_exc()}")
-            continue
-        # Upload to Hugging Face
-        try:
-            if 'result_file' in locals():
-                API.upload_file(
-                    path_or_fileobj=result_file,
-                    path_in_repo=result_filename if not username else os.path.join(username, result_filename),
-                    repo_id=f"{OWNER}/results",
-                    repo_type="dataset",
-                    commit_message=f"Add evaluation results for {result.model}"
-                )
-                print("Successfully uploaded results to Hugging Face")
-        except Exception as e:
-            print(f"Error uploading results to Hugging Face: {str(e)}")

 import torch
 from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
 from datasets import load_dataset
+import traceback
 from src.envs import API, OWNER, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, RESULTS_REPO
 from src.display.utils import Tasks
             predictions = []
             targets = []
+            # Create DataLoader with batch size 16
+            from torch.utils.data import DataLoader
+            # Define a custom collate function
+            def collate_fn(batch):
+                # Stack tensors for input_ids and attention_mask
+                input_ids = torch.stack([sample['input_ids'] for sample in batch])
+                attention_mask = torch.stack([sample['attention_mask'] for sample in batch])
+                # Stack targets
+                targets = torch.stack([torch.tensor(sample['target']) for sample in batch])
+                return {
+                    'input_ids': input_ids,
+                    'attention_mask': attention_mask,
+                    'target': targets
+                }
+            dataloader = DataLoader(
+                dataset,
+                batch_size=16,
+                shuffle=False,
+                collate_fn=collate_fn
+            )
+            for i, batch in enumerate(dataloader):
                 if i == 0:
                     print("\nProcessing first batch...")
                     print(f"Batch keys: {list(batch.keys())}")
         def preprocess(examples):
             print("Tunisian Corpus preprocess exemples -------------",examples)
             # Use 'Tweet' field as per dataset structure
+            return tokenizer(
+                examples['Tweet'],
+                padding=False,  # We don't need padding for token coverage
+                truncation=False,  # Don't truncate long sequences
+                max_length=None  # Let tokenizer handle the length
+            )
         dataset = dataset.map(preprocess, batched=True)
         covered_tokens = 0
         for example in dataset:
+            # Get the tokenized input IDs
+            input_ids = example['input_ids']
+            # Convert to tokens and count
+            tokens = tokenizer.convert_ids_to_tokens(input_ids)
             total_tokens += len(tokens)
             covered_tokens += len([t for t in tokens if t != tokenizer.unk_token])
         return {"coverage": coverage}
     except Exception as e:
         print(f"Error in Tunisian Corpus evaluation: {str(e)}")
+        print(f"Full traceback: {traceback.format_exc()}")
+        raise e
 def evaluate_model(model_name: str, revision: str, precision: str, weight_type: str) -> EvaluationResult:
     """Evaluate a single model on all tasks"""
         # Find all JSON files in the model directory
         json_files = [f for f in os.listdir(model_dir_path) if f.endswith('.json')]
+        print(f"Found {len(json_files)} pending evaluation requests")
         for file in json_files:
             file_path = os.path.join(model_dir_path, file)
+            print(f"  - {file_path}")
             try:
                 with open(file_path, 'r') as f:
                     eval_entry = json.load(f)
+                # Check if this is a pending or running evaluation
+                status = eval_entry.get('status', '')
+                if status == EvaluationStatus.PENDING.value:
                     print(f"\n=== Found pending evaluation ===")
                     print(f"Model: {eval_entry['model']}")
                     print(f"Revision: {eval_entry['revision']}")
                             print("\nError file uploaded to Hugging Face")
                         except Exception as upload_error:
                             print(f"Error uploading error file: {str(upload_error)}")
+                elif status == EvaluationStatus.RUNNING.value:
+                    print(f"\n=== Found running evaluation ===")
+                    print(f"Model: {eval_entry['model']}")
+                    print(f"Revision: {eval_entry['revision']}")
+                    print(f"Precision: {eval_entry['precision']}")
+                    print(f"Weight type: {eval_entry['weight_type']}")
+                    try:
+                        # Check if we have results for this evaluation
+                        result_filename = os.path.basename(file_path)
+                        result_path = os.path.join(EVAL_RESULTS_PATH, result_filename)
+                        if os.path.exists(result_path):
+                            print(f"\nFound existing results file: {result_path}")
+                            # Update status to FINISHED
+                            eval_entry['status'] = EvaluationStatus.FINISHED.value
+                            with open(file_path, 'w') as f:
+                                json.dump(eval_entry, f, indent=2)
+                        else:
+                            print("\nNo results found. Restarting evaluation...")
+                            # Restart the evaluation
+                            eval_entry['status'] = EvaluationStatus.PENDING.value
+                            with open(file_path, 'w') as f:
+                                json.dump(eval_entry, f, indent=2)
+                    except Exception as check_error:
+                        print(f"\n=== Error checking running evaluation ===")
+                        print(f"Error: {str(check_error)}")
+                        print(f"Full traceback: {traceback.format_exc()}")
+                        # If we can't check the status, restart the evaluation
+                        eval_entry['status'] = EvaluationStatus.PENDING.value
+                        with open(file_path, 'w') as f:
+                            json.dump(eval_entry, f, indent=2)
             except Exception as e:
                 print(f"Error processing file {file}: {str(e)}")
                 print(f"Full traceback: {traceback.format_exc()}")
                 continue
+    print(f"\n=== Evaluation queue summary ===")
+    print(f"Total directories checked: {len(model_dirs)}")
+    print(f"Total files processed: {len(json_files)}")
+    print(f"\nEvaluation queue processed. Sleeping for 5 minutes...")
+    return

src/leaderboard/read_evals.py CHANGED Viewed

@@ -61,7 +61,7 @@ class EvalResult:
                 model_type=ModelType.from_str(data.get('model_type', 'Unknown')),
                 weight_type=WeightType.from_str(data.get('weight_type', 'Original')),
                 date=data.get('submitted_at', ''),
-                still_on_hub=is_model_on_hub(model_name)
             )
         except Exception as e:
             print(f"Error reading evaluation file {json_filepath}: {str(e)}")
@@ -85,7 +85,7 @@ class EvalResult:
         full_model = "/".join(org_and_model)
         still_on_hub, _, model_config = is_model_on_hub(
-            full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
         )
         architecture = "?"
         if model_config is not None:
@@ -151,7 +151,7 @@ class EvalResult:
             AutoEvalColumnInstance.license.name: self.license,
             AutoEvalColumnInstance.likes.name: self.likes,
             AutoEvalColumnInstance.params.name: self.num_params,
-            AutoEvalColumnInstance.still_on_hub.name: self.still_on_hub,
         }
         for task in Tasks:
@@ -188,24 +188,28 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
     for root, _, files in os.walk(results_path):
         # Only process .json files
         json_files = [f for f in files if f.endswith(".json")]
         for file in json_files:
             model_result_filepaths.append(os.path.join(root, file))
     eval_results = {}
     for model_result_filepath in model_result_filepaths:
         try:
             # Creation of result
             eval_result = EvalResult.init_from_json_file(model_result_filepath)
             if eval_result is None:
                 print(f"Skipping invalid evaluation file: {model_result_filepath}")
                 continue
             eval_result.update_with_request_file(requests_path)
             # Store results of same eval together
             if eval_result.eval_name not in eval_results:
                 eval_results[eval_result.eval_name] = []
             eval_results[eval_result.eval_name].append(eval_result)
         except Exception as e:
             print(f"Error processing evaluation file {model_result_filepath}: {str(e)}")
@@ -214,16 +218,47 @@ def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResu
         # Store results of same eval together
         eval_name = eval_result.eval_name
         if eval_name in eval_results.keys():
-            eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
         else:
-            eval_results[eval_name] = eval_result
     results = []
-    for v in eval_results.values():
         try:
-            v.to_dict() # we test if the dict version is complete
             results.append(v)
         except KeyError as e:  # not all eval values present
-            print(e)
             continue
     return results

                 model_type=ModelType.from_str(data.get('model_type', 'Unknown')),
                 weight_type=WeightType.from_str(data.get('weight_type', 'Original')),
                 date=data.get('submitted_at', ''),
+                still_on_hub=is_model_on_hub(model_name, revision="main")
             )
         except Exception as e:
             print(f"Error reading evaluation file {json_filepath}: {str(e)}")
         full_model = "/".join(org_and_model)
         still_on_hub, _, model_config = is_model_on_hub(
+            full_model, revision=config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
         )
         architecture = "?"
         if model_config is not None:
             AutoEvalColumnInstance.license.name: self.license,
             AutoEvalColumnInstance.likes.name: self.likes,
             AutoEvalColumnInstance.params.name: self.num_params,
+            AutoEvalColumnInstance.still_on_hub.name: True if isinstance(self.still_on_hub, tuple) and self.still_on_hub[0] else False,
         }
         for task in Tasks:
     for root, _, files in os.walk(results_path):
         # Only process .json files
         json_files = [f for f in files if f.endswith(".json")]
+        print(json_files)
         for file in json_files:
             model_result_filepaths.append(os.path.join(root, file))
+            print(model_result_filepaths)
     eval_results = {}
     for model_result_filepath in model_result_filepaths:
         try:
             # Creation of result
             eval_result = EvalResult.init_from_json_file(model_result_filepath)
+            # print(eval_result)
             if eval_result is None:
                 print(f"Skipping invalid evaluation file: {model_result_filepath}")
                 continue
             eval_result.update_with_request_file(requests_path)
+            # print(eval_result)
             # Store results of same eval together
             if eval_result.eval_name not in eval_results:
                 eval_results[eval_result.eval_name] = []
             eval_results[eval_result.eval_name].append(eval_result)
+            # print(eval_results)
         except Exception as e:
             print(f"Error processing evaluation file {model_result_filepath}: {str(e)}")
         # Store results of same eval together
         eval_name = eval_result.eval_name
         if eval_name in eval_results.keys():
+            # If we already have results for this eval, append to list
+            eval_results[eval_name].append(eval_result)
         else:
+            # Initialize list for this eval name
+            eval_results[eval_name] = [eval_result]
+    # Process final results
+    final_results = {}
+    for eval_name, eval_list in eval_results.items():
+        # Create merged results from all evaluations, ensuring all required task keys are present
+        merged_results = {task.value.benchmark: None for task in Tasks}
+        for eval_result in eval_list:
+            merged_results.update({k: v for k, v in eval_result.results.items() if v is not None})
+        # Take the first eval_result as base and update with merged results
+        print("evaluation list : ", eval_list)
+        base_result = eval_list[0]
+        # print(base_result)
+        final_results[eval_name] = EvalResult(
+            eval_name=eval_name,
+            full_model=base_result.full_model,
+            org=base_result.org,
+            model=base_result.model,
+            revision=base_result.revision,
+            results=merged_results,
+            precision=base_result.precision,
+            model_type=base_result.model_type,
+            weight_type=base_result.weight_type,
+            date=base_result.date,
+            still_on_hub=base_result.still_on_hub
+        )
+        print(final_results)
     results = []
+    for v in final_results.values():
+        print("v : ",v)
+        print("Merged results: ", v.results)
         try:
+            v.to_dict()  # we test if the dict version is complete
             results.append(v)
         except KeyError as e:  # not all eval values present
+            print("error in v",e)
             continue
     return results

src/populate.py CHANGED Viewed

@@ -11,14 +11,20 @@ from src.leaderboard.read_evals import get_raw_eval_results
 def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
     """Creates a dataframe from all the individual experiment results"""
     raw_data = get_raw_eval_results(results_path, requests_path)
     all_data_json = [v.to_dict() for v in raw_data]
     df = pd.DataFrame.from_records(all_data_json)
     if df.empty:
         print("No evaluation results found. Returning empty DataFrame with correct columns.")
         return pd.DataFrame(columns=cols)
     df = df.sort_values(by=[AutoEvalColumn().average.name], ascending=False)
     df = df[cols].round(decimals=2)
-    df = df[has_no_nan_values(df, benchmark_cols)]
     return df

 def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
     """Creates a dataframe from all the individual experiment results"""
     raw_data = get_raw_eval_results(results_path, requests_path)
+    print(raw_data)
     all_data_json = [v.to_dict() for v in raw_data]
+    print(all_data_json)
     df = pd.DataFrame.from_records(all_data_json)
+    print(df)
     if df.empty:
         print("No evaluation results found. Returning empty DataFrame with correct columns.")
         return pd.DataFrame(columns=cols)
     df = df.sort_values(by=[AutoEvalColumn().average.name], ascending=False)
+    print(df)
     df = df[cols].round(decimals=2)
+    print(df)
+    # df = df[has_no_nan_values(df, benchmark_cols)]
+    # print(df)
     return df