lukehinds's picture
Populate results from dataset
80fe2b4
raw
history blame
6.32 kB
import json
import os
import numpy as np
import pandas as pd
import logging
from typing import List, Dict, Any
from src.display.formatting import make_clickable_model
from src.leaderboard.read_evals import get_raw_eval_results
logger = logging.getLogger(__name__)
from huggingface_hub import HfApi
from src.config import RESULTS_REPO, QUEUE_REPO
def get_leaderboard_df(cols: List[str], benchmark_cols: List[str]) -> pd.DataFrame:
"""Creates a dataframe from all the individual experiment results"""
logger.info(f"Fetching evaluation results from {RESULTS_REPO}")
try:
# Load the dataset directly
from datasets import load_dataset
dataset = load_dataset(RESULTS_REPO, split="train")
logger.debug(f"Loaded dataset with {len(dataset)} rows")
logger.debug(f"Dataset features: {dataset.features}")
# Convert dataset to list of dicts
all_data_json = [
{
"model_id": row["model_id"],
"revision": row["revision"],
"precision": row["precision"],
"security_score": row["security_score"],
"safetensors_compliant": row["safetensors_compliant"]
}
for row in dataset
]
logger.debug(f"Converted dataset to: {json.dumps(all_data_json, indent=2)}")
except Exception as e:
logger.error(f"Error loading dataset from {RESULTS_REPO}: {str(e)}", exc_info=True)
return pd.DataFrame(columns=cols) # Return empty DataFrame on error
logger.info(f"Fetched {len(all_data_json)} results")
logger.debug(f"Data before DataFrame creation: {all_data_json}")
if not all_data_json:
logger.warning("No valid data found!")
return pd.DataFrame(columns=cols)
df = pd.DataFrame(all_data_json)
logger.info(f"Created DataFrame with columns: {df.columns.tolist()}")
logger.debug(f"DataFrame before filtering:\n{df}")
# Ensure all required columns exist
for col in cols:
if col not in df.columns:
logger.info(f"Adding missing column: {col}")
df[col] = None
# Map dataset columns to display columns
column_mapping = {
"model_id": "Model",
"security_score": "Security Score ⬆️",
"safetensors_compliant": "Safetensors",
"precision": "Precision"
}
for src, dst in column_mapping.items():
if src in df.columns:
df[dst] = df[src]
logger.debug(f"Mapped column {src} to {dst}")
# Sort by Security Score if available
if "Security Score ⬆️" in df.columns and not df["Security Score ⬆️"].isnull().all():
df = df.sort_values(by="Security Score ⬆️", ascending=False)
logger.info("DataFrame sorted by Security Score")
else:
logger.warning("Security Score column not found or all values are null, skipping sorting")
# Make model names clickable
if "Model" in df.columns:
df["Model"] = df["Model"].apply(make_clickable_model)
# Select only the columns we want to display
df = df[cols]
# Round numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
df[col] = pd.to_numeric(df[col], errors='coerce')
df[numeric_cols] = df[numeric_cols].round(decimals=2)
logger.debug(f"DataFrame after column selection and rounding:\n{df}")
logger.info(f"Final DataFrame has {len(df)} rows")
return df
def get_evaluation_queue_df(cols: list) -> list[pd.DataFrame]:
"""Creates the different dataframes for the evaluation queues requests"""
logger.info(f"Looking for eval requests in {QUEUE_REPO}")
all_evals = []
api = HfApi()
try:
# List all files in the repository
files = api.list_repo_files(repo_id=QUEUE_REPO, repo_type="dataset")
# Filter for JSON files
json_files = [f for f in files if f.endswith('.json')]
for file in json_files:
try:
# Download and read each JSON file
content = api.hf_hub_download(repo_id=QUEUE_REPO, filename=file, repo_type="dataset")
logger.info(f"Reading JSON file: {file}")
with open(content, 'r') as fp:
data = json.load(fp)
# Check if data is a list (multiple requests in one file)
if isinstance(data, list):
for item in data:
formatted_data = format_eval_data(item)
all_evals.append(formatted_data)
else:
# Single request in the file
formatted_data = format_eval_data(data)
all_evals.append(formatted_data)
except Exception as e:
logger.error(f"Error processing file {file}: {str(e)}", exc_info=True)
except Exception as e:
logger.error(f"Error fetching requests from {QUEUE_REPO}: {str(e)}", exc_info=True)
logger.info(f"Found {len(all_evals)} total eval requests")
pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
running_list = [e for e in all_evals if e["status"] == "RUNNING"]
finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
logger.info(f"Pending: {len(pending_list)}, Running: {len(running_list)}, Finished: {len(finished_list)}")
df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
df_running = pd.DataFrame.from_records(running_list, columns=cols)
df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
return df_finished[cols], df_running[cols], df_pending[cols]
def format_eval_data(data: dict) -> dict:
"""Format the evaluation data into the required structure"""
model_name = data.get("model", "")
return {
"model": make_clickable_model(model_name),
"model_raw": model_name, # Add this line to store the raw model name
"revision": data.get("revision", "main"),
"private": data.get("private", False),
"precision": data.get("precision", ""),
"weight_type": data.get("weight_type", ""),
"model_type": data.get("model_type", ""),
"status": data.get("status", "")
}