File size: 6,316 Bytes
50373cb
 
1264ff3
50373cb
5b0b7d5
6002427
50373cb
79c9329
50373cb
 
5b0b7d5
50373cb
5b0b7d5
2144d6f
5b0b7d5
6002427
50373cb
5b0b7d5
beeec80
5b0b7d5
80fe2b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99b815f
5b0b7d5
80fe2b4
6002427
5b0b7d5
 
 
50373cb
366074b
5b0b7d5
366074b
 
5b0b7d5
 
 
99b815f
1264ff3
 
b257b3e
5b0b7d5
b257b3e
99b815f
d2ee706
 
 
 
 
 
 
 
 
 
 
 
 
 
6002427
1264ff3
5b0b7d5
1264ff3
6002427
1264ff3
d2ee706
 
 
 
1264ff3
 
beeec80
1264ff3
 
5b0b7d5
 
1264ff3
beeec80
5b0b7d5
 
50373cb
 
 
2144d6f
 
 
50373cb
 
2144d6f
 
 
 
 
beeec80
2144d6f
 
beeec80
2144d6f
 
 
 
 
 
50373cb
 
7c23313
 
 
 
 
 
 
 
 
2144d6f
 
 
 
 
50373cb
beeec80
50373cb
 
 
99b815f
beeec80
99b815f
50373cb
 
 
 
7c23313
 
 
78a8c9c
7c23313
78a8c9c
 
7c23313
 
 
 
120caf1
7c23313
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import json
import os
import numpy as np
import pandas as pd
import logging
from typing import List, Dict, Any

from src.display.formatting import make_clickable_model
from src.leaderboard.read_evals import get_raw_eval_results

logger = logging.getLogger(__name__)

from huggingface_hub import HfApi
from src.config import RESULTS_REPO, QUEUE_REPO

def get_leaderboard_df(cols: List[str], benchmark_cols: List[str]) -> pd.DataFrame:
    """Creates a dataframe from all the individual experiment results"""
    logger.info(f"Fetching evaluation results from {RESULTS_REPO}")

    try:
        # Load the dataset directly
        from datasets import load_dataset
        dataset = load_dataset(RESULTS_REPO, split="train")
        logger.debug(f"Loaded dataset with {len(dataset)} rows")
        logger.debug(f"Dataset features: {dataset.features}")

        # Convert dataset to list of dicts
        all_data_json = [
            {
                "model_id": row["model_id"],
                "revision": row["revision"],
                "precision": row["precision"],
                "security_score": row["security_score"],
                "safetensors_compliant": row["safetensors_compliant"]
            }
            for row in dataset
        ]

        logger.debug(f"Converted dataset to: {json.dumps(all_data_json, indent=2)}")

    except Exception as e:
        logger.error(f"Error loading dataset from {RESULTS_REPO}: {str(e)}", exc_info=True)
        return pd.DataFrame(columns=cols)  # Return empty DataFrame on error

    logger.info(f"Fetched {len(all_data_json)} results")
    logger.debug(f"Data before DataFrame creation: {all_data_json}")

    if not all_data_json:
        logger.warning("No valid data found!")
        return pd.DataFrame(columns=cols)

    df = pd.DataFrame(all_data_json)
    logger.info(f"Created DataFrame with columns: {df.columns.tolist()}")
    logger.debug(f"DataFrame before filtering:\n{df}")

    # Ensure all required columns exist
    for col in cols:
        if col not in df.columns:
            logger.info(f"Adding missing column: {col}")
            df[col] = None

    # Map dataset columns to display columns
    column_mapping = {
        "model_id": "Model",
        "security_score": "Security Score ⬆️",
        "safetensors_compliant": "Safetensors",
        "precision": "Precision"
    }
    
    for src, dst in column_mapping.items():
        if src in df.columns:
            df[dst] = df[src]
            logger.debug(f"Mapped column {src} to {dst}")

    # Sort by Security Score if available
    if "Security Score ⬆️" in df.columns and not df["Security Score ⬆️"].isnull().all():
        df = df.sort_values(by="Security Score ⬆️", ascending=False)
        logger.info("DataFrame sorted by Security Score")
    else:
        logger.warning("Security Score column not found or all values are null, skipping sorting")

    # Make model names clickable
    if "Model" in df.columns:
        df["Model"] = df["Model"].apply(make_clickable_model)

    # Select only the columns we want to display
    df = df[cols]

    # Round numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    df[numeric_cols] = df[numeric_cols].round(decimals=2)

    logger.debug(f"DataFrame after column selection and rounding:\n{df}")
    logger.info(f"Final DataFrame has {len(df)} rows")
    return df


def get_evaluation_queue_df(cols: list) -> list[pd.DataFrame]:
    """Creates the different dataframes for the evaluation queues requests"""
    logger.info(f"Looking for eval requests in {QUEUE_REPO}")
    all_evals = []

    api = HfApi()

    try:
        # List all files in the repository
        files = api.list_repo_files(repo_id=QUEUE_REPO, repo_type="dataset")

        # Filter for JSON files
        json_files = [f for f in files if f.endswith('.json')]

        for file in json_files:
            try:
                # Download and read each JSON file
                content = api.hf_hub_download(repo_id=QUEUE_REPO, filename=file, repo_type="dataset")
                logger.info(f"Reading JSON file: {file}")
                with open(content, 'r') as fp:
                    data = json.load(fp)

                # Check if data is a list (multiple requests in one file)
                if isinstance(data, list):
                    for item in data:
                        formatted_data = format_eval_data(item)
                        all_evals.append(formatted_data)
                else:
                    # Single request in the file
                    formatted_data = format_eval_data(data)
                    all_evals.append(formatted_data)
            except Exception as e:
                logger.error(f"Error processing file {file}: {str(e)}", exc_info=True)

    except Exception as e:
        logger.error(f"Error fetching requests from {QUEUE_REPO}: {str(e)}", exc_info=True)

    logger.info(f"Found {len(all_evals)} total eval requests")
    pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
    running_list = [e for e in all_evals if e["status"] == "RUNNING"]
    finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]

    logger.info(f"Pending: {len(pending_list)}, Running: {len(running_list)}, Finished: {len(finished_list)}")

    df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
    df_running = pd.DataFrame.from_records(running_list, columns=cols)
    df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
    return df_finished[cols], df_running[cols], df_pending[cols]

def format_eval_data(data: dict) -> dict:
    """Format the evaluation data into the required structure"""
    model_name = data.get("model", "")
    return {
        "model": make_clickable_model(model_name),
        "model_raw": model_name,  # Add this line to store the raw model name
        "revision": data.get("revision", "main"),
        "private": data.get("private", False),
        "precision": data.get("precision", ""),
        "weight_type": data.get("weight_type", ""),
        "model_type": data.get("model_type", ""),
        "status": data.get("status", "")
    }