Clean up
Browse files- app.py +15 -24
- debug.py +1 -1
- hub/version.txt +1 -1
- init_huggingface_dataset.py +0 -85
- logs/evaluation.log +0 -0
- logs/security_eval.log +0 -0
- src/populate.py +4 -6
- src/submission/submit.py +1 -4
app.py
CHANGED
@@ -1,11 +1,10 @@
|
|
1 |
import gradio as gr
|
2 |
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
3 |
from apscheduler.schedulers.background import BackgroundScheduler
|
4 |
-
from huggingface_hub import snapshot_download
|
5 |
import pandas as pd
|
6 |
import os
|
7 |
import logging
|
8 |
-
import json
|
9 |
from datetime import datetime
|
10 |
from datasets import Dataset
|
11 |
|
@@ -25,9 +24,7 @@ from src.display.utils import (
|
|
25 |
COLS,
|
26 |
EVAL_COLS,
|
27 |
EVAL_TYPES,
|
28 |
-
AutoEvalColumn,
|
29 |
ModelType,
|
30 |
-
fields,
|
31 |
WeightType,
|
32 |
Precision
|
33 |
)
|
@@ -42,7 +39,7 @@ from src.envs import (
|
|
42 |
TOKEN
|
43 |
)
|
44 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
45 |
-
from src.submission.submit import
|
46 |
|
47 |
|
48 |
# Setup logging
|
@@ -65,13 +62,13 @@ def initialize_space():
|
|
65 |
logger.info("Initializing space")
|
66 |
try:
|
67 |
logger.info(f"Downloading queue data from {QUEUE_REPO}")
|
68 |
-
|
69 |
# Initialize queue repository if needed
|
70 |
if not initialize_queue_repo():
|
71 |
logger.error("Failed to initialize queue repository")
|
72 |
restart_space()
|
73 |
return
|
74 |
-
|
75 |
snapshot_download(
|
76 |
repo_id=QUEUE_REPO,
|
77 |
local_dir=EVAL_REQUESTS_PATH,
|
@@ -86,13 +83,13 @@ def initialize_space():
|
|
86 |
|
87 |
try:
|
88 |
logger.info(f"Downloading results data from {RESULTS_REPO}")
|
89 |
-
|
90 |
# Initialize results repository if needed
|
91 |
if not initialize_results_repo():
|
92 |
logger.error("Failed to initialize results repository")
|
93 |
restart_space()
|
94 |
return
|
95 |
-
|
96 |
snapshot_download(
|
97 |
repo_id=RESULTS_REPO,
|
98 |
local_dir=EVAL_RESULTS_PATH,
|
@@ -117,12 +114,6 @@ LEADERBOARD_DF = get_leaderboard_df(COLS, BENCHMARK_COLS)
|
|
117 |
pending_eval_queue_df,
|
118 |
) = get_evaluation_queue_df(EVAL_COLS)
|
119 |
|
120 |
-
# Function to update the leaderboard
|
121 |
-
def update_leaderboard():
|
122 |
-
global LEADERBOARD_DF
|
123 |
-
LEADERBOARD_DF = get_leaderboard_df(COLS, BENCHMARK_COLS)
|
124 |
-
return LEADERBOARD_DF
|
125 |
-
|
126 |
def process_evaluation_queue():
|
127 |
"""Process pending evaluation requests."""
|
128 |
logger.info("Processing evaluation queue")
|
@@ -172,40 +163,40 @@ def update_request_status(model_name, status):
|
|
172 |
# Load the current dataset
|
173 |
from datasets import load_dataset
|
174 |
dataset = load_dataset(QUEUE_REPO, split="train")
|
175 |
-
|
176 |
# Convert to dictionary for easier manipulation
|
177 |
data_dict = dataset.to_dict()
|
178 |
-
|
179 |
# Find the most recent request for this model
|
180 |
indices = [i for i, m in enumerate(data_dict["model_raw"]) if m == model_name]
|
181 |
-
|
182 |
if not indices:
|
183 |
logger.error(f"No request found for model {model_name}")
|
184 |
return
|
185 |
-
|
186 |
# Get the most recent request (last index)
|
187 |
latest_index = indices[-1]
|
188 |
-
|
189 |
# Update the status for the found request
|
190 |
data_dict["status"][latest_index] = status
|
191 |
-
|
192 |
# Create new dataset with updated status
|
193 |
updated_dataset = Dataset.from_dict(data_dict)
|
194 |
-
|
195 |
# Push the updated dataset back to the hub with a descriptive commit message
|
196 |
updated_dataset.push_to_hub(
|
197 |
QUEUE_REPO,
|
198 |
split="train",
|
199 |
commit_message=f"Update status to {status} for {model_name}"
|
200 |
)
|
201 |
-
|
202 |
logger.info(f"Updated status for {model_name} to {status}")
|
203 |
except Exception as e:
|
204 |
logger.error(f"Failed to update status for {model_name}: {str(e)}", exc_info=True)
|
205 |
|
206 |
# Remove the extract_model_name function as it's no longer needed
|
207 |
|
208 |
-
|
209 |
|
210 |
def save_results_to_repo(results, repo):
|
211 |
"""Save evaluation results to the specified repository."""
|
|
|
1 |
import gradio as gr
|
2 |
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
|
3 |
from apscheduler.schedulers.background import BackgroundScheduler
|
4 |
+
from huggingface_hub import snapshot_download
|
5 |
import pandas as pd
|
6 |
import os
|
7 |
import logging
|
|
|
8 |
from datetime import datetime
|
9 |
from datasets import Dataset
|
10 |
|
|
|
24 |
COLS,
|
25 |
EVAL_COLS,
|
26 |
EVAL_TYPES,
|
|
|
27 |
ModelType,
|
|
|
28 |
WeightType,
|
29 |
Precision
|
30 |
)
|
|
|
39 |
TOKEN
|
40 |
)
|
41 |
from src.populate import get_evaluation_queue_df, get_leaderboard_df
|
42 |
+
from src.submission.submit import initialize_queue_repo, initialize_results_repo
|
43 |
|
44 |
|
45 |
# Setup logging
|
|
|
62 |
logger.info("Initializing space")
|
63 |
try:
|
64 |
logger.info(f"Downloading queue data from {QUEUE_REPO}")
|
65 |
+
|
66 |
# Initialize queue repository if needed
|
67 |
if not initialize_queue_repo():
|
68 |
logger.error("Failed to initialize queue repository")
|
69 |
restart_space()
|
70 |
return
|
71 |
+
|
72 |
snapshot_download(
|
73 |
repo_id=QUEUE_REPO,
|
74 |
local_dir=EVAL_REQUESTS_PATH,
|
|
|
83 |
|
84 |
try:
|
85 |
logger.info(f"Downloading results data from {RESULTS_REPO}")
|
86 |
+
|
87 |
# Initialize results repository if needed
|
88 |
if not initialize_results_repo():
|
89 |
logger.error("Failed to initialize results repository")
|
90 |
restart_space()
|
91 |
return
|
92 |
+
|
93 |
snapshot_download(
|
94 |
repo_id=RESULTS_REPO,
|
95 |
local_dir=EVAL_RESULTS_PATH,
|
|
|
114 |
pending_eval_queue_df,
|
115 |
) = get_evaluation_queue_df(EVAL_COLS)
|
116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
def process_evaluation_queue():
|
118 |
"""Process pending evaluation requests."""
|
119 |
logger.info("Processing evaluation queue")
|
|
|
163 |
# Load the current dataset
|
164 |
from datasets import load_dataset
|
165 |
dataset = load_dataset(QUEUE_REPO, split="train")
|
166 |
+
|
167 |
# Convert to dictionary for easier manipulation
|
168 |
data_dict = dataset.to_dict()
|
169 |
+
|
170 |
# Find the most recent request for this model
|
171 |
indices = [i for i, m in enumerate(data_dict["model_raw"]) if m == model_name]
|
172 |
+
|
173 |
if not indices:
|
174 |
logger.error(f"No request found for model {model_name}")
|
175 |
return
|
176 |
+
|
177 |
# Get the most recent request (last index)
|
178 |
latest_index = indices[-1]
|
179 |
+
|
180 |
# Update the status for the found request
|
181 |
data_dict["status"][latest_index] = status
|
182 |
+
|
183 |
# Create new dataset with updated status
|
184 |
updated_dataset = Dataset.from_dict(data_dict)
|
185 |
+
|
186 |
# Push the updated dataset back to the hub with a descriptive commit message
|
187 |
updated_dataset.push_to_hub(
|
188 |
QUEUE_REPO,
|
189 |
split="train",
|
190 |
commit_message=f"Update status to {status} for {model_name}"
|
191 |
)
|
192 |
+
|
193 |
logger.info(f"Updated status for {model_name} to {status}")
|
194 |
except Exception as e:
|
195 |
logger.error(f"Failed to update status for {model_name}: {str(e)}", exc_info=True)
|
196 |
|
197 |
# Remove the extract_model_name function as it's no longer needed
|
198 |
|
199 |
+
|
200 |
|
201 |
def save_results_to_repo(results, repo):
|
202 |
"""Save evaluation results to the specified repository."""
|
debug.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import pandas as pd
|
2 |
-
from src.display.utils import
|
3 |
from src.about import Tasks
|
4 |
from src.leaderboard.read_evals import get_raw_eval_results
|
5 |
|
|
|
1 |
import pandas as pd
|
2 |
+
from src.display.utils import BENCHMARK_COLS
|
3 |
from src.about import Tasks
|
4 |
from src.leaderboard.read_evals import get_raw_eval_results
|
5 |
|
hub/version.txt
CHANGED
@@ -1 +1 @@
|
|
1 |
-
1
|
|
|
1 |
+
0.1
|
init_huggingface_dataset.py
DELETED
@@ -1,85 +0,0 @@
|
|
1 |
-
from datasets import Dataset
|
2 |
-
from huggingface_hub import HfApi, login
|
3 |
-
import os
|
4 |
-
|
5 |
-
# Initialize the dataset with a sample entry
|
6 |
-
initial_data = {
|
7 |
-
"model": ["example/model"],
|
8 |
-
"model_raw": ["example/model"],
|
9 |
-
"base_model": ["gpt2"],
|
10 |
-
"revision": ["main"],
|
11 |
-
"precision": ["fp16"],
|
12 |
-
"weight_type": ["Safetensors"],
|
13 |
-
"model_type": ["Pretrained"],
|
14 |
-
"status": ["PENDING"],
|
15 |
-
"timestamp": ["2025-01-26T15:15:09.693973"],
|
16 |
-
"security_score": [0.5],
|
17 |
-
"safetensors_compliant": [True],
|
18 |
-
"hub_license": ["MIT"],
|
19 |
-
"hub_likes": [0],
|
20 |
-
"params_billion": [0.5],
|
21 |
-
"available_on_hub": [True],
|
22 |
-
"model_sha": ["abc123"]
|
23 |
-
}
|
24 |
-
|
25 |
-
# Create a Dataset object
|
26 |
-
dataset = Dataset.from_dict(initial_data)
|
27 |
-
|
28 |
-
# Login to Hugging Face (you'll need to set the HUGGINGFACE_TOKEN environment variable)
|
29 |
-
login()
|
30 |
-
|
31 |
-
# Push the dataset to the Hugging Face Hub
|
32 |
-
dataset.push_to_hub("stacklok/results")
|
33 |
-
|
34 |
-
# Create a dataset card
|
35 |
-
dataset_card = """
|
36 |
-
---
|
37 |
-
language:
|
38 |
-
- en
|
39 |
-
license:
|
40 |
-
- mit
|
41 |
-
---
|
42 |
-
|
43 |
-
# Dataset Card for stacklok/results
|
44 |
-
|
45 |
-
This dataset contains evaluation results for various models, focusing on security scores and other relevant metrics.
|
46 |
-
|
47 |
-
## Dataset Structure
|
48 |
-
|
49 |
-
The dataset contains the following fields:
|
50 |
-
- `model`: The identifier of the model
|
51 |
-
- `model_raw`: The raw model identifier
|
52 |
-
- `base_model`: The base model if applicable
|
53 |
-
- `revision`: The revision or version of the model
|
54 |
-
- `precision`: The precision used for the model (e.g., fp16, fp32)
|
55 |
-
- `weight_type`: Type of weights used
|
56 |
-
- `model_type`: Type of the model
|
57 |
-
- `status`: Current status of the evaluation
|
58 |
-
- `timestamp`: When the evaluation was performed
|
59 |
-
- `security_score`: A score representing the model's security evaluation
|
60 |
-
- `safetensors_compliant`: A boolean indicating whether the model is compliant with safetensors
|
61 |
-
- `hub_license`: The license of the model on Hugging Face Hub
|
62 |
-
- `hub_likes`: Number of likes on Hugging Face Hub
|
63 |
-
- `params_billion`: Number of parameters in billions
|
64 |
-
- `available_on_hub`: Whether the model is available on Hugging Face Hub
|
65 |
-
- `model_sha`: SHA hash of the model
|
66 |
-
|
67 |
-
## Usage
|
68 |
-
|
69 |
-
This dataset is used to populate the secure code leaderboard, providing insights into the security aspects of various models.
|
70 |
-
"""
|
71 |
-
|
72 |
-
# Write the dataset card
|
73 |
-
with open("README.md", "w") as f:
|
74 |
-
f.write(dataset_card)
|
75 |
-
|
76 |
-
# Upload the dataset card
|
77 |
-
api = HfApi()
|
78 |
-
api.upload_file(
|
79 |
-
path_or_fileobj="README.md",
|
80 |
-
path_in_repo="README.md",
|
81 |
-
repo_id="stacklok/results",
|
82 |
-
repo_type="dataset"
|
83 |
-
)
|
84 |
-
|
85 |
-
print("Dataset initialized and card uploaded successfully!")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
logs/evaluation.log
DELETED
File without changes
|
logs/security_eval.log
DELETED
File without changes
|
src/populate.py
CHANGED
@@ -1,17 +1,15 @@
|
|
1 |
import json
|
2 |
-
|
3 |
import numpy as np
|
4 |
import pandas as pd
|
5 |
import logging
|
6 |
-
from typing import List
|
7 |
-
|
8 |
from src.display.formatting import make_clickable_model
|
9 |
-
from src.leaderboard.read_evals import get_raw_eval_results
|
10 |
|
11 |
logger = logging.getLogger(__name__)
|
12 |
|
13 |
-
|
14 |
-
from src.config import RESULTS_REPO, QUEUE_REPO
|
15 |
|
16 |
def get_leaderboard_df(cols: List[str], benchmark_cols: List[str]) -> pd.DataFrame:
|
17 |
"""Creates a dataframe from all the individual experiment results"""
|
|
|
1 |
import json
|
2 |
+
|
3 |
import numpy as np
|
4 |
import pandas as pd
|
5 |
import logging
|
6 |
+
from typing import List
|
7 |
+
from src.config import RESULTS_REPO, QUEUE_REPO
|
8 |
from src.display.formatting import make_clickable_model
|
|
|
9 |
|
10 |
logger = logging.getLogger(__name__)
|
11 |
|
12 |
+
|
|
|
13 |
|
14 |
def get_leaderboard_df(cols: List[str], benchmark_cols: List[str]) -> pd.DataFrame:
|
15 |
"""Creates a dataframe from all the individual experiment results"""
|
src/submission/submit.py
CHANGED
@@ -1,5 +1,3 @@
|
|
1 |
-
import json
|
2 |
-
import os
|
3 |
import logging
|
4 |
from datetime import datetime, timezone
|
5 |
from typing import Dict, Tuple, Optional, Any
|
@@ -331,8 +329,7 @@ def initialize_results_repo():
|
|
331 |
return True
|
332 |
except Exception:
|
333 |
logger.info("Results repository not initialized, creating initial dataset")
|
334 |
-
|
335 |
-
# Initialize with a sample entry as per init_huggingface_dataset.py
|
336 |
initial_data = {
|
337 |
"model": ["example/model"],
|
338 |
"model_raw": ["example/model"],
|
|
|
|
|
|
|
1 |
import logging
|
2 |
from datetime import datetime, timezone
|
3 |
from typing import Dict, Tuple, Optional, Any
|
|
|
329 |
return True
|
330 |
except Exception:
|
331 |
logger.info("Results repository not initialized, creating initial dataset")
|
332 |
+
|
|
|
333 |
initial_data = {
|
334 |
"model": ["example/model"],
|
335 |
"model_raw": ["example/model"],
|