lukehinds commited on
Commit
bd09cee
·
1 Parent(s): e20e214
app.py CHANGED
@@ -1,11 +1,10 @@
1
  import gradio as gr
2
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  from apscheduler.schedulers.background import BackgroundScheduler
4
- from huggingface_hub import snapshot_download, HfApi
5
  import pandas as pd
6
  import os
7
  import logging
8
- import json
9
  from datetime import datetime
10
  from datasets import Dataset
11
 
@@ -25,9 +24,7 @@ from src.display.utils import (
25
  COLS,
26
  EVAL_COLS,
27
  EVAL_TYPES,
28
- AutoEvalColumn,
29
  ModelType,
30
- fields,
31
  WeightType,
32
  Precision
33
  )
@@ -42,7 +39,7 @@ from src.envs import (
42
  TOKEN
43
  )
44
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
45
- from src.submission.submit import add_new_eval, initialize_queue_repo, initialize_results_repo
46
 
47
 
48
  # Setup logging
@@ -65,13 +62,13 @@ def initialize_space():
65
  logger.info("Initializing space")
66
  try:
67
  logger.info(f"Downloading queue data from {QUEUE_REPO}")
68
-
69
  # Initialize queue repository if needed
70
  if not initialize_queue_repo():
71
  logger.error("Failed to initialize queue repository")
72
  restart_space()
73
  return
74
-
75
  snapshot_download(
76
  repo_id=QUEUE_REPO,
77
  local_dir=EVAL_REQUESTS_PATH,
@@ -86,13 +83,13 @@ def initialize_space():
86
 
87
  try:
88
  logger.info(f"Downloading results data from {RESULTS_REPO}")
89
-
90
  # Initialize results repository if needed
91
  if not initialize_results_repo():
92
  logger.error("Failed to initialize results repository")
93
  restart_space()
94
  return
95
-
96
  snapshot_download(
97
  repo_id=RESULTS_REPO,
98
  local_dir=EVAL_RESULTS_PATH,
@@ -117,12 +114,6 @@ LEADERBOARD_DF = get_leaderboard_df(COLS, BENCHMARK_COLS)
117
  pending_eval_queue_df,
118
  ) = get_evaluation_queue_df(EVAL_COLS)
119
 
120
- # Function to update the leaderboard
121
- def update_leaderboard():
122
- global LEADERBOARD_DF
123
- LEADERBOARD_DF = get_leaderboard_df(COLS, BENCHMARK_COLS)
124
- return LEADERBOARD_DF
125
-
126
  def process_evaluation_queue():
127
  """Process pending evaluation requests."""
128
  logger.info("Processing evaluation queue")
@@ -172,40 +163,40 @@ def update_request_status(model_name, status):
172
  # Load the current dataset
173
  from datasets import load_dataset
174
  dataset = load_dataset(QUEUE_REPO, split="train")
175
-
176
  # Convert to dictionary for easier manipulation
177
  data_dict = dataset.to_dict()
178
-
179
  # Find the most recent request for this model
180
  indices = [i for i, m in enumerate(data_dict["model_raw"]) if m == model_name]
181
-
182
  if not indices:
183
  logger.error(f"No request found for model {model_name}")
184
  return
185
-
186
  # Get the most recent request (last index)
187
  latest_index = indices[-1]
188
-
189
  # Update the status for the found request
190
  data_dict["status"][latest_index] = status
191
-
192
  # Create new dataset with updated status
193
  updated_dataset = Dataset.from_dict(data_dict)
194
-
195
  # Push the updated dataset back to the hub with a descriptive commit message
196
  updated_dataset.push_to_hub(
197
  QUEUE_REPO,
198
  split="train",
199
  commit_message=f"Update status to {status} for {model_name}"
200
  )
201
-
202
  logger.info(f"Updated status for {model_name} to {status}")
203
  except Exception as e:
204
  logger.error(f"Failed to update status for {model_name}: {str(e)}", exc_info=True)
205
 
206
  # Remove the extract_model_name function as it's no longer needed
207
 
208
- from huggingface_hub import HfApi
209
 
210
  def save_results_to_repo(results, repo):
211
  """Save evaluation results to the specified repository."""
 
1
  import gradio as gr
2
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  from apscheduler.schedulers.background import BackgroundScheduler
4
+ from huggingface_hub import snapshot_download
5
  import pandas as pd
6
  import os
7
  import logging
 
8
  from datetime import datetime
9
  from datasets import Dataset
10
 
 
24
  COLS,
25
  EVAL_COLS,
26
  EVAL_TYPES,
 
27
  ModelType,
 
28
  WeightType,
29
  Precision
30
  )
 
39
  TOKEN
40
  )
41
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
42
+ from src.submission.submit import initialize_queue_repo, initialize_results_repo
43
 
44
 
45
  # Setup logging
 
62
  logger.info("Initializing space")
63
  try:
64
  logger.info(f"Downloading queue data from {QUEUE_REPO}")
65
+
66
  # Initialize queue repository if needed
67
  if not initialize_queue_repo():
68
  logger.error("Failed to initialize queue repository")
69
  restart_space()
70
  return
71
+
72
  snapshot_download(
73
  repo_id=QUEUE_REPO,
74
  local_dir=EVAL_REQUESTS_PATH,
 
83
 
84
  try:
85
  logger.info(f"Downloading results data from {RESULTS_REPO}")
86
+
87
  # Initialize results repository if needed
88
  if not initialize_results_repo():
89
  logger.error("Failed to initialize results repository")
90
  restart_space()
91
  return
92
+
93
  snapshot_download(
94
  repo_id=RESULTS_REPO,
95
  local_dir=EVAL_RESULTS_PATH,
 
114
  pending_eval_queue_df,
115
  ) = get_evaluation_queue_df(EVAL_COLS)
116
 
 
 
 
 
 
 
117
  def process_evaluation_queue():
118
  """Process pending evaluation requests."""
119
  logger.info("Processing evaluation queue")
 
163
  # Load the current dataset
164
  from datasets import load_dataset
165
  dataset = load_dataset(QUEUE_REPO, split="train")
166
+
167
  # Convert to dictionary for easier manipulation
168
  data_dict = dataset.to_dict()
169
+
170
  # Find the most recent request for this model
171
  indices = [i for i, m in enumerate(data_dict["model_raw"]) if m == model_name]
172
+
173
  if not indices:
174
  logger.error(f"No request found for model {model_name}")
175
  return
176
+
177
  # Get the most recent request (last index)
178
  latest_index = indices[-1]
179
+
180
  # Update the status for the found request
181
  data_dict["status"][latest_index] = status
182
+
183
  # Create new dataset with updated status
184
  updated_dataset = Dataset.from_dict(data_dict)
185
+
186
  # Push the updated dataset back to the hub with a descriptive commit message
187
  updated_dataset.push_to_hub(
188
  QUEUE_REPO,
189
  split="train",
190
  commit_message=f"Update status to {status} for {model_name}"
191
  )
192
+
193
  logger.info(f"Updated status for {model_name} to {status}")
194
  except Exception as e:
195
  logger.error(f"Failed to update status for {model_name}: {str(e)}", exc_info=True)
196
 
197
  # Remove the extract_model_name function as it's no longer needed
198
 
199
+
200
 
201
  def save_results_to_repo(results, repo):
202
  """Save evaluation results to the specified repository."""
debug.py CHANGED
@@ -1,5 +1,5 @@
1
  import pandas as pd
2
- from src.display.utils import COLS, BENCHMARK_COLS
3
  from src.about import Tasks
4
  from src.leaderboard.read_evals import get_raw_eval_results
5
 
 
1
  import pandas as pd
2
+ from src.display.utils import BENCHMARK_COLS
3
  from src.about import Tasks
4
  from src.leaderboard.read_evals import get_raw_eval_results
5
 
hub/version.txt CHANGED
@@ -1 +1 @@
1
- 1
 
1
+ 0.1
init_huggingface_dataset.py DELETED
@@ -1,85 +0,0 @@
1
- from datasets import Dataset
2
- from huggingface_hub import HfApi, login
3
- import os
4
-
5
- # Initialize the dataset with a sample entry
6
- initial_data = {
7
- "model": ["example/model"],
8
- "model_raw": ["example/model"],
9
- "base_model": ["gpt2"],
10
- "revision": ["main"],
11
- "precision": ["fp16"],
12
- "weight_type": ["Safetensors"],
13
- "model_type": ["Pretrained"],
14
- "status": ["PENDING"],
15
- "timestamp": ["2025-01-26T15:15:09.693973"],
16
- "security_score": [0.5],
17
- "safetensors_compliant": [True],
18
- "hub_license": ["MIT"],
19
- "hub_likes": [0],
20
- "params_billion": [0.5],
21
- "available_on_hub": [True],
22
- "model_sha": ["abc123"]
23
- }
24
-
25
- # Create a Dataset object
26
- dataset = Dataset.from_dict(initial_data)
27
-
28
- # Login to Hugging Face (you'll need to set the HUGGINGFACE_TOKEN environment variable)
29
- login()
30
-
31
- # Push the dataset to the Hugging Face Hub
32
- dataset.push_to_hub("stacklok/results")
33
-
34
- # Create a dataset card
35
- dataset_card = """
36
- ---
37
- language:
38
- - en
39
- license:
40
- - mit
41
- ---
42
-
43
- # Dataset Card for stacklok/results
44
-
45
- This dataset contains evaluation results for various models, focusing on security scores and other relevant metrics.
46
-
47
- ## Dataset Structure
48
-
49
- The dataset contains the following fields:
50
- - `model`: The identifier of the model
51
- - `model_raw`: The raw model identifier
52
- - `base_model`: The base model if applicable
53
- - `revision`: The revision or version of the model
54
- - `precision`: The precision used for the model (e.g., fp16, fp32)
55
- - `weight_type`: Type of weights used
56
- - `model_type`: Type of the model
57
- - `status`: Current status of the evaluation
58
- - `timestamp`: When the evaluation was performed
59
- - `security_score`: A score representing the model's security evaluation
60
- - `safetensors_compliant`: A boolean indicating whether the model is compliant with safetensors
61
- - `hub_license`: The license of the model on Hugging Face Hub
62
- - `hub_likes`: Number of likes on Hugging Face Hub
63
- - `params_billion`: Number of parameters in billions
64
- - `available_on_hub`: Whether the model is available on Hugging Face Hub
65
- - `model_sha`: SHA hash of the model
66
-
67
- ## Usage
68
-
69
- This dataset is used to populate the secure code leaderboard, providing insights into the security aspects of various models.
70
- """
71
-
72
- # Write the dataset card
73
- with open("README.md", "w") as f:
74
- f.write(dataset_card)
75
-
76
- # Upload the dataset card
77
- api = HfApi()
78
- api.upload_file(
79
- path_or_fileobj="README.md",
80
- path_in_repo="README.md",
81
- repo_id="stacklok/results",
82
- repo_type="dataset"
83
- )
84
-
85
- print("Dataset initialized and card uploaded successfully!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
logs/evaluation.log DELETED
File without changes
logs/security_eval.log DELETED
File without changes
src/populate.py CHANGED
@@ -1,17 +1,15 @@
1
  import json
2
- import os
3
  import numpy as np
4
  import pandas as pd
5
  import logging
6
- from typing import List, Dict, Any
7
-
8
  from src.display.formatting import make_clickable_model
9
- from src.leaderboard.read_evals import get_raw_eval_results
10
 
11
  logger = logging.getLogger(__name__)
12
 
13
- from huggingface_hub import HfApi
14
- from src.config import RESULTS_REPO, QUEUE_REPO
15
 
16
  def get_leaderboard_df(cols: List[str], benchmark_cols: List[str]) -> pd.DataFrame:
17
  """Creates a dataframe from all the individual experiment results"""
 
1
  import json
2
+
3
  import numpy as np
4
  import pandas as pd
5
  import logging
6
+ from typing import List
7
+ from src.config import RESULTS_REPO, QUEUE_REPO
8
  from src.display.formatting import make_clickable_model
 
9
 
10
  logger = logging.getLogger(__name__)
11
 
12
+
 
13
 
14
  def get_leaderboard_df(cols: List[str], benchmark_cols: List[str]) -> pd.DataFrame:
15
  """Creates a dataframe from all the individual experiment results"""
src/submission/submit.py CHANGED
@@ -1,5 +1,3 @@
1
- import json
2
- import os
3
  import logging
4
  from datetime import datetime, timezone
5
  from typing import Dict, Tuple, Optional, Any
@@ -331,8 +329,7 @@ def initialize_results_repo():
331
  return True
332
  except Exception:
333
  logger.info("Results repository not initialized, creating initial dataset")
334
-
335
- # Initialize with a sample entry as per init_huggingface_dataset.py
336
  initial_data = {
337
  "model": ["example/model"],
338
  "model_raw": ["example/model"],
 
 
 
1
  import logging
2
  from datetime import datetime, timezone
3
  from typing import Dict, Tuple, Optional, Any
 
329
  return True
330
  except Exception:
331
  logger.info("Results repository not initialized, creating initial dataset")
332
+
 
333
  initial_data = {
334
  "model": ["example/model"],
335
  "model_raw": ["example/model"],