lukehinds's picture
Clean up
bd09cee
import gradio as gr
from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import snapshot_download
import pandas as pd
import os
import logging
from datetime import datetime
from datasets import Dataset
from src.core.evaluation import EvaluationManager, EvaluationRequest
from src.logging_config import setup_logging
from src.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
EVALUATION_QUEUE_TEXT,
INTRODUCTION_TEXT,
LLM_BENCHMARKS_TEXT,
TITLE,
)
from src.display.css_html_js import custom_css
from src.display.utils import (
BENCHMARK_COLS,
COLS,
EVAL_COLS,
EVAL_TYPES,
ModelType,
WeightType,
Precision
)
from src.envs import (
API,
CACHE_PATH,
EVAL_REQUESTS_PATH,
EVAL_RESULTS_PATH,
QUEUE_REPO,
REPO_ID,
RESULTS_REPO,
TOKEN
)
from src.populate import get_evaluation_queue_df, get_leaderboard_df
from src.submission.submit import initialize_queue_repo, initialize_results_repo
# Setup logging
setup_logging(log_dir="logs")
logger = logging.getLogger('web')
# Initialize evaluation manager
evaluation_manager = EvaluationManager(
results_dir=EVAL_RESULTS_PATH,
backup_dir=os.path.join(CACHE_PATH, "eval-backups")
)
def restart_space():
"""Restart the Hugging Face space."""
logger.info("Restarting space")
API.restart_space(repo_id=REPO_ID)
def initialize_space():
"""Initialize the space by downloading required data."""
logger.info("Initializing space")
try:
logger.info(f"Downloading queue data from {QUEUE_REPO}")
# Initialize queue repository if needed
if not initialize_queue_repo():
logger.error("Failed to initialize queue repository")
restart_space()
return
snapshot_download(
repo_id=QUEUE_REPO,
local_dir=EVAL_REQUESTS_PATH,
repo_type="dataset",
tqdm_class=None,
etag_timeout=30,
token=TOKEN
)
except Exception as e:
logger.error(f"Failed to download queue data: {str(e)}")
restart_space()
try:
logger.info(f"Downloading results data from {RESULTS_REPO}")
# Initialize results repository if needed
if not initialize_results_repo():
logger.error("Failed to initialize results repository")
restart_space()
return
snapshot_download(
repo_id=RESULTS_REPO,
local_dir=EVAL_RESULTS_PATH,
repo_type="dataset",
tqdm_class=None,
etag_timeout=30,
token=TOKEN
)
except Exception as e:
logger.error(f"Failed to download results data: {str(e)}")
restart_space()
# Initialize space
initialize_space()
LEADERBOARD_DF = get_leaderboard_df(COLS, BENCHMARK_COLS)
(
finished_eval_queue_df,
running_eval_queue_df,
pending_eval_queue_df,
) = get_evaluation_queue_df(EVAL_COLS)
def process_evaluation_queue():
"""Process pending evaluation requests."""
logger.info("Processing evaluation queue")
# Fetch pending requests from Hugging Face repository
_, _, pending_requests = get_evaluation_queue_df(EVAL_COLS + ['model_raw', 'timestamp'])
for _, request in pending_requests.iterrows():
try:
model_name = request['model_raw']
logger.info(f"Processing request for model: {model_name}")
# Update status to RUNNING
update_request_status(model_name, "RUNNING")
# Convert queue request to evaluation request
eval_request = EvaluationRequest(
model=model_name,
revision=request['revision'],
precision=request['precision'],
weight_type=request['weight_type'],
submitted_time=request['timestamp'], # Use the actual timestamp field
model_type=request.get('model_type', '')
)
# Run evaluation
results = evaluation_manager.run_evaluation(eval_request)
logger.info(f"Evaluation complete for {model_name}")
# Save results to stacklok/results
save_results_to_repo(results, RESULTS_REPO)
# Update request status in stacklok/requests
update_request_status(model_name, "FINISHED")
# Update leaderboard
update_leaderboard()
except Exception as e:
logger.error(f"Evaluation failed for {model_name}: {str(e)}", exc_info=True)
# Update request status to indicate failure
update_request_status(model_name, "FAILED")
def update_request_status(model_name, status):
"""Update the status of a request in the Hugging Face repository."""
try:
# Load the current dataset
from datasets import load_dataset
dataset = load_dataset(QUEUE_REPO, split="train")
# Convert to dictionary for easier manipulation
data_dict = dataset.to_dict()
# Find the most recent request for this model
indices = [i for i, m in enumerate(data_dict["model_raw"]) if m == model_name]
if not indices:
logger.error(f"No request found for model {model_name}")
return
# Get the most recent request (last index)
latest_index = indices[-1]
# Update the status for the found request
data_dict["status"][latest_index] = status
# Create new dataset with updated status
updated_dataset = Dataset.from_dict(data_dict)
# Push the updated dataset back to the hub with a descriptive commit message
updated_dataset.push_to_hub(
QUEUE_REPO,
split="train",
commit_message=f"Update status to {status} for {model_name}"
)
logger.info(f"Updated status for {model_name} to {status}")
except Exception as e:
logger.error(f"Failed to update status for {model_name}: {str(e)}", exc_info=True)
# Remove the extract_model_name function as it's no longer needed
def save_results_to_repo(results, repo):
"""Save evaluation results to the specified repository."""
try:
model_id = results.get('model', '')
if not model_id:
raise ValueError("Model ID not found in results")
# Convert all values to lists if they aren't already
dataset_dict = {
k: [v] if not isinstance(v, list) else v
for k, v in results.items()
}
# Create a Dataset object from the results
dataset = Dataset.from_dict(dataset_dict)
# Push the dataset to the Hugging Face Hub
dataset.push_to_hub(repo, split="train")
logger.info(f"Saved results for {model_id} to {repo}")
except Exception as e:
logger.error(f"Failed to save results to {repo}: {str(e)}", exc_info=True)
def update_leaderboard():
"""Update the leaderboard with latest evaluation results."""
global LEADERBOARD_DF
LEADERBOARD_DF = get_leaderboard_df(COLS, BENCHMARK_COLS)
return LEADERBOARD_DF
def init_leaderboard(df):
"""Initialize the leaderboard with the given DataFrame."""
if df is None or df.empty:
df = pd.DataFrame(columns=COLS)
logger.info("Creating empty leaderboard - no evaluations completed yet")
else:
logger.info(f"Initializing leaderboard with {len(df)} rows")
# Ensure all required columns exist
for col in COLS:
if col not in df.columns:
logger.warning(f"Column {col} not found in DataFrame, adding with None values")
df[col] = None
# Map dataset columns to display columns
column_mapping = {
"model_id": "Model",
"security_score": "Security Score ⬆️",
"safetensors_compliant": "Safetensors",
"precision": "Precision"
}
for src, dst in column_mapping.items():
if src in df.columns:
df[dst] = df[src]
logger.debug(f"Mapped column {src} to {dst}")
# Sort by Security Score if available
if "Security Score ⬆️" in df.columns:
df = df.sort_values(by="Security Score ⬆️", ascending=False)
logger.info("Sorted leaderboard by Security Score")
# Select only the columns we want to display
df = df[COLS]
logger.info(f"Final leaderboard columns: {df.columns.tolist()}")
logger.debug(f"Leaderboard data:\n{df}")
# Create the leaderboard using gradio_leaderboard
return Leaderboard(
value=df,
datatype=["html" if col == "Model" else "number" if col == "Security Score ⬆️" else "bool" if col == "Safetensors" else "str" for col in COLS],
select_columns=SelectColumns(
default_selection=COLS,
cant_deselect=["Model", "Security Score ⬆️", "Safetensors"],
label="Select Columns to Display:",
),
search_columns=["Model"],
filter_columns=[
ColumnFilter("Safetensors", type="boolean", label="Show only Safetensors models"),
ColumnFilter("Security Score ⬆️", type="slider", min=0, max=1, label="Minimum Security Score"),
],
interactive=False,
)
demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("πŸ”’ Security Leaderboard", elem_id="security-leaderboard-tab", id=0):
leaderboard = init_leaderboard(LEADERBOARD_DF)
with gr.TabItem("πŸ“ About", elem_id="about-tab", id=2):
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
with gr.TabItem("πŸš€ Submit Model", elem_id="submit-tab", id=3):
with gr.Column():
with gr.Row():
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
with gr.Column():
with gr.Accordion(
f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
open=False,
):
with gr.Row():
finished_eval_table = gr.components.Dataframe(
value=finished_eval_queue_df,
headers=EVAL_COLS,
datatype=EVAL_TYPES,
row_count=5,
)
with gr.Accordion(
f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
open=False,
):
with gr.Row():
running_eval_table = gr.components.Dataframe(
value=running_eval_queue_df,
headers=EVAL_COLS,
datatype=EVAL_TYPES,
row_count=5,
)
with gr.Accordion(
f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
open=False,
):
with gr.Row():
pending_eval_table = gr.components.Dataframe(
value=pending_eval_queue_df,
headers=EVAL_COLS,
datatype=EVAL_TYPES,
row_count=5,
)
with gr.Row():
gr.Markdown("# πŸ”’ Submit Your Model for Security Evaluation", elem_classes="markdown-text")
with gr.Row():
with gr.Column():
model_name_textbox = gr.Textbox(
label="Model name (organization/model-name)",
placeholder="huggingface/model-name"
)
revision_name_textbox = gr.Textbox(
label="Revision commit",
placeholder="main"
)
model_type = gr.Dropdown(
choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
label="Model type",
multiselect=False,
value=None,
interactive=True,
)
with gr.Column():
precision = gr.Dropdown(
choices=[i.value.name for i in Precision if i != Precision.Unknown],
label="Precision",
multiselect=False,
value="float16",
interactive=True,
)
weight_type = gr.Dropdown(
choices=[i.value.name for i in WeightType],
label="Weight Format",
multiselect=False,
value="Safetensors",
interactive=True,
)
base_model_name_textbox = gr.Textbox(
label="Base model (for delta or adapter weights)",
placeholder="Optional: base model path"
)
with gr.Row():
gr.Markdown(
"""
### Security Requirements:
1. Model weights must be in safetensors format
2. Model card must include security considerations
3. Model will be evaluated on secure coding capabilities
""",
elem_classes="markdown-text"
)
submit_button = gr.Button("Submit for Security Evaluation")
submission_result = gr.Markdown()
def handle_submission(model, base_model, revision, precision, weight_type, model_type):
"""Handle new model submission."""
try:
logger.info(f"New submission received for {model}")
# Prepare request data as a dataset-compatible dictionary (all values must be lists)
request_data = {
"model": [model],
"model_raw": [model], # Store raw model name for processing
"base_model": [base_model if base_model else ""],
"revision": [revision if revision else "main"],
"precision": [precision],
"weight_type": [weight_type],
"model_type": [model_type],
"status": ["PENDING"],
"timestamp": [datetime.now().isoformat()]
}
# Convert to dataset and push to hub
dataset = Dataset.from_dict(request_data)
dataset.push_to_hub(
QUEUE_REPO,
config_name=model.replace("/", "_"),
split="train"
)
logger.info(f"Added request for {model} to {QUEUE_REPO}")
# Get updated pending evaluations
_, _, pending_eval_queue_df = get_evaluation_queue_df(EVAL_COLS)
# Start processing queue in background
scheduler.add_job(process_evaluation_queue, id='process_queue_job', replace_existing=True)
return "Submission successful! Your model has been added to the evaluation queue. Please check the 'Pending Evaluation Queue' for status updates.", pending_eval_queue_df
except Exception as e:
logger.error(f"Submission failed: {str(e)}", exc_info=True)
return f"Error: {str(e)}", None
# Remove the queue_manager initialization
# queue_manager = QueueManager(queue_dir=os.path.join(CACHE_PATH, "eval-queue"))
submit_button.click(
handle_submission,
[
model_name_textbox,
base_model_name_textbox,
revision_name_textbox,
precision,
weight_type,
model_type,
],
[submission_result, pending_eval_table],
)
with gr.Row():
with gr.Accordion("πŸ“™ Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
lines=20,
elem_id="citation-button",
show_copy_button=True,
)
# Update evaluation tables periodically
def update_evaluation_tables():
finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_COLS)
return finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df
# Setup schedulers
scheduler = BackgroundScheduler()
scheduler.add_job(restart_space, "interval", seconds=1800)
scheduler.add_job(process_evaluation_queue, "interval", seconds=300) # Process queue every 5 minutes
scheduler.start()
logger.info("Application startup complete")
demo.queue(default_concurrency_limit=40).launch()
# Update evaluation tables every 60 seconds
demo.load(update_evaluation_tables, outputs=[finished_eval_table, running_eval_table, pending_eval_table], every=60)