Spaces:

stacklok
/

secure_code_leaderboard_archived

Running

App Files Files Community

secure_code_leaderboard_archived / app.py

lukehinds

Clean up

bd09cee about 1 month ago

raw

history blame contribute delete

17.7 kB

	import gradio as gr
	from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
	from apscheduler.schedulers.background import BackgroundScheduler
	from huggingface_hub import snapshot_download
	import pandas as pd
	import os
	import logging
	from datetime import datetime
	from datasets import Dataset

	from src.core.evaluation import EvaluationManager, EvaluationRequest
	from src.logging_config import setup_logging
	from src.about import (
	CITATION_BUTTON_LABEL,
	CITATION_BUTTON_TEXT,
	EVALUATION_QUEUE_TEXT,
	INTRODUCTION_TEXT,
	LLM_BENCHMARKS_TEXT,
	TITLE,
	)
	from src.display.css_html_js import custom_css
	from src.display.utils import (
	BENCHMARK_COLS,
	COLS,
	EVAL_COLS,
	EVAL_TYPES,
	ModelType,
	WeightType,
	Precision
	)
	from src.envs import (
	API,
	CACHE_PATH,
	EVAL_REQUESTS_PATH,
	EVAL_RESULTS_PATH,
	QUEUE_REPO,
	REPO_ID,
	RESULTS_REPO,
	TOKEN
	)
	from src.populate import get_evaluation_queue_df, get_leaderboard_df
	from src.submission.submit import initialize_queue_repo, initialize_results_repo


	# Setup logging
	setup_logging(log_dir="logs")
	logger = logging.getLogger('web')

	# Initialize evaluation manager
	evaluation_manager = EvaluationManager(
	results_dir=EVAL_RESULTS_PATH,
	backup_dir=os.path.join(CACHE_PATH, "eval-backups")
	)

	def restart_space():
	"""Restart the Hugging Face space."""
	logger.info("Restarting space")
	API.restart_space(repo_id=REPO_ID)

	def initialize_space():
	"""Initialize the space by downloading required data."""
	logger.info("Initializing space")
	try:
	logger.info(f"Downloading queue data from {QUEUE_REPO}")

	# Initialize queue repository if needed
	if not initialize_queue_repo():
	logger.error("Failed to initialize queue repository")
	restart_space()
	return

	snapshot_download(
	repo_id=QUEUE_REPO,
	local_dir=EVAL_REQUESTS_PATH,
	repo_type="dataset",
	tqdm_class=None,
	etag_timeout=30,
	token=TOKEN
	)
	except Exception as e:
	logger.error(f"Failed to download queue data: {str(e)}")
	restart_space()

	try:
	logger.info(f"Downloading results data from {RESULTS_REPO}")

	# Initialize results repository if needed
	if not initialize_results_repo():
	logger.error("Failed to initialize results repository")
	restart_space()
	return

	snapshot_download(
	repo_id=RESULTS_REPO,
	local_dir=EVAL_RESULTS_PATH,
	repo_type="dataset",
	tqdm_class=None,
	etag_timeout=30,
	token=TOKEN
	)
	except Exception as e:
	logger.error(f"Failed to download results data: {str(e)}")
	restart_space()

	# Initialize space
	initialize_space()


	LEADERBOARD_DF = get_leaderboard_df(COLS, BENCHMARK_COLS)

	(
	finished_eval_queue_df,
	running_eval_queue_df,
	pending_eval_queue_df,
	) = get_evaluation_queue_df(EVAL_COLS)

	def process_evaluation_queue():
	"""Process pending evaluation requests."""
	logger.info("Processing evaluation queue")

	# Fetch pending requests from Hugging Face repository
	_, _, pending_requests = get_evaluation_queue_df(EVAL_COLS + ['model_raw', 'timestamp'])

	for _, request in pending_requests.iterrows():
	try:
	model_name = request['model_raw']
	logger.info(f"Processing request for model: {model_name}")

	# Update status to RUNNING
	update_request_status(model_name, "RUNNING")

	# Convert queue request to evaluation request
	eval_request = EvaluationRequest(
	model=model_name,
	revision=request['revision'],
	precision=request['precision'],
	weight_type=request['weight_type'],
	submitted_time=request['timestamp'], # Use the actual timestamp field
	model_type=request.get('model_type', '')
	)

	# Run evaluation
	results = evaluation_manager.run_evaluation(eval_request)
	logger.info(f"Evaluation complete for {model_name}")

	# Save results to stacklok/results
	save_results_to_repo(results, RESULTS_REPO)

	# Update request status in stacklok/requests
	update_request_status(model_name, "FINISHED")

	# Update leaderboard
	update_leaderboard()

	except Exception as e:
	logger.error(f"Evaluation failed for {model_name}: {str(e)}", exc_info=True)
	# Update request status to indicate failure
	update_request_status(model_name, "FAILED")

	def update_request_status(model_name, status):
	"""Update the status of a request in the Hugging Face repository."""
	try:
	# Load the current dataset
	from datasets import load_dataset
	dataset = load_dataset(QUEUE_REPO, split="train")

	# Convert to dictionary for easier manipulation
	data_dict = dataset.to_dict()

	# Find the most recent request for this model
	indices = [i for i, m in enumerate(data_dict["model_raw"]) if m == model_name]

	if not indices:
	logger.error(f"No request found for model {model_name}")
	return

	# Get the most recent request (last index)
	latest_index = indices[-1]

	# Update the status for the found request
	data_dict["status"][latest_index] = status

	# Create new dataset with updated status
	updated_dataset = Dataset.from_dict(data_dict)

	# Push the updated dataset back to the hub with a descriptive commit message
	updated_dataset.push_to_hub(
	QUEUE_REPO,
	split="train",
	commit_message=f"Update status to {status} for {model_name}"
	)

	logger.info(f"Updated status for {model_name} to {status}")
	except Exception as e:
	logger.error(f"Failed to update status for {model_name}: {str(e)}", exc_info=True)

	# Remove the extract_model_name function as it's no longer needed



	def save_results_to_repo(results, repo):
	"""Save evaluation results to the specified repository."""
	try:
	model_id = results.get('model', '')
	if not model_id:
	raise ValueError("Model ID not found in results")

	# Convert all values to lists if they aren't already
	dataset_dict = {
	k: [v] if not isinstance(v, list) else v
	for k, v in results.items()
	}

	# Create a Dataset object from the results
	dataset = Dataset.from_dict(dataset_dict)

	# Push the dataset to the Hugging Face Hub
	dataset.push_to_hub(repo, split="train")

	logger.info(f"Saved results for {model_id} to {repo}")
	except Exception as e:
	logger.error(f"Failed to save results to {repo}: {str(e)}", exc_info=True)

	def update_leaderboard():
	"""Update the leaderboard with latest evaluation results."""
	global LEADERBOARD_DF
	LEADERBOARD_DF = get_leaderboard_df(COLS, BENCHMARK_COLS)
	return LEADERBOARD_DF

	def init_leaderboard(df):
	"""Initialize the leaderboard with the given DataFrame."""
	if df is None or df.empty:
	df = pd.DataFrame(columns=COLS)
	logger.info("Creating empty leaderboard - no evaluations completed yet")
	else:
	logger.info(f"Initializing leaderboard with {len(df)} rows")

	# Ensure all required columns exist
	for col in COLS:
	if col not in df.columns:
	logger.warning(f"Column {col} not found in DataFrame, adding with None values")
	df[col] = None

	# Map dataset columns to display columns
	column_mapping = {
	"model_id": "Model",
	"security_score": "Security Score ⬆️",
	"safetensors_compliant": "Safetensors",
	"precision": "Precision"
	}

	for src, dst in column_mapping.items():
	if src in df.columns:
	df[dst] = df[src]
	logger.debug(f"Mapped column {src} to {dst}")

	# Sort by Security Score if available
	if "Security Score ⬆️" in df.columns:
	df = df.sort_values(by="Security Score ⬆️", ascending=False)
	logger.info("Sorted leaderboard by Security Score")

	# Select only the columns we want to display
	df = df[COLS]

	logger.info(f"Final leaderboard columns: {df.columns.tolist()}")
	logger.debug(f"Leaderboard data:\n{df}")

	# Create the leaderboard using gradio_leaderboard
	return Leaderboard(
	value=df,
	datatype=["html" if col == "Model" else "number" if col == "Security Score ⬆️" else "bool" if col == "Safetensors" else "str" for col in COLS],
	select_columns=SelectColumns(
	default_selection=COLS,
	cant_deselect=["Model", "Security Score ⬆️", "Safetensors"],
	label="Select Columns to Display:",
	),
	search_columns=["Model"],
	filter_columns=[
	ColumnFilter("Safetensors", type="boolean", label="Show only Safetensors models"),
	ColumnFilter("Security Score ⬆️", type="slider", min=0, max=1, label="Minimum Security Score"),
	],
	interactive=False,
	)


	demo = gr.Blocks(css=custom_css)
	with demo:
	gr.HTML(TITLE)
	gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

	with gr.Tabs(elem_classes="tab-buttons") as tabs:
	with gr.TabItem("🔒 Security Leaderboard", elem_id="security-leaderboard-tab", id=0):
	leaderboard = init_leaderboard(LEADERBOARD_DF)

	with gr.TabItem("📝 About", elem_id="about-tab", id=2):
	gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")

	with gr.TabItem("🚀 Submit Model", elem_id="submit-tab", id=3):
	with gr.Column():
	with gr.Row():
	gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")

	with gr.Column():
	with gr.Accordion(
	f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
	open=False,
	):
	with gr.Row():
	finished_eval_table = gr.components.Dataframe(
	value=finished_eval_queue_df,
	headers=EVAL_COLS,
	datatype=EVAL_TYPES,
	row_count=5,
	)
	with gr.Accordion(
	f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
	open=False,
	):
	with gr.Row():
	running_eval_table = gr.components.Dataframe(
	value=running_eval_queue_df,
	headers=EVAL_COLS,
	datatype=EVAL_TYPES,
	row_count=5,
	)

	with gr.Accordion(
	f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
	open=False,
	):
	with gr.Row():
	pending_eval_table = gr.components.Dataframe(
	value=pending_eval_queue_df,
	headers=EVAL_COLS,
	datatype=EVAL_TYPES,
	row_count=5,
	)
	with gr.Row():
	gr.Markdown("# 🔒 Submit Your Model for Security Evaluation", elem_classes="markdown-text")

	with gr.Row():
	with gr.Column():
	model_name_textbox = gr.Textbox(
	label="Model name (organization/model-name)",
	placeholder="huggingface/model-name"
	)
	revision_name_textbox = gr.Textbox(
	label="Revision commit",
	placeholder="main"
	)
	model_type = gr.Dropdown(
	choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
	label="Model type",
	multiselect=False,
	value=None,
	interactive=True,
	)

	with gr.Column():
	precision = gr.Dropdown(
	choices=[i.value.name for i in Precision if i != Precision.Unknown],
	label="Precision",
	multiselect=False,
	value="float16",
	interactive=True,
	)
	weight_type = gr.Dropdown(
	choices=[i.value.name for i in WeightType],
	label="Weight Format",
	multiselect=False,
	value="Safetensors",
	interactive=True,
	)
	base_model_name_textbox = gr.Textbox(
	label="Base model (for delta or adapter weights)",
	placeholder="Optional: base model path"
	)

	with gr.Row():
	gr.Markdown(
	"""
	### Security Requirements:
	1. Model weights must be in safetensors format
	2. Model card must include security considerations
	3. Model will be evaluated on secure coding capabilities
	""",
	elem_classes="markdown-text"
	)

	submit_button = gr.Button("Submit for Security Evaluation")
	submission_result = gr.Markdown()

	def handle_submission(model, base_model, revision, precision, weight_type, model_type):
	"""Handle new model submission."""
	try:
	logger.info(f"New submission received for {model}")

	# Prepare request data as a dataset-compatible dictionary (all values must be lists)
	request_data = {
	"model": [model],
	"model_raw": [model], # Store raw model name for processing
	"base_model": [base_model if base_model else ""],
	"revision": [revision if revision else "main"],
	"precision": [precision],
	"weight_type": [weight_type],
	"model_type": [model_type],
	"status": ["PENDING"],
	"timestamp": [datetime.now().isoformat()]
	}

	# Convert to dataset and push to hub
	dataset = Dataset.from_dict(request_data)
	dataset.push_to_hub(
	QUEUE_REPO,
	config_name=model.replace("/", "_"),
	split="train"
	)

	logger.info(f"Added request for {model} to {QUEUE_REPO}")

	# Get updated pending evaluations
	_, _, pending_eval_queue_df = get_evaluation_queue_df(EVAL_COLS)

	# Start processing queue in background
	scheduler.add_job(process_evaluation_queue, id='process_queue_job', replace_existing=True)

	return "Submission successful! Your model has been added to the evaluation queue. Please check the 'Pending Evaluation Queue' for status updates.", pending_eval_queue_df
	except Exception as e:
	logger.error(f"Submission failed: {str(e)}", exc_info=True)
	return f"Error: {str(e)}", None

	# Remove the queue_manager initialization
	# queue_manager = QueueManager(queue_dir=os.path.join(CACHE_PATH, "eval-queue"))

	submit_button.click(
	handle_submission,
	[
	model_name_textbox,
	base_model_name_textbox,
	revision_name_textbox,
	precision,
	weight_type,
	model_type,
	],
	[submission_result, pending_eval_table],
	)

	with gr.Row():
	with gr.Accordion("📙 Citation", open=False):
	citation_button = gr.Textbox(
	value=CITATION_BUTTON_TEXT,
	label=CITATION_BUTTON_LABEL,
	lines=20,
	elem_id="citation-button",
	show_copy_button=True,
	)

	# Update evaluation tables periodically
	def update_evaluation_tables():
	finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df = get_evaluation_queue_df(EVAL_COLS)
	return finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df

	# Setup schedulers
	scheduler = BackgroundScheduler()
	scheduler.add_job(restart_space, "interval", seconds=1800)
	scheduler.add_job(process_evaluation_queue, "interval", seconds=300) # Process queue every 5 minutes
	scheduler.start()

	logger.info("Application startup complete")
	demo.queue(default_concurrency_limit=40).launch()

	# Update evaluation tables every 60 seconds
	demo.load(update_evaluation_tables, outputs=[finished_eval_table, running_eval_table, pending_eval_table], every=60)