abdev-leaderboard / validation.py
loodvanniekerkginkgo's picture
Small possible edge case with empty username
1938471
import pandas as pd
import io
import gradio as gr
import requests
from constants import (
REQUIRED_COLUMNS,
ASSAY_LIST,
CV_COLUMN,
EXAMPLE_FILE_DICT,
ANTIBODY_NAMES_DICT,
)
def validate_username(username: str) -> bool:
"""
Validate that the username corresponds to a real Hugging Face profile.
Just check https://huggingface.co/username exists.
Parameters
----------
username: str
The username to validate
Returns
-------
bool
True if the username is valid and profile exists, False otherwise
Raises
------
gr.Error: If username is invalid or profile doesn't exist
"""
username = username.strip()
if username == "":
raise gr.Error("❌ Please provide a username.")
# Check if the Hugging Face profile exists
profile_url = f"https://huggingface.co/{username}"
try:
response = requests.get(profile_url, timeout=10)
if response.status_code == 200:
# Additional check: make sure it's actually a user profile page
# and not some other page that happens to exist
if "profile" in response.text.lower() or "models" in response.text.lower():
return True
else:
raise gr.Error(
f"❌ '{username}' does not appear to be a valid Hugging Face user profile"
)
elif response.status_code == 404:
raise gr.Error(
f"❌ Hugging Face user '{username}' does not exist. Please check the username or create an account at https://huggingface.co. This is used to track unique submissions."
)
else:
raise gr.Error(
f"❌ Unable to verify username '{username}'. Please try again later."
)
except requests.exceptions.Timeout:
raise gr.Error("❌ Timeout while checking username. Please try again.")
except requests.exceptions.ConnectionError:
raise gr.Error(
"❌ Unable to connect to Hugging Face. Please check your internet connection."
)
except requests.exceptions.RequestException as e:
raise gr.Error(f"❌ Error validating username: {str(e)}")
def validate_csv_can_be_read(file_content: str) -> pd.DataFrame:
"""
Validate that the CSV file can be read and parsed.
Parameters
----------
file_content: str
The content of the uploaded CSV file.
Returns
-------
pd.DataFrame
The parsed DataFrame if successful.
Raises
------
gr.Error: If CSV cannot be read or parsed
"""
try:
# Read CSV content
df = pd.read_csv(io.StringIO(file_content))
return df
except pd.errors.EmptyDataError:
raise gr.Error("❌ CSV file is empty or contains no valid data")
except pd.errors.ParserError as e:
raise gr.Error(f"❌ Invalid CSV format<br><br>" f"Error: {str(e)}")
except UnicodeDecodeError:
raise gr.Error(
"❌ File encoding error<br><br>"
"Your file appears to have an unsupported encoding.<br>"
"Please save your CSV file with UTF-8 encoding and try again."
)
except Exception as e:
raise gr.Error(f"❌ Unexpected error reading CSV file: {str(e)}")
def validate_cv_submission(
df: pd.DataFrame, submission_type: str = "GDPa1_cross_validation"
) -> None:
"""Validate cross-validation submission"""
# Must have CV_COLUMN for CV submissions
if CV_COLUMN not in df.columns:
raise gr.Error(f"❌ CV submissions must include a '{CV_COLUMN}' column")
# Load canonical fold assignments
expected_cv_df = pd.read_csv(EXAMPLE_FILE_DICT[submission_type])[
["antibody_name", CV_COLUMN]
]
antibody_check = expected_cv_df.merge(
df[["antibody_name", CV_COLUMN]],
on="antibody_name",
how="left",
suffixes=("_expected", "_submitted"),
)
# CV fold assignments should match
fold_mismatches = antibody_check[
antibody_check[f"{CV_COLUMN}_expected"]
!= antibody_check[f"{CV_COLUMN}_submitted"]
]
if len(fold_mismatches) > 0:
examples = []
for _, row in fold_mismatches.head(3).iterrows():
examples.append(
f"{row['antibody_name']} (expected fold {row[f'{CV_COLUMN}_expected']}, got {row[f'{CV_COLUMN}_submitted']})"
)
raise gr.Error(
f"❌ Fold assignments don't match canonical CV folds: {'; '.join(examples)}"
)
def validate_full_dataset_submission(df: pd.DataFrame) -> None:
"""Validate full dataset submission"""
if CV_COLUMN in df.columns:
raise gr.Error(
f"❌ Your submission contains a '{CV_COLUMN}' column. "
"Please select 'Cross-Validation Predictions' if you want to submit CV results."
)
def get_assay_columns(df: pd.DataFrame) -> list[str]:
"""Get all assay columns from the DataFrame"""
return [col for col in df.columns if col in ASSAY_LIST]
def validate_dataframe(df: pd.DataFrame, submission_type: str = "GDPa1") -> None:
"""
Validate the DataFrame content and structure.
Parameters
----------
df: pd.DataFrame
The DataFrame to validate.
submission_type: str
Type of submission: "GDPa1" or "GDPa1_cross_validation"
Raises
------
gr.Error: If validation fails
"""
if submission_type not in EXAMPLE_FILE_DICT.keys():
raise ValueError(f"Invalid submission type: {submission_type}")
# Required columns should be present
missing_columns = set(REQUIRED_COLUMNS) - set(df.columns)
if missing_columns:
raise gr.Error(f"❌ Missing required columns: {', '.join(missing_columns)}")
# Should include at least 1 assay column
assay_columns = get_assay_columns(df)
if len(assay_columns) < 1:
raise gr.Error(
"❌ CSV should include at least one of the following assay columns: "
+ ", ".join(ASSAY_LIST)
)
# Submission are name, sequence, and at least one assay column
submission_columns = REQUIRED_COLUMNS + assay_columns
# Data should not be empty
if df.empty:
raise gr.Error("❌ CSV file is empty")
# No missing values in submission columns
for col in submission_columns:
missing_count = df[col].isnull().sum()
if missing_count > 0:
raise gr.Error(f"❌ Column '{col}' contains {missing_count} missing values")
# All names should be unique
n_duplicates = df["antibody_name"].duplicated().sum()
if n_duplicates > 0:
raise gr.Error(
f"❌ CSV should have only one row per antibody. Found {n_duplicates} duplicates."
)
# All antibody names should be recognizable
unrecognized_antibodies = set(df["antibody_name"]) - set(
ANTIBODY_NAMES_DICT[submission_type]
)
if unrecognized_antibodies:
raise gr.Error(
f"❌ Found unrecognized antibody names: {', '.join(unrecognized_antibodies)}"
)
# All antibody names should be present
missing_antibodies = set(ANTIBODY_NAMES_DICT[submission_type]) - set(
df["antibody_name"]
)
if missing_antibodies:
raise gr.Error(
f"❌ Missing predictions for {len(missing_antibodies)} antibodies: {', '.join(missing_antibodies)}"
)
# Submission-type specific validation
if submission_type.endswith("_cross_validation"):
validate_cv_submission(df, submission_type)
else: # full_dataset
validate_full_dataset_submission(df)
def validate_csv_file(file_content: str, submission_type: str = "GDPa1") -> None:
"""
Validate the uploaded CSV file.
Parameters
----------
file_content: str
The content of the uploaded CSV file.
submission_type: str
Type of submission: "GDPa1" or "GDPa1_cross_validation"
Raises
------
gr.Error: If validation fails
"""
df = validate_csv_can_be_read(file_content)
validate_dataframe(df, submission_type)