|
import pandas as pd |
|
import io |
|
import gradio as gr |
|
import requests |
|
from constants import ( |
|
REQUIRED_COLUMNS, |
|
ASSAY_LIST, |
|
CV_COLUMN, |
|
EXAMPLE_FILE_DICT, |
|
ANTIBODY_NAMES_DICT, |
|
) |
|
|
|
|
|
def validate_username(username: str) -> bool: |
|
""" |
|
Validate that the username corresponds to a real Hugging Face profile. |
|
Just check https://huggingface.co/username exists. |
|
|
|
Parameters |
|
---------- |
|
username: str |
|
The username to validate |
|
|
|
Returns |
|
------- |
|
bool |
|
True if the username is valid and profile exists, False otherwise |
|
|
|
Raises |
|
------ |
|
gr.Error: If username is invalid or profile doesn't exist |
|
""" |
|
username = username.strip() |
|
if username == "": |
|
raise gr.Error("β Please provide a username.") |
|
|
|
|
|
profile_url = f"https://huggingface.co/{username}" |
|
|
|
try: |
|
response = requests.get(profile_url, timeout=10) |
|
|
|
if response.status_code == 200: |
|
|
|
|
|
if "profile" in response.text.lower() or "models" in response.text.lower(): |
|
return True |
|
else: |
|
raise gr.Error( |
|
f"β '{username}' does not appear to be a valid Hugging Face user profile" |
|
) |
|
elif response.status_code == 404: |
|
raise gr.Error( |
|
f"β Hugging Face user '{username}' does not exist. Please check the username or create an account at https://huggingface.co. This is used to track unique submissions." |
|
) |
|
else: |
|
raise gr.Error( |
|
f"β Unable to verify username '{username}'. Please try again later." |
|
) |
|
|
|
except requests.exceptions.Timeout: |
|
raise gr.Error("β Timeout while checking username. Please try again.") |
|
except requests.exceptions.ConnectionError: |
|
raise gr.Error( |
|
"β Unable to connect to Hugging Face. Please check your internet connection." |
|
) |
|
except requests.exceptions.RequestException as e: |
|
raise gr.Error(f"β Error validating username: {str(e)}") |
|
|
|
|
|
def validate_csv_can_be_read(file_content: str) -> pd.DataFrame: |
|
""" |
|
Validate that the CSV file can be read and parsed. |
|
|
|
Parameters |
|
---------- |
|
file_content: str |
|
The content of the uploaded CSV file. |
|
|
|
Returns |
|
------- |
|
pd.DataFrame |
|
The parsed DataFrame if successful. |
|
|
|
Raises |
|
------ |
|
gr.Error: If CSV cannot be read or parsed |
|
""" |
|
try: |
|
|
|
df = pd.read_csv(io.StringIO(file_content)) |
|
return df |
|
|
|
except pd.errors.EmptyDataError: |
|
raise gr.Error("β CSV file is empty or contains no valid data") |
|
except pd.errors.ParserError as e: |
|
raise gr.Error(f"β Invalid CSV format<br><br>" f"Error: {str(e)}") |
|
except UnicodeDecodeError: |
|
raise gr.Error( |
|
"β File encoding error<br><br>" |
|
"Your file appears to have an unsupported encoding.<br>" |
|
"Please save your CSV file with UTF-8 encoding and try again." |
|
) |
|
except Exception as e: |
|
raise gr.Error(f"β Unexpected error reading CSV file: {str(e)}") |
|
|
|
|
|
def validate_cv_submission( |
|
df: pd.DataFrame, submission_type: str = "GDPa1_cross_validation" |
|
) -> None: |
|
"""Validate cross-validation submission""" |
|
|
|
if CV_COLUMN not in df.columns: |
|
raise gr.Error(f"β CV submissions must include a '{CV_COLUMN}' column") |
|
|
|
|
|
expected_cv_df = pd.read_csv(EXAMPLE_FILE_DICT[submission_type])[ |
|
["antibody_name", CV_COLUMN] |
|
] |
|
antibody_check = expected_cv_df.merge( |
|
df[["antibody_name", CV_COLUMN]], |
|
on="antibody_name", |
|
how="left", |
|
suffixes=("_expected", "_submitted"), |
|
) |
|
|
|
fold_mismatches = antibody_check[ |
|
antibody_check[f"{CV_COLUMN}_expected"] |
|
!= antibody_check[f"{CV_COLUMN}_submitted"] |
|
] |
|
if len(fold_mismatches) > 0: |
|
examples = [] |
|
for _, row in fold_mismatches.head(3).iterrows(): |
|
examples.append( |
|
f"{row['antibody_name']} (expected fold {row[f'{CV_COLUMN}_expected']}, got {row[f'{CV_COLUMN}_submitted']})" |
|
) |
|
raise gr.Error( |
|
f"β Fold assignments don't match canonical CV folds: {'; '.join(examples)}" |
|
) |
|
|
|
|
|
def validate_full_dataset_submission(df: pd.DataFrame) -> None: |
|
"""Validate full dataset submission""" |
|
if CV_COLUMN in df.columns: |
|
raise gr.Error( |
|
f"β Your submission contains a '{CV_COLUMN}' column. " |
|
"Please select 'Cross-Validation Predictions' if you want to submit CV results." |
|
) |
|
|
|
|
|
def get_assay_columns(df: pd.DataFrame) -> list[str]: |
|
"""Get all assay columns from the DataFrame""" |
|
return [col for col in df.columns if col in ASSAY_LIST] |
|
|
|
|
|
def validate_dataframe(df: pd.DataFrame, submission_type: str = "GDPa1") -> None: |
|
""" |
|
Validate the DataFrame content and structure. |
|
|
|
Parameters |
|
---------- |
|
df: pd.DataFrame |
|
The DataFrame to validate. |
|
submission_type: str |
|
Type of submission: "GDPa1" or "GDPa1_cross_validation" |
|
|
|
Raises |
|
------ |
|
gr.Error: If validation fails |
|
""" |
|
if submission_type not in EXAMPLE_FILE_DICT.keys(): |
|
raise ValueError(f"Invalid submission type: {submission_type}") |
|
|
|
|
|
missing_columns = set(REQUIRED_COLUMNS) - set(df.columns) |
|
if missing_columns: |
|
raise gr.Error(f"β Missing required columns: {', '.join(missing_columns)}") |
|
|
|
|
|
assay_columns = get_assay_columns(df) |
|
if len(assay_columns) < 1: |
|
raise gr.Error( |
|
"β CSV should include at least one of the following assay columns: " |
|
+ ", ".join(ASSAY_LIST) |
|
) |
|
|
|
submission_columns = REQUIRED_COLUMNS + assay_columns |
|
|
|
|
|
if df.empty: |
|
raise gr.Error("β CSV file is empty") |
|
|
|
|
|
for col in submission_columns: |
|
missing_count = df[col].isnull().sum() |
|
if missing_count > 0: |
|
raise gr.Error(f"β Column '{col}' contains {missing_count} missing values") |
|
|
|
|
|
n_duplicates = df["antibody_name"].duplicated().sum() |
|
if n_duplicates > 0: |
|
raise gr.Error( |
|
f"β CSV should have only one row per antibody. Found {n_duplicates} duplicates." |
|
) |
|
|
|
unrecognized_antibodies = set(df["antibody_name"]) - set( |
|
ANTIBODY_NAMES_DICT[submission_type] |
|
) |
|
if unrecognized_antibodies: |
|
raise gr.Error( |
|
f"β Found unrecognized antibody names: {', '.join(unrecognized_antibodies)}" |
|
) |
|
|
|
|
|
missing_antibodies = set(ANTIBODY_NAMES_DICT[submission_type]) - set( |
|
df["antibody_name"] |
|
) |
|
if missing_antibodies: |
|
raise gr.Error( |
|
f"β Missing predictions for {len(missing_antibodies)} antibodies: {', '.join(missing_antibodies)}" |
|
) |
|
|
|
if submission_type.endswith("_cross_validation"): |
|
validate_cv_submission(df, submission_type) |
|
else: |
|
validate_full_dataset_submission(df) |
|
|
|
|
|
def validate_csv_file(file_content: str, submission_type: str = "GDPa1") -> None: |
|
""" |
|
Validate the uploaded CSV file. |
|
|
|
Parameters |
|
---------- |
|
file_content: str |
|
The content of the uploaded CSV file. |
|
submission_type: str |
|
Type of submission: "GDPa1" or "GDPa1_cross_validation" |
|
|
|
Raises |
|
------ |
|
gr.Error: If validation fails |
|
""" |
|
df = validate_csv_can_be_read(file_content) |
|
validate_dataframe(df, submission_type) |
|
|