File size: 8,115 Bytes
eb50e2e e0516fa 0f3e1b5 b2a1e67 0f3e1b5 eb50e2e 8f9985e e0516fa 1938471 e0516fa 62b6599 e0516fa eb50e2e 8f9985e eb50e2e 8f9985e eb50e2e 8f9985e eb50e2e 8f9985e eb50e2e 8f9985e eb50e2e 8f9985e eb50e2e 845443f eb50e2e 8f9985e bff3b9b b2a1e67 eb50e2e 8f9985e eb50e2e b2a1e67 bff3b9b 8f9985e eb50e2e b2a1e67 eb50e2e 8f9985e 0f3e1b5 b2a1e67 0f3e1b5 eb50e2e 8f9985e 0f3e1b5 eb50e2e 8f9985e 4d9df8e b2a1e67 4d9df8e f412a50 b2a1e67 bff3b9b b2a1e67 8f9985e eb50e2e b2a1e67 eb50e2e 8f9985e eb50e2e b2a1e67 bff3b9b 8f9985e eb50e2e b2a1e67 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 |
import pandas as pd
import io
import gradio as gr
import requests
from constants import (
REQUIRED_COLUMNS,
ASSAY_LIST,
CV_COLUMN,
EXAMPLE_FILE_DICT,
ANTIBODY_NAMES_DICT,
)
def validate_username(username: str) -> bool:
"""
Validate that the username corresponds to a real Hugging Face profile.
Just check https://huggingface.co/username exists.
Parameters
----------
username: str
The username to validate
Returns
-------
bool
True if the username is valid and profile exists, False otherwise
Raises
------
gr.Error: If username is invalid or profile doesn't exist
"""
username = username.strip()
if username == "":
raise gr.Error("β Please provide a username.")
# Check if the Hugging Face profile exists
profile_url = f"https://huggingface.co/{username}"
try:
response = requests.get(profile_url, timeout=10)
if response.status_code == 200:
# Additional check: make sure it's actually a user profile page
# and not some other page that happens to exist
if "profile" in response.text.lower() or "models" in response.text.lower():
return True
else:
raise gr.Error(
f"β '{username}' does not appear to be a valid Hugging Face user profile"
)
elif response.status_code == 404:
raise gr.Error(
f"β Hugging Face user '{username}' does not exist. Please check the username or create an account at https://huggingface.co. This is used to track unique submissions."
)
else:
raise gr.Error(
f"β Unable to verify username '{username}'. Please try again later."
)
except requests.exceptions.Timeout:
raise gr.Error("β Timeout while checking username. Please try again.")
except requests.exceptions.ConnectionError:
raise gr.Error(
"β Unable to connect to Hugging Face. Please check your internet connection."
)
except requests.exceptions.RequestException as e:
raise gr.Error(f"β Error validating username: {str(e)}")
def validate_csv_can_be_read(file_content: str) -> pd.DataFrame:
"""
Validate that the CSV file can be read and parsed.
Parameters
----------
file_content: str
The content of the uploaded CSV file.
Returns
-------
pd.DataFrame
The parsed DataFrame if successful.
Raises
------
gr.Error: If CSV cannot be read or parsed
"""
try:
# Read CSV content
df = pd.read_csv(io.StringIO(file_content))
return df
except pd.errors.EmptyDataError:
raise gr.Error("β CSV file is empty or contains no valid data")
except pd.errors.ParserError as e:
raise gr.Error(f"β Invalid CSV format<br><br>" f"Error: {str(e)}")
except UnicodeDecodeError:
raise gr.Error(
"β File encoding error<br><br>"
"Your file appears to have an unsupported encoding.<br>"
"Please save your CSV file with UTF-8 encoding and try again."
)
except Exception as e:
raise gr.Error(f"β Unexpected error reading CSV file: {str(e)}")
def validate_cv_submission(
df: pd.DataFrame, submission_type: str = "GDPa1_cross_validation"
) -> None:
"""Validate cross-validation submission"""
# Must have CV_COLUMN for CV submissions
if CV_COLUMN not in df.columns:
raise gr.Error(f"β CV submissions must include a '{CV_COLUMN}' column")
# Load canonical fold assignments
expected_cv_df = pd.read_csv(EXAMPLE_FILE_DICT[submission_type])[
["antibody_name", CV_COLUMN]
]
antibody_check = expected_cv_df.merge(
df[["antibody_name", CV_COLUMN]],
on="antibody_name",
how="left",
suffixes=("_expected", "_submitted"),
)
# CV fold assignments should match
fold_mismatches = antibody_check[
antibody_check[f"{CV_COLUMN}_expected"]
!= antibody_check[f"{CV_COLUMN}_submitted"]
]
if len(fold_mismatches) > 0:
examples = []
for _, row in fold_mismatches.head(3).iterrows():
examples.append(
f"{row['antibody_name']} (expected fold {row[f'{CV_COLUMN}_expected']}, got {row[f'{CV_COLUMN}_submitted']})"
)
raise gr.Error(
f"β Fold assignments don't match canonical CV folds: {'; '.join(examples)}"
)
def validate_full_dataset_submission(df: pd.DataFrame) -> None:
"""Validate full dataset submission"""
if CV_COLUMN in df.columns:
raise gr.Error(
f"β Your submission contains a '{CV_COLUMN}' column. "
"Please select 'Cross-Validation Predictions' if you want to submit CV results."
)
def get_assay_columns(df: pd.DataFrame) -> list[str]:
"""Get all assay columns from the DataFrame"""
return [col for col in df.columns if col in ASSAY_LIST]
def validate_dataframe(df: pd.DataFrame, submission_type: str = "GDPa1") -> None:
"""
Validate the DataFrame content and structure.
Parameters
----------
df: pd.DataFrame
The DataFrame to validate.
submission_type: str
Type of submission: "GDPa1" or "GDPa1_cross_validation"
Raises
------
gr.Error: If validation fails
"""
if submission_type not in EXAMPLE_FILE_DICT.keys():
raise ValueError(f"Invalid submission type: {submission_type}")
# Required columns should be present
missing_columns = set(REQUIRED_COLUMNS) - set(df.columns)
if missing_columns:
raise gr.Error(f"β Missing required columns: {', '.join(missing_columns)}")
# Should include at least 1 assay column
assay_columns = get_assay_columns(df)
if len(assay_columns) < 1:
raise gr.Error(
"β CSV should include at least one of the following assay columns: "
+ ", ".join(ASSAY_LIST)
)
# Submission are name, sequence, and at least one assay column
submission_columns = REQUIRED_COLUMNS + assay_columns
# Data should not be empty
if df.empty:
raise gr.Error("β CSV file is empty")
# No missing values in submission columns
for col in submission_columns:
missing_count = df[col].isnull().sum()
if missing_count > 0:
raise gr.Error(f"β Column '{col}' contains {missing_count} missing values")
# All names should be unique
n_duplicates = df["antibody_name"].duplicated().sum()
if n_duplicates > 0:
raise gr.Error(
f"β CSV should have only one row per antibody. Found {n_duplicates} duplicates."
)
# All antibody names should be recognizable
unrecognized_antibodies = set(df["antibody_name"]) - set(
ANTIBODY_NAMES_DICT[submission_type]
)
if unrecognized_antibodies:
raise gr.Error(
f"β Found unrecognized antibody names: {', '.join(unrecognized_antibodies)}"
)
# All antibody names should be present
missing_antibodies = set(ANTIBODY_NAMES_DICT[submission_type]) - set(
df["antibody_name"]
)
if missing_antibodies:
raise gr.Error(
f"β Missing predictions for {len(missing_antibodies)} antibodies: {', '.join(missing_antibodies)}"
)
# Submission-type specific validation
if submission_type.endswith("_cross_validation"):
validate_cv_submission(df, submission_type)
else: # full_dataset
validate_full_dataset_submission(df)
def validate_csv_file(file_content: str, submission_type: str = "GDPa1") -> None:
"""
Validate the uploaded CSV file.
Parameters
----------
file_content: str
The content of the uploaded CSV file.
submission_type: str
Type of submission: "GDPa1" or "GDPa1_cross_validation"
Raises
------
gr.Error: If validation fails
"""
df = validate_csv_can_be_read(file_content)
validate_dataframe(df, submission_type)
|