import pandas as pd import io import gradio as gr import requests from constants import ( REQUIRED_COLUMNS, ASSAY_LIST, CV_COLUMN, EXAMPLE_FILE_DICT, ANTIBODY_NAMES_DICT, ) def validate_username(username: str) -> bool: """ Validate that the username corresponds to a real Hugging Face profile. Just check https://huggingface.co/username exists. Parameters ---------- username: str The username to validate Returns ------- bool True if the username is valid and profile exists, False otherwise Raises ------ gr.Error: If username is invalid or profile doesn't exist """ username = username.strip() if username == "": raise gr.Error("❌ Please provide a username.") # Check if the Hugging Face profile exists profile_url = f"https://huggingface.co/{username}" try: response = requests.get(profile_url, timeout=10) if response.status_code == 200: # Additional check: make sure it's actually a user profile page # and not some other page that happens to exist if "profile" in response.text.lower() or "models" in response.text.lower(): return True else: raise gr.Error( f"❌ '{username}' does not appear to be a valid Hugging Face user profile" ) elif response.status_code == 404: raise gr.Error( f"❌ Hugging Face user '{username}' does not exist. Please check the username or create an account at https://huggingface.co. This is used to track unique submissions." ) else: raise gr.Error( f"❌ Unable to verify username '{username}'. Please try again later." ) except requests.exceptions.Timeout: raise gr.Error("❌ Timeout while checking username. Please try again.") except requests.exceptions.ConnectionError: raise gr.Error( "❌ Unable to connect to Hugging Face. Please check your internet connection." ) except requests.exceptions.RequestException as e: raise gr.Error(f"❌ Error validating username: {str(e)}") def validate_csv_can_be_read(file_content: str) -> pd.DataFrame: """ Validate that the CSV file can be read and parsed. Parameters ---------- file_content: str The content of the uploaded CSV file. Returns ------- pd.DataFrame The parsed DataFrame if successful. Raises ------ gr.Error: If CSV cannot be read or parsed """ try: # Read CSV content df = pd.read_csv(io.StringIO(file_content)) return df except pd.errors.EmptyDataError: raise gr.Error("❌ CSV file is empty or contains no valid data") except pd.errors.ParserError as e: raise gr.Error(f"❌ Invalid CSV format

" f"Error: {str(e)}") except UnicodeDecodeError: raise gr.Error( "❌ File encoding error

" "Your file appears to have an unsupported encoding.
" "Please save your CSV file with UTF-8 encoding and try again." ) except Exception as e: raise gr.Error(f"❌ Unexpected error reading CSV file: {str(e)}") def validate_cv_submission( df: pd.DataFrame, submission_type: str = "GDPa1_cross_validation" ) -> None: """Validate cross-validation submission""" # Must have CV_COLUMN for CV submissions if CV_COLUMN not in df.columns: raise gr.Error(f"❌ CV submissions must include a '{CV_COLUMN}' column") # Load canonical fold assignments expected_cv_df = pd.read_csv(EXAMPLE_FILE_DICT[submission_type])[ ["antibody_name", CV_COLUMN] ] antibody_check = expected_cv_df.merge( df[["antibody_name", CV_COLUMN]], on="antibody_name", how="left", suffixes=("_expected", "_submitted"), ) # CV fold assignments should match fold_mismatches = antibody_check[ antibody_check[f"{CV_COLUMN}_expected"] != antibody_check[f"{CV_COLUMN}_submitted"] ] if len(fold_mismatches) > 0: examples = [] for _, row in fold_mismatches.head(3).iterrows(): examples.append( f"{row['antibody_name']} (expected fold {row[f'{CV_COLUMN}_expected']}, got {row[f'{CV_COLUMN}_submitted']})" ) raise gr.Error( f"❌ Fold assignments don't match canonical CV folds: {'; '.join(examples)}" ) def validate_full_dataset_submission(df: pd.DataFrame) -> None: """Validate full dataset submission""" if CV_COLUMN in df.columns: raise gr.Error( f"❌ Your submission contains a '{CV_COLUMN}' column. " "Please select 'Cross-Validation Predictions' if you want to submit CV results." ) def get_assay_columns(df: pd.DataFrame) -> list[str]: """Get all assay columns from the DataFrame""" return [col for col in df.columns if col in ASSAY_LIST] def validate_dataframe(df: pd.DataFrame, submission_type: str = "GDPa1") -> None: """ Validate the DataFrame content and structure. Parameters ---------- df: pd.DataFrame The DataFrame to validate. submission_type: str Type of submission: "GDPa1" or "GDPa1_cross_validation" Raises ------ gr.Error: If validation fails """ if submission_type not in EXAMPLE_FILE_DICT.keys(): raise ValueError(f"Invalid submission type: {submission_type}") # Required columns should be present missing_columns = set(REQUIRED_COLUMNS) - set(df.columns) if missing_columns: raise gr.Error(f"❌ Missing required columns: {', '.join(missing_columns)}") # Should include at least 1 assay column assay_columns = get_assay_columns(df) if len(assay_columns) < 1: raise gr.Error( "❌ CSV should include at least one of the following assay columns: " + ", ".join(ASSAY_LIST) ) # Submission are name, sequence, and at least one assay column submission_columns = REQUIRED_COLUMNS + assay_columns # Data should not be empty if df.empty: raise gr.Error("❌ CSV file is empty") # No missing values in submission columns for col in submission_columns: missing_count = df[col].isnull().sum() if missing_count > 0: raise gr.Error(f"❌ Column '{col}' contains {missing_count} missing values") # All names should be unique n_duplicates = df["antibody_name"].duplicated().sum() if n_duplicates > 0: raise gr.Error( f"❌ CSV should have only one row per antibody. Found {n_duplicates} duplicates." ) # All antibody names should be recognizable unrecognized_antibodies = set(df["antibody_name"]) - set( ANTIBODY_NAMES_DICT[submission_type] ) if unrecognized_antibodies: raise gr.Error( f"❌ Found unrecognized antibody names: {', '.join(unrecognized_antibodies)}" ) # All antibody names should be present missing_antibodies = set(ANTIBODY_NAMES_DICT[submission_type]) - set( df["antibody_name"] ) if missing_antibodies: raise gr.Error( f"❌ Missing predictions for {len(missing_antibodies)} antibodies: {', '.join(missing_antibodies)}" ) # Submission-type specific validation if submission_type.endswith("_cross_validation"): validate_cv_submission(df, submission_type) else: # full_dataset validate_full_dataset_submission(df) def validate_csv_file(file_content: str, submission_type: str = "GDPa1") -> None: """ Validate the uploaded CSV file. Parameters ---------- file_content: str The content of the uploaded CSV file. submission_type: str Type of submission: "GDPa1" or "GDPa1_cross_validation" Raises ------ gr.Error: If validation fails """ df = validate_csv_can_be_read(file_content) validate_dataframe(df, submission_type)