File size: 8,115 Bytes
eb50e2e
 
 
e0516fa
0f3e1b5
 
 
b2a1e67
 
 
0f3e1b5
eb50e2e
8f9985e
e0516fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1938471
 
e0516fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62b6599
e0516fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb50e2e
 
 
8f9985e
eb50e2e
 
 
 
8f9985e
eb50e2e
 
 
 
8f9985e
eb50e2e
 
 
 
 
 
 
 
8f9985e
eb50e2e
8f9985e
eb50e2e
8f9985e
eb50e2e
 
 
 
 
 
845443f
 
eb50e2e
8f9985e
bff3b9b
 
 
b2a1e67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
eb50e2e
 
8f9985e
eb50e2e
 
 
 
b2a1e67
bff3b9b
8f9985e
eb50e2e
 
 
 
b2a1e67
 
 
eb50e2e
 
 
8f9985e
 
0f3e1b5
b2a1e67
0f3e1b5
 
 
 
 
 
 
 
eb50e2e
 
8f9985e
 
0f3e1b5
 
eb50e2e
 
8f9985e
 
4d9df8e
 
 
 
 
 
 
b2a1e67
 
 
4d9df8e
 
 
 
f412a50
 
 
 
 
 
 
 
 
b2a1e67
bff3b9b
b2a1e67
 
 
8f9985e
eb50e2e
b2a1e67
eb50e2e
 
8f9985e
eb50e2e
 
 
 
b2a1e67
bff3b9b
8f9985e
eb50e2e
 
 
 
 
b2a1e67
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
import pandas as pd
import io
import gradio as gr
import requests
from constants import (
    REQUIRED_COLUMNS,
    ASSAY_LIST,
    CV_COLUMN,
    EXAMPLE_FILE_DICT,
    ANTIBODY_NAMES_DICT,
)


def validate_username(username: str) -> bool:
    """
    Validate that the username corresponds to a real Hugging Face profile.
    Just check https://huggingface.co/username exists.

    Parameters
    ----------
    username: str
        The username to validate

    Returns
    -------
    bool
        True if the username is valid and profile exists, False otherwise

    Raises
    ------
    gr.Error: If username is invalid or profile doesn't exist
    """
    username = username.strip()
    if username == "":
        raise gr.Error("❌ Please provide a username.")

    # Check if the Hugging Face profile exists
    profile_url = f"https://huggingface.co/{username}"

    try:
        response = requests.get(profile_url, timeout=10)

        if response.status_code == 200:
            # Additional check: make sure it's actually a user profile page
            # and not some other page that happens to exist
            if "profile" in response.text.lower() or "models" in response.text.lower():
                return True
            else:
                raise gr.Error(
                    f"❌ '{username}' does not appear to be a valid Hugging Face user profile"
                )
        elif response.status_code == 404:
            raise gr.Error(
                f"❌ Hugging Face user '{username}' does not exist. Please check the username or create an account at https://huggingface.co. This is used to track unique submissions."
            )
        else:
            raise gr.Error(
                f"❌ Unable to verify username '{username}'. Please try again later."
            )

    except requests.exceptions.Timeout:
        raise gr.Error("❌ Timeout while checking username. Please try again.")
    except requests.exceptions.ConnectionError:
        raise gr.Error(
            "❌ Unable to connect to Hugging Face. Please check your internet connection."
        )
    except requests.exceptions.RequestException as e:
        raise gr.Error(f"❌ Error validating username: {str(e)}")


def validate_csv_can_be_read(file_content: str) -> pd.DataFrame:
    """
    Validate that the CSV file can be read and parsed.

    Parameters
    ----------
    file_content: str
        The content of the uploaded CSV file.

    Returns
    -------
    pd.DataFrame
        The parsed DataFrame if successful.

    Raises
    ------
    gr.Error: If CSV cannot be read or parsed
    """
    try:
        # Read CSV content
        df = pd.read_csv(io.StringIO(file_content))
        return df

    except pd.errors.EmptyDataError:
        raise gr.Error("❌ CSV file is empty or contains no valid data")
    except pd.errors.ParserError as e:
        raise gr.Error(f"❌ Invalid CSV format<br><br>" f"Error: {str(e)}")
    except UnicodeDecodeError:
        raise gr.Error(
            "❌ File encoding error<br><br>"
            "Your file appears to have an unsupported encoding.<br>"
            "Please save your CSV file with UTF-8 encoding and try again."
        )
    except Exception as e:
        raise gr.Error(f"❌ Unexpected error reading CSV file: {str(e)}")


def validate_cv_submission(
    df: pd.DataFrame, submission_type: str = "GDPa1_cross_validation"
) -> None:
    """Validate cross-validation submission"""
    # Must have CV_COLUMN for CV submissions
    if CV_COLUMN not in df.columns:
        raise gr.Error(f"❌ CV submissions must include a '{CV_COLUMN}' column")

    # Load canonical fold assignments
    expected_cv_df = pd.read_csv(EXAMPLE_FILE_DICT[submission_type])[
        ["antibody_name", CV_COLUMN]
    ]
    antibody_check = expected_cv_df.merge(
        df[["antibody_name", CV_COLUMN]],
        on="antibody_name",
        how="left",
        suffixes=("_expected", "_submitted"),
    )
    # CV fold assignments should match
    fold_mismatches = antibody_check[
        antibody_check[f"{CV_COLUMN}_expected"]
        != antibody_check[f"{CV_COLUMN}_submitted"]
    ]
    if len(fold_mismatches) > 0:
        examples = []
        for _, row in fold_mismatches.head(3).iterrows():
            examples.append(
                f"{row['antibody_name']} (expected fold {row[f'{CV_COLUMN}_expected']}, got {row[f'{CV_COLUMN}_submitted']})"
            )
        raise gr.Error(
            f"❌ Fold assignments don't match canonical CV folds: {'; '.join(examples)}"
        )


def validate_full_dataset_submission(df: pd.DataFrame) -> None:
    """Validate full dataset submission"""
    if CV_COLUMN in df.columns:
        raise gr.Error(
            f"❌ Your submission contains a '{CV_COLUMN}' column. "
            "Please select 'Cross-Validation Predictions' if you want to submit CV results."
        )


def get_assay_columns(df: pd.DataFrame) -> list[str]:
    """Get all assay columns from the DataFrame"""
    return [col for col in df.columns if col in ASSAY_LIST]


def validate_dataframe(df: pd.DataFrame, submission_type: str = "GDPa1") -> None:
    """
    Validate the DataFrame content and structure.

    Parameters
    ----------
    df: pd.DataFrame
        The DataFrame to validate.
    submission_type: str
        Type of submission: "GDPa1" or "GDPa1_cross_validation"

    Raises
    ------
    gr.Error: If validation fails
    """
    if submission_type not in EXAMPLE_FILE_DICT.keys():
        raise ValueError(f"Invalid submission type: {submission_type}")

    # Required columns should be present
    missing_columns = set(REQUIRED_COLUMNS) - set(df.columns)
    if missing_columns:
        raise gr.Error(f"❌ Missing required columns: {', '.join(missing_columns)}")

    # Should include at least 1 assay column
    assay_columns = get_assay_columns(df)
    if len(assay_columns) < 1:
        raise gr.Error(
            "❌ CSV should include at least one of the following assay columns: "
            + ", ".join(ASSAY_LIST)
        )
    # Submission are name, sequence, and at least one assay column
    submission_columns = REQUIRED_COLUMNS + assay_columns

    # Data should not be empty
    if df.empty:
        raise gr.Error("❌ CSV file is empty")

    # No missing values in submission columns
    for col in submission_columns:
        missing_count = df[col].isnull().sum()
        if missing_count > 0:
            raise gr.Error(f"❌ Column '{col}' contains {missing_count} missing values")

    # All names should be unique
    n_duplicates = df["antibody_name"].duplicated().sum()
    if n_duplicates > 0:
        raise gr.Error(
            f"❌ CSV should have only one row per antibody. Found {n_duplicates} duplicates."
        )
    # All antibody names should be recognizable
    unrecognized_antibodies = set(df["antibody_name"]) - set(
        ANTIBODY_NAMES_DICT[submission_type]
    )
    if unrecognized_antibodies:
        raise gr.Error(
            f"❌ Found unrecognized antibody names: {', '.join(unrecognized_antibodies)}"
        )

    # All antibody names should be present
    missing_antibodies = set(ANTIBODY_NAMES_DICT[submission_type]) - set(
        df["antibody_name"]
    )
    if missing_antibodies:
        raise gr.Error(
            f"❌ Missing predictions for {len(missing_antibodies)} antibodies: {', '.join(missing_antibodies)}"
        )
    # Submission-type specific validation
    if submission_type.endswith("_cross_validation"):
        validate_cv_submission(df, submission_type)
    else:  # full_dataset
        validate_full_dataset_submission(df)


def validate_csv_file(file_content: str, submission_type: str = "GDPa1") -> None:
    """
    Validate the uploaded CSV file.

    Parameters
    ----------
    file_content: str
        The content of the uploaded CSV file.
    submission_type: str
        Type of submission: "GDPa1" or "GDPa1_cross_validation"

    Raises
    ------
    gr.Error: If validation fails
    """
    df = validate_csv_can_be_read(file_content)
    validate_dataframe(df, submission_type)