check all abs are present and rm redundant validation
Browse files- constants.py +0 -1
- test/test_validation.py +3 -6
- validation.py +9 -43
constants.py
CHANGED
@@ -30,7 +30,6 @@ ASSAY_EMOJIS = {
|
|
30 |
}
|
31 |
|
32 |
# Input CSV file requirements
|
33 |
-
MINIMAL_NUMBER_OF_ROWS: int = 50
|
34 |
REQUIRED_COLUMNS: list[str] = [
|
35 |
"antibody_name",
|
36 |
"vh_protein_sequence",
|
|
|
30 |
}
|
31 |
|
32 |
# Input CSV file requirements
|
|
|
33 |
REQUIRED_COLUMNS: list[str] = [
|
34 |
"antibody_name",
|
35 |
"vh_protein_sequence",
|
test/test_validation.py
CHANGED
@@ -71,14 +71,11 @@ class TestValidateDataframe:
|
|
71 |
|
72 |
assert "CSV file is empty" in str(exc_info.value)
|
73 |
|
74 |
-
def
|
75 |
-
df = valid_input_dataframe.head(
|
76 |
with pytest.raises(gr.Error) as exc_info:
|
77 |
validate_dataframe(df)
|
78 |
-
|
79 |
-
assert f"CSV should have at least {MINIMAL_NUMBER_OF_ROWS} rows" in str(
|
80 |
-
exc_info.value
|
81 |
-
)
|
82 |
|
83 |
def test_missing_values_raises_error(self, valid_input_dataframe):
|
84 |
bad_column = REQUIRED_COLUMNS[0]
|
|
|
71 |
|
72 |
assert "CSV file is empty" in str(exc_info.value)
|
73 |
|
74 |
+
def test_missing_antibodies_raises_error(self, valid_input_dataframe):
|
75 |
+
df = valid_input_dataframe.head(50)
|
76 |
with pytest.raises(gr.Error) as exc_info:
|
77 |
validate_dataframe(df)
|
78 |
+
assert "Missing predictions for" in str(exc_info.value)
|
|
|
|
|
|
|
79 |
|
80 |
def test_missing_values_raises_error(self, valid_input_dataframe):
|
81 |
bad_column = REQUIRED_COLUMNS[0]
|
validation.py
CHANGED
@@ -3,7 +3,6 @@ import io
|
|
3 |
import gradio as gr
|
4 |
from constants import (
|
5 |
REQUIRED_COLUMNS,
|
6 |
-
MINIMAL_NUMBER_OF_ROWS,
|
7 |
ASSAY_LIST,
|
8 |
CV_COLUMN,
|
9 |
EXAMPLE_FILE_DICT,
|
@@ -64,16 +63,6 @@ def validate_cv_submission(df: pd.DataFrame, submission_type: str = "GDPa1_CV")
|
|
64 |
how="left",
|
65 |
suffixes=("_expected", "_submitted"),
|
66 |
)
|
67 |
-
# All antibodies should be present if using CV
|
68 |
-
missing_antibodies_mask = antibody_check[f"{CV_COLUMN}_submitted"].isna()
|
69 |
-
n_missing_antibodies = missing_antibodies_mask.sum()
|
70 |
-
if n_missing_antibodies > 0:
|
71 |
-
missing_antibodies = (
|
72 |
-
antibody_check[missing_antibodies_mask]["antibody_name"].head(5).tolist()
|
73 |
-
)
|
74 |
-
raise gr.Error(
|
75 |
-
f"β Missing predictions for {n_missing_antibodies} antibodies. Examples: {', '.join(missing_antibodies)}"
|
76 |
-
)
|
77 |
# CV fold assignments should match
|
78 |
fold_mismatches = antibody_check[
|
79 |
antibody_check[f"{CV_COLUMN}_expected"]
|
@@ -89,26 +78,6 @@ def validate_cv_submission(df: pd.DataFrame, submission_type: str = "GDPa1_CV")
|
|
89 |
f"β Fold assignments don't match canonical CV folds: {'; '.join(examples)}"
|
90 |
)
|
91 |
|
92 |
-
# Merge on both columns for assay validation
|
93 |
-
merged_cv_df = expected_cv_df.merge(df, on=["antibody_name", CV_COLUMN], how="left")
|
94 |
-
|
95 |
-
# Check for missing assay predictions
|
96 |
-
assay_columns = get_assay_columns(merged_cv_df)
|
97 |
-
for assay_column in assay_columns:
|
98 |
-
missing_antibodies = merged_cv_df[merged_cv_df[assay_column].isna()][
|
99 |
-
"antibody_name"
|
100 |
-
].unique()
|
101 |
-
if len(missing_antibodies) > 0:
|
102 |
-
raise gr.Error(
|
103 |
-
f"β Missing {assay_column} predictions for {len(missing_antibodies)} antibodies: {', '.join(missing_antibodies[:5])}"
|
104 |
-
)
|
105 |
-
|
106 |
-
# Step 5: Check that submission length matches expected
|
107 |
-
if len(merged_cv_df) != len(expected_cv_df):
|
108 |
-
raise gr.Error(
|
109 |
-
f"β Expected {len(expected_cv_df)} rows, got {len(merged_cv_df)}"
|
110 |
-
)
|
111 |
-
|
112 |
|
113 |
def validate_full_dataset_submission(df: pd.DataFrame) -> None:
|
114 |
"""Validate full dataset submission"""
|
@@ -118,13 +87,6 @@ def validate_full_dataset_submission(df: pd.DataFrame) -> None:
|
|
118 |
"Please select 'Cross-Validation Predictions' if you want to submit CV results."
|
119 |
)
|
120 |
|
121 |
-
# All names should be unique (duplicates check from original validation)
|
122 |
-
n_duplicates = df["antibody_name"].duplicated().sum()
|
123 |
-
if n_duplicates > 0:
|
124 |
-
raise gr.Error(
|
125 |
-
f"β Standard submissions should have only one prediction per antibody. Found {n_duplicates} duplicates."
|
126 |
-
)
|
127 |
-
|
128 |
|
129 |
def get_assay_columns(df: pd.DataFrame) -> list[str]:
|
130 |
"""Get all assay columns from the DataFrame"""
|
@@ -174,17 +136,12 @@ def validate_dataframe(df: pd.DataFrame, submission_type: str = "GDPa1") -> None
|
|
174 |
if missing_count > 0:
|
175 |
raise gr.Error(f"β Column '{col}' contains {missing_count} missing values")
|
176 |
|
177 |
-
# Above minimal number of rows
|
178 |
-
if len(df) < MINIMAL_NUMBER_OF_ROWS:
|
179 |
-
raise gr.Error(f"β CSV should have at least {MINIMAL_NUMBER_OF_ROWS} rows")
|
180 |
-
|
181 |
# All names should be unique
|
182 |
n_duplicates = df["antibody_name"].duplicated().sum()
|
183 |
if n_duplicates > 0:
|
184 |
raise gr.Error(
|
185 |
f"β CSV should have only one row per antibody. Found {n_duplicates} duplicates."
|
186 |
)
|
187 |
-
|
188 |
# All antibody names should be recognizable
|
189 |
unrecognized_antibodies = set(df["antibody_name"]) - set(
|
190 |
ANTIBODY_NAMES_DICT[submission_type]
|
@@ -193,6 +150,15 @@ def validate_dataframe(df: pd.DataFrame, submission_type: str = "GDPa1") -> None
|
|
193 |
raise gr.Error(
|
194 |
f"β Found unrecognized antibody names: {', '.join(unrecognized_antibodies)}"
|
195 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
# Submission-type specific validation
|
197 |
if submission_type.endswith("_CV"):
|
198 |
validate_cv_submission(df, submission_type)
|
|
|
3 |
import gradio as gr
|
4 |
from constants import (
|
5 |
REQUIRED_COLUMNS,
|
|
|
6 |
ASSAY_LIST,
|
7 |
CV_COLUMN,
|
8 |
EXAMPLE_FILE_DICT,
|
|
|
63 |
how="left",
|
64 |
suffixes=("_expected", "_submitted"),
|
65 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
# CV fold assignments should match
|
67 |
fold_mismatches = antibody_check[
|
68 |
antibody_check[f"{CV_COLUMN}_expected"]
|
|
|
78 |
f"β Fold assignments don't match canonical CV folds: {'; '.join(examples)}"
|
79 |
)
|
80 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
def validate_full_dataset_submission(df: pd.DataFrame) -> None:
|
83 |
"""Validate full dataset submission"""
|
|
|
87 |
"Please select 'Cross-Validation Predictions' if you want to submit CV results."
|
88 |
)
|
89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
|
91 |
def get_assay_columns(df: pd.DataFrame) -> list[str]:
|
92 |
"""Get all assay columns from the DataFrame"""
|
|
|
136 |
if missing_count > 0:
|
137 |
raise gr.Error(f"β Column '{col}' contains {missing_count} missing values")
|
138 |
|
|
|
|
|
|
|
|
|
139 |
# All names should be unique
|
140 |
n_duplicates = df["antibody_name"].duplicated().sum()
|
141 |
if n_duplicates > 0:
|
142 |
raise gr.Error(
|
143 |
f"β CSV should have only one row per antibody. Found {n_duplicates} duplicates."
|
144 |
)
|
|
|
145 |
# All antibody names should be recognizable
|
146 |
unrecognized_antibodies = set(df["antibody_name"]) - set(
|
147 |
ANTIBODY_NAMES_DICT[submission_type]
|
|
|
150 |
raise gr.Error(
|
151 |
f"β Found unrecognized antibody names: {', '.join(unrecognized_antibodies)}"
|
152 |
)
|
153 |
+
|
154 |
+
# All antibody names should be present
|
155 |
+
missing_antibodies = set(ANTIBODY_NAMES_DICT[submission_type]) - set(
|
156 |
+
df["antibody_name"]
|
157 |
+
)
|
158 |
+
if missing_antibodies:
|
159 |
+
raise gr.Error(
|
160 |
+
f"β Missing predictions for {len(missing_antibodies)} antibodies: {', '.join(missing_antibodies)}"
|
161 |
+
)
|
162 |
# Submission-type specific validation
|
163 |
if submission_type.endswith("_CV"):
|
164 |
validate_cv_submission(df, submission_type)
|