pquintero commited on
Commit
f412a50
Β·
1 Parent(s): b2a1e67

check all abs are present and rm redundant validation

Browse files
Files changed (3) hide show
  1. constants.py +0 -1
  2. test/test_validation.py +3 -6
  3. validation.py +9 -43
constants.py CHANGED
@@ -30,7 +30,6 @@ ASSAY_EMOJIS = {
30
  }
31
 
32
  # Input CSV file requirements
33
- MINIMAL_NUMBER_OF_ROWS: int = 50
34
  REQUIRED_COLUMNS: list[str] = [
35
  "antibody_name",
36
  "vh_protein_sequence",
 
30
  }
31
 
32
  # Input CSV file requirements
 
33
  REQUIRED_COLUMNS: list[str] = [
34
  "antibody_name",
35
  "vh_protein_sequence",
test/test_validation.py CHANGED
@@ -71,14 +71,11 @@ class TestValidateDataframe:
71
 
72
  assert "CSV file is empty" in str(exc_info.value)
73
 
74
- def test_insufficient_rows_raises_error(self, valid_input_dataframe):
75
- df = valid_input_dataframe.head(MINIMAL_NUMBER_OF_ROWS - 1)
76
  with pytest.raises(gr.Error) as exc_info:
77
  validate_dataframe(df)
78
-
79
- assert f"CSV should have at least {MINIMAL_NUMBER_OF_ROWS} rows" in str(
80
- exc_info.value
81
- )
82
 
83
  def test_missing_values_raises_error(self, valid_input_dataframe):
84
  bad_column = REQUIRED_COLUMNS[0]
 
71
 
72
  assert "CSV file is empty" in str(exc_info.value)
73
 
74
+ def test_missing_antibodies_raises_error(self, valid_input_dataframe):
75
+ df = valid_input_dataframe.head(50)
76
  with pytest.raises(gr.Error) as exc_info:
77
  validate_dataframe(df)
78
+ assert "Missing predictions for" in str(exc_info.value)
 
 
 
79
 
80
  def test_missing_values_raises_error(self, valid_input_dataframe):
81
  bad_column = REQUIRED_COLUMNS[0]
validation.py CHANGED
@@ -3,7 +3,6 @@ import io
3
  import gradio as gr
4
  from constants import (
5
  REQUIRED_COLUMNS,
6
- MINIMAL_NUMBER_OF_ROWS,
7
  ASSAY_LIST,
8
  CV_COLUMN,
9
  EXAMPLE_FILE_DICT,
@@ -64,16 +63,6 @@ def validate_cv_submission(df: pd.DataFrame, submission_type: str = "GDPa1_CV")
64
  how="left",
65
  suffixes=("_expected", "_submitted"),
66
  )
67
- # All antibodies should be present if using CV
68
- missing_antibodies_mask = antibody_check[f"{CV_COLUMN}_submitted"].isna()
69
- n_missing_antibodies = missing_antibodies_mask.sum()
70
- if n_missing_antibodies > 0:
71
- missing_antibodies = (
72
- antibody_check[missing_antibodies_mask]["antibody_name"].head(5).tolist()
73
- )
74
- raise gr.Error(
75
- f"❌ Missing predictions for {n_missing_antibodies} antibodies. Examples: {', '.join(missing_antibodies)}"
76
- )
77
  # CV fold assignments should match
78
  fold_mismatches = antibody_check[
79
  antibody_check[f"{CV_COLUMN}_expected"]
@@ -89,26 +78,6 @@ def validate_cv_submission(df: pd.DataFrame, submission_type: str = "GDPa1_CV")
89
  f"❌ Fold assignments don't match canonical CV folds: {'; '.join(examples)}"
90
  )
91
 
92
- # Merge on both columns for assay validation
93
- merged_cv_df = expected_cv_df.merge(df, on=["antibody_name", CV_COLUMN], how="left")
94
-
95
- # Check for missing assay predictions
96
- assay_columns = get_assay_columns(merged_cv_df)
97
- for assay_column in assay_columns:
98
- missing_antibodies = merged_cv_df[merged_cv_df[assay_column].isna()][
99
- "antibody_name"
100
- ].unique()
101
- if len(missing_antibodies) > 0:
102
- raise gr.Error(
103
- f"❌ Missing {assay_column} predictions for {len(missing_antibodies)} antibodies: {', '.join(missing_antibodies[:5])}"
104
- )
105
-
106
- # Step 5: Check that submission length matches expected
107
- if len(merged_cv_df) != len(expected_cv_df):
108
- raise gr.Error(
109
- f"❌ Expected {len(expected_cv_df)} rows, got {len(merged_cv_df)}"
110
- )
111
-
112
 
113
  def validate_full_dataset_submission(df: pd.DataFrame) -> None:
114
  """Validate full dataset submission"""
@@ -118,13 +87,6 @@ def validate_full_dataset_submission(df: pd.DataFrame) -> None:
118
  "Please select 'Cross-Validation Predictions' if you want to submit CV results."
119
  )
120
 
121
- # All names should be unique (duplicates check from original validation)
122
- n_duplicates = df["antibody_name"].duplicated().sum()
123
- if n_duplicates > 0:
124
- raise gr.Error(
125
- f"❌ Standard submissions should have only one prediction per antibody. Found {n_duplicates} duplicates."
126
- )
127
-
128
 
129
  def get_assay_columns(df: pd.DataFrame) -> list[str]:
130
  """Get all assay columns from the DataFrame"""
@@ -174,17 +136,12 @@ def validate_dataframe(df: pd.DataFrame, submission_type: str = "GDPa1") -> None
174
  if missing_count > 0:
175
  raise gr.Error(f"❌ Column '{col}' contains {missing_count} missing values")
176
 
177
- # Above minimal number of rows
178
- if len(df) < MINIMAL_NUMBER_OF_ROWS:
179
- raise gr.Error(f"❌ CSV should have at least {MINIMAL_NUMBER_OF_ROWS} rows")
180
-
181
  # All names should be unique
182
  n_duplicates = df["antibody_name"].duplicated().sum()
183
  if n_duplicates > 0:
184
  raise gr.Error(
185
  f"❌ CSV should have only one row per antibody. Found {n_duplicates} duplicates."
186
  )
187
-
188
  # All antibody names should be recognizable
189
  unrecognized_antibodies = set(df["antibody_name"]) - set(
190
  ANTIBODY_NAMES_DICT[submission_type]
@@ -193,6 +150,15 @@ def validate_dataframe(df: pd.DataFrame, submission_type: str = "GDPa1") -> None
193
  raise gr.Error(
194
  f"❌ Found unrecognized antibody names: {', '.join(unrecognized_antibodies)}"
195
  )
 
 
 
 
 
 
 
 
 
196
  # Submission-type specific validation
197
  if submission_type.endswith("_CV"):
198
  validate_cv_submission(df, submission_type)
 
3
  import gradio as gr
4
  from constants import (
5
  REQUIRED_COLUMNS,
 
6
  ASSAY_LIST,
7
  CV_COLUMN,
8
  EXAMPLE_FILE_DICT,
 
63
  how="left",
64
  suffixes=("_expected", "_submitted"),
65
  )
 
 
 
 
 
 
 
 
 
 
66
  # CV fold assignments should match
67
  fold_mismatches = antibody_check[
68
  antibody_check[f"{CV_COLUMN}_expected"]
 
78
  f"❌ Fold assignments don't match canonical CV folds: {'; '.join(examples)}"
79
  )
80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
  def validate_full_dataset_submission(df: pd.DataFrame) -> None:
83
  """Validate full dataset submission"""
 
87
  "Please select 'Cross-Validation Predictions' if you want to submit CV results."
88
  )
89
 
 
 
 
 
 
 
 
90
 
91
  def get_assay_columns(df: pd.DataFrame) -> list[str]:
92
  """Get all assay columns from the DataFrame"""
 
136
  if missing_count > 0:
137
  raise gr.Error(f"❌ Column '{col}' contains {missing_count} missing values")
138
 
 
 
 
 
139
  # All names should be unique
140
  n_duplicates = df["antibody_name"].duplicated().sum()
141
  if n_duplicates > 0:
142
  raise gr.Error(
143
  f"❌ CSV should have only one row per antibody. Found {n_duplicates} duplicates."
144
  )
 
145
  # All antibody names should be recognizable
146
  unrecognized_antibodies = set(df["antibody_name"]) - set(
147
  ANTIBODY_NAMES_DICT[submission_type]
 
150
  raise gr.Error(
151
  f"❌ Found unrecognized antibody names: {', '.join(unrecognized_antibodies)}"
152
  )
153
+
154
+ # All antibody names should be present
155
+ missing_antibodies = set(ANTIBODY_NAMES_DICT[submission_type]) - set(
156
+ df["antibody_name"]
157
+ )
158
+ if missing_antibodies:
159
+ raise gr.Error(
160
+ f"❌ Missing predictions for {len(missing_antibodies)} antibodies: {', '.join(missing_antibodies)}"
161
+ )
162
  # Submission-type specific validation
163
  if submission_type.endswith("_CV"):
164
  validate_cv_submission(df, submission_type)