pquintero commited on
Commit
0f3e1b5
·
1 Parent(s): fc0bf00

at least 1 assay column required

Browse files
Files changed (4) hide show
  1. constants.py +1 -1
  2. test/conftest.py +2 -19
  3. test/test_validation.py +10 -3
  4. validation.py +18 -3
constants.py CHANGED
@@ -35,7 +35,7 @@ REQUIRED_COLUMNS: list[str] = [
35
  "antibody_name",
36
  "vh_protein_sequence",
37
  "vl_protein_sequence",
38
- ] + ASSAY_LIST
39
  ANTIBODY_NAMES = pd.read_csv("data/example-predictions.csv")["antibody_name"].tolist()
40
 
41
  # Huggingface API
 
35
  "antibody_name",
36
  "vh_protein_sequence",
37
  "vl_protein_sequence",
38
+ ]
39
  ANTIBODY_NAMES = pd.read_csv("data/example-predictions.csv")["antibody_name"].tolist()
40
 
41
  # Huggingface API
test/conftest.py CHANGED
@@ -1,27 +1,10 @@
1
  import pytest
2
  import pandas as pd
3
- from constants import MINIMAL_NUMBER_OF_ROWS, ASSAY_LIST, ANTIBODY_NAMES
4
 
5
 
6
  @pytest.fixture
7
- def valid_csv_data():
8
- return {
9
- "antibody_name": ANTIBODY_NAMES[:MINIMAL_NUMBER_OF_ROWS],
10
- "vh_protein_sequence": [
11
- "EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARDYGDGYYFDYWGQGTLVTVSS"
12
- ]
13
- * MINIMAL_NUMBER_OF_ROWS,
14
- "vl_protein_sequence": [
15
- "DIQMTQSPSSLSASVGDRVTITCRASQSISSYLNWYQQKPGKAPKLLIYAASTLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQSYSTPFTFGQGTKVEIK"
16
- ]
17
- * MINIMAL_NUMBER_OF_ROWS,
18
- **{assay: [0.85] * MINIMAL_NUMBER_OF_ROWS for assay in ASSAY_LIST},
19
- }
20
-
21
-
22
- @pytest.fixture
23
- def valid_input_dataframe(valid_csv_data):
24
- return pd.DataFrame(valid_csv_data)
25
 
26
 
27
  @pytest.fixture
 
1
  import pytest
2
  import pandas as pd
 
3
 
4
 
5
  @pytest.fixture
6
+ def valid_input_dataframe():
7
+ return pd.read_csv("data/example-predictions.csv")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
 
10
  @pytest.fixture
test/test_validation.py CHANGED
@@ -2,7 +2,7 @@ import pytest
2
  import pandas as pd
3
  import gradio as gr
4
  from validation import validate_csv_file, validate_csv_can_be_read, validate_dataframe
5
- from constants import REQUIRED_COLUMNS, MINIMAL_NUMBER_OF_ROWS
6
 
7
 
8
  class TestValidateCsvCanBeRead:
@@ -11,8 +11,6 @@ class TestValidateCsvCanBeRead:
11
  def test_valid_csv_can_be_read(self, valid_csv_content):
12
  df = validate_csv_can_be_read(valid_csv_content)
13
  assert isinstance(df, pd.DataFrame)
14
- assert len(df) == MINIMAL_NUMBER_OF_ROWS
15
- assert list(df.columns) == list(REQUIRED_COLUMNS)
16
 
17
  def test_empty_csv_raises_error(self):
18
  empty_csv = ""
@@ -56,6 +54,15 @@ class TestValidateDataframe:
56
 
57
  assert f"Missing required columns: {missing_column}" in str(exc_info.value)
58
 
 
 
 
 
 
 
 
 
 
59
  def test_empty_dataframe_raises_error(self, valid_input_dataframe):
60
  empty_df = valid_input_dataframe.head(0)
61
 
 
2
  import pandas as pd
3
  import gradio as gr
4
  from validation import validate_csv_file, validate_csv_can_be_read, validate_dataframe
5
+ from constants import REQUIRED_COLUMNS, MINIMAL_NUMBER_OF_ROWS, ASSAY_LIST
6
 
7
 
8
  class TestValidateCsvCanBeRead:
 
11
  def test_valid_csv_can_be_read(self, valid_csv_content):
12
  df = validate_csv_can_be_read(valid_csv_content)
13
  assert isinstance(df, pd.DataFrame)
 
 
14
 
15
  def test_empty_csv_raises_error(self):
16
  empty_csv = ""
 
54
 
55
  assert f"Missing required columns: {missing_column}" in str(exc_info.value)
56
 
57
+ def test_at_least_one_assay_column_raises_error(self, valid_input_dataframe):
58
+ df = valid_input_dataframe.copy()
59
+ df.drop(columns=ASSAY_LIST, inplace=True, errors="ignore")
60
+ with pytest.raises(gr.Error) as exc_info:
61
+ validate_dataframe(df)
62
+ assert "CSV should include at least one of the following assay columns" in str(
63
+ exc_info.value
64
+ )
65
+
66
  def test_empty_dataframe_raises_error(self, valid_input_dataframe):
67
  empty_df = valid_input_dataframe.head(0)
68
 
validation.py CHANGED
@@ -1,7 +1,12 @@
1
  import pandas as pd
2
  import io
3
  import gradio as gr
4
- from constants import REQUIRED_COLUMNS, MINIMAL_NUMBER_OF_ROWS, ANTIBODY_NAMES
 
 
 
 
 
5
 
6
 
7
  def validate_csv_can_be_read(file_content: str) -> pd.DataFrame:
@@ -57,12 +62,22 @@ def validate_dataframe(df: pd.DataFrame) -> None:
57
  if missing_columns:
58
  raise gr.Error(f"❌ Missing required columns: {', '.join(missing_columns)}")
59
 
 
 
 
 
 
 
 
 
 
 
60
  # Data should not be empty
61
  if df.empty:
62
  raise gr.Error("❌ CSV file is empty")
63
 
64
- # No missing values in required columns
65
- for col in REQUIRED_COLUMNS:
66
  missing_count = df[col].isnull().sum()
67
  if missing_count > 0:
68
  raise gr.Error(f"❌ Column '{col}' contains {missing_count} missing values")
 
1
  import pandas as pd
2
  import io
3
  import gradio as gr
4
+ from constants import (
5
+ REQUIRED_COLUMNS,
6
+ MINIMAL_NUMBER_OF_ROWS,
7
+ ANTIBODY_NAMES,
8
+ ASSAY_LIST,
9
+ )
10
 
11
 
12
  def validate_csv_can_be_read(file_content: str) -> pd.DataFrame:
 
62
  if missing_columns:
63
  raise gr.Error(f"❌ Missing required columns: {', '.join(missing_columns)}")
64
 
65
+ # Should include at least 1 assay column
66
+ assay_columns = [col for col in df.columns if col in ASSAY_LIST]
67
+ if len(assay_columns) < 1:
68
+ raise gr.Error(
69
+ "❌ CSV should include at least one of the following assay columns: "
70
+ + ", ".join(ASSAY_LIST)
71
+ )
72
+ # Submission are name, sequence, and at least one assay column
73
+ submission_columns = REQUIRED_COLUMNS + assay_columns
74
+
75
  # Data should not be empty
76
  if df.empty:
77
  raise gr.Error("❌ CSV file is empty")
78
 
79
+ # No missing values in submission columns
80
+ for col in submission_columns:
81
  missing_count = df[col].isnull().sum()
82
  if missing_count > 0:
83
  raise gr.Error(f"❌ Column '{col}' contains {missing_count} missing values")