pquintero commited on
Commit
eb50e2e
Β·
1 Parent(s): 7a1c35b

validate dataframe with tests

Browse files
Files changed (5) hide show
  1. submit.py +14 -2
  2. test/__init__.py +0 -0
  3. test/conftest.py +26 -0
  4. test/test_validation.py +109 -0
  5. validation.py +101 -0
submit.py CHANGED
@@ -2,12 +2,15 @@ from pathlib import Path
2
  import tempfile
3
  from typing import BinaryIO
4
  import json
 
 
5
 
6
  import gradio as gr
7
  from datetime import datetime
8
  import uuid
9
 
10
- from about import API, submissions_repo
 
11
 
12
  def make_submission(
13
  submitted_file: BinaryIO,
@@ -17,17 +20,26 @@ def make_submission(
17
  if user_state is None:
18
  raise gr.Error("You must submit your username to submit a file.")
19
 
 
 
 
20
  file_path = submitted_file.name
21
 
22
  if not file_path:
23
  raise gr.Error("Uploaded file object does not have a valid file path.")
24
 
25
  path_obj = Path(file_path)
 
 
 
 
26
  timestamp = datetime.utcnow().isoformat()
27
  submission_id = str(uuid.uuid4())
28
 
29
  with (path_obj.open("rb") as f_in):
30
  file_content = f_in.read().decode("utf-8")
 
 
31
 
32
  # write to dataset
33
  filename = f"{submission_id}.json"
@@ -49,7 +61,7 @@ def make_submission(
49
  API.upload_file(
50
  path_or_fileobj=tmp_name,
51
  path_in_repo=filename,
52
- repo_id=submissions_repo,
53
  repo_type="dataset",
54
  commit_message=f"Add submission for {user_state} at {timestamp}"
55
  )
 
2
  import tempfile
3
  from typing import BinaryIO
4
  import json
5
+ import pandas as pd
6
+ import io
7
 
8
  import gradio as gr
9
  from datetime import datetime
10
  import uuid
11
 
12
+ from constants import API, SUBMISSIONS_REPO
13
+ from validation import validate_csv_file
14
 
15
  def make_submission(
16
  submitted_file: BinaryIO,
 
20
  if user_state is None:
21
  raise gr.Error("You must submit your username to submit a file.")
22
 
23
+ if submitted_file is None:
24
+ raise gr.Error("Please upload a CSV file before submitting.")
25
+
26
  file_path = submitted_file.name
27
 
28
  if not file_path:
29
  raise gr.Error("Uploaded file object does not have a valid file path.")
30
 
31
  path_obj = Path(file_path)
32
+
33
+ if path_obj.suffix.lower() != '.csv':
34
+ raise gr.Error("File must be a CSV file. Please upload a .csv file.")
35
+
36
  timestamp = datetime.utcnow().isoformat()
37
  submission_id = str(uuid.uuid4())
38
 
39
  with (path_obj.open("rb") as f_in):
40
  file_content = f_in.read().decode("utf-8")
41
+
42
+ validate_csv_file(file_content)
43
 
44
  # write to dataset
45
  filename = f"{submission_id}.json"
 
61
  API.upload_file(
62
  path_or_fileobj=tmp_name,
63
  path_in_repo=filename,
64
+ repo_id=SUBMISSIONS_REPO,
65
  repo_type="dataset",
66
  commit_message=f"Add submission for {user_state} at {timestamp}"
67
  )
test/__init__.py ADDED
File without changes
test/conftest.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ import pandas as pd
3
+ from constants import MINIMAL_NUMBER_OF_ROWS, ASSAY_LIST
4
+
5
+
6
+ @pytest.fixture
7
+ def valid_csv_data():
8
+ """Fixture providing valid CSV data with all required columns"""
9
+ return {
10
+ "antibody_id": ["AB001"] * MINIMAL_NUMBER_OF_ROWS,
11
+ "antibody_name": ["AB001"] * MINIMAL_NUMBER_OF_ROWS,
12
+ "vh_protein_sequence": ["EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARDYGDGYYFDYWGQGTLVTVSS"] * MINIMAL_NUMBER_OF_ROWS,
13
+ "vl_protein_sequence": ["DIQMTQSPSSLSASVGDRVTITCRASQSISSYLNWYQQKPGKAPKLLIYAASTLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQSYSTPFTFGQGTKVEIK"] * MINIMAL_NUMBER_OF_ROWS,
14
+ **{assay: [0.85] * MINIMAL_NUMBER_OF_ROWS for assay in ASSAY_LIST},
15
+ }
16
+
17
+
18
+ @pytest.fixture
19
+ def valid_input_dataframe(valid_csv_data):
20
+ """Fixture providing a valid input dataframe"""
21
+ return pd.DataFrame(valid_csv_data)
22
+
23
+ @pytest.fixture
24
+ def valid_csv_content(valid_input_dataframe):
25
+ """Fixture providing valid CSV content as string"""
26
+ return valid_input_dataframe.to_csv(index=False)
test/test_validation.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pytest
2
+ import pandas as pd
3
+ import gradio as gr
4
+ from validation import validate_csv_file, validate_csv_can_be_read, validate_dataframe
5
+ from constants import REQUIRED_COLUMNS, MINIMAL_NUMBER_OF_ROWS
6
+
7
+
8
+ class TestValidateCsvCanBeRead:
9
+ """Test cases for validate_csv_can_be_read function"""
10
+
11
+ def test_valid_csv_can_be_read(self, valid_csv_content):
12
+ """Test that valid CSV content can be read"""
13
+ df = validate_csv_can_be_read(valid_csv_content)
14
+ assert isinstance(df, pd.DataFrame)
15
+ assert len(df) == MINIMAL_NUMBER_OF_ROWS
16
+ assert list(df.columns) == list(REQUIRED_COLUMNS)
17
+
18
+ def test_empty_csv_raises_error(self):
19
+ """Test that empty CSV raises an error"""
20
+ empty_csv = ""
21
+
22
+ with pytest.raises(gr.Error) as exc_info:
23
+ validate_csv_can_be_read(empty_csv)
24
+
25
+ assert "empty or contains no valid data" in str(exc_info.value)
26
+
27
+ def test_invalid_csv_format_raises_error(self):
28
+ """Test that invalid CSV format raises an error"""
29
+ # Create a CSV with malformed structure that pandas cannot parse
30
+ malformed_csv = "column1,column2\nvalue1,\"unclosed quote\nvalue4,value5"
31
+
32
+ with pytest.raises(gr.Error) as exc_info:
33
+ validate_csv_can_be_read(malformed_csv)
34
+
35
+ assert "Invalid CSV format" in str(exc_info.value)
36
+
37
+ def test_csv_with_quoted_fields_can_be_read(self):
38
+ """Test that CSV with quoted fields can be read"""
39
+ # Create CSV with quoted fields and enough rows
40
+ base_row = 'AB001,"EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARDYGDGYYFDYWGQGTLVTVSS","DIQMTQSPSSLSASVGDRVTITCRASQSISSYLNWYQQKPGKAPKLLIYAASTLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQSYSTPFTFGQGTKVEIK",95.2,0.85,0.92,0.78,0.81,72.5'
41
+ csv_content = "antibody_id,vh_protein_sequence,vl_protein_sequence,SEC %Monomer,HIC,PR_CHO,AC-SINS_pH6.0,AC-SINS_pH7.4,Tm\n"
42
+ csv_content += "\n".join([base_row] * MINIMAL_NUMBER_OF_ROWS)
43
+
44
+ df = validate_csv_can_be_read(csv_content)
45
+ assert isinstance(df, pd.DataFrame)
46
+ assert len(df) == MINIMAL_NUMBER_OF_ROWS
47
+
48
+
49
+ class TestValidateDataframe:
50
+ """Test cases for validate_dataframe function"""
51
+
52
+ def test_valid_dataframe_passes(self, valid_input_dataframe):
53
+ """Test that valid DataFrame passes validation"""
54
+ validate_dataframe(valid_input_dataframe)
55
+
56
+ def test_missing_columns_raises_error(self, valid_input_dataframe):
57
+ """Test that DataFrame with missing columns raises an error"""
58
+ missing_column = REQUIRED_COLUMNS[0]
59
+ df = valid_input_dataframe.copy()
60
+ df.drop(columns=[missing_column], inplace=True)
61
+
62
+ with pytest.raises(gr.Error) as exc_info:
63
+ validate_dataframe(df)
64
+
65
+ assert f"Missing required columns: {missing_column}" in str(exc_info.value)
66
+
67
+
68
+ def test_empty_dataframe_raises_error(self, valid_input_dataframe):
69
+ """Test that empty DataFrame raises an error"""
70
+ empty_df = valid_input_dataframe.head(0)
71
+
72
+ with pytest.raises(gr.Error) as exc_info:
73
+ validate_dataframe(empty_df)
74
+
75
+ assert "CSV file is empty" in str(exc_info.value)
76
+
77
+ def test_insufficient_rows_raises_error(self, valid_input_dataframe):
78
+ """Test that DataFrame with insufficient rows raises an error"""
79
+ df = valid_input_dataframe.head(MINIMAL_NUMBER_OF_ROWS - 1)
80
+ with pytest.raises(gr.Error) as exc_info:
81
+ validate_dataframe(df)
82
+
83
+ assert f"CSV should have at least {MINIMAL_NUMBER_OF_ROWS} rows" in str(exc_info.value)
84
+
85
+ def test_missing_values_raises_error(self, valid_input_dataframe):
86
+ """Test that DataFrame with missing values raises an error"""
87
+ bad_column = REQUIRED_COLUMNS[0]
88
+ df = valid_input_dataframe.copy()
89
+ df[bad_column] = [None] * len(df)
90
+ with pytest.raises(gr.Error) as exc_info:
91
+ validate_dataframe(df)
92
+
93
+ assert f"contains {len(df)} missing values" in str(exc_info.value)
94
+
95
+ def test_csv_with_extra_columns_passes(self, valid_input_dataframe):
96
+ """Test that DataFrame with extra columns passes validation"""
97
+ extra_column = "extra_column_1"
98
+ df = valid_input_dataframe.copy()
99
+ df[extra_column] = ["extra1"] * len(df)
100
+ df[extra_column] = ["extra2"] * len(df)
101
+ validate_dataframe(df)
102
+
103
+
104
+ class TestValidateCsvFile:
105
+ """Test cases for the combined validate_csv_file function"""
106
+
107
+ def test_valid_csv_passes(self, valid_csv_content):
108
+ """Test that a valid CSV with all required columns passes validation"""
109
+ validate_csv_file(valid_csv_content)
validation.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import io
3
+ import gradio as gr
4
+ from constants import REQUIRED_COLUMNS, MINIMAL_NUMBER_OF_ROWS
5
+
6
+ def validate_csv_can_be_read(file_content: str) -> pd.DataFrame:
7
+ """
8
+ Validate that the CSV file can be read and parsed.
9
+
10
+ Parameters
11
+ ----------
12
+ file_content: str
13
+ The content of the uploaded CSV file.
14
+
15
+ Returns
16
+ -------
17
+ pd.DataFrame
18
+ The parsed DataFrame if successful.
19
+
20
+ Raises
21
+ ------
22
+ gr.Error: If CSV cannot be read or parsed
23
+ """
24
+ try:
25
+ # Read CSV content
26
+ df = pd.read_csv(io.StringIO(file_content))
27
+ return df
28
+
29
+ except pd.errors.EmptyDataError:
30
+ raise gr.Error(
31
+ "❌ CSV file is empty or contains no valid data"
32
+ )
33
+ except pd.errors.ParserError as e:
34
+ raise gr.Error(
35
+ f"❌ Invalid CSV format<br><br>"
36
+ f"Error: {str(e)}"
37
+ )
38
+ except UnicodeDecodeError:
39
+ raise gr.Error(
40
+ "❌ File encoding error<br><br>"
41
+ "Your file appears to have an unsupported encoding.<br>"
42
+ "Please save your CSV file with UTF-8 encoding and try again."
43
+ )
44
+
45
+ def validate_dataframe(df: pd.DataFrame) -> None:
46
+ """
47
+ Validate the DataFrame content and structure.
48
+
49
+ Parameters
50
+ ----------
51
+ df: pd.DataFrame
52
+ The DataFrame to validate.
53
+
54
+ Raises
55
+ ------
56
+ gr.Error: If validation fails
57
+ """
58
+ # Required columns should be present
59
+ missing_columns = set(REQUIRED_COLUMNS) - set(df.columns)
60
+ if missing_columns:
61
+ raise gr.Error(
62
+ f"❌ Missing required columns: {', '.join(missing_columns)}"
63
+ )
64
+
65
+ # Data should not be empty
66
+ if df.empty:
67
+ raise gr.Error(
68
+ "❌ CSV file is empty"
69
+ )
70
+
71
+ # Check for missing values in required columns
72
+ for col in REQUIRED_COLUMNS:
73
+ missing_count = df[col].isnull().sum()
74
+ if missing_count > 0:
75
+ raise gr.Error(
76
+ f"❌ Column '{col}' contains {missing_count} missing values"
77
+ )
78
+
79
+ # Check for reasonable number of rows
80
+ if len(df) < MINIMAL_NUMBER_OF_ROWS:
81
+ raise gr.Error(
82
+ f"❌ CSV should have at least {MINIMAL_NUMBER_OF_ROWS} rows"
83
+ )
84
+
85
+ print(f"βœ… CSV validation passed! Found {len(df)} rows with columns: {', '.join(df.columns)}")
86
+
87
+ def validate_csv_file(file_content: str) -> None:
88
+ """
89
+ Validate the uploaded CSV file.
90
+
91
+ Parameters
92
+ ----------
93
+ file_content: str
94
+ The content of the uploaded CSV file.
95
+
96
+ Raises
97
+ ------
98
+ gr.Error: If validation fails
99
+ """
100
+ df = validate_csv_can_be_read(file_content)
101
+ validate_dataframe(df)