validate dataframe with tests
Browse files- submit.py +14 -2
- test/__init__.py +0 -0
- test/conftest.py +26 -0
- test/test_validation.py +109 -0
- validation.py +101 -0
submit.py
CHANGED
@@ -2,12 +2,15 @@ from pathlib import Path
|
|
2 |
import tempfile
|
3 |
from typing import BinaryIO
|
4 |
import json
|
|
|
|
|
5 |
|
6 |
import gradio as gr
|
7 |
from datetime import datetime
|
8 |
import uuid
|
9 |
|
10 |
-
from
|
|
|
11 |
|
12 |
def make_submission(
|
13 |
submitted_file: BinaryIO,
|
@@ -17,17 +20,26 @@ def make_submission(
|
|
17 |
if user_state is None:
|
18 |
raise gr.Error("You must submit your username to submit a file.")
|
19 |
|
|
|
|
|
|
|
20 |
file_path = submitted_file.name
|
21 |
|
22 |
if not file_path:
|
23 |
raise gr.Error("Uploaded file object does not have a valid file path.")
|
24 |
|
25 |
path_obj = Path(file_path)
|
|
|
|
|
|
|
|
|
26 |
timestamp = datetime.utcnow().isoformat()
|
27 |
submission_id = str(uuid.uuid4())
|
28 |
|
29 |
with (path_obj.open("rb") as f_in):
|
30 |
file_content = f_in.read().decode("utf-8")
|
|
|
|
|
31 |
|
32 |
# write to dataset
|
33 |
filename = f"{submission_id}.json"
|
@@ -49,7 +61,7 @@ def make_submission(
|
|
49 |
API.upload_file(
|
50 |
path_or_fileobj=tmp_name,
|
51 |
path_in_repo=filename,
|
52 |
-
repo_id=
|
53 |
repo_type="dataset",
|
54 |
commit_message=f"Add submission for {user_state} at {timestamp}"
|
55 |
)
|
|
|
2 |
import tempfile
|
3 |
from typing import BinaryIO
|
4 |
import json
|
5 |
+
import pandas as pd
|
6 |
+
import io
|
7 |
|
8 |
import gradio as gr
|
9 |
from datetime import datetime
|
10 |
import uuid
|
11 |
|
12 |
+
from constants import API, SUBMISSIONS_REPO
|
13 |
+
from validation import validate_csv_file
|
14 |
|
15 |
def make_submission(
|
16 |
submitted_file: BinaryIO,
|
|
|
20 |
if user_state is None:
|
21 |
raise gr.Error("You must submit your username to submit a file.")
|
22 |
|
23 |
+
if submitted_file is None:
|
24 |
+
raise gr.Error("Please upload a CSV file before submitting.")
|
25 |
+
|
26 |
file_path = submitted_file.name
|
27 |
|
28 |
if not file_path:
|
29 |
raise gr.Error("Uploaded file object does not have a valid file path.")
|
30 |
|
31 |
path_obj = Path(file_path)
|
32 |
+
|
33 |
+
if path_obj.suffix.lower() != '.csv':
|
34 |
+
raise gr.Error("File must be a CSV file. Please upload a .csv file.")
|
35 |
+
|
36 |
timestamp = datetime.utcnow().isoformat()
|
37 |
submission_id = str(uuid.uuid4())
|
38 |
|
39 |
with (path_obj.open("rb") as f_in):
|
40 |
file_content = f_in.read().decode("utf-8")
|
41 |
+
|
42 |
+
validate_csv_file(file_content)
|
43 |
|
44 |
# write to dataset
|
45 |
filename = f"{submission_id}.json"
|
|
|
61 |
API.upload_file(
|
62 |
path_or_fileobj=tmp_name,
|
63 |
path_in_repo=filename,
|
64 |
+
repo_id=SUBMISSIONS_REPO,
|
65 |
repo_type="dataset",
|
66 |
commit_message=f"Add submission for {user_state} at {timestamp}"
|
67 |
)
|
test/__init__.py
ADDED
File without changes
|
test/conftest.py
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pytest
|
2 |
+
import pandas as pd
|
3 |
+
from constants import MINIMAL_NUMBER_OF_ROWS, ASSAY_LIST
|
4 |
+
|
5 |
+
|
6 |
+
@pytest.fixture
|
7 |
+
def valid_csv_data():
|
8 |
+
"""Fixture providing valid CSV data with all required columns"""
|
9 |
+
return {
|
10 |
+
"antibody_id": ["AB001"] * MINIMAL_NUMBER_OF_ROWS,
|
11 |
+
"antibody_name": ["AB001"] * MINIMAL_NUMBER_OF_ROWS,
|
12 |
+
"vh_protein_sequence": ["EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARDYGDGYYFDYWGQGTLVTVSS"] * MINIMAL_NUMBER_OF_ROWS,
|
13 |
+
"vl_protein_sequence": ["DIQMTQSPSSLSASVGDRVTITCRASQSISSYLNWYQQKPGKAPKLLIYAASTLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQSYSTPFTFGQGTKVEIK"] * MINIMAL_NUMBER_OF_ROWS,
|
14 |
+
**{assay: [0.85] * MINIMAL_NUMBER_OF_ROWS for assay in ASSAY_LIST},
|
15 |
+
}
|
16 |
+
|
17 |
+
|
18 |
+
@pytest.fixture
|
19 |
+
def valid_input_dataframe(valid_csv_data):
|
20 |
+
"""Fixture providing a valid input dataframe"""
|
21 |
+
return pd.DataFrame(valid_csv_data)
|
22 |
+
|
23 |
+
@pytest.fixture
|
24 |
+
def valid_csv_content(valid_input_dataframe):
|
25 |
+
"""Fixture providing valid CSV content as string"""
|
26 |
+
return valid_input_dataframe.to_csv(index=False)
|
test/test_validation.py
ADDED
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pytest
|
2 |
+
import pandas as pd
|
3 |
+
import gradio as gr
|
4 |
+
from validation import validate_csv_file, validate_csv_can_be_read, validate_dataframe
|
5 |
+
from constants import REQUIRED_COLUMNS, MINIMAL_NUMBER_OF_ROWS
|
6 |
+
|
7 |
+
|
8 |
+
class TestValidateCsvCanBeRead:
|
9 |
+
"""Test cases for validate_csv_can_be_read function"""
|
10 |
+
|
11 |
+
def test_valid_csv_can_be_read(self, valid_csv_content):
|
12 |
+
"""Test that valid CSV content can be read"""
|
13 |
+
df = validate_csv_can_be_read(valid_csv_content)
|
14 |
+
assert isinstance(df, pd.DataFrame)
|
15 |
+
assert len(df) == MINIMAL_NUMBER_OF_ROWS
|
16 |
+
assert list(df.columns) == list(REQUIRED_COLUMNS)
|
17 |
+
|
18 |
+
def test_empty_csv_raises_error(self):
|
19 |
+
"""Test that empty CSV raises an error"""
|
20 |
+
empty_csv = ""
|
21 |
+
|
22 |
+
with pytest.raises(gr.Error) as exc_info:
|
23 |
+
validate_csv_can_be_read(empty_csv)
|
24 |
+
|
25 |
+
assert "empty or contains no valid data" in str(exc_info.value)
|
26 |
+
|
27 |
+
def test_invalid_csv_format_raises_error(self):
|
28 |
+
"""Test that invalid CSV format raises an error"""
|
29 |
+
# Create a CSV with malformed structure that pandas cannot parse
|
30 |
+
malformed_csv = "column1,column2\nvalue1,\"unclosed quote\nvalue4,value5"
|
31 |
+
|
32 |
+
with pytest.raises(gr.Error) as exc_info:
|
33 |
+
validate_csv_can_be_read(malformed_csv)
|
34 |
+
|
35 |
+
assert "Invalid CSV format" in str(exc_info.value)
|
36 |
+
|
37 |
+
def test_csv_with_quoted_fields_can_be_read(self):
|
38 |
+
"""Test that CSV with quoted fields can be read"""
|
39 |
+
# Create CSV with quoted fields and enough rows
|
40 |
+
base_row = 'AB001,"EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARDYGDGYYFDYWGQGTLVTVSS","DIQMTQSPSSLSASVGDRVTITCRASQSISSYLNWYQQKPGKAPKLLIYAASTLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQSYSTPFTFGQGTKVEIK",95.2,0.85,0.92,0.78,0.81,72.5'
|
41 |
+
csv_content = "antibody_id,vh_protein_sequence,vl_protein_sequence,SEC %Monomer,HIC,PR_CHO,AC-SINS_pH6.0,AC-SINS_pH7.4,Tm\n"
|
42 |
+
csv_content += "\n".join([base_row] * MINIMAL_NUMBER_OF_ROWS)
|
43 |
+
|
44 |
+
df = validate_csv_can_be_read(csv_content)
|
45 |
+
assert isinstance(df, pd.DataFrame)
|
46 |
+
assert len(df) == MINIMAL_NUMBER_OF_ROWS
|
47 |
+
|
48 |
+
|
49 |
+
class TestValidateDataframe:
|
50 |
+
"""Test cases for validate_dataframe function"""
|
51 |
+
|
52 |
+
def test_valid_dataframe_passes(self, valid_input_dataframe):
|
53 |
+
"""Test that valid DataFrame passes validation"""
|
54 |
+
validate_dataframe(valid_input_dataframe)
|
55 |
+
|
56 |
+
def test_missing_columns_raises_error(self, valid_input_dataframe):
|
57 |
+
"""Test that DataFrame with missing columns raises an error"""
|
58 |
+
missing_column = REQUIRED_COLUMNS[0]
|
59 |
+
df = valid_input_dataframe.copy()
|
60 |
+
df.drop(columns=[missing_column], inplace=True)
|
61 |
+
|
62 |
+
with pytest.raises(gr.Error) as exc_info:
|
63 |
+
validate_dataframe(df)
|
64 |
+
|
65 |
+
assert f"Missing required columns: {missing_column}" in str(exc_info.value)
|
66 |
+
|
67 |
+
|
68 |
+
def test_empty_dataframe_raises_error(self, valid_input_dataframe):
|
69 |
+
"""Test that empty DataFrame raises an error"""
|
70 |
+
empty_df = valid_input_dataframe.head(0)
|
71 |
+
|
72 |
+
with pytest.raises(gr.Error) as exc_info:
|
73 |
+
validate_dataframe(empty_df)
|
74 |
+
|
75 |
+
assert "CSV file is empty" in str(exc_info.value)
|
76 |
+
|
77 |
+
def test_insufficient_rows_raises_error(self, valid_input_dataframe):
|
78 |
+
"""Test that DataFrame with insufficient rows raises an error"""
|
79 |
+
df = valid_input_dataframe.head(MINIMAL_NUMBER_OF_ROWS - 1)
|
80 |
+
with pytest.raises(gr.Error) as exc_info:
|
81 |
+
validate_dataframe(df)
|
82 |
+
|
83 |
+
assert f"CSV should have at least {MINIMAL_NUMBER_OF_ROWS} rows" in str(exc_info.value)
|
84 |
+
|
85 |
+
def test_missing_values_raises_error(self, valid_input_dataframe):
|
86 |
+
"""Test that DataFrame with missing values raises an error"""
|
87 |
+
bad_column = REQUIRED_COLUMNS[0]
|
88 |
+
df = valid_input_dataframe.copy()
|
89 |
+
df[bad_column] = [None] * len(df)
|
90 |
+
with pytest.raises(gr.Error) as exc_info:
|
91 |
+
validate_dataframe(df)
|
92 |
+
|
93 |
+
assert f"contains {len(df)} missing values" in str(exc_info.value)
|
94 |
+
|
95 |
+
def test_csv_with_extra_columns_passes(self, valid_input_dataframe):
|
96 |
+
"""Test that DataFrame with extra columns passes validation"""
|
97 |
+
extra_column = "extra_column_1"
|
98 |
+
df = valid_input_dataframe.copy()
|
99 |
+
df[extra_column] = ["extra1"] * len(df)
|
100 |
+
df[extra_column] = ["extra2"] * len(df)
|
101 |
+
validate_dataframe(df)
|
102 |
+
|
103 |
+
|
104 |
+
class TestValidateCsvFile:
|
105 |
+
"""Test cases for the combined validate_csv_file function"""
|
106 |
+
|
107 |
+
def test_valid_csv_passes(self, valid_csv_content):
|
108 |
+
"""Test that a valid CSV with all required columns passes validation"""
|
109 |
+
validate_csv_file(valid_csv_content)
|
validation.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import io
|
3 |
+
import gradio as gr
|
4 |
+
from constants import REQUIRED_COLUMNS, MINIMAL_NUMBER_OF_ROWS
|
5 |
+
|
6 |
+
def validate_csv_can_be_read(file_content: str) -> pd.DataFrame:
|
7 |
+
"""
|
8 |
+
Validate that the CSV file can be read and parsed.
|
9 |
+
|
10 |
+
Parameters
|
11 |
+
----------
|
12 |
+
file_content: str
|
13 |
+
The content of the uploaded CSV file.
|
14 |
+
|
15 |
+
Returns
|
16 |
+
-------
|
17 |
+
pd.DataFrame
|
18 |
+
The parsed DataFrame if successful.
|
19 |
+
|
20 |
+
Raises
|
21 |
+
------
|
22 |
+
gr.Error: If CSV cannot be read or parsed
|
23 |
+
"""
|
24 |
+
try:
|
25 |
+
# Read CSV content
|
26 |
+
df = pd.read_csv(io.StringIO(file_content))
|
27 |
+
return df
|
28 |
+
|
29 |
+
except pd.errors.EmptyDataError:
|
30 |
+
raise gr.Error(
|
31 |
+
"β CSV file is empty or contains no valid data"
|
32 |
+
)
|
33 |
+
except pd.errors.ParserError as e:
|
34 |
+
raise gr.Error(
|
35 |
+
f"β Invalid CSV format<br><br>"
|
36 |
+
f"Error: {str(e)}"
|
37 |
+
)
|
38 |
+
except UnicodeDecodeError:
|
39 |
+
raise gr.Error(
|
40 |
+
"β File encoding error<br><br>"
|
41 |
+
"Your file appears to have an unsupported encoding.<br>"
|
42 |
+
"Please save your CSV file with UTF-8 encoding and try again."
|
43 |
+
)
|
44 |
+
|
45 |
+
def validate_dataframe(df: pd.DataFrame) -> None:
|
46 |
+
"""
|
47 |
+
Validate the DataFrame content and structure.
|
48 |
+
|
49 |
+
Parameters
|
50 |
+
----------
|
51 |
+
df: pd.DataFrame
|
52 |
+
The DataFrame to validate.
|
53 |
+
|
54 |
+
Raises
|
55 |
+
------
|
56 |
+
gr.Error: If validation fails
|
57 |
+
"""
|
58 |
+
# Required columns should be present
|
59 |
+
missing_columns = set(REQUIRED_COLUMNS) - set(df.columns)
|
60 |
+
if missing_columns:
|
61 |
+
raise gr.Error(
|
62 |
+
f"β Missing required columns: {', '.join(missing_columns)}"
|
63 |
+
)
|
64 |
+
|
65 |
+
# Data should not be empty
|
66 |
+
if df.empty:
|
67 |
+
raise gr.Error(
|
68 |
+
"β CSV file is empty"
|
69 |
+
)
|
70 |
+
|
71 |
+
# Check for missing values in required columns
|
72 |
+
for col in REQUIRED_COLUMNS:
|
73 |
+
missing_count = df[col].isnull().sum()
|
74 |
+
if missing_count > 0:
|
75 |
+
raise gr.Error(
|
76 |
+
f"β Column '{col}' contains {missing_count} missing values"
|
77 |
+
)
|
78 |
+
|
79 |
+
# Check for reasonable number of rows
|
80 |
+
if len(df) < MINIMAL_NUMBER_OF_ROWS:
|
81 |
+
raise gr.Error(
|
82 |
+
f"β CSV should have at least {MINIMAL_NUMBER_OF_ROWS} rows"
|
83 |
+
)
|
84 |
+
|
85 |
+
print(f"β
CSV validation passed! Found {len(df)} rows with columns: {', '.join(df.columns)}")
|
86 |
+
|
87 |
+
def validate_csv_file(file_content: str) -> None:
|
88 |
+
"""
|
89 |
+
Validate the uploaded CSV file.
|
90 |
+
|
91 |
+
Parameters
|
92 |
+
----------
|
93 |
+
file_content: str
|
94 |
+
The content of the uploaded CSV file.
|
95 |
+
|
96 |
+
Raises
|
97 |
+
------
|
98 |
+
gr.Error: If validation fails
|
99 |
+
"""
|
100 |
+
df = validate_csv_can_be_read(file_content)
|
101 |
+
validate_dataframe(df)
|