precommit
Browse files- .pre-commit-config.yaml +34 -0
- about.py +2 -3
- app.py +36 -22
- constants.py +5 -5
- evaluation.py +4 -6
- requirements.txt +1 -1
- submit.py +11 -16
- test/conftest.py +9 -2
- test/test_validation.py +28 -27
- utils.py +28 -18
- validation.py +24 -32
.pre-commit-config.yaml
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
exclude: '^.*\.(ipynb|json)$'
|
2 |
+
repos:
|
3 |
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
4 |
+
rev: v5.0.0
|
5 |
+
hooks:
|
6 |
+
- id: check-added-large-files
|
7 |
+
- id: check-ast
|
8 |
+
- id: check-builtin-literals
|
9 |
+
- id: check-case-conflict
|
10 |
+
- id: check-docstring-first
|
11 |
+
- id: check-json
|
12 |
+
- id: check-merge-conflict
|
13 |
+
- id: check-shebang-scripts-are-executable
|
14 |
+
- id: check-symlinks
|
15 |
+
- id: check-toml
|
16 |
+
- id: check-xml
|
17 |
+
- id: check-yaml
|
18 |
+
- id: debug-statements
|
19 |
+
- id: detect-private-key
|
20 |
+
- id: end-of-file-fixer
|
21 |
+
- id: trailing-whitespace
|
22 |
+
exclude: |
|
23 |
+
(?x)^(
|
24 |
+
.bumpversion.cfg
|
25 |
+
)$
|
26 |
+
# Fast Python linter and formatter - replaces flake8, isort, and black
|
27 |
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
28 |
+
rev: v0.6.9
|
29 |
+
hooks:
|
30 |
+
# Run the Ruff linter
|
31 |
+
- id: ruff
|
32 |
+
args: [--fix, --exit-non-zero-on-fix]
|
33 |
+
# Run the Ruff formatter
|
34 |
+
- id: ruff-format
|
about.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
|
2 |
ABOUT_TEXT = """
|
3 |
## About this challenge
|
4 |
|
@@ -6,7 +5,7 @@ We're inviting the ML/bio community to predict developability properties for 244
|
|
6 |
|
7 |
**What is antibody developability?**
|
8 |
|
9 |
-
Antibodies have to be manufacturable, stable in high concentrations, and have low off-target effects.
|
10 |
Properties such as these can often hinder the progression of an antibody to the clinic, and are collectively referred to as 'developability'.
|
11 |
Here we show 5 of these properties and invite the community to submit and develop better predictors, which will be tested out on a heldout private set to assess model generalization.
|
12 |
|
@@ -28,4 +27,4 @@ We'd like to add some more existing models to the leaderboard. Some examples of
|
|
28 |
"""
|
29 |
FAQS = {
|
30 |
"Example FAQ with dropdown": """Full answer to this question""",
|
31 |
-
}
|
|
|
|
|
1 |
ABOUT_TEXT = """
|
2 |
## About this challenge
|
3 |
|
|
|
5 |
|
6 |
**What is antibody developability?**
|
7 |
|
8 |
+
Antibodies have to be manufacturable, stable in high concentrations, and have low off-target effects.
|
9 |
Properties such as these can often hinder the progression of an antibody to the clinic, and are collectively referred to as 'developability'.
|
10 |
Here we show 5 of these properties and invite the community to submit and develop better predictors, which will be tested out on a heldout private set to assess model generalization.
|
11 |
|
|
|
27 |
"""
|
28 |
FAQS = {
|
29 |
"Example FAQ with dropdown": """Full answer to this question""",
|
30 |
+
}
|
app.py
CHANGED
@@ -7,7 +7,8 @@ from utils import fetch_hf_results, show_output_box
|
|
7 |
from constants import ASSAY_LIST, ASSAY_RENAME, ASSAY_EMOJIS, ASSAY_DESCRIPTION
|
8 |
from about import ABOUT_TEXT, FAQS
|
9 |
from submit import make_submission
|
10 |
-
|
|
|
11 |
def format_leaderboard_table(df_results: pd.DataFrame, assay: str | None = None):
|
12 |
# Previous things that were nice in the constellaration leaderboard:
|
13 |
# Having a submission time column, and a user column where the username is clickable (this is a pro for usability but con for anonymity)
|
@@ -16,10 +17,11 @@ def format_leaderboard_table(df_results: pd.DataFrame, assay: str | None = None)
|
|
16 |
column_order = ["model", "property", "spearman", "spearman_cross_val"]
|
17 |
df = df_results.query("assay.isin(@ASSAY_RENAME.keys())").copy()
|
18 |
if assay is not None:
|
19 |
-
df = df[df[
|
20 |
df = df[column_order]
|
21 |
return df.sort_values(by="spearman", ascending=False)
|
22 |
|
|
|
23 |
def get_leaderboard_object(assay: str | None = None):
|
24 |
filter_columns = ["model"]
|
25 |
if assay is None:
|
@@ -33,15 +35,16 @@ def get_leaderboard_object(assay: str | None = None):
|
|
33 |
search_columns=["model"],
|
34 |
filter_columns=filter_columns,
|
35 |
every=60,
|
36 |
-
render=True
|
37 |
)
|
38 |
|
|
|
39 |
with gr.Blocks() as demo:
|
40 |
gr.Markdown("""
|
41 |
## Welcome to the Ginkgo Antibody Developability Benchmark!
|
42 |
-
|
43 |
**Beta version, not publicly launched yet**
|
44 |
-
|
45 |
Participants can submit their model to the leaderboard by uploading a CSV file (see the "βοΈ Submit" tab).
|
46 |
See more details in the "βAbout" tab.
|
47 |
""")
|
@@ -58,15 +61,20 @@ with gr.Blocks() as demo:
|
|
58 |
for question, answer in FAQS.items():
|
59 |
with gr.Accordion(question):
|
60 |
gr.Markdown(answer)
|
61 |
-
|
62 |
# Procedurally make these 5 tabs
|
63 |
for assay in ASSAY_LIST:
|
64 |
-
with gr.TabItem(
|
|
|
|
|
|
|
65 |
gr.Markdown(f"# {ASSAY_DESCRIPTION[assay]}")
|
66 |
get_leaderboard_object(assay=assay)
|
67 |
-
|
68 |
-
with gr.TabItem("π Overall", elem_id="abdev-benchmark-tab-table"):
|
69 |
-
gr.Markdown(
|
|
|
|
|
70 |
get_leaderboard_object()
|
71 |
|
72 |
with gr.TabItem("βοΈ Submit", elem_id="boundary-benchmark-tab-table"):
|
@@ -74,27 +82,31 @@ with gr.Blocks() as demo:
|
|
74 |
"""
|
75 |
# Antibody Developability Submission
|
76 |
Upload a CSV to get a score!
|
77 |
-
|
78 |
Please use your Hugging Face account name to submit your model - we use this to track separate submissions, but only Hugging Face/Ginkgo will see these usernames (unless you choose to make them public).
|
79 |
Your submission will be evaluated and added to the leaderboard.
|
80 |
"""
|
81 |
)
|
82 |
-
filename = gr.State(value=None)
|
83 |
-
eval_state = gr.State(value=None)
|
84 |
user_state = gr.State(value=None)
|
85 |
anonymous_state = gr.State(value=False)
|
86 |
|
87 |
-
login_button = gr.LoginButton(
|
|
|
|
|
88 |
|
89 |
with gr.Row():
|
90 |
with gr.Column():
|
91 |
username_input = gr.Textbox(
|
92 |
-
label="Username",
|
93 |
placeholder="Enter your Hugging Face username",
|
94 |
-
info="This will be displayed on the leaderboard."
|
95 |
)
|
96 |
|
97 |
-
anonymous_checkbox = gr.Checkbox(
|
|
|
|
|
98 |
with gr.Column():
|
99 |
submission_file = gr.File(label="Submission CSV")
|
100 |
|
@@ -102,14 +114,16 @@ with gr.Blocks() as demo:
|
|
102 |
username_input.change(
|
103 |
fn=lambda x: x if x.strip() else None,
|
104 |
inputs=username_input,
|
105 |
-
outputs=user_state
|
106 |
-
)
|
107 |
|
108 |
submit_btn = gr.Button("Evaluate")
|
109 |
message = gr.Textbox(label="Status", lines=1, visible=False)
|
110 |
# help message
|
111 |
-
gr.Markdown(
|
112 |
-
|
|
|
|
|
113 |
submit_btn.click(
|
114 |
make_submission,
|
115 |
inputs=[submission_file, user_state, anonymous_state],
|
@@ -126,7 +140,7 @@ with gr.Blocks() as demo:
|
|
126 |
π¬ For questions or feedback, contact <a href="mailto:[email protected]">[email protected]</a> or visit the Community tab at the top of this page.
|
127 |
</div>
|
128 |
""",
|
129 |
-
elem_id="contact-footer"
|
130 |
)
|
131 |
|
132 |
if __name__ == "__main__":
|
|
|
7 |
from constants import ASSAY_LIST, ASSAY_RENAME, ASSAY_EMOJIS, ASSAY_DESCRIPTION
|
8 |
from about import ABOUT_TEXT, FAQS
|
9 |
from submit import make_submission
|
10 |
+
|
11 |
+
|
12 |
def format_leaderboard_table(df_results: pd.DataFrame, assay: str | None = None):
|
13 |
# Previous things that were nice in the constellaration leaderboard:
|
14 |
# Having a submission time column, and a user column where the username is clickable (this is a pro for usability but con for anonymity)
|
|
|
17 |
column_order = ["model", "property", "spearman", "spearman_cross_val"]
|
18 |
df = df_results.query("assay.isin(@ASSAY_RENAME.keys())").copy()
|
19 |
if assay is not None:
|
20 |
+
df = df[df["assay"] == assay]
|
21 |
df = df[column_order]
|
22 |
return df.sort_values(by="spearman", ascending=False)
|
23 |
|
24 |
+
|
25 |
def get_leaderboard_object(assay: str | None = None):
|
26 |
filter_columns = ["model"]
|
27 |
if assay is None:
|
|
|
35 |
search_columns=["model"],
|
36 |
filter_columns=filter_columns,
|
37 |
every=60,
|
38 |
+
render=True,
|
39 |
)
|
40 |
|
41 |
+
|
42 |
with gr.Blocks() as demo:
|
43 |
gr.Markdown("""
|
44 |
## Welcome to the Ginkgo Antibody Developability Benchmark!
|
45 |
+
|
46 |
**Beta version, not publicly launched yet**
|
47 |
+
|
48 |
Participants can submit their model to the leaderboard by uploading a CSV file (see the "βοΈ Submit" tab).
|
49 |
See more details in the "βAbout" tab.
|
50 |
""")
|
|
|
61 |
for question, answer in FAQS.items():
|
62 |
with gr.Accordion(question):
|
63 |
gr.Markdown(answer)
|
64 |
+
|
65 |
# Procedurally make these 5 tabs
|
66 |
for assay in ASSAY_LIST:
|
67 |
+
with gr.TabItem(
|
68 |
+
f"{ASSAY_EMOJIS[assay]} {ASSAY_RENAME[assay]}",
|
69 |
+
elem_id="abdev-benchmark-tab-table",
|
70 |
+
):
|
71 |
gr.Markdown(f"# {ASSAY_DESCRIPTION[assay]}")
|
72 |
get_leaderboard_object(assay=assay)
|
73 |
+
|
74 |
+
with gr.TabItem("π Overall", elem_id="abdev-benchmark-tab-table"):
|
75 |
+
gr.Markdown(
|
76 |
+
"# Antibody Developability Benchmark Leaderboard over all properties"
|
77 |
+
)
|
78 |
get_leaderboard_object()
|
79 |
|
80 |
with gr.TabItem("βοΈ Submit", elem_id="boundary-benchmark-tab-table"):
|
|
|
82 |
"""
|
83 |
# Antibody Developability Submission
|
84 |
Upload a CSV to get a score!
|
85 |
+
|
86 |
Please use your Hugging Face account name to submit your model - we use this to track separate submissions, but only Hugging Face/Ginkgo will see these usernames (unless you choose to make them public).
|
87 |
Your submission will be evaluated and added to the leaderboard.
|
88 |
"""
|
89 |
)
|
90 |
+
filename = gr.State(value=None)
|
91 |
+
eval_state = gr.State(value=None)
|
92 |
user_state = gr.State(value=None)
|
93 |
anonymous_state = gr.State(value=False)
|
94 |
|
95 |
+
login_button = gr.LoginButton(
|
96 |
+
value="Sign in with Hugging Face to see account name"
|
97 |
+
) # Note(Lood): Is this mandatory?
|
98 |
|
99 |
with gr.Row():
|
100 |
with gr.Column():
|
101 |
username_input = gr.Textbox(
|
102 |
+
label="Username",
|
103 |
placeholder="Enter your Hugging Face username",
|
104 |
+
info="This will be displayed on the leaderboard.",
|
105 |
)
|
106 |
|
107 |
+
anonymous_checkbox = gr.Checkbox(
|
108 |
+
label="Would you like to keep your submission anonymous?"
|
109 |
+
) # Can make this ticked by default
|
110 |
with gr.Column():
|
111 |
submission_file = gr.File(label="Submission CSV")
|
112 |
|
|
|
114 |
username_input.change(
|
115 |
fn=lambda x: x if x.strip() else None,
|
116 |
inputs=username_input,
|
117 |
+
outputs=user_state,
|
118 |
+
)
|
119 |
|
120 |
submit_btn = gr.Button("Evaluate")
|
121 |
message = gr.Textbox(label="Status", lines=1, visible=False)
|
122 |
# help message
|
123 |
+
gr.Markdown(
|
124 |
+
"If you have issues with submission or using the leaderboard, please start a discussion in the Community tab of this Space."
|
125 |
+
)
|
126 |
+
|
127 |
submit_btn.click(
|
128 |
make_submission,
|
129 |
inputs=[submission_file, user_state, anonymous_state],
|
|
|
140 |
π¬ For questions or feedback, contact <a href="mailto:[email protected]">[email protected]</a> or visit the Community tab at the top of this page.
|
141 |
</div>
|
142 |
""",
|
143 |
+
elem_id="contact-footer",
|
144 |
)
|
145 |
|
146 |
if __name__ == "__main__":
|
constants.py
CHANGED
@@ -36,14 +36,14 @@ REQUIRED_COLUMNS: list[str] = [
|
|
36 |
"antibody_name",
|
37 |
"vh_protein_sequence",
|
38 |
"vl_protein_sequence",
|
39 |
-
]
|
40 |
|
41 |
# Huggingface API
|
42 |
TOKEN = os.environ.get("HF_TOKEN")
|
43 |
-
CACHE_PATH=os.getenv("HF_HOME", ".")
|
44 |
API = HfApi(token=TOKEN)
|
45 |
|
46 |
# Huggingface repos
|
47 |
-
ORGANIZATION="ginkgo-datapoints"
|
48 |
-
SUBMISSIONS_REPO = f
|
49 |
-
RESULTS_REPO = f
|
|
|
36 |
"antibody_name",
|
37 |
"vh_protein_sequence",
|
38 |
"vl_protein_sequence",
|
39 |
+
] + ASSAY_LIST
|
40 |
|
41 |
# Huggingface API
|
42 |
TOKEN = os.environ.get("HF_TOKEN")
|
43 |
+
CACHE_PATH = os.getenv("HF_HOME", ".")
|
44 |
API = HfApi(token=TOKEN)
|
45 |
|
46 |
# Huggingface repos
|
47 |
+
ORGANIZATION = "ginkgo-datapoints"
|
48 |
+
SUBMISSIONS_REPO = f"{ORGANIZATION}/abdev-bench-submissions"
|
49 |
+
RESULTS_REPO = f"{ORGANIZATION}/abdev-bench-results"
|
evaluation.py
CHANGED
@@ -1,9 +1,7 @@
|
|
1 |
-
import json
|
2 |
-
from pathlib import Path
|
3 |
-
|
4 |
def evaluate_problem(
|
5 |
-
problem_type: str,
|
6 |
-
|
|
|
7 |
):
|
8 |
pass
|
9 |
# with Path(input_file).open("r") as f:
|
@@ -25,6 +23,6 @@ def evaluate_problem(
|
|
25 |
# result = problems.MHDStableQIStellarator().evaluate(boundaries)
|
26 |
# case _:
|
27 |
# raise ValueError(f"Unknown problem type: {problem_type}")
|
28 |
-
|
29 |
# print("Finished evaluation.")
|
30 |
# return result
|
|
|
|
|
|
|
|
|
1 |
def evaluate_problem(
|
2 |
+
problem_type: str,
|
3 |
+
input_file: str,
|
4 |
+
# ) -> problems.EvaluationSingleObjective | problems.EvaluationMultiObjective:
|
5 |
):
|
6 |
pass
|
7 |
# with Path(input_file).open("r") as f:
|
|
|
23 |
# result = problems.MHDStableQIStellarator().evaluate(boundaries)
|
24 |
# case _:
|
25 |
# raise ValueError(f"Unknown problem type: {problem_type}")
|
26 |
+
|
27 |
# print("Finished evaluation.")
|
28 |
# return result
|
requirements.txt
CHANGED
@@ -3,4 +3,4 @@ datasets
|
|
3 |
huggingface_hub
|
4 |
gradio-leaderboard
|
5 |
gradio[oauth]
|
6 |
-
# plotly
|
|
|
3 |
huggingface_hub
|
4 |
gradio-leaderboard
|
5 |
gradio[oauth]
|
6 |
+
# plotly
|
submit.py
CHANGED
@@ -2,8 +2,6 @@ from pathlib import Path
|
|
2 |
import tempfile
|
3 |
from typing import BinaryIO
|
4 |
import json
|
5 |
-
import pandas as pd
|
6 |
-
import io
|
7 |
|
8 |
import gradio as gr
|
9 |
from datetime import datetime
|
@@ -12,33 +10,30 @@ import uuid
|
|
12 |
from constants import API, SUBMISSIONS_REPO
|
13 |
from validation import validate_csv_file
|
14 |
|
15 |
-
def make_submission(
|
16 |
-
submitted_file: BinaryIO,
|
17 |
-
user_state,
|
18 |
-
anonymous_state):
|
19 |
|
|
|
20 |
if user_state is None:
|
21 |
raise gr.Error("You must submit your username to submit a file.")
|
22 |
-
|
23 |
if submitted_file is None:
|
24 |
raise gr.Error("Please upload a CSV file before submitting.")
|
25 |
-
|
26 |
file_path = submitted_file.name
|
27 |
|
28 |
if not file_path:
|
29 |
raise gr.Error("Uploaded file object does not have a valid file path.")
|
30 |
-
|
31 |
path_obj = Path(file_path)
|
32 |
|
33 |
-
if path_obj.suffix.lower() !=
|
34 |
raise gr.Error("File must be a CSV file. Please upload a .csv file.")
|
35 |
-
|
36 |
timestamp = datetime.utcnow().isoformat()
|
37 |
submission_id = str(uuid.uuid4())
|
38 |
|
39 |
-
with
|
40 |
file_content = f_in.read().decode("utf-8")
|
41 |
-
|
42 |
validate_csv_file(file_content)
|
43 |
|
44 |
# write to dataset
|
@@ -51,7 +46,7 @@ def make_submission(
|
|
51 |
"evaluated": False,
|
52 |
"user": user_state,
|
53 |
"anonymous": anonymous_state,
|
54 |
-
"csv_content": file_content
|
55 |
}
|
56 |
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as tmp:
|
57 |
json.dump(record, tmp, indent=2)
|
@@ -59,11 +54,11 @@ def make_submission(
|
|
59 |
tmp_name = tmp.name
|
60 |
|
61 |
API.upload_file(
|
62 |
-
path_or_fileobj=tmp_name,
|
63 |
path_in_repo=filename,
|
64 |
repo_id=SUBMISSIONS_REPO,
|
65 |
repo_type="dataset",
|
66 |
-
commit_message=f"Add submission for {user_state} at {timestamp}"
|
67 |
)
|
68 |
Path(tmp_name).unlink()
|
69 |
|
|
|
2 |
import tempfile
|
3 |
from typing import BinaryIO
|
4 |
import json
|
|
|
|
|
5 |
|
6 |
import gradio as gr
|
7 |
from datetime import datetime
|
|
|
10 |
from constants import API, SUBMISSIONS_REPO
|
11 |
from validation import validate_csv_file
|
12 |
|
|
|
|
|
|
|
|
|
13 |
|
14 |
+
def make_submission(submitted_file: BinaryIO, user_state, anonymous_state):
|
15 |
if user_state is None:
|
16 |
raise gr.Error("You must submit your username to submit a file.")
|
17 |
+
|
18 |
if submitted_file is None:
|
19 |
raise gr.Error("Please upload a CSV file before submitting.")
|
20 |
+
|
21 |
file_path = submitted_file.name
|
22 |
|
23 |
if not file_path:
|
24 |
raise gr.Error("Uploaded file object does not have a valid file path.")
|
25 |
+
|
26 |
path_obj = Path(file_path)
|
27 |
|
28 |
+
if path_obj.suffix.lower() != ".csv":
|
29 |
raise gr.Error("File must be a CSV file. Please upload a .csv file.")
|
30 |
+
|
31 |
timestamp = datetime.utcnow().isoformat()
|
32 |
submission_id = str(uuid.uuid4())
|
33 |
|
34 |
+
with path_obj.open("rb") as f_in:
|
35 |
file_content = f_in.read().decode("utf-8")
|
36 |
+
|
37 |
validate_csv_file(file_content)
|
38 |
|
39 |
# write to dataset
|
|
|
46 |
"evaluated": False,
|
47 |
"user": user_state,
|
48 |
"anonymous": anonymous_state,
|
49 |
+
"csv_content": file_content,
|
50 |
}
|
51 |
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as tmp:
|
52 |
json.dump(record, tmp, indent=2)
|
|
|
54 |
tmp_name = tmp.name
|
55 |
|
56 |
API.upload_file(
|
57 |
+
path_or_fileobj=tmp_name,
|
58 |
path_in_repo=filename,
|
59 |
repo_id=SUBMISSIONS_REPO,
|
60 |
repo_type="dataset",
|
61 |
+
commit_message=f"Add submission for {user_state} at {timestamp}",
|
62 |
)
|
63 |
Path(tmp_name).unlink()
|
64 |
|
test/conftest.py
CHANGED
@@ -9,8 +9,14 @@ def valid_csv_data():
|
|
9 |
return {
|
10 |
"antibody_id": ["AB001"] * MINIMAL_NUMBER_OF_ROWS,
|
11 |
"antibody_name": ["AB001"] * MINIMAL_NUMBER_OF_ROWS,
|
12 |
-
"vh_protein_sequence": [
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
**{assay: [0.85] * MINIMAL_NUMBER_OF_ROWS for assay in ASSAY_LIST},
|
15 |
}
|
16 |
|
@@ -20,6 +26,7 @@ def valid_input_dataframe(valid_csv_data):
|
|
20 |
"""Fixture providing a valid input dataframe"""
|
21 |
return pd.DataFrame(valid_csv_data)
|
22 |
|
|
|
23 |
@pytest.fixture
|
24 |
def valid_csv_content(valid_input_dataframe):
|
25 |
"""Fixture providing valid CSV content as string"""
|
|
|
9 |
return {
|
10 |
"antibody_id": ["AB001"] * MINIMAL_NUMBER_OF_ROWS,
|
11 |
"antibody_name": ["AB001"] * MINIMAL_NUMBER_OF_ROWS,
|
12 |
+
"vh_protein_sequence": [
|
13 |
+
"EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARDYGDGYYFDYWGQGTLVTVSS"
|
14 |
+
]
|
15 |
+
* MINIMAL_NUMBER_OF_ROWS,
|
16 |
+
"vl_protein_sequence": [
|
17 |
+
"DIQMTQSPSSLSASVGDRVTITCRASQSISSYLNWYQQKPGKAPKLLIYAASTLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQSYSTPFTFGQGTKVEIK"
|
18 |
+
]
|
19 |
+
* MINIMAL_NUMBER_OF_ROWS,
|
20 |
**{assay: [0.85] * MINIMAL_NUMBER_OF_ROWS for assay in ASSAY_LIST},
|
21 |
}
|
22 |
|
|
|
26 |
"""Fixture providing a valid input dataframe"""
|
27 |
return pd.DataFrame(valid_csv_data)
|
28 |
|
29 |
+
|
30 |
@pytest.fixture
|
31 |
def valid_csv_content(valid_input_dataframe):
|
32 |
"""Fixture providing valid CSV content as string"""
|
test/test_validation.py
CHANGED
@@ -7,40 +7,40 @@ from constants import REQUIRED_COLUMNS, MINIMAL_NUMBER_OF_ROWS
|
|
7 |
|
8 |
class TestValidateCsvCanBeRead:
|
9 |
"""Test cases for validate_csv_can_be_read function"""
|
10 |
-
|
11 |
def test_valid_csv_can_be_read(self, valid_csv_content):
|
12 |
"""Test that valid CSV content can be read"""
|
13 |
df = validate_csv_can_be_read(valid_csv_content)
|
14 |
assert isinstance(df, pd.DataFrame)
|
15 |
assert len(df) == MINIMAL_NUMBER_OF_ROWS
|
16 |
assert list(df.columns) == list(REQUIRED_COLUMNS)
|
17 |
-
|
18 |
def test_empty_csv_raises_error(self):
|
19 |
"""Test that empty CSV raises an error"""
|
20 |
empty_csv = ""
|
21 |
-
|
22 |
with pytest.raises(gr.Error) as exc_info:
|
23 |
validate_csv_can_be_read(empty_csv)
|
24 |
-
|
25 |
assert "empty or contains no valid data" in str(exc_info.value)
|
26 |
-
|
27 |
def test_invalid_csv_format_raises_error(self):
|
28 |
"""Test that invalid CSV format raises an error"""
|
29 |
# Create a CSV with malformed structure that pandas cannot parse
|
30 |
-
malformed_csv =
|
31 |
-
|
32 |
with pytest.raises(gr.Error) as exc_info:
|
33 |
validate_csv_can_be_read(malformed_csv)
|
34 |
-
|
35 |
assert "Invalid CSV format" in str(exc_info.value)
|
36 |
-
|
37 |
def test_csv_with_quoted_fields_can_be_read(self):
|
38 |
"""Test that CSV with quoted fields can be read"""
|
39 |
# Create CSV with quoted fields and enough rows
|
40 |
base_row = 'AB001,"EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARDYGDGYYFDYWGQGTLVTVSS","DIQMTQSPSSLSASVGDRVTITCRASQSISSYLNWYQQKPGKAPKLLIYAASTLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQSYSTPFTFGQGTKVEIK",95.2,0.85,0.92,0.78,0.81,72.5'
|
41 |
csv_content = "antibody_id,vh_protein_sequence,vl_protein_sequence,SEC %Monomer,HIC,PR_CHO,AC-SINS_pH6.0,AC-SINS_pH7.4,Tm\n"
|
42 |
csv_content += "\n".join([base_row] * MINIMAL_NUMBER_OF_ROWS)
|
43 |
-
|
44 |
df = validate_csv_can_be_read(csv_content)
|
45 |
assert isinstance(df, pd.DataFrame)
|
46 |
assert len(df) == MINIMAL_NUMBER_OF_ROWS
|
@@ -48,40 +48,41 @@ class TestValidateCsvCanBeRead:
|
|
48 |
|
49 |
class TestValidateDataframe:
|
50 |
"""Test cases for validate_dataframe function"""
|
51 |
-
|
52 |
def test_valid_dataframe_passes(self, valid_input_dataframe):
|
53 |
"""Test that valid DataFrame passes validation"""
|
54 |
validate_dataframe(valid_input_dataframe)
|
55 |
-
|
56 |
def test_missing_columns_raises_error(self, valid_input_dataframe):
|
57 |
"""Test that DataFrame with missing columns raises an error"""
|
58 |
missing_column = REQUIRED_COLUMNS[0]
|
59 |
df = valid_input_dataframe.copy()
|
60 |
df.drop(columns=[missing_column], inplace=True)
|
61 |
-
|
62 |
with pytest.raises(gr.Error) as exc_info:
|
63 |
validate_dataframe(df)
|
64 |
-
|
65 |
assert f"Missing required columns: {missing_column}" in str(exc_info.value)
|
66 |
-
|
67 |
-
|
68 |
def test_empty_dataframe_raises_error(self, valid_input_dataframe):
|
69 |
"""Test that empty DataFrame raises an error"""
|
70 |
empty_df = valid_input_dataframe.head(0)
|
71 |
-
|
72 |
with pytest.raises(gr.Error) as exc_info:
|
73 |
validate_dataframe(empty_df)
|
74 |
-
|
75 |
assert "CSV file is empty" in str(exc_info.value)
|
76 |
-
|
77 |
def test_insufficient_rows_raises_error(self, valid_input_dataframe):
|
78 |
"""Test that DataFrame with insufficient rows raises an error"""
|
79 |
df = valid_input_dataframe.head(MINIMAL_NUMBER_OF_ROWS - 1)
|
80 |
with pytest.raises(gr.Error) as exc_info:
|
81 |
validate_dataframe(df)
|
82 |
-
|
83 |
-
assert f"CSV should have at least {MINIMAL_NUMBER_OF_ROWS} rows" in str(
|
84 |
-
|
|
|
|
|
85 |
def test_missing_values_raises_error(self, valid_input_dataframe):
|
86 |
"""Test that DataFrame with missing values raises an error"""
|
87 |
bad_column = REQUIRED_COLUMNS[0]
|
@@ -89,9 +90,9 @@ class TestValidateDataframe:
|
|
89 |
df[bad_column] = [None] * len(df)
|
90 |
with pytest.raises(gr.Error) as exc_info:
|
91 |
validate_dataframe(df)
|
92 |
-
|
93 |
assert f"contains {len(df)} missing values" in str(exc_info.value)
|
94 |
-
|
95 |
def test_csv_with_extra_columns_passes(self, valid_input_dataframe):
|
96 |
"""Test that DataFrame with extra columns passes validation"""
|
97 |
extra_column = "extra_column_1"
|
@@ -99,11 +100,11 @@ class TestValidateDataframe:
|
|
99 |
df[extra_column] = ["extra1"] * len(df)
|
100 |
df[extra_column] = ["extra2"] * len(df)
|
101 |
validate_dataframe(df)
|
102 |
-
|
103 |
|
104 |
class TestValidateCsvFile:
|
105 |
"""Test cases for the combined validate_csv_file function"""
|
106 |
-
|
107 |
def test_valid_csv_passes(self, valid_csv_content):
|
108 |
"""Test that a valid CSV with all required columns passes validation"""
|
109 |
-
validate_csv_file(valid_csv_content)
|
|
|
7 |
|
8 |
class TestValidateCsvCanBeRead:
|
9 |
"""Test cases for validate_csv_can_be_read function"""
|
10 |
+
|
11 |
def test_valid_csv_can_be_read(self, valid_csv_content):
|
12 |
"""Test that valid CSV content can be read"""
|
13 |
df = validate_csv_can_be_read(valid_csv_content)
|
14 |
assert isinstance(df, pd.DataFrame)
|
15 |
assert len(df) == MINIMAL_NUMBER_OF_ROWS
|
16 |
assert list(df.columns) == list(REQUIRED_COLUMNS)
|
17 |
+
|
18 |
def test_empty_csv_raises_error(self):
|
19 |
"""Test that empty CSV raises an error"""
|
20 |
empty_csv = ""
|
21 |
+
|
22 |
with pytest.raises(gr.Error) as exc_info:
|
23 |
validate_csv_can_be_read(empty_csv)
|
24 |
+
|
25 |
assert "empty or contains no valid data" in str(exc_info.value)
|
26 |
+
|
27 |
def test_invalid_csv_format_raises_error(self):
|
28 |
"""Test that invalid CSV format raises an error"""
|
29 |
# Create a CSV with malformed structure that pandas cannot parse
|
30 |
+
malformed_csv = 'column1,column2\nvalue1,"unclosed quote\nvalue4,value5'
|
31 |
+
|
32 |
with pytest.raises(gr.Error) as exc_info:
|
33 |
validate_csv_can_be_read(malformed_csv)
|
34 |
+
|
35 |
assert "Invalid CSV format" in str(exc_info.value)
|
36 |
+
|
37 |
def test_csv_with_quoted_fields_can_be_read(self):
|
38 |
"""Test that CSV with quoted fields can be read"""
|
39 |
# Create CSV with quoted fields and enough rows
|
40 |
base_row = 'AB001,"EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARDYGDGYYFDYWGQGTLVTVSS","DIQMTQSPSSLSASVGDRVTITCRASQSISSYLNWYQQKPGKAPKLLIYAASTLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQSYSTPFTFGQGTKVEIK",95.2,0.85,0.92,0.78,0.81,72.5'
|
41 |
csv_content = "antibody_id,vh_protein_sequence,vl_protein_sequence,SEC %Monomer,HIC,PR_CHO,AC-SINS_pH6.0,AC-SINS_pH7.4,Tm\n"
|
42 |
csv_content += "\n".join([base_row] * MINIMAL_NUMBER_OF_ROWS)
|
43 |
+
|
44 |
df = validate_csv_can_be_read(csv_content)
|
45 |
assert isinstance(df, pd.DataFrame)
|
46 |
assert len(df) == MINIMAL_NUMBER_OF_ROWS
|
|
|
48 |
|
49 |
class TestValidateDataframe:
|
50 |
"""Test cases for validate_dataframe function"""
|
51 |
+
|
52 |
def test_valid_dataframe_passes(self, valid_input_dataframe):
|
53 |
"""Test that valid DataFrame passes validation"""
|
54 |
validate_dataframe(valid_input_dataframe)
|
55 |
+
|
56 |
def test_missing_columns_raises_error(self, valid_input_dataframe):
|
57 |
"""Test that DataFrame with missing columns raises an error"""
|
58 |
missing_column = REQUIRED_COLUMNS[0]
|
59 |
df = valid_input_dataframe.copy()
|
60 |
df.drop(columns=[missing_column], inplace=True)
|
61 |
+
|
62 |
with pytest.raises(gr.Error) as exc_info:
|
63 |
validate_dataframe(df)
|
64 |
+
|
65 |
assert f"Missing required columns: {missing_column}" in str(exc_info.value)
|
66 |
+
|
|
|
67 |
def test_empty_dataframe_raises_error(self, valid_input_dataframe):
|
68 |
"""Test that empty DataFrame raises an error"""
|
69 |
empty_df = valid_input_dataframe.head(0)
|
70 |
+
|
71 |
with pytest.raises(gr.Error) as exc_info:
|
72 |
validate_dataframe(empty_df)
|
73 |
+
|
74 |
assert "CSV file is empty" in str(exc_info.value)
|
75 |
+
|
76 |
def test_insufficient_rows_raises_error(self, valid_input_dataframe):
|
77 |
"""Test that DataFrame with insufficient rows raises an error"""
|
78 |
df = valid_input_dataframe.head(MINIMAL_NUMBER_OF_ROWS - 1)
|
79 |
with pytest.raises(gr.Error) as exc_info:
|
80 |
validate_dataframe(df)
|
81 |
+
|
82 |
+
assert f"CSV should have at least {MINIMAL_NUMBER_OF_ROWS} rows" in str(
|
83 |
+
exc_info.value
|
84 |
+
)
|
85 |
+
|
86 |
def test_missing_values_raises_error(self, valid_input_dataframe):
|
87 |
"""Test that DataFrame with missing values raises an error"""
|
88 |
bad_column = REQUIRED_COLUMNS[0]
|
|
|
90 |
df[bad_column] = [None] * len(df)
|
91 |
with pytest.raises(gr.Error) as exc_info:
|
92 |
validate_dataframe(df)
|
93 |
+
|
94 |
assert f"contains {len(df)} missing values" in str(exc_info.value)
|
95 |
+
|
96 |
def test_csv_with_extra_columns_passes(self, valid_input_dataframe):
|
97 |
"""Test that DataFrame with extra columns passes validation"""
|
98 |
extra_column = "extra_column_1"
|
|
|
100 |
df[extra_column] = ["extra1"] * len(df)
|
101 |
df[extra_column] = ["extra2"] * len(df)
|
102 |
validate_dataframe(df)
|
103 |
+
|
104 |
|
105 |
class TestValidateCsvFile:
|
106 |
"""Test cases for the combined validate_csv_file function"""
|
107 |
+
|
108 |
def test_valid_csv_passes(self, valid_csv_content):
|
109 |
"""Test that a valid CSV with all required columns passes validation"""
|
110 |
+
validate_csv_file(valid_csv_content)
|
utils.py
CHANGED
@@ -17,16 +17,21 @@ from constants import API, SUBMISSIONS_REPO, RESULTS_REPO, ASSAY_RENAME
|
|
17 |
# link =f'https://huggingface.co/datasets/proxima-fusion/constellaration-bench-results/blob/main/{filename}'
|
18 |
# return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">link</a>'
|
19 |
|
|
|
20 |
def show_output_box(message):
|
21 |
return gr.update(value=message, visible=True)
|
22 |
|
|
|
23 |
def fetch_hf_results():
|
24 |
-
ds = load_dataset(
|
|
|
|
|
25 |
df = pd.DataFrame(ds).drop_duplicates(subset=["model", "assay"])
|
26 |
df["property"] = df["assay"].map(ASSAY_RENAME)
|
27 |
print(df.head())
|
28 |
return df
|
29 |
|
|
|
30 |
def read_result_from_hub(filename):
|
31 |
local_path = hf_hub_download(
|
32 |
repo_id=RESULTS_REPO,
|
@@ -35,6 +40,7 @@ def read_result_from_hub(filename):
|
|
35 |
)
|
36 |
return local_path
|
37 |
|
|
|
38 |
def read_submission_from_hub(filename):
|
39 |
local_path = hf_hub_download(
|
40 |
repo_id=SUBMISSIONS_REPO,
|
@@ -43,37 +49,41 @@ def read_submission_from_hub(filename):
|
|
43 |
)
|
44 |
return local_path
|
45 |
|
|
|
46 |
def write_results(record, result):
|
47 |
record.update(result)
|
48 |
-
record[
|
49 |
-
|
50 |
-
|
|
|
|
|
51 |
|
52 |
record["objectives"] = json.dumps(record.get("objectives", []))
|
53 |
record["feasibilities"] = json.dumps(record.get("feasibility", []))
|
54 |
-
|
55 |
-
if
|
56 |
-
record[
|
57 |
-
record[
|
58 |
-
record[
|
59 |
|
60 |
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as tmp:
|
61 |
json.dump(record, tmp, indent=2)
|
62 |
tmp.flush()
|
63 |
tmp_name = tmp.name
|
64 |
-
|
65 |
API.upload_file(
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
pathlib.Path(tmp_name).unlink()
|
74 |
return
|
75 |
|
|
|
76 |
def get_user(profile: gr.OAuthProfile | None) -> str:
|
77 |
if profile is None:
|
78 |
return "Please login to submit a boundary for evaluation."
|
79 |
-
return profile.username
|
|
|
17 |
# link =f'https://huggingface.co/datasets/proxima-fusion/constellaration-bench-results/blob/main/{filename}'
|
18 |
# return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">link</a>'
|
19 |
|
20 |
+
|
21 |
def show_output_box(message):
|
22 |
return gr.update(value=message, visible=True)
|
23 |
|
24 |
+
|
25 |
def fetch_hf_results():
|
26 |
+
ds = load_dataset(
|
27 |
+
RESULTS_REPO, split="no_low_spearman", download_mode="force_redownload"
|
28 |
+
)
|
29 |
df = pd.DataFrame(ds).drop_duplicates(subset=["model", "assay"])
|
30 |
df["property"] = df["assay"].map(ASSAY_RENAME)
|
31 |
print(df.head())
|
32 |
return df
|
33 |
|
34 |
+
|
35 |
def read_result_from_hub(filename):
|
36 |
local_path = hf_hub_download(
|
37 |
repo_id=RESULTS_REPO,
|
|
|
40 |
)
|
41 |
return local_path
|
42 |
|
43 |
+
|
44 |
def read_submission_from_hub(filename):
|
45 |
local_path = hf_hub_download(
|
46 |
repo_id=SUBMISSIONS_REPO,
|
|
|
49 |
)
|
50 |
return local_path
|
51 |
|
52 |
+
|
53 |
def write_results(record, result):
|
54 |
record.update(result)
|
55 |
+
record["result_filename"] = (
|
56 |
+
record["submission_filename"].rstrip(".json") + "_results.json"
|
57 |
+
)
|
58 |
+
print(record["result_filename"])
|
59 |
+
record["evaluated"] = True
|
60 |
|
61 |
record["objectives"] = json.dumps(record.get("objectives", []))
|
62 |
record["feasibilities"] = json.dumps(record.get("feasibility", []))
|
63 |
+
|
64 |
+
if "objective" not in record.keys():
|
65 |
+
record["objective"] = 0.0
|
66 |
+
record["minimize_objective"] = True
|
67 |
+
record["feasibility"] = sum(record["feasibility"]) / len(record["feasibility"])
|
68 |
|
69 |
with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as tmp:
|
70 |
json.dump(record, tmp, indent=2)
|
71 |
tmp.flush()
|
72 |
tmp_name = tmp.name
|
73 |
+
|
74 |
API.upload_file(
|
75 |
+
path_or_fileobj=tmp_name,
|
76 |
+
path_in_repo=record["result_filename"],
|
77 |
+
repo_id=RESULTS_REPO,
|
78 |
+
repo_type="dataset",
|
79 |
+
commit_message=f"Add result data for {record['result_filename']}",
|
80 |
+
)
|
81 |
+
|
82 |
pathlib.Path(tmp_name).unlink()
|
83 |
return
|
84 |
|
85 |
+
|
86 |
def get_user(profile: gr.OAuthProfile | None) -> str:
|
87 |
if profile is None:
|
88 |
return "Please login to submit a boundary for evaluation."
|
89 |
+
return profile.username
|
validation.py
CHANGED
@@ -3,20 +3,21 @@ import io
|
|
3 |
import gradio as gr
|
4 |
from constants import REQUIRED_COLUMNS, MINIMAL_NUMBER_OF_ROWS
|
5 |
|
|
|
6 |
def validate_csv_can_be_read(file_content: str) -> pd.DataFrame:
|
7 |
"""
|
8 |
Validate that the CSV file can be read and parsed.
|
9 |
-
|
10 |
Parameters
|
11 |
----------
|
12 |
file_content: str
|
13 |
The content of the uploaded CSV file.
|
14 |
-
|
15 |
Returns
|
16 |
-------
|
17 |
pd.DataFrame
|
18 |
The parsed DataFrame if successful.
|
19 |
-
|
20 |
Raises
|
21 |
------
|
22 |
gr.Error: If CSV cannot be read or parsed
|
@@ -25,16 +26,11 @@ def validate_csv_can_be_read(file_content: str) -> pd.DataFrame:
|
|
25 |
# Read CSV content
|
26 |
df = pd.read_csv(io.StringIO(file_content))
|
27 |
return df
|
28 |
-
|
29 |
except pd.errors.EmptyDataError:
|
30 |
-
raise gr.Error(
|
31 |
-
"β CSV file is empty or contains no valid data"
|
32 |
-
)
|
33 |
except pd.errors.ParserError as e:
|
34 |
-
raise gr.Error(
|
35 |
-
f"β Invalid CSV format<br><br>"
|
36 |
-
f"Error: {str(e)}"
|
37 |
-
)
|
38 |
except UnicodeDecodeError:
|
39 |
raise gr.Error(
|
40 |
"β File encoding error<br><br>"
|
@@ -42,15 +38,16 @@ def validate_csv_can_be_read(file_content: str) -> pd.DataFrame:
|
|
42 |
"Please save your CSV file with UTF-8 encoding and try again."
|
43 |
)
|
44 |
|
|
|
45 |
def validate_dataframe(df: pd.DataFrame) -> None:
|
46 |
"""
|
47 |
Validate the DataFrame content and structure.
|
48 |
-
|
49 |
Parameters
|
50 |
----------
|
51 |
df: pd.DataFrame
|
52 |
The DataFrame to validate.
|
53 |
-
|
54 |
Raises
|
55 |
------
|
56 |
gr.Error: If validation fails
|
@@ -58,41 +55,36 @@ def validate_dataframe(df: pd.DataFrame) -> None:
|
|
58 |
# Required columns should be present
|
59 |
missing_columns = set(REQUIRED_COLUMNS) - set(df.columns)
|
60 |
if missing_columns:
|
61 |
-
raise gr.Error(
|
62 |
-
|
63 |
-
)
|
64 |
-
|
65 |
# Data should not be empty
|
66 |
if df.empty:
|
67 |
-
raise gr.Error(
|
68 |
-
|
69 |
-
)
|
70 |
-
|
71 |
# Check for missing values in required columns
|
72 |
for col in REQUIRED_COLUMNS:
|
73 |
missing_count = df[col].isnull().sum()
|
74 |
if missing_count > 0:
|
75 |
-
raise gr.Error(
|
76 |
-
|
77 |
-
)
|
78 |
-
|
79 |
# Check for reasonable number of rows
|
80 |
if len(df) < MINIMAL_NUMBER_OF_ROWS:
|
81 |
-
raise gr.Error(
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
|
|
86 |
|
87 |
def validate_csv_file(file_content: str) -> None:
|
88 |
"""
|
89 |
Validate the uploaded CSV file.
|
90 |
-
|
91 |
Parameters
|
92 |
----------
|
93 |
file_content: str
|
94 |
The content of the uploaded CSV file.
|
95 |
-
|
96 |
Raises
|
97 |
------
|
98 |
gr.Error: If validation fails
|
|
|
3 |
import gradio as gr
|
4 |
from constants import REQUIRED_COLUMNS, MINIMAL_NUMBER_OF_ROWS
|
5 |
|
6 |
+
|
7 |
def validate_csv_can_be_read(file_content: str) -> pd.DataFrame:
|
8 |
"""
|
9 |
Validate that the CSV file can be read and parsed.
|
10 |
+
|
11 |
Parameters
|
12 |
----------
|
13 |
file_content: str
|
14 |
The content of the uploaded CSV file.
|
15 |
+
|
16 |
Returns
|
17 |
-------
|
18 |
pd.DataFrame
|
19 |
The parsed DataFrame if successful.
|
20 |
+
|
21 |
Raises
|
22 |
------
|
23 |
gr.Error: If CSV cannot be read or parsed
|
|
|
26 |
# Read CSV content
|
27 |
df = pd.read_csv(io.StringIO(file_content))
|
28 |
return df
|
29 |
+
|
30 |
except pd.errors.EmptyDataError:
|
31 |
+
raise gr.Error("β CSV file is empty or contains no valid data")
|
|
|
|
|
32 |
except pd.errors.ParserError as e:
|
33 |
+
raise gr.Error(f"β Invalid CSV format<br><br>" f"Error: {str(e)}")
|
|
|
|
|
|
|
34 |
except UnicodeDecodeError:
|
35 |
raise gr.Error(
|
36 |
"β File encoding error<br><br>"
|
|
|
38 |
"Please save your CSV file with UTF-8 encoding and try again."
|
39 |
)
|
40 |
|
41 |
+
|
42 |
def validate_dataframe(df: pd.DataFrame) -> None:
|
43 |
"""
|
44 |
Validate the DataFrame content and structure.
|
45 |
+
|
46 |
Parameters
|
47 |
----------
|
48 |
df: pd.DataFrame
|
49 |
The DataFrame to validate.
|
50 |
+
|
51 |
Raises
|
52 |
------
|
53 |
gr.Error: If validation fails
|
|
|
55 |
# Required columns should be present
|
56 |
missing_columns = set(REQUIRED_COLUMNS) - set(df.columns)
|
57 |
if missing_columns:
|
58 |
+
raise gr.Error(f"β Missing required columns: {', '.join(missing_columns)}")
|
59 |
+
|
|
|
|
|
60 |
# Data should not be empty
|
61 |
if df.empty:
|
62 |
+
raise gr.Error("β CSV file is empty")
|
63 |
+
|
|
|
|
|
64 |
# Check for missing values in required columns
|
65 |
for col in REQUIRED_COLUMNS:
|
66 |
missing_count = df[col].isnull().sum()
|
67 |
if missing_count > 0:
|
68 |
+
raise gr.Error(f"β Column '{col}' contains {missing_count} missing values")
|
69 |
+
|
|
|
|
|
70 |
# Check for reasonable number of rows
|
71 |
if len(df) < MINIMAL_NUMBER_OF_ROWS:
|
72 |
+
raise gr.Error(f"β CSV should have at least {MINIMAL_NUMBER_OF_ROWS} rows")
|
73 |
+
|
74 |
+
print(
|
75 |
+
f"β
CSV validation passed! Found {len(df)} rows with columns: {', '.join(df.columns)}"
|
76 |
+
)
|
77 |
+
|
78 |
|
79 |
def validate_csv_file(file_content: str) -> None:
|
80 |
"""
|
81 |
Validate the uploaded CSV file.
|
82 |
+
|
83 |
Parameters
|
84 |
----------
|
85 |
file_content: str
|
86 |
The content of the uploaded CSV file.
|
87 |
+
|
88 |
Raises
|
89 |
------
|
90 |
gr.Error: If validation fails
|