abdev-leaderboard

Running

App Files Files Community

pquintero commited on Jul 31

Commit

8f9985e

1 Parent(s): eb50e2e

precommit

Browse files

Files changed (11) hide show

.pre-commit-config.yaml +34 -0
about.py +2 -3
app.py +36 -22
constants.py +5 -5
evaluation.py +4 -6
requirements.txt +1 -1
submit.py +11 -16
test/conftest.py +9 -2
test/test_validation.py +28 -27
utils.py +28 -18
validation.py +24 -32

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,34 @@

+exclude: '^.*\.(ipynb|json)$'
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0
+    hooks:
+      - id: check-added-large-files
+      - id: check-ast
+      - id: check-builtin-literals
+      - id: check-case-conflict
+      - id: check-docstring-first
+      - id: check-json
+      - id: check-merge-conflict
+      - id: check-shebang-scripts-are-executable
+      - id: check-symlinks
+      - id: check-toml
+      - id: check-xml
+      - id: check-yaml
+      - id: debug-statements
+      - id: detect-private-key
+      - id: end-of-file-fixer
+      - id: trailing-whitespace
+        exclude: |
+            (?x)^(
+                .bumpversion.cfg
+            )$
+  # Fast Python linter and formatter - replaces flake8, isort, and black
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.6.9
+    hooks:
+      # Run the Ruff linter
+      - id: ruff
+        args: [--fix, --exit-non-zero-on-fix]
+      # Run the Ruff formatter
+      - id: ruff-format

about.py CHANGED Viewed

@@ -1,4 +1,3 @@
 ABOUT_TEXT = """
 ## About this challenge
@@ -6,7 +5,7 @@ We're inviting the ML/bio community to predict developability properties for 244
 **What is antibody developability?**
-Antibodies have to be manufacturable, stable in high concentrations, and have low off-target effects.
 Properties such as these can often hinder the progression of an antibody to the clinic, and are collectively referred to as 'developability'.
 Here we show 5 of these properties and invite the community to submit and develop better predictors, which will be tested out on a heldout private set to assess model generalization.
@@ -28,4 +27,4 @@ We'd like to add some more existing models to the leaderboard. Some examples of
 """
 FAQS = {
     "Example FAQ with dropdown": """Full answer to this question""",
-}

 ABOUT_TEXT = """
 ## About this challenge
 **What is antibody developability?**
+Antibodies have to be manufacturable, stable in high concentrations, and have low off-target effects.
 Properties such as these can often hinder the progression of an antibody to the clinic, and are collectively referred to as 'developability'.
 Here we show 5 of these properties and invite the community to submit and develop better predictors, which will be tested out on a heldout private set to assess model generalization.
 """
 FAQS = {
     "Example FAQ with dropdown": """Full answer to this question""",
+}

app.py CHANGED Viewed

@@ -7,7 +7,8 @@ from utils import fetch_hf_results, show_output_box
 from constants import ASSAY_LIST, ASSAY_RENAME, ASSAY_EMOJIS, ASSAY_DESCRIPTION
 from about import ABOUT_TEXT, FAQS
 from submit import make_submission
 def format_leaderboard_table(df_results: pd.DataFrame, assay: str | None = None):
     # Previous things that were nice in the constellaration leaderboard:
     # Having a submission time column, and a user column where the username is clickable (this is a pro for usability but con for anonymity)
@@ -16,10 +17,11 @@ def format_leaderboard_table(df_results: pd.DataFrame, assay: str | None = None)
     column_order = ["model", "property", "spearman", "spearman_cross_val"]
     df = df_results.query("assay.isin(@ASSAY_RENAME.keys())").copy()
     if assay is not None:
-        df = df[df['assay'] == assay]
     df = df[column_order]
     return df.sort_values(by="spearman", ascending=False)
 def get_leaderboard_object(assay: str | None = None):
     filter_columns = ["model"]
     if assay is None:
@@ -33,15 +35,16 @@ def get_leaderboard_object(assay: str | None = None):
         search_columns=["model"],
         filter_columns=filter_columns,
         every=60,
-        render=True
     )
 with gr.Blocks() as demo:
     gr.Markdown("""
         ## Welcome to the Ginkgo Antibody Developability Benchmark!
         **Beta version, not publicly launched yet**
         Participants can submit their model to the leaderboard by uploading a CSV file (see the "✉️ Submit" tab).
         See more details in the "❔About" tab.
         """)
@@ -58,15 +61,20 @@ with gr.Blocks() as demo:
             for question, answer in FAQS.items():
                 with gr.Accordion(question):
                     gr.Markdown(answer)
         # Procedurally make these 5 tabs
         for assay in ASSAY_LIST:
-            with gr.TabItem(f"{ASSAY_EMOJIS[assay]} {ASSAY_RENAME[assay]}", elem_id=f"abdev-benchmark-tab-table"):
                 gr.Markdown(f"# {ASSAY_DESCRIPTION[assay]}")
                 get_leaderboard_object(assay=assay)
-        with gr.TabItem("🚀 Overall", elem_id="abdev-benchmark-tab-table"):
-            gr.Markdown("# Antibody Developability Benchmark Leaderboard over all properties")
             get_leaderboard_object()
         with gr.TabItem("✉️ Submit", elem_id="boundary-benchmark-tab-table"):
@@ -74,27 +82,31 @@ with gr.Blocks() as demo:
                 """
             # Antibody Developability Submission
             Upload a CSV to get a score!
             Please use your Hugging Face account name to submit your model - we use this to track separate submissions, but only Hugging Face/Ginkgo will see these usernames (unless you choose to make them public).
             Your submission will be evaluated and added to the leaderboard.
             """
             )
-            filename = gr.State(value=None)
-            eval_state = gr.State(value=None)
             user_state = gr.State(value=None)
             anonymous_state = gr.State(value=False)
-            login_button = gr.LoginButton(value="Sign in with Hugging Face to see account name")  # Note(Lood): Is this mandatory?
             with gr.Row():
                 with gr.Column():
                     username_input = gr.Textbox(
-                        label="Username",
                         placeholder="Enter your Hugging Face username",
-                        info="This will be displayed on the leaderboard."
                     )
-                    anonymous_checkbox = gr.Checkbox(label="Would you like to keep your submission anonymous?") # Can make this ticked by default
                 with gr.Column():
                     submission_file = gr.File(label="Submission CSV")
@@ -102,14 +114,16 @@ with gr.Blocks() as demo:
             username_input.change(
                 fn=lambda x: x if x.strip() else None,
                 inputs=username_input,
-                outputs=user_state
-            )
             submit_btn = gr.Button("Evaluate")
             message = gr.Textbox(label="Status", lines=1, visible=False)
             # help message
-            gr.Markdown("If you have issues with submission or using the leaderboard, please start a discussion in the Community tab of this Space.")
             submit_btn.click(
                 make_submission,
                 inputs=[submission_file, user_state, anonymous_state],
@@ -126,7 +140,7 @@ with gr.Blocks() as demo:
         📬 For questions or feedback, contact <a href="mailto:[email protected]">[email protected]</a> or visit the Community tab at the top of this page.
         </div>
         """,
-        elem_id="contact-footer"
     )
 if __name__ == "__main__":

 from constants import ASSAY_LIST, ASSAY_RENAME, ASSAY_EMOJIS, ASSAY_DESCRIPTION
 from about import ABOUT_TEXT, FAQS
 from submit import make_submission
 def format_leaderboard_table(df_results: pd.DataFrame, assay: str | None = None):
     # Previous things that were nice in the constellaration leaderboard:
     # Having a submission time column, and a user column where the username is clickable (this is a pro for usability but con for anonymity)
     column_order = ["model", "property", "spearman", "spearman_cross_val"]
     df = df_results.query("assay.isin(@ASSAY_RENAME.keys())").copy()
     if assay is not None:
+        df = df[df["assay"] == assay]
     df = df[column_order]
     return df.sort_values(by="spearman", ascending=False)
 def get_leaderboard_object(assay: str | None = None):
     filter_columns = ["model"]
     if assay is None:
         search_columns=["model"],
         filter_columns=filter_columns,
         every=60,
+        render=True,
     )
 with gr.Blocks() as demo:
     gr.Markdown("""
         ## Welcome to the Ginkgo Antibody Developability Benchmark!
         **Beta version, not publicly launched yet**
         Participants can submit their model to the leaderboard by uploading a CSV file (see the "✉️ Submit" tab).
         See more details in the "❔About" tab.
         """)
             for question, answer in FAQS.items():
                 with gr.Accordion(question):
                     gr.Markdown(answer)
         # Procedurally make these 5 tabs
         for assay in ASSAY_LIST:
+            with gr.TabItem(
+                f"{ASSAY_EMOJIS[assay]} {ASSAY_RENAME[assay]}",
+                elem_id="abdev-benchmark-tab-table",
+            ):
                 gr.Markdown(f"# {ASSAY_DESCRIPTION[assay]}")
                 get_leaderboard_object(assay=assay)
+        with gr.TabItem("🚀 Overall", elem_id="abdev-benchmark-tab-table"):
+            gr.Markdown(
+                "# Antibody Developability Benchmark Leaderboard over all properties"
+            )
             get_leaderboard_object()
         with gr.TabItem("✉️ Submit", elem_id="boundary-benchmark-tab-table"):
                 """
             # Antibody Developability Submission
             Upload a CSV to get a score!
             Please use your Hugging Face account name to submit your model - we use this to track separate submissions, but only Hugging Face/Ginkgo will see these usernames (unless you choose to make them public).
             Your submission will be evaluated and added to the leaderboard.
             """
             )
+            filename = gr.State(value=None)
+            eval_state = gr.State(value=None)
             user_state = gr.State(value=None)
             anonymous_state = gr.State(value=False)
+            login_button = gr.LoginButton(
+                value="Sign in with Hugging Face to see account name"
+            )  # Note(Lood): Is this mandatory?
             with gr.Row():
                 with gr.Column():
                     username_input = gr.Textbox(
+                        label="Username",
                         placeholder="Enter your Hugging Face username",
+                        info="This will be displayed on the leaderboard.",
                     )
+                    anonymous_checkbox = gr.Checkbox(
+                        label="Would you like to keep your submission anonymous?"
+                    )  # Can make this ticked by default
                 with gr.Column():
                     submission_file = gr.File(label="Submission CSV")
             username_input.change(
                 fn=lambda x: x if x.strip() else None,
                 inputs=username_input,
+                outputs=user_state,
+            )
             submit_btn = gr.Button("Evaluate")
             message = gr.Textbox(label="Status", lines=1, visible=False)
             # help message
+            gr.Markdown(
+                "If you have issues with submission or using the leaderboard, please start a discussion in the Community tab of this Space."
+            )
             submit_btn.click(
                 make_submission,
                 inputs=[submission_file, user_state, anonymous_state],
         📬 For questions or feedback, contact <a href="mailto:[email protected]">[email protected]</a> or visit the Community tab at the top of this page.
         </div>
         """,
+        elem_id="contact-footer",
     )
 if __name__ == "__main__":

constants.py CHANGED Viewed

@@ -36,14 +36,14 @@ REQUIRED_COLUMNS: list[str] = [
     "antibody_name",
     "vh_protein_sequence",
     "vl_protein_sequence",
-]  + ASSAY_LIST
 # Huggingface API
 TOKEN = os.environ.get("HF_TOKEN")
-CACHE_PATH=os.getenv("HF_HOME", ".")
 API = HfApi(token=TOKEN)
 # Huggingface repos
-ORGANIZATION="ginkgo-datapoints"
-SUBMISSIONS_REPO = f'{ORGANIZATION}/abdev-bench-submissions'
-RESULTS_REPO = f'{ORGANIZATION}/abdev-bench-results'

     "antibody_name",
     "vh_protein_sequence",
     "vl_protein_sequence",
+] + ASSAY_LIST
 # Huggingface API
 TOKEN = os.environ.get("HF_TOKEN")
+CACHE_PATH = os.getenv("HF_HOME", ".")
 API = HfApi(token=TOKEN)
 # Huggingface repos
+ORGANIZATION = "ginkgo-datapoints"
+SUBMISSIONS_REPO = f"{ORGANIZATION}/abdev-bench-submissions"
+RESULTS_REPO = f"{ORGANIZATION}/abdev-bench-results"

evaluation.py CHANGED Viewed

@@ -1,9 +1,7 @@
-import json
-from pathlib import Path
 def evaluate_problem(
-    problem_type: str, input_file: str
-# ) -> problems.EvaluationSingleObjective | problems.EvaluationMultiObjective:
 ):
     pass
     # with Path(input_file).open("r") as f:
@@ -25,6 +23,6 @@ def evaluate_problem(
     #         result = problems.MHDStableQIStellarator().evaluate(boundaries)
     #     case _:
     #         raise ValueError(f"Unknown problem type: {problem_type}")
     # print("Finished evaluation.")
     # return result

 def evaluate_problem(
+    problem_type: str,
+    input_file: str,
+    # ) -> problems.EvaluationSingleObjective | problems.EvaluationMultiObjective:
 ):
     pass
     # with Path(input_file).open("r") as f:
     #         result = problems.MHDStableQIStellarator().evaluate(boundaries)
     #     case _:
     #         raise ValueError(f"Unknown problem type: {problem_type}")
     # print("Finished evaluation.")
     # return result

requirements.txt CHANGED Viewed

@@ -3,4 +3,4 @@ datasets
 huggingface_hub
 gradio-leaderboard
 gradio[oauth]
-# plotly

 huggingface_hub
 gradio-leaderboard
 gradio[oauth]
+# plotly

submit.py CHANGED Viewed

@@ -2,8 +2,6 @@ from pathlib import Path
 import tempfile
 from typing import BinaryIO
 import json
-import pandas as pd
-import io
 import gradio as gr
 from datetime import datetime
@@ -12,33 +10,30 @@ import uuid
 from constants import API, SUBMISSIONS_REPO
 from validation import validate_csv_file
-def make_submission(
-    submitted_file: BinaryIO,
-    user_state,
-    anonymous_state):
     if user_state is None:
         raise gr.Error("You must submit your username to submit a file.")
     if submitted_file is None:
         raise gr.Error("Please upload a CSV file before submitting.")
     file_path = submitted_file.name
     if not file_path:
         raise gr.Error("Uploaded file object does not have a valid file path.")
     path_obj = Path(file_path)
-    if path_obj.suffix.lower() != '.csv':
         raise gr.Error("File must be a CSV file. Please upload a .csv file.")
     timestamp = datetime.utcnow().isoformat()
     submission_id = str(uuid.uuid4())
-    with (path_obj.open("rb") as f_in):
         file_content = f_in.read().decode("utf-8")
         validate_csv_file(file_content)
         # write to dataset
@@ -51,7 +46,7 @@ def make_submission(
             "evaluated": False,
             "user": user_state,
             "anonymous": anonymous_state,
-            "csv_content": file_content
         }
         with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as tmp:
             json.dump(record, tmp, indent=2)
@@ -59,11 +54,11 @@ def make_submission(
             tmp_name = tmp.name
         API.upload_file(
-            path_or_fileobj=tmp_name,
             path_in_repo=filename,
             repo_id=SUBMISSIONS_REPO,
             repo_type="dataset",
-            commit_message=f"Add submission for {user_state} at {timestamp}"
         )
         Path(tmp_name).unlink()

 import tempfile
 from typing import BinaryIO
 import json
 import gradio as gr
 from datetime import datetime
 from constants import API, SUBMISSIONS_REPO
 from validation import validate_csv_file
+def make_submission(submitted_file: BinaryIO, user_state, anonymous_state):
     if user_state is None:
         raise gr.Error("You must submit your username to submit a file.")
     if submitted_file is None:
         raise gr.Error("Please upload a CSV file before submitting.")
     file_path = submitted_file.name
     if not file_path:
         raise gr.Error("Uploaded file object does not have a valid file path.")
     path_obj = Path(file_path)
+    if path_obj.suffix.lower() != ".csv":
         raise gr.Error("File must be a CSV file. Please upload a .csv file.")
     timestamp = datetime.utcnow().isoformat()
     submission_id = str(uuid.uuid4())
+    with path_obj.open("rb") as f_in:
         file_content = f_in.read().decode("utf-8")
         validate_csv_file(file_content)
         # write to dataset
             "evaluated": False,
             "user": user_state,
             "anonymous": anonymous_state,
+            "csv_content": file_content,
         }
         with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as tmp:
             json.dump(record, tmp, indent=2)
             tmp_name = tmp.name
         API.upload_file(
+            path_or_fileobj=tmp_name,
             path_in_repo=filename,
             repo_id=SUBMISSIONS_REPO,
             repo_type="dataset",
+            commit_message=f"Add submission for {user_state} at {timestamp}",
         )
         Path(tmp_name).unlink()

test/conftest.py CHANGED Viewed

@@ -9,8 +9,14 @@ def valid_csv_data():
     return {
         "antibody_id": ["AB001"] * MINIMAL_NUMBER_OF_ROWS,
         "antibody_name": ["AB001"] * MINIMAL_NUMBER_OF_ROWS,
-        "vh_protein_sequence": ["EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARDYGDGYYFDYWGQGTLVTVSS"] * MINIMAL_NUMBER_OF_ROWS,
-        "vl_protein_sequence": ["DIQMTQSPSSLSASVGDRVTITCRASQSISSYLNWYQQKPGKAPKLLIYAASTLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQSYSTPFTFGQGTKVEIK"] * MINIMAL_NUMBER_OF_ROWS,
         **{assay: [0.85] * MINIMAL_NUMBER_OF_ROWS for assay in ASSAY_LIST},
     }
@@ -20,6 +26,7 @@ def valid_input_dataframe(valid_csv_data):
     """Fixture providing a valid input dataframe"""
     return pd.DataFrame(valid_csv_data)
 @pytest.fixture
 def valid_csv_content(valid_input_dataframe):
     """Fixture providing valid CSV content as string"""

     return {
         "antibody_id": ["AB001"] * MINIMAL_NUMBER_OF_ROWS,
         "antibody_name": ["AB001"] * MINIMAL_NUMBER_OF_ROWS,
+        "vh_protein_sequence": [
+            "EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARDYGDGYYFDYWGQGTLVTVSS"
+        ]
+        * MINIMAL_NUMBER_OF_ROWS,
+        "vl_protein_sequence": [
+            "DIQMTQSPSSLSASVGDRVTITCRASQSISSYLNWYQQKPGKAPKLLIYAASTLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQSYSTPFTFGQGTKVEIK"
+        ]
+        * MINIMAL_NUMBER_OF_ROWS,
         **{assay: [0.85] * MINIMAL_NUMBER_OF_ROWS for assay in ASSAY_LIST},
     }
     """Fixture providing a valid input dataframe"""
     return pd.DataFrame(valid_csv_data)
 @pytest.fixture
 def valid_csv_content(valid_input_dataframe):
     """Fixture providing valid CSV content as string"""

test/test_validation.py CHANGED Viewed

@@ -7,40 +7,40 @@ from constants import REQUIRED_COLUMNS, MINIMAL_NUMBER_OF_ROWS
 class TestValidateCsvCanBeRead:
     """Test cases for validate_csv_can_be_read function"""
     def test_valid_csv_can_be_read(self, valid_csv_content):
         """Test that valid CSV content can be read"""
         df = validate_csv_can_be_read(valid_csv_content)
         assert isinstance(df, pd.DataFrame)
         assert len(df) == MINIMAL_NUMBER_OF_ROWS
         assert list(df.columns) == list(REQUIRED_COLUMNS)
     def test_empty_csv_raises_error(self):
         """Test that empty CSV raises an error"""
         empty_csv = ""
         with pytest.raises(gr.Error) as exc_info:
             validate_csv_can_be_read(empty_csv)
         assert "empty or contains no valid data" in str(exc_info.value)
     def test_invalid_csv_format_raises_error(self):
         """Test that invalid CSV format raises an error"""
         # Create a CSV with malformed structure that pandas cannot parse
-        malformed_csv = "column1,column2\nvalue1,\"unclosed quote\nvalue4,value5"
         with pytest.raises(gr.Error) as exc_info:
             validate_csv_can_be_read(malformed_csv)
         assert "Invalid CSV format" in str(exc_info.value)
     def test_csv_with_quoted_fields_can_be_read(self):
         """Test that CSV with quoted fields can be read"""
         # Create CSV with quoted fields and enough rows
         base_row = 'AB001,"EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARDYGDGYYFDYWGQGTLVTVSS","DIQMTQSPSSLSASVGDRVTITCRASQSISSYLNWYQQKPGKAPKLLIYAASTLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQSYSTPFTFGQGTKVEIK",95.2,0.85,0.92,0.78,0.81,72.5'
         csv_content = "antibody_id,vh_protein_sequence,vl_protein_sequence,SEC %Monomer,HIC,PR_CHO,AC-SINS_pH6.0,AC-SINS_pH7.4,Tm\n"
         csv_content += "\n".join([base_row] * MINIMAL_NUMBER_OF_ROWS)
         df = validate_csv_can_be_read(csv_content)
         assert isinstance(df, pd.DataFrame)
         assert len(df) == MINIMAL_NUMBER_OF_ROWS
@@ -48,40 +48,41 @@ class TestValidateCsvCanBeRead:
 class TestValidateDataframe:
     """Test cases for validate_dataframe function"""
     def test_valid_dataframe_passes(self, valid_input_dataframe):
         """Test that valid DataFrame passes validation"""
         validate_dataframe(valid_input_dataframe)
     def test_missing_columns_raises_error(self, valid_input_dataframe):
         """Test that DataFrame with missing columns raises an error"""
         missing_column = REQUIRED_COLUMNS[0]
         df = valid_input_dataframe.copy()
         df.drop(columns=[missing_column], inplace=True)
         with pytest.raises(gr.Error) as exc_info:
             validate_dataframe(df)
         assert f"Missing required columns: {missing_column}" in str(exc_info.value)
     def test_empty_dataframe_raises_error(self, valid_input_dataframe):
         """Test that empty DataFrame raises an error"""
         empty_df = valid_input_dataframe.head(0)
         with pytest.raises(gr.Error) as exc_info:
             validate_dataframe(empty_df)
         assert "CSV file is empty" in str(exc_info.value)
     def test_insufficient_rows_raises_error(self, valid_input_dataframe):
         """Test that DataFrame with insufficient rows raises an error"""
         df = valid_input_dataframe.head(MINIMAL_NUMBER_OF_ROWS - 1)
         with pytest.raises(gr.Error) as exc_info:
             validate_dataframe(df)
-        assert f"CSV should have at least {MINIMAL_NUMBER_OF_ROWS} rows" in str(exc_info.value)
     def test_missing_values_raises_error(self, valid_input_dataframe):
         """Test that DataFrame with missing values raises an error"""
         bad_column = REQUIRED_COLUMNS[0]
@@ -89,9 +90,9 @@ class TestValidateDataframe:
         df[bad_column] = [None] * len(df)
         with pytest.raises(gr.Error) as exc_info:
             validate_dataframe(df)
         assert f"contains {len(df)} missing values" in str(exc_info.value)
     def test_csv_with_extra_columns_passes(self, valid_input_dataframe):
         """Test that DataFrame with extra columns passes validation"""
         extra_column = "extra_column_1"
@@ -99,11 +100,11 @@ class TestValidateDataframe:
         df[extra_column] = ["extra1"] * len(df)
         df[extra_column] = ["extra2"] * len(df)
         validate_dataframe(df)
 class TestValidateCsvFile:
     """Test cases for the combined validate_csv_file function"""
     def test_valid_csv_passes(self, valid_csv_content):
         """Test that a valid CSV with all required columns passes validation"""
-        validate_csv_file(valid_csv_content)

 class TestValidateCsvCanBeRead:
     """Test cases for validate_csv_can_be_read function"""
     def test_valid_csv_can_be_read(self, valid_csv_content):
         """Test that valid CSV content can be read"""
         df = validate_csv_can_be_read(valid_csv_content)
         assert isinstance(df, pd.DataFrame)
         assert len(df) == MINIMAL_NUMBER_OF_ROWS
         assert list(df.columns) == list(REQUIRED_COLUMNS)
     def test_empty_csv_raises_error(self):
         """Test that empty CSV raises an error"""
         empty_csv = ""
         with pytest.raises(gr.Error) as exc_info:
             validate_csv_can_be_read(empty_csv)
         assert "empty or contains no valid data" in str(exc_info.value)
     def test_invalid_csv_format_raises_error(self):
         """Test that invalid CSV format raises an error"""
         # Create a CSV with malformed structure that pandas cannot parse
+        malformed_csv = 'column1,column2\nvalue1,"unclosed quote\nvalue4,value5'
         with pytest.raises(gr.Error) as exc_info:
             validate_csv_can_be_read(malformed_csv)
         assert "Invalid CSV format" in str(exc_info.value)
     def test_csv_with_quoted_fields_can_be_read(self):
         """Test that CSV with quoted fields can be read"""
         # Create CSV with quoted fields and enough rows
         base_row = 'AB001,"EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARDYGDGYYFDYWGQGTLVTVSS","DIQMTQSPSSLSASVGDRVTITCRASQSISSYLNWYQQKPGKAPKLLIYAASTLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQSYSTPFTFGQGTKVEIK",95.2,0.85,0.92,0.78,0.81,72.5'
         csv_content = "antibody_id,vh_protein_sequence,vl_protein_sequence,SEC %Monomer,HIC,PR_CHO,AC-SINS_pH6.0,AC-SINS_pH7.4,Tm\n"
         csv_content += "\n".join([base_row] * MINIMAL_NUMBER_OF_ROWS)
         df = validate_csv_can_be_read(csv_content)
         assert isinstance(df, pd.DataFrame)
         assert len(df) == MINIMAL_NUMBER_OF_ROWS
 class TestValidateDataframe:
     """Test cases for validate_dataframe function"""
     def test_valid_dataframe_passes(self, valid_input_dataframe):
         """Test that valid DataFrame passes validation"""
         validate_dataframe(valid_input_dataframe)
     def test_missing_columns_raises_error(self, valid_input_dataframe):
         """Test that DataFrame with missing columns raises an error"""
         missing_column = REQUIRED_COLUMNS[0]
         df = valid_input_dataframe.copy()
         df.drop(columns=[missing_column], inplace=True)
         with pytest.raises(gr.Error) as exc_info:
             validate_dataframe(df)
         assert f"Missing required columns: {missing_column}" in str(exc_info.value)
     def test_empty_dataframe_raises_error(self, valid_input_dataframe):
         """Test that empty DataFrame raises an error"""
         empty_df = valid_input_dataframe.head(0)
         with pytest.raises(gr.Error) as exc_info:
             validate_dataframe(empty_df)
         assert "CSV file is empty" in str(exc_info.value)
     def test_insufficient_rows_raises_error(self, valid_input_dataframe):
         """Test that DataFrame with insufficient rows raises an error"""
         df = valid_input_dataframe.head(MINIMAL_NUMBER_OF_ROWS - 1)
         with pytest.raises(gr.Error) as exc_info:
             validate_dataframe(df)
+        assert f"CSV should have at least {MINIMAL_NUMBER_OF_ROWS} rows" in str(
+            exc_info.value
+        )
     def test_missing_values_raises_error(self, valid_input_dataframe):
         """Test that DataFrame with missing values raises an error"""
         bad_column = REQUIRED_COLUMNS[0]
         df[bad_column] = [None] * len(df)
         with pytest.raises(gr.Error) as exc_info:
             validate_dataframe(df)
         assert f"contains {len(df)} missing values" in str(exc_info.value)
     def test_csv_with_extra_columns_passes(self, valid_input_dataframe):
         """Test that DataFrame with extra columns passes validation"""
         extra_column = "extra_column_1"
         df[extra_column] = ["extra1"] * len(df)
         df[extra_column] = ["extra2"] * len(df)
         validate_dataframe(df)
 class TestValidateCsvFile:
     """Test cases for the combined validate_csv_file function"""
     def test_valid_csv_passes(self, valid_csv_content):
         """Test that a valid CSV with all required columns passes validation"""
+        validate_csv_file(valid_csv_content)

utils.py CHANGED Viewed

@@ -17,16 +17,21 @@ from constants import API, SUBMISSIONS_REPO, RESULTS_REPO, ASSAY_RENAME
 #     link =f'https://huggingface.co/datasets/proxima-fusion/constellaration-bench-results/blob/main/{filename}'
 #     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">link</a>'
 def show_output_box(message):
     return gr.update(value=message, visible=True)
 def fetch_hf_results():
-    ds = load_dataset(RESULTS_REPO, split='no_low_spearman', download_mode="force_redownload")
     df = pd.DataFrame(ds).drop_duplicates(subset=["model", "assay"])
     df["property"] = df["assay"].map(ASSAY_RENAME)
     print(df.head())
     return df
 def read_result_from_hub(filename):
     local_path = hf_hub_download(
         repo_id=RESULTS_REPO,
@@ -35,6 +40,7 @@ def read_result_from_hub(filename):
     )
     return local_path
 def read_submission_from_hub(filename):
     local_path = hf_hub_download(
         repo_id=SUBMISSIONS_REPO,
@@ -43,37 +49,41 @@ def read_submission_from_hub(filename):
     )
     return local_path
 def write_results(record, result):
     record.update(result)
-    record['result_filename'] = record['submission_filename'].rstrip('.json') + '_results.json'
-    print(record['result_filename'])
-    record['evaluated'] = True
     record["objectives"] = json.dumps(record.get("objectives", []))
     record["feasibilities"] = json.dumps(record.get("feasibility", []))
-    if 'objective' not in record.keys():
-        record['objective'] = 0.0
-        record['minimize_objective'] = True
-        record['feasibility'] = sum(record['feasibility'])/len(record['feasibility'])
     with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as tmp:
         json.dump(record, tmp, indent=2)
         tmp.flush()
         tmp_name = tmp.name
     API.upload_file(
-            path_or_fileobj=tmp_name,
-            path_in_repo=record['result_filename'],
-            repo_id=RESULTS_REPO,
-            repo_type="dataset",
-            commit_message=f"Add result data for {record['result_filename']}"
-        )
     pathlib.Path(tmp_name).unlink()
     return
 def get_user(profile: gr.OAuthProfile | None) -> str:
     if profile is None:
         return "Please login to submit a boundary for evaluation."
-    return profile.username

 #     link =f'https://huggingface.co/datasets/proxima-fusion/constellaration-bench-results/blob/main/{filename}'
 #     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">link</a>'
 def show_output_box(message):
     return gr.update(value=message, visible=True)
 def fetch_hf_results():
+    ds = load_dataset(
+        RESULTS_REPO, split="no_low_spearman", download_mode="force_redownload"
+    )
     df = pd.DataFrame(ds).drop_duplicates(subset=["model", "assay"])
     df["property"] = df["assay"].map(ASSAY_RENAME)
     print(df.head())
     return df
 def read_result_from_hub(filename):
     local_path = hf_hub_download(
         repo_id=RESULTS_REPO,
     )
     return local_path
 def read_submission_from_hub(filename):
     local_path = hf_hub_download(
         repo_id=SUBMISSIONS_REPO,
     )
     return local_path
 def write_results(record, result):
     record.update(result)
+    record["result_filename"] = (
+        record["submission_filename"].rstrip(".json") + "_results.json"
+    )
+    print(record["result_filename"])
+    record["evaluated"] = True
     record["objectives"] = json.dumps(record.get("objectives", []))
     record["feasibilities"] = json.dumps(record.get("feasibility", []))
+    if "objective" not in record.keys():
+        record["objective"] = 0.0
+        record["minimize_objective"] = True
+        record["feasibility"] = sum(record["feasibility"]) / len(record["feasibility"])
     with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as tmp:
         json.dump(record, tmp, indent=2)
         tmp.flush()
         tmp_name = tmp.name
     API.upload_file(
+        path_or_fileobj=tmp_name,
+        path_in_repo=record["result_filename"],
+        repo_id=RESULTS_REPO,
+        repo_type="dataset",
+        commit_message=f"Add result data for {record['result_filename']}",
+    )
     pathlib.Path(tmp_name).unlink()
     return
 def get_user(profile: gr.OAuthProfile | None) -> str:
     if profile is None:
         return "Please login to submit a boundary for evaluation."
+    return profile.username

validation.py CHANGED Viewed

@@ -3,20 +3,21 @@ import io
 import gradio as gr
 from constants import REQUIRED_COLUMNS, MINIMAL_NUMBER_OF_ROWS
 def validate_csv_can_be_read(file_content: str) -> pd.DataFrame:
     """
     Validate that the CSV file can be read and parsed.
     Parameters
     ----------
     file_content: str
         The content of the uploaded CSV file.
     Returns
     -------
     pd.DataFrame
         The parsed DataFrame if successful.
     Raises
     ------
     gr.Error: If CSV cannot be read or parsed
@@ -25,16 +26,11 @@ def validate_csv_can_be_read(file_content: str) -> pd.DataFrame:
         # Read CSV content
         df = pd.read_csv(io.StringIO(file_content))
         return df
     except pd.errors.EmptyDataError:
-        raise gr.Error(
-            "❌ CSV file is empty or contains no valid data"
-        )
     except pd.errors.ParserError as e:
-        raise gr.Error(
-            f"❌ Invalid CSV format<br><br>"
-            f"Error: {str(e)}"
-        )
     except UnicodeDecodeError:
         raise gr.Error(
             "❌ File encoding error<br><br>"
@@ -42,15 +38,16 @@ def validate_csv_can_be_read(file_content: str) -> pd.DataFrame:
             "Please save your CSV file with UTF-8 encoding and try again."
         )
 def validate_dataframe(df: pd.DataFrame) -> None:
     """
     Validate the DataFrame content and structure.
     Parameters
     ----------
     df: pd.DataFrame
         The DataFrame to validate.
     Raises
     ------
     gr.Error: If validation fails
@@ -58,41 +55,36 @@ def validate_dataframe(df: pd.DataFrame) -> None:
     # Required columns should be present
     missing_columns = set(REQUIRED_COLUMNS) - set(df.columns)
     if missing_columns:
-        raise gr.Error(
-            f"❌ Missing required columns: {', '.join(missing_columns)}"
-        )
     # Data should not be empty
     if df.empty:
-        raise gr.Error(
-            "❌ CSV file is empty"
-        )
     # Check for missing values in required columns
     for col in REQUIRED_COLUMNS:
         missing_count = df[col].isnull().sum()
         if missing_count > 0:
-            raise gr.Error(
-                f"❌ Column '{col}' contains {missing_count} missing values"
-            )
     # Check for reasonable number of rows
     if len(df) < MINIMAL_NUMBER_OF_ROWS:
-        raise gr.Error(
-            f"❌ CSV should have at least {MINIMAL_NUMBER_OF_ROWS} rows"
-        )
-    print(f"✅ CSV validation passed! Found {len(df)} rows with columns: {', '.join(df.columns)}")
 def validate_csv_file(file_content: str) -> None:
     """
     Validate the uploaded CSV file.
     Parameters
     ----------
     file_content: str
         The content of the uploaded CSV file.
     Raises
     ------
     gr.Error: If validation fails

 import gradio as gr
 from constants import REQUIRED_COLUMNS, MINIMAL_NUMBER_OF_ROWS
 def validate_csv_can_be_read(file_content: str) -> pd.DataFrame:
     """
     Validate that the CSV file can be read and parsed.
     Parameters
     ----------
     file_content: str
         The content of the uploaded CSV file.
     Returns
     -------
     pd.DataFrame
         The parsed DataFrame if successful.
     Raises
     ------
     gr.Error: If CSV cannot be read or parsed
         # Read CSV content
         df = pd.read_csv(io.StringIO(file_content))
         return df
     except pd.errors.EmptyDataError:
+        raise gr.Error("❌ CSV file is empty or contains no valid data")
     except pd.errors.ParserError as e:
+        raise gr.Error(f"❌ Invalid CSV format<br><br>" f"Error: {str(e)}")
     except UnicodeDecodeError:
         raise gr.Error(
             "❌ File encoding error<br><br>"
             "Please save your CSV file with UTF-8 encoding and try again."
         )
 def validate_dataframe(df: pd.DataFrame) -> None:
     """
     Validate the DataFrame content and structure.
     Parameters
     ----------
     df: pd.DataFrame
         The DataFrame to validate.
     Raises
     ------
     gr.Error: If validation fails
     # Required columns should be present
     missing_columns = set(REQUIRED_COLUMNS) - set(df.columns)
     if missing_columns:
+        raise gr.Error(f"❌ Missing required columns: {', '.join(missing_columns)}")
     # Data should not be empty
     if df.empty:
+        raise gr.Error("❌ CSV file is empty")
     # Check for missing values in required columns
     for col in REQUIRED_COLUMNS:
         missing_count = df[col].isnull().sum()
         if missing_count > 0:
+            raise gr.Error(f"❌ Column '{col}' contains {missing_count} missing values")
     # Check for reasonable number of rows
     if len(df) < MINIMAL_NUMBER_OF_ROWS:
+        raise gr.Error(f"❌ CSV should have at least {MINIMAL_NUMBER_OF_ROWS} rows")
+    print(
+        f"✅ CSV validation passed! Found {len(df)} rows with columns: {', '.join(df.columns)}"
+    )
 def validate_csv_file(file_content: str) -> None:
     """
     Validate the uploaded CSV file.
     Parameters
     ----------
     file_content: str
         The content of the uploaded CSV file.
     Raises
     ------
     gr.Error: If validation fails