pquintero commited on
Commit
8f9985e
Β·
1 Parent(s): eb50e2e
Files changed (11) hide show
  1. .pre-commit-config.yaml +34 -0
  2. about.py +2 -3
  3. app.py +36 -22
  4. constants.py +5 -5
  5. evaluation.py +4 -6
  6. requirements.txt +1 -1
  7. submit.py +11 -16
  8. test/conftest.py +9 -2
  9. test/test_validation.py +28 -27
  10. utils.py +28 -18
  11. validation.py +24 -32
.pre-commit-config.yaml ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ exclude: '^.*\.(ipynb|json)$'
2
+ repos:
3
+ - repo: https://github.com/pre-commit/pre-commit-hooks
4
+ rev: v5.0.0
5
+ hooks:
6
+ - id: check-added-large-files
7
+ - id: check-ast
8
+ - id: check-builtin-literals
9
+ - id: check-case-conflict
10
+ - id: check-docstring-first
11
+ - id: check-json
12
+ - id: check-merge-conflict
13
+ - id: check-shebang-scripts-are-executable
14
+ - id: check-symlinks
15
+ - id: check-toml
16
+ - id: check-xml
17
+ - id: check-yaml
18
+ - id: debug-statements
19
+ - id: detect-private-key
20
+ - id: end-of-file-fixer
21
+ - id: trailing-whitespace
22
+ exclude: |
23
+ (?x)^(
24
+ .bumpversion.cfg
25
+ )$
26
+ # Fast Python linter and formatter - replaces flake8, isort, and black
27
+ - repo: https://github.com/astral-sh/ruff-pre-commit
28
+ rev: v0.6.9
29
+ hooks:
30
+ # Run the Ruff linter
31
+ - id: ruff
32
+ args: [--fix, --exit-non-zero-on-fix]
33
+ # Run the Ruff formatter
34
+ - id: ruff-format
about.py CHANGED
@@ -1,4 +1,3 @@
1
-
2
  ABOUT_TEXT = """
3
  ## About this challenge
4
 
@@ -6,7 +5,7 @@ We're inviting the ML/bio community to predict developability properties for 244
6
 
7
  **What is antibody developability?**
8
 
9
- Antibodies have to be manufacturable, stable in high concentrations, and have low off-target effects.
10
  Properties such as these can often hinder the progression of an antibody to the clinic, and are collectively referred to as 'developability'.
11
  Here we show 5 of these properties and invite the community to submit and develop better predictors, which will be tested out on a heldout private set to assess model generalization.
12
 
@@ -28,4 +27,4 @@ We'd like to add some more existing models to the leaderboard. Some examples of
28
  """
29
  FAQS = {
30
  "Example FAQ with dropdown": """Full answer to this question""",
31
- }
 
 
1
  ABOUT_TEXT = """
2
  ## About this challenge
3
 
 
5
 
6
  **What is antibody developability?**
7
 
8
+ Antibodies have to be manufacturable, stable in high concentrations, and have low off-target effects.
9
  Properties such as these can often hinder the progression of an antibody to the clinic, and are collectively referred to as 'developability'.
10
  Here we show 5 of these properties and invite the community to submit and develop better predictors, which will be tested out on a heldout private set to assess model generalization.
11
 
 
27
  """
28
  FAQS = {
29
  "Example FAQ with dropdown": """Full answer to this question""",
30
+ }
app.py CHANGED
@@ -7,7 +7,8 @@ from utils import fetch_hf_results, show_output_box
7
  from constants import ASSAY_LIST, ASSAY_RENAME, ASSAY_EMOJIS, ASSAY_DESCRIPTION
8
  from about import ABOUT_TEXT, FAQS
9
  from submit import make_submission
10
-
 
11
  def format_leaderboard_table(df_results: pd.DataFrame, assay: str | None = None):
12
  # Previous things that were nice in the constellaration leaderboard:
13
  # Having a submission time column, and a user column where the username is clickable (this is a pro for usability but con for anonymity)
@@ -16,10 +17,11 @@ def format_leaderboard_table(df_results: pd.DataFrame, assay: str | None = None)
16
  column_order = ["model", "property", "spearman", "spearman_cross_val"]
17
  df = df_results.query("assay.isin(@ASSAY_RENAME.keys())").copy()
18
  if assay is not None:
19
- df = df[df['assay'] == assay]
20
  df = df[column_order]
21
  return df.sort_values(by="spearman", ascending=False)
22
 
 
23
  def get_leaderboard_object(assay: str | None = None):
24
  filter_columns = ["model"]
25
  if assay is None:
@@ -33,15 +35,16 @@ def get_leaderboard_object(assay: str | None = None):
33
  search_columns=["model"],
34
  filter_columns=filter_columns,
35
  every=60,
36
- render=True
37
  )
38
 
 
39
  with gr.Blocks() as demo:
40
  gr.Markdown("""
41
  ## Welcome to the Ginkgo Antibody Developability Benchmark!
42
-
43
  **Beta version, not publicly launched yet**
44
-
45
  Participants can submit their model to the leaderboard by uploading a CSV file (see the "βœ‰οΈ Submit" tab).
46
  See more details in the "❔About" tab.
47
  """)
@@ -58,15 +61,20 @@ with gr.Blocks() as demo:
58
  for question, answer in FAQS.items():
59
  with gr.Accordion(question):
60
  gr.Markdown(answer)
61
-
62
  # Procedurally make these 5 tabs
63
  for assay in ASSAY_LIST:
64
- with gr.TabItem(f"{ASSAY_EMOJIS[assay]} {ASSAY_RENAME[assay]}", elem_id=f"abdev-benchmark-tab-table"):
 
 
 
65
  gr.Markdown(f"# {ASSAY_DESCRIPTION[assay]}")
66
  get_leaderboard_object(assay=assay)
67
-
68
- with gr.TabItem("πŸš€ Overall", elem_id="abdev-benchmark-tab-table"):
69
- gr.Markdown("# Antibody Developability Benchmark Leaderboard over all properties")
 
 
70
  get_leaderboard_object()
71
 
72
  with gr.TabItem("βœ‰οΈ Submit", elem_id="boundary-benchmark-tab-table"):
@@ -74,27 +82,31 @@ with gr.Blocks() as demo:
74
  """
75
  # Antibody Developability Submission
76
  Upload a CSV to get a score!
77
-
78
  Please use your Hugging Face account name to submit your model - we use this to track separate submissions, but only Hugging Face/Ginkgo will see these usernames (unless you choose to make them public).
79
  Your submission will be evaluated and added to the leaderboard.
80
  """
81
  )
82
- filename = gr.State(value=None)
83
- eval_state = gr.State(value=None)
84
  user_state = gr.State(value=None)
85
  anonymous_state = gr.State(value=False)
86
 
87
- login_button = gr.LoginButton(value="Sign in with Hugging Face to see account name") # Note(Lood): Is this mandatory?
 
 
88
 
89
  with gr.Row():
90
  with gr.Column():
91
  username_input = gr.Textbox(
92
- label="Username",
93
  placeholder="Enter your Hugging Face username",
94
- info="This will be displayed on the leaderboard."
95
  )
96
 
97
- anonymous_checkbox = gr.Checkbox(label="Would you like to keep your submission anonymous?") # Can make this ticked by default
 
 
98
  with gr.Column():
99
  submission_file = gr.File(label="Submission CSV")
100
 
@@ -102,14 +114,16 @@ with gr.Blocks() as demo:
102
  username_input.change(
103
  fn=lambda x: x if x.strip() else None,
104
  inputs=username_input,
105
- outputs=user_state
106
- )
107
 
108
  submit_btn = gr.Button("Evaluate")
109
  message = gr.Textbox(label="Status", lines=1, visible=False)
110
  # help message
111
- gr.Markdown("If you have issues with submission or using the leaderboard, please start a discussion in the Community tab of this Space.")
112
-
 
 
113
  submit_btn.click(
114
  make_submission,
115
  inputs=[submission_file, user_state, anonymous_state],
@@ -126,7 +140,7 @@ with gr.Blocks() as demo:
126
  πŸ“¬ For questions or feedback, contact <a href="mailto:[email protected]">[email protected]</a> or visit the Community tab at the top of this page.
127
  </div>
128
  """,
129
- elem_id="contact-footer"
130
  )
131
 
132
  if __name__ == "__main__":
 
7
  from constants import ASSAY_LIST, ASSAY_RENAME, ASSAY_EMOJIS, ASSAY_DESCRIPTION
8
  from about import ABOUT_TEXT, FAQS
9
  from submit import make_submission
10
+
11
+
12
  def format_leaderboard_table(df_results: pd.DataFrame, assay: str | None = None):
13
  # Previous things that were nice in the constellaration leaderboard:
14
  # Having a submission time column, and a user column where the username is clickable (this is a pro for usability but con for anonymity)
 
17
  column_order = ["model", "property", "spearman", "spearman_cross_val"]
18
  df = df_results.query("assay.isin(@ASSAY_RENAME.keys())").copy()
19
  if assay is not None:
20
+ df = df[df["assay"] == assay]
21
  df = df[column_order]
22
  return df.sort_values(by="spearman", ascending=False)
23
 
24
+
25
  def get_leaderboard_object(assay: str | None = None):
26
  filter_columns = ["model"]
27
  if assay is None:
 
35
  search_columns=["model"],
36
  filter_columns=filter_columns,
37
  every=60,
38
+ render=True,
39
  )
40
 
41
+
42
  with gr.Blocks() as demo:
43
  gr.Markdown("""
44
  ## Welcome to the Ginkgo Antibody Developability Benchmark!
45
+
46
  **Beta version, not publicly launched yet**
47
+
48
  Participants can submit their model to the leaderboard by uploading a CSV file (see the "βœ‰οΈ Submit" tab).
49
  See more details in the "❔About" tab.
50
  """)
 
61
  for question, answer in FAQS.items():
62
  with gr.Accordion(question):
63
  gr.Markdown(answer)
64
+
65
  # Procedurally make these 5 tabs
66
  for assay in ASSAY_LIST:
67
+ with gr.TabItem(
68
+ f"{ASSAY_EMOJIS[assay]} {ASSAY_RENAME[assay]}",
69
+ elem_id="abdev-benchmark-tab-table",
70
+ ):
71
  gr.Markdown(f"# {ASSAY_DESCRIPTION[assay]}")
72
  get_leaderboard_object(assay=assay)
73
+
74
+ with gr.TabItem("πŸš€ Overall", elem_id="abdev-benchmark-tab-table"):
75
+ gr.Markdown(
76
+ "# Antibody Developability Benchmark Leaderboard over all properties"
77
+ )
78
  get_leaderboard_object()
79
 
80
  with gr.TabItem("βœ‰οΈ Submit", elem_id="boundary-benchmark-tab-table"):
 
82
  """
83
  # Antibody Developability Submission
84
  Upload a CSV to get a score!
85
+
86
  Please use your Hugging Face account name to submit your model - we use this to track separate submissions, but only Hugging Face/Ginkgo will see these usernames (unless you choose to make them public).
87
  Your submission will be evaluated and added to the leaderboard.
88
  """
89
  )
90
+ filename = gr.State(value=None)
91
+ eval_state = gr.State(value=None)
92
  user_state = gr.State(value=None)
93
  anonymous_state = gr.State(value=False)
94
 
95
+ login_button = gr.LoginButton(
96
+ value="Sign in with Hugging Face to see account name"
97
+ ) # Note(Lood): Is this mandatory?
98
 
99
  with gr.Row():
100
  with gr.Column():
101
  username_input = gr.Textbox(
102
+ label="Username",
103
  placeholder="Enter your Hugging Face username",
104
+ info="This will be displayed on the leaderboard.",
105
  )
106
 
107
+ anonymous_checkbox = gr.Checkbox(
108
+ label="Would you like to keep your submission anonymous?"
109
+ ) # Can make this ticked by default
110
  with gr.Column():
111
  submission_file = gr.File(label="Submission CSV")
112
 
 
114
  username_input.change(
115
  fn=lambda x: x if x.strip() else None,
116
  inputs=username_input,
117
+ outputs=user_state,
118
+ )
119
 
120
  submit_btn = gr.Button("Evaluate")
121
  message = gr.Textbox(label="Status", lines=1, visible=False)
122
  # help message
123
+ gr.Markdown(
124
+ "If you have issues with submission or using the leaderboard, please start a discussion in the Community tab of this Space."
125
+ )
126
+
127
  submit_btn.click(
128
  make_submission,
129
  inputs=[submission_file, user_state, anonymous_state],
 
140
  πŸ“¬ For questions or feedback, contact <a href="mailto:[email protected]">[email protected]</a> or visit the Community tab at the top of this page.
141
  </div>
142
  """,
143
+ elem_id="contact-footer",
144
  )
145
 
146
  if __name__ == "__main__":
constants.py CHANGED
@@ -36,14 +36,14 @@ REQUIRED_COLUMNS: list[str] = [
36
  "antibody_name",
37
  "vh_protein_sequence",
38
  "vl_protein_sequence",
39
- ] + ASSAY_LIST
40
 
41
  # Huggingface API
42
  TOKEN = os.environ.get("HF_TOKEN")
43
- CACHE_PATH=os.getenv("HF_HOME", ".")
44
  API = HfApi(token=TOKEN)
45
 
46
  # Huggingface repos
47
- ORGANIZATION="ginkgo-datapoints"
48
- SUBMISSIONS_REPO = f'{ORGANIZATION}/abdev-bench-submissions'
49
- RESULTS_REPO = f'{ORGANIZATION}/abdev-bench-results'
 
36
  "antibody_name",
37
  "vh_protein_sequence",
38
  "vl_protein_sequence",
39
+ ] + ASSAY_LIST
40
 
41
  # Huggingface API
42
  TOKEN = os.environ.get("HF_TOKEN")
43
+ CACHE_PATH = os.getenv("HF_HOME", ".")
44
  API = HfApi(token=TOKEN)
45
 
46
  # Huggingface repos
47
+ ORGANIZATION = "ginkgo-datapoints"
48
+ SUBMISSIONS_REPO = f"{ORGANIZATION}/abdev-bench-submissions"
49
+ RESULTS_REPO = f"{ORGANIZATION}/abdev-bench-results"
evaluation.py CHANGED
@@ -1,9 +1,7 @@
1
- import json
2
- from pathlib import Path
3
-
4
  def evaluate_problem(
5
- problem_type: str, input_file: str
6
- # ) -> problems.EvaluationSingleObjective | problems.EvaluationMultiObjective:
 
7
  ):
8
  pass
9
  # with Path(input_file).open("r") as f:
@@ -25,6 +23,6 @@ def evaluate_problem(
25
  # result = problems.MHDStableQIStellarator().evaluate(boundaries)
26
  # case _:
27
  # raise ValueError(f"Unknown problem type: {problem_type}")
28
-
29
  # print("Finished evaluation.")
30
  # return result
 
 
 
 
1
  def evaluate_problem(
2
+ problem_type: str,
3
+ input_file: str,
4
+ # ) -> problems.EvaluationSingleObjective | problems.EvaluationMultiObjective:
5
  ):
6
  pass
7
  # with Path(input_file).open("r") as f:
 
23
  # result = problems.MHDStableQIStellarator().evaluate(boundaries)
24
  # case _:
25
  # raise ValueError(f"Unknown problem type: {problem_type}")
26
+
27
  # print("Finished evaluation.")
28
  # return result
requirements.txt CHANGED
@@ -3,4 +3,4 @@ datasets
3
  huggingface_hub
4
  gradio-leaderboard
5
  gradio[oauth]
6
- # plotly
 
3
  huggingface_hub
4
  gradio-leaderboard
5
  gradio[oauth]
6
+ # plotly
submit.py CHANGED
@@ -2,8 +2,6 @@ from pathlib import Path
2
  import tempfile
3
  from typing import BinaryIO
4
  import json
5
- import pandas as pd
6
- import io
7
 
8
  import gradio as gr
9
  from datetime import datetime
@@ -12,33 +10,30 @@ import uuid
12
  from constants import API, SUBMISSIONS_REPO
13
  from validation import validate_csv_file
14
 
15
- def make_submission(
16
- submitted_file: BinaryIO,
17
- user_state,
18
- anonymous_state):
19
 
 
20
  if user_state is None:
21
  raise gr.Error("You must submit your username to submit a file.")
22
-
23
  if submitted_file is None:
24
  raise gr.Error("Please upload a CSV file before submitting.")
25
-
26
  file_path = submitted_file.name
27
 
28
  if not file_path:
29
  raise gr.Error("Uploaded file object does not have a valid file path.")
30
-
31
  path_obj = Path(file_path)
32
 
33
- if path_obj.suffix.lower() != '.csv':
34
  raise gr.Error("File must be a CSV file. Please upload a .csv file.")
35
-
36
  timestamp = datetime.utcnow().isoformat()
37
  submission_id = str(uuid.uuid4())
38
 
39
- with (path_obj.open("rb") as f_in):
40
  file_content = f_in.read().decode("utf-8")
41
-
42
  validate_csv_file(file_content)
43
 
44
  # write to dataset
@@ -51,7 +46,7 @@ def make_submission(
51
  "evaluated": False,
52
  "user": user_state,
53
  "anonymous": anonymous_state,
54
- "csv_content": file_content
55
  }
56
  with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as tmp:
57
  json.dump(record, tmp, indent=2)
@@ -59,11 +54,11 @@ def make_submission(
59
  tmp_name = tmp.name
60
 
61
  API.upload_file(
62
- path_or_fileobj=tmp_name,
63
  path_in_repo=filename,
64
  repo_id=SUBMISSIONS_REPO,
65
  repo_type="dataset",
66
- commit_message=f"Add submission for {user_state} at {timestamp}"
67
  )
68
  Path(tmp_name).unlink()
69
 
 
2
  import tempfile
3
  from typing import BinaryIO
4
  import json
 
 
5
 
6
  import gradio as gr
7
  from datetime import datetime
 
10
  from constants import API, SUBMISSIONS_REPO
11
  from validation import validate_csv_file
12
 
 
 
 
 
13
 
14
+ def make_submission(submitted_file: BinaryIO, user_state, anonymous_state):
15
  if user_state is None:
16
  raise gr.Error("You must submit your username to submit a file.")
17
+
18
  if submitted_file is None:
19
  raise gr.Error("Please upload a CSV file before submitting.")
20
+
21
  file_path = submitted_file.name
22
 
23
  if not file_path:
24
  raise gr.Error("Uploaded file object does not have a valid file path.")
25
+
26
  path_obj = Path(file_path)
27
 
28
+ if path_obj.suffix.lower() != ".csv":
29
  raise gr.Error("File must be a CSV file. Please upload a .csv file.")
30
+
31
  timestamp = datetime.utcnow().isoformat()
32
  submission_id = str(uuid.uuid4())
33
 
34
+ with path_obj.open("rb") as f_in:
35
  file_content = f_in.read().decode("utf-8")
36
+
37
  validate_csv_file(file_content)
38
 
39
  # write to dataset
 
46
  "evaluated": False,
47
  "user": user_state,
48
  "anonymous": anonymous_state,
49
+ "csv_content": file_content,
50
  }
51
  with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as tmp:
52
  json.dump(record, tmp, indent=2)
 
54
  tmp_name = tmp.name
55
 
56
  API.upload_file(
57
+ path_or_fileobj=tmp_name,
58
  path_in_repo=filename,
59
  repo_id=SUBMISSIONS_REPO,
60
  repo_type="dataset",
61
+ commit_message=f"Add submission for {user_state} at {timestamp}",
62
  )
63
  Path(tmp_name).unlink()
64
 
test/conftest.py CHANGED
@@ -9,8 +9,14 @@ def valid_csv_data():
9
  return {
10
  "antibody_id": ["AB001"] * MINIMAL_NUMBER_OF_ROWS,
11
  "antibody_name": ["AB001"] * MINIMAL_NUMBER_OF_ROWS,
12
- "vh_protein_sequence": ["EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARDYGDGYYFDYWGQGTLVTVSS"] * MINIMAL_NUMBER_OF_ROWS,
13
- "vl_protein_sequence": ["DIQMTQSPSSLSASVGDRVTITCRASQSISSYLNWYQQKPGKAPKLLIYAASTLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQSYSTPFTFGQGTKVEIK"] * MINIMAL_NUMBER_OF_ROWS,
 
 
 
 
 
 
14
  **{assay: [0.85] * MINIMAL_NUMBER_OF_ROWS for assay in ASSAY_LIST},
15
  }
16
 
@@ -20,6 +26,7 @@ def valid_input_dataframe(valid_csv_data):
20
  """Fixture providing a valid input dataframe"""
21
  return pd.DataFrame(valid_csv_data)
22
 
 
23
  @pytest.fixture
24
  def valid_csv_content(valid_input_dataframe):
25
  """Fixture providing valid CSV content as string"""
 
9
  return {
10
  "antibody_id": ["AB001"] * MINIMAL_NUMBER_OF_ROWS,
11
  "antibody_name": ["AB001"] * MINIMAL_NUMBER_OF_ROWS,
12
+ "vh_protein_sequence": [
13
+ "EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARDYGDGYYFDYWGQGTLVTVSS"
14
+ ]
15
+ * MINIMAL_NUMBER_OF_ROWS,
16
+ "vl_protein_sequence": [
17
+ "DIQMTQSPSSLSASVGDRVTITCRASQSISSYLNWYQQKPGKAPKLLIYAASTLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQSYSTPFTFGQGTKVEIK"
18
+ ]
19
+ * MINIMAL_NUMBER_OF_ROWS,
20
  **{assay: [0.85] * MINIMAL_NUMBER_OF_ROWS for assay in ASSAY_LIST},
21
  }
22
 
 
26
  """Fixture providing a valid input dataframe"""
27
  return pd.DataFrame(valid_csv_data)
28
 
29
+
30
  @pytest.fixture
31
  def valid_csv_content(valid_input_dataframe):
32
  """Fixture providing valid CSV content as string"""
test/test_validation.py CHANGED
@@ -7,40 +7,40 @@ from constants import REQUIRED_COLUMNS, MINIMAL_NUMBER_OF_ROWS
7
 
8
  class TestValidateCsvCanBeRead:
9
  """Test cases for validate_csv_can_be_read function"""
10
-
11
  def test_valid_csv_can_be_read(self, valid_csv_content):
12
  """Test that valid CSV content can be read"""
13
  df = validate_csv_can_be_read(valid_csv_content)
14
  assert isinstance(df, pd.DataFrame)
15
  assert len(df) == MINIMAL_NUMBER_OF_ROWS
16
  assert list(df.columns) == list(REQUIRED_COLUMNS)
17
-
18
  def test_empty_csv_raises_error(self):
19
  """Test that empty CSV raises an error"""
20
  empty_csv = ""
21
-
22
  with pytest.raises(gr.Error) as exc_info:
23
  validate_csv_can_be_read(empty_csv)
24
-
25
  assert "empty or contains no valid data" in str(exc_info.value)
26
-
27
  def test_invalid_csv_format_raises_error(self):
28
  """Test that invalid CSV format raises an error"""
29
  # Create a CSV with malformed structure that pandas cannot parse
30
- malformed_csv = "column1,column2\nvalue1,\"unclosed quote\nvalue4,value5"
31
-
32
  with pytest.raises(gr.Error) as exc_info:
33
  validate_csv_can_be_read(malformed_csv)
34
-
35
  assert "Invalid CSV format" in str(exc_info.value)
36
-
37
  def test_csv_with_quoted_fields_can_be_read(self):
38
  """Test that CSV with quoted fields can be read"""
39
  # Create CSV with quoted fields and enough rows
40
  base_row = 'AB001,"EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARDYGDGYYFDYWGQGTLVTVSS","DIQMTQSPSSLSASVGDRVTITCRASQSISSYLNWYQQKPGKAPKLLIYAASTLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQSYSTPFTFGQGTKVEIK",95.2,0.85,0.92,0.78,0.81,72.5'
41
  csv_content = "antibody_id,vh_protein_sequence,vl_protein_sequence,SEC %Monomer,HIC,PR_CHO,AC-SINS_pH6.0,AC-SINS_pH7.4,Tm\n"
42
  csv_content += "\n".join([base_row] * MINIMAL_NUMBER_OF_ROWS)
43
-
44
  df = validate_csv_can_be_read(csv_content)
45
  assert isinstance(df, pd.DataFrame)
46
  assert len(df) == MINIMAL_NUMBER_OF_ROWS
@@ -48,40 +48,41 @@ class TestValidateCsvCanBeRead:
48
 
49
  class TestValidateDataframe:
50
  """Test cases for validate_dataframe function"""
51
-
52
  def test_valid_dataframe_passes(self, valid_input_dataframe):
53
  """Test that valid DataFrame passes validation"""
54
  validate_dataframe(valid_input_dataframe)
55
-
56
  def test_missing_columns_raises_error(self, valid_input_dataframe):
57
  """Test that DataFrame with missing columns raises an error"""
58
  missing_column = REQUIRED_COLUMNS[0]
59
  df = valid_input_dataframe.copy()
60
  df.drop(columns=[missing_column], inplace=True)
61
-
62
  with pytest.raises(gr.Error) as exc_info:
63
  validate_dataframe(df)
64
-
65
  assert f"Missing required columns: {missing_column}" in str(exc_info.value)
66
-
67
-
68
  def test_empty_dataframe_raises_error(self, valid_input_dataframe):
69
  """Test that empty DataFrame raises an error"""
70
  empty_df = valid_input_dataframe.head(0)
71
-
72
  with pytest.raises(gr.Error) as exc_info:
73
  validate_dataframe(empty_df)
74
-
75
  assert "CSV file is empty" in str(exc_info.value)
76
-
77
  def test_insufficient_rows_raises_error(self, valid_input_dataframe):
78
  """Test that DataFrame with insufficient rows raises an error"""
79
  df = valid_input_dataframe.head(MINIMAL_NUMBER_OF_ROWS - 1)
80
  with pytest.raises(gr.Error) as exc_info:
81
  validate_dataframe(df)
82
-
83
- assert f"CSV should have at least {MINIMAL_NUMBER_OF_ROWS} rows" in str(exc_info.value)
84
-
 
 
85
  def test_missing_values_raises_error(self, valid_input_dataframe):
86
  """Test that DataFrame with missing values raises an error"""
87
  bad_column = REQUIRED_COLUMNS[0]
@@ -89,9 +90,9 @@ class TestValidateDataframe:
89
  df[bad_column] = [None] * len(df)
90
  with pytest.raises(gr.Error) as exc_info:
91
  validate_dataframe(df)
92
-
93
  assert f"contains {len(df)} missing values" in str(exc_info.value)
94
-
95
  def test_csv_with_extra_columns_passes(self, valid_input_dataframe):
96
  """Test that DataFrame with extra columns passes validation"""
97
  extra_column = "extra_column_1"
@@ -99,11 +100,11 @@ class TestValidateDataframe:
99
  df[extra_column] = ["extra1"] * len(df)
100
  df[extra_column] = ["extra2"] * len(df)
101
  validate_dataframe(df)
102
-
103
 
104
  class TestValidateCsvFile:
105
  """Test cases for the combined validate_csv_file function"""
106
-
107
  def test_valid_csv_passes(self, valid_csv_content):
108
  """Test that a valid CSV with all required columns passes validation"""
109
- validate_csv_file(valid_csv_content)
 
7
 
8
  class TestValidateCsvCanBeRead:
9
  """Test cases for validate_csv_can_be_read function"""
10
+
11
  def test_valid_csv_can_be_read(self, valid_csv_content):
12
  """Test that valid CSV content can be read"""
13
  df = validate_csv_can_be_read(valid_csv_content)
14
  assert isinstance(df, pd.DataFrame)
15
  assert len(df) == MINIMAL_NUMBER_OF_ROWS
16
  assert list(df.columns) == list(REQUIRED_COLUMNS)
17
+
18
  def test_empty_csv_raises_error(self):
19
  """Test that empty CSV raises an error"""
20
  empty_csv = ""
21
+
22
  with pytest.raises(gr.Error) as exc_info:
23
  validate_csv_can_be_read(empty_csv)
24
+
25
  assert "empty or contains no valid data" in str(exc_info.value)
26
+
27
  def test_invalid_csv_format_raises_error(self):
28
  """Test that invalid CSV format raises an error"""
29
  # Create a CSV with malformed structure that pandas cannot parse
30
+ malformed_csv = 'column1,column2\nvalue1,"unclosed quote\nvalue4,value5'
31
+
32
  with pytest.raises(gr.Error) as exc_info:
33
  validate_csv_can_be_read(malformed_csv)
34
+
35
  assert "Invalid CSV format" in str(exc_info.value)
36
+
37
  def test_csv_with_quoted_fields_can_be_read(self):
38
  """Test that CSV with quoted fields can be read"""
39
  # Create CSV with quoted fields and enough rows
40
  base_row = 'AB001,"EVQLVESGGGLVQPGGSLRLSCAASGFTFSSYAMHWVRQAPGKGLEWVSAISGSGGSTYYADSVKGRFTISRDNSKNTLYLQMNSLRAEDTAVYYCARDYGDGYYFDYWGQGTLVTVSS","DIQMTQSPSSLSASVGDRVTITCRASQSISSYLNWYQQKPGKAPKLLIYAASTLQSGVPSRFSGSGSGTDFTLTISSLQPEDFATYYCQQSYSTPFTFGQGTKVEIK",95.2,0.85,0.92,0.78,0.81,72.5'
41
  csv_content = "antibody_id,vh_protein_sequence,vl_protein_sequence,SEC %Monomer,HIC,PR_CHO,AC-SINS_pH6.0,AC-SINS_pH7.4,Tm\n"
42
  csv_content += "\n".join([base_row] * MINIMAL_NUMBER_OF_ROWS)
43
+
44
  df = validate_csv_can_be_read(csv_content)
45
  assert isinstance(df, pd.DataFrame)
46
  assert len(df) == MINIMAL_NUMBER_OF_ROWS
 
48
 
49
  class TestValidateDataframe:
50
  """Test cases for validate_dataframe function"""
51
+
52
  def test_valid_dataframe_passes(self, valid_input_dataframe):
53
  """Test that valid DataFrame passes validation"""
54
  validate_dataframe(valid_input_dataframe)
55
+
56
  def test_missing_columns_raises_error(self, valid_input_dataframe):
57
  """Test that DataFrame with missing columns raises an error"""
58
  missing_column = REQUIRED_COLUMNS[0]
59
  df = valid_input_dataframe.copy()
60
  df.drop(columns=[missing_column], inplace=True)
61
+
62
  with pytest.raises(gr.Error) as exc_info:
63
  validate_dataframe(df)
64
+
65
  assert f"Missing required columns: {missing_column}" in str(exc_info.value)
66
+
 
67
  def test_empty_dataframe_raises_error(self, valid_input_dataframe):
68
  """Test that empty DataFrame raises an error"""
69
  empty_df = valid_input_dataframe.head(0)
70
+
71
  with pytest.raises(gr.Error) as exc_info:
72
  validate_dataframe(empty_df)
73
+
74
  assert "CSV file is empty" in str(exc_info.value)
75
+
76
  def test_insufficient_rows_raises_error(self, valid_input_dataframe):
77
  """Test that DataFrame with insufficient rows raises an error"""
78
  df = valid_input_dataframe.head(MINIMAL_NUMBER_OF_ROWS - 1)
79
  with pytest.raises(gr.Error) as exc_info:
80
  validate_dataframe(df)
81
+
82
+ assert f"CSV should have at least {MINIMAL_NUMBER_OF_ROWS} rows" in str(
83
+ exc_info.value
84
+ )
85
+
86
  def test_missing_values_raises_error(self, valid_input_dataframe):
87
  """Test that DataFrame with missing values raises an error"""
88
  bad_column = REQUIRED_COLUMNS[0]
 
90
  df[bad_column] = [None] * len(df)
91
  with pytest.raises(gr.Error) as exc_info:
92
  validate_dataframe(df)
93
+
94
  assert f"contains {len(df)} missing values" in str(exc_info.value)
95
+
96
  def test_csv_with_extra_columns_passes(self, valid_input_dataframe):
97
  """Test that DataFrame with extra columns passes validation"""
98
  extra_column = "extra_column_1"
 
100
  df[extra_column] = ["extra1"] * len(df)
101
  df[extra_column] = ["extra2"] * len(df)
102
  validate_dataframe(df)
103
+
104
 
105
  class TestValidateCsvFile:
106
  """Test cases for the combined validate_csv_file function"""
107
+
108
  def test_valid_csv_passes(self, valid_csv_content):
109
  """Test that a valid CSV with all required columns passes validation"""
110
+ validate_csv_file(valid_csv_content)
utils.py CHANGED
@@ -17,16 +17,21 @@ from constants import API, SUBMISSIONS_REPO, RESULTS_REPO, ASSAY_RENAME
17
  # link =f'https://huggingface.co/datasets/proxima-fusion/constellaration-bench-results/blob/main/{filename}'
18
  # return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">link</a>'
19
 
 
20
  def show_output_box(message):
21
  return gr.update(value=message, visible=True)
22
 
 
23
  def fetch_hf_results():
24
- ds = load_dataset(RESULTS_REPO, split='no_low_spearman', download_mode="force_redownload")
 
 
25
  df = pd.DataFrame(ds).drop_duplicates(subset=["model", "assay"])
26
  df["property"] = df["assay"].map(ASSAY_RENAME)
27
  print(df.head())
28
  return df
29
 
 
30
  def read_result_from_hub(filename):
31
  local_path = hf_hub_download(
32
  repo_id=RESULTS_REPO,
@@ -35,6 +40,7 @@ def read_result_from_hub(filename):
35
  )
36
  return local_path
37
 
 
38
  def read_submission_from_hub(filename):
39
  local_path = hf_hub_download(
40
  repo_id=SUBMISSIONS_REPO,
@@ -43,37 +49,41 @@ def read_submission_from_hub(filename):
43
  )
44
  return local_path
45
 
 
46
  def write_results(record, result):
47
  record.update(result)
48
- record['result_filename'] = record['submission_filename'].rstrip('.json') + '_results.json'
49
- print(record['result_filename'])
50
- record['evaluated'] = True
 
 
51
 
52
  record["objectives"] = json.dumps(record.get("objectives", []))
53
  record["feasibilities"] = json.dumps(record.get("feasibility", []))
54
-
55
- if 'objective' not in record.keys():
56
- record['objective'] = 0.0
57
- record['minimize_objective'] = True
58
- record['feasibility'] = sum(record['feasibility'])/len(record['feasibility'])
59
 
60
  with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as tmp:
61
  json.dump(record, tmp, indent=2)
62
  tmp.flush()
63
  tmp_name = tmp.name
64
-
65
  API.upload_file(
66
- path_or_fileobj=tmp_name,
67
- path_in_repo=record['result_filename'],
68
- repo_id=RESULTS_REPO,
69
- repo_type="dataset",
70
- commit_message=f"Add result data for {record['result_filename']}"
71
- )
72
-
73
  pathlib.Path(tmp_name).unlink()
74
  return
75
 
 
76
  def get_user(profile: gr.OAuthProfile | None) -> str:
77
  if profile is None:
78
  return "Please login to submit a boundary for evaluation."
79
- return profile.username
 
17
  # link =f'https://huggingface.co/datasets/proxima-fusion/constellaration-bench-results/blob/main/{filename}'
18
  # return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">link</a>'
19
 
20
+
21
  def show_output_box(message):
22
  return gr.update(value=message, visible=True)
23
 
24
+
25
  def fetch_hf_results():
26
+ ds = load_dataset(
27
+ RESULTS_REPO, split="no_low_spearman", download_mode="force_redownload"
28
+ )
29
  df = pd.DataFrame(ds).drop_duplicates(subset=["model", "assay"])
30
  df["property"] = df["assay"].map(ASSAY_RENAME)
31
  print(df.head())
32
  return df
33
 
34
+
35
  def read_result_from_hub(filename):
36
  local_path = hf_hub_download(
37
  repo_id=RESULTS_REPO,
 
40
  )
41
  return local_path
42
 
43
+
44
  def read_submission_from_hub(filename):
45
  local_path = hf_hub_download(
46
  repo_id=SUBMISSIONS_REPO,
 
49
  )
50
  return local_path
51
 
52
+
53
  def write_results(record, result):
54
  record.update(result)
55
+ record["result_filename"] = (
56
+ record["submission_filename"].rstrip(".json") + "_results.json"
57
+ )
58
+ print(record["result_filename"])
59
+ record["evaluated"] = True
60
 
61
  record["objectives"] = json.dumps(record.get("objectives", []))
62
  record["feasibilities"] = json.dumps(record.get("feasibility", []))
63
+
64
+ if "objective" not in record.keys():
65
+ record["objective"] = 0.0
66
+ record["minimize_objective"] = True
67
+ record["feasibility"] = sum(record["feasibility"]) / len(record["feasibility"])
68
 
69
  with tempfile.NamedTemporaryFile(mode="w", suffix=".json", delete=False) as tmp:
70
  json.dump(record, tmp, indent=2)
71
  tmp.flush()
72
  tmp_name = tmp.name
73
+
74
  API.upload_file(
75
+ path_or_fileobj=tmp_name,
76
+ path_in_repo=record["result_filename"],
77
+ repo_id=RESULTS_REPO,
78
+ repo_type="dataset",
79
+ commit_message=f"Add result data for {record['result_filename']}",
80
+ )
81
+
82
  pathlib.Path(tmp_name).unlink()
83
  return
84
 
85
+
86
  def get_user(profile: gr.OAuthProfile | None) -> str:
87
  if profile is None:
88
  return "Please login to submit a boundary for evaluation."
89
+ return profile.username
validation.py CHANGED
@@ -3,20 +3,21 @@ import io
3
  import gradio as gr
4
  from constants import REQUIRED_COLUMNS, MINIMAL_NUMBER_OF_ROWS
5
 
 
6
  def validate_csv_can_be_read(file_content: str) -> pd.DataFrame:
7
  """
8
  Validate that the CSV file can be read and parsed.
9
-
10
  Parameters
11
  ----------
12
  file_content: str
13
  The content of the uploaded CSV file.
14
-
15
  Returns
16
  -------
17
  pd.DataFrame
18
  The parsed DataFrame if successful.
19
-
20
  Raises
21
  ------
22
  gr.Error: If CSV cannot be read or parsed
@@ -25,16 +26,11 @@ def validate_csv_can_be_read(file_content: str) -> pd.DataFrame:
25
  # Read CSV content
26
  df = pd.read_csv(io.StringIO(file_content))
27
  return df
28
-
29
  except pd.errors.EmptyDataError:
30
- raise gr.Error(
31
- "❌ CSV file is empty or contains no valid data"
32
- )
33
  except pd.errors.ParserError as e:
34
- raise gr.Error(
35
- f"❌ Invalid CSV format<br><br>"
36
- f"Error: {str(e)}"
37
- )
38
  except UnicodeDecodeError:
39
  raise gr.Error(
40
  "❌ File encoding error<br><br>"
@@ -42,15 +38,16 @@ def validate_csv_can_be_read(file_content: str) -> pd.DataFrame:
42
  "Please save your CSV file with UTF-8 encoding and try again."
43
  )
44
 
 
45
  def validate_dataframe(df: pd.DataFrame) -> None:
46
  """
47
  Validate the DataFrame content and structure.
48
-
49
  Parameters
50
  ----------
51
  df: pd.DataFrame
52
  The DataFrame to validate.
53
-
54
  Raises
55
  ------
56
  gr.Error: If validation fails
@@ -58,41 +55,36 @@ def validate_dataframe(df: pd.DataFrame) -> None:
58
  # Required columns should be present
59
  missing_columns = set(REQUIRED_COLUMNS) - set(df.columns)
60
  if missing_columns:
61
- raise gr.Error(
62
- f"❌ Missing required columns: {', '.join(missing_columns)}"
63
- )
64
-
65
  # Data should not be empty
66
  if df.empty:
67
- raise gr.Error(
68
- "❌ CSV file is empty"
69
- )
70
-
71
  # Check for missing values in required columns
72
  for col in REQUIRED_COLUMNS:
73
  missing_count = df[col].isnull().sum()
74
  if missing_count > 0:
75
- raise gr.Error(
76
- f"❌ Column '{col}' contains {missing_count} missing values"
77
- )
78
-
79
  # Check for reasonable number of rows
80
  if len(df) < MINIMAL_NUMBER_OF_ROWS:
81
- raise gr.Error(
82
- f"❌ CSV should have at least {MINIMAL_NUMBER_OF_ROWS} rows"
83
- )
84
-
85
- print(f"βœ… CSV validation passed! Found {len(df)} rows with columns: {', '.join(df.columns)}")
 
86
 
87
  def validate_csv_file(file_content: str) -> None:
88
  """
89
  Validate the uploaded CSV file.
90
-
91
  Parameters
92
  ----------
93
  file_content: str
94
  The content of the uploaded CSV file.
95
-
96
  Raises
97
  ------
98
  gr.Error: If validation fails
 
3
  import gradio as gr
4
  from constants import REQUIRED_COLUMNS, MINIMAL_NUMBER_OF_ROWS
5
 
6
+
7
  def validate_csv_can_be_read(file_content: str) -> pd.DataFrame:
8
  """
9
  Validate that the CSV file can be read and parsed.
10
+
11
  Parameters
12
  ----------
13
  file_content: str
14
  The content of the uploaded CSV file.
15
+
16
  Returns
17
  -------
18
  pd.DataFrame
19
  The parsed DataFrame if successful.
20
+
21
  Raises
22
  ------
23
  gr.Error: If CSV cannot be read or parsed
 
26
  # Read CSV content
27
  df = pd.read_csv(io.StringIO(file_content))
28
  return df
29
+
30
  except pd.errors.EmptyDataError:
31
+ raise gr.Error("❌ CSV file is empty or contains no valid data")
 
 
32
  except pd.errors.ParserError as e:
33
+ raise gr.Error(f"❌ Invalid CSV format<br><br>" f"Error: {str(e)}")
 
 
 
34
  except UnicodeDecodeError:
35
  raise gr.Error(
36
  "❌ File encoding error<br><br>"
 
38
  "Please save your CSV file with UTF-8 encoding and try again."
39
  )
40
 
41
+
42
  def validate_dataframe(df: pd.DataFrame) -> None:
43
  """
44
  Validate the DataFrame content and structure.
45
+
46
  Parameters
47
  ----------
48
  df: pd.DataFrame
49
  The DataFrame to validate.
50
+
51
  Raises
52
  ------
53
  gr.Error: If validation fails
 
55
  # Required columns should be present
56
  missing_columns = set(REQUIRED_COLUMNS) - set(df.columns)
57
  if missing_columns:
58
+ raise gr.Error(f"❌ Missing required columns: {', '.join(missing_columns)}")
59
+
 
 
60
  # Data should not be empty
61
  if df.empty:
62
+ raise gr.Error("❌ CSV file is empty")
63
+
 
 
64
  # Check for missing values in required columns
65
  for col in REQUIRED_COLUMNS:
66
  missing_count = df[col].isnull().sum()
67
  if missing_count > 0:
68
+ raise gr.Error(f"❌ Column '{col}' contains {missing_count} missing values")
69
+
 
 
70
  # Check for reasonable number of rows
71
  if len(df) < MINIMAL_NUMBER_OF_ROWS:
72
+ raise gr.Error(f"❌ CSV should have at least {MINIMAL_NUMBER_OF_ROWS} rows")
73
+
74
+ print(
75
+ f"βœ… CSV validation passed! Found {len(df)} rows with columns: {', '.join(df.columns)}"
76
+ )
77
+
78
 
79
  def validate_csv_file(file_content: str) -> None:
80
  """
81
  Validate the uploaded CSV file.
82
+
83
  Parameters
84
  ----------
85
  file_content: str
86
  The content of the uploaded CSV file.
87
+
88
  Raises
89
  ------
90
  gr.Error: If validation fails