avantol commited on
Commit
68d7e91
·
1 Parent(s): 800f2c1

fix(app): simplify usage, fix samples

Browse files
Files changed (16) hide show
  1. .secrets.baseline +12 -6
  2. app.py +11 -46
  3. requirements.txt +2 -1
  4. schema_to_sql.py +1 -1
  5. serialized_file_creation_demo/README.md +5 -0
  6. serialized_file_creation_demo/gen3_dm_scaffold.json +0 -0
  7. serialized_file_creation_demo/serialized_file_creation_demo.ipynb +1 -1
  8. serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/SDM_0__submitted_genotyping_array.mass_cytometry_image.actionable_mutation__jsonschema_dd.json +598 -1
  9. serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/actionable_mutation_metadata.tsv +1 -1
  10. serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/aliquot_metadata.tsv +1 -1
  11. serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/analyte_metadata.tsv +1 -1
  12. serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/case_metadata.tsv +1 -1
  13. serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/mass_cytometry_assay_file_manifest.tsv +1 -1
  14. serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/mass_cytometry_image_file_manifest.tsv +1 -1
  15. serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/submitted_genotyping_array.mass_cytometry_image.actionable_mutation_paths.json +34 -1
  16. serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/submitted_genotyping_array_file_manifest.tsv +1 -1
.secrets.baseline CHANGED
@@ -14,10 +14,6 @@
14
  "name": "Base64HighEntropyString",
15
  "limit": 4.5
16
  },
17
- {
18
- "name": "HuggingFaceTokenDetector",
19
- "path": "file://hf_token_plugin.py"
20
- },
21
  {
22
  "name": "BasicAuthDetector"
23
  },
@@ -126,6 +122,16 @@
126
  "path": "detect_secrets.filters.heuristic.is_templated_secret"
127
  }
128
  ],
129
- "results": {},
130
- "generated_at": "2025-07-14T21:35:18Z"
 
 
 
 
 
 
 
 
 
 
131
  }
 
14
  "name": "Base64HighEntropyString",
15
  "limit": 4.5
16
  },
 
 
 
 
17
  {
18
  "name": "BasicAuthDetector"
19
  },
 
122
  "path": "detect_secrets.filters.heuristic.is_templated_secret"
123
  }
124
  ],
125
+ "results": {
126
+ "serialized_file_creation_demo/gen3_dm_scaffold.json": [
127
+ {
128
+ "type": "Hex High Entropy String",
129
+ "filename": "serialized_file_creation_demo/gen3_dm_scaffold.json",
130
+ "hashed_secret": "0bf50a968d39c25aaf2ac4636505adda571f17bd",
131
+ "is_verified": false,
132
+ "line_number": 683
133
+ }
134
+ ]
135
+ },
136
+ "generated_at": "2025-07-21T13:56:51Z"
137
  }
app.py CHANGED
@@ -10,7 +10,6 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
10
 
11
  from schema_to_sql import dd_to_sql
12
  from utils import (
13
- create_graph_image_from_json,
14
  create_summary_tables,
15
  get_example_ai_model_output,
16
  get_prompt_with_files_uploaded,
@@ -26,7 +25,7 @@ AUTH_TOKEN = os.environ.get("HF_TOKEN", False)
26
  BASE_MODEL = "meta-llama/Llama-3.1-8B-Instruct"
27
  LORA_ADAPTER = "uc-ctds/data-model-curator"
28
 
29
- MAX_RETRY_ATTEMPTS = 1
30
 
31
  print(f"Is CUDA available: {torch.cuda.is_available()}")
32
  print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
@@ -50,7 +49,7 @@ except Exception:
50
  # continue on so setup instructions load
51
 
52
 
53
- @spaces.GPU(duration=360)
54
  def run_llm_inference(model_prompt):
55
  retry_count = 1
56
 
@@ -125,11 +124,12 @@ def gen_output_from_files_uploaded(filepaths: list[str] = None):
125
  model_response = run_llm_inference(prompt_from_tsv_upload)
126
  model_response_json = json.loads(model_response)
127
 
128
- # Create Graph Network Image
129
- graph_network_img = create_graph_image_from_json(model_response_json)
130
-
131
  # Create SQL Code
132
- sql, validation = dd_to_sql(model_response_json)
 
 
 
 
133
 
134
  # Create Summary Table
135
  nodes_df, properties_df = {}, {}
@@ -138,16 +138,15 @@ def gen_output_from_files_uploaded(filepaths: list[str] = None):
138
  except Exception as exc:
139
  print(f"summary table creation failed: {exc}")
140
 
141
- return model_response, graph_network_img, sql, nodes_df, properties_df
142
 
143
 
144
  def gen_output_from_example():
145
  model_response = get_example_ai_model_output()
146
  model_response_json = json.loads(model_response)
147
- graph_network_img = create_graph_image_from_json(model_response_json)
148
  sql, validation = dd_to_sql(model_response_json)
149
 
150
- return model_response, graph_network_img, sql
151
 
152
 
153
  def zip_tsvs():
@@ -166,36 +165,6 @@ def zip_tsvs():
166
  with gr.Blocks() as demo:
167
  gr.Markdown("# Demonstration of Llama Data Model Generator")
168
 
169
- gr.Markdown("## IMPORTANT Setup")
170
-
171
- gr.Markdown(
172
- "This demonstrates usage of our [Llama Data Model Generator](https://huggingface.co/uc-ctds/llama-data-model-generator). "
173
- "We fine-tuned the base [Llama 3.1 8B Instruct model](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct), "
174
- "so you must be approved to access it to use this space. Please follow the previous links and gain access to the "
175
- "gated models before proceeding."
176
- )
177
-
178
- gr.Markdown(
179
- "After gaining access, you must **duplicate this space** and add a secret variable HF_TOKEN in the settings. "
180
- "See [Official Huggingface docs](https://huggingface.co/docs/hub/security-tokens) on how to generate a token. "
181
- "It should only have `read` access. Note: this is due to a limitation in Huggingface Spaces and end-user "
182
- "access to gated models."
183
- )
184
-
185
- gr.Image("duplicate.png", label="How to duplicate this space")
186
-
187
- gr.Markdown(
188
- "Ensure you set your duplicated space to **private** and enter your HF_TOKEN. See below: "
189
- )
190
-
191
- gr.Image("duplicate_dialog.png", label="Duplicate instructions")
192
-
193
- gr.Markdown("**IMPORTANT:** Only continue after doing the above.")
194
-
195
- gr.Markdown("## Already do the above? Is this your duplicated space?")
196
-
197
- gr.Markdown("Awesome! Let's test this out!")
198
-
199
  gr.Markdown("## (Optional) Get Sample TSV(s) to Upload")
200
 
201
  gr.Markdown("### Example 1: A single TSV")
@@ -245,23 +214,19 @@ with gr.Blocks() as demo:
245
  container=True,
246
  )
247
 
248
- with gr.Row():
249
- with gr.Column(scale=7):
250
- graph_out = gr.Image(label="Network Graph Representation", type="pil")
251
-
252
  # If files are uploaded, generate prompt and run model
253
  if model_loaded:
254
  files.upload(
255
  fn=gen_output_from_files_uploaded,
256
  inputs=files,
257
- outputs=[json_out, graph_out, sql_out],
258
  )
259
 
260
  gr.Markdown("Run out of FreeGPU or having issues? Try the example output!")
261
  demo_btn = gr.Button("Manually Load Example Output from Previous Run")
262
  demo_btn.click(
263
  fn=gen_output_from_example,
264
- outputs=[json_out, graph_out, sql_out],
265
  )
266
 
267
  if __name__ == "__main__":
 
10
 
11
  from schema_to_sql import dd_to_sql
12
  from utils import (
 
13
  create_summary_tables,
14
  get_example_ai_model_output,
15
  get_prompt_with_files_uploaded,
 
25
  BASE_MODEL = "meta-llama/Llama-3.1-8B-Instruct"
26
  LORA_ADAPTER = "uc-ctds/data-model-curator"
27
 
28
+ MAX_RETRY_ATTEMPTS = 3
29
 
30
  print(f"Is CUDA available: {torch.cuda.is_available()}")
31
  print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
 
49
  # continue on so setup instructions load
50
 
51
 
52
+ @spaces.GPU(duration=450)
53
  def run_llm_inference(model_prompt):
54
  retry_count = 1
55
 
 
124
  model_response = run_llm_inference(prompt_from_tsv_upload)
125
  model_response_json = json.loads(model_response)
126
 
 
 
 
127
  # Create SQL Code
128
+ try:
129
+ sql, validation = dd_to_sql(model_response_json)
130
+ except Exception:
131
+ print(f"Errors converting to SQL, skipping...")
132
+ sql = ""
133
 
134
  # Create Summary Table
135
  nodes_df, properties_df = {}, {}
 
138
  except Exception as exc:
139
  print(f"summary table creation failed: {exc}")
140
 
141
+ return model_response, sql, nodes_df, properties_df
142
 
143
 
144
  def gen_output_from_example():
145
  model_response = get_example_ai_model_output()
146
  model_response_json = json.loads(model_response)
 
147
  sql, validation = dd_to_sql(model_response_json)
148
 
149
+ return model_response, sql
150
 
151
 
152
  def zip_tsvs():
 
165
  with gr.Blocks() as demo:
166
  gr.Markdown("# Demonstration of Llama Data Model Generator")
167
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
168
  gr.Markdown("## (Optional) Get Sample TSV(s) to Upload")
169
 
170
  gr.Markdown("### Example 1: A single TSV")
 
214
  container=True,
215
  )
216
 
 
 
 
 
217
  # If files are uploaded, generate prompt and run model
218
  if model_loaded:
219
  files.upload(
220
  fn=gen_output_from_files_uploaded,
221
  inputs=files,
222
+ outputs=[json_out, sql_out],
223
  )
224
 
225
  gr.Markdown("Run out of FreeGPU or having issues? Try the example output!")
226
  demo_btn = gr.Button("Manually Load Example Output from Previous Run")
227
  demo_btn.click(
228
  fn=gen_output_from_example,
229
+ outputs=[json_out, sql_out],
230
  )
231
 
232
  if __name__ == "__main__":
requirements.txt CHANGED
@@ -1,4 +1,3 @@
1
- torch==2.5.1
2
  transformers==4.50.0
3
  pydantic==2.10.6
4
  gradio==5.35.0
@@ -8,3 +7,5 @@ vllm==0.6.4.post1
8
  peft
9
  Pillow
10
  spaces
 
 
 
 
1
  transformers==4.50.0
2
  pydantic==2.10.6
3
  gradio==5.35.0
 
7
  peft
8
  Pillow
9
  spaces
10
+ --extra-index-url https://download.pytorch.org/whl/cu113
11
+ torch==2.5.1
schema_to_sql.py CHANGED
@@ -135,7 +135,7 @@ def generate_create_table(node, table_lookup):
135
  parent_cols = table_lookup.get(parent, {})
136
  if parent_field in parent_cols:
137
  fk_constraints.append(
138
- f' FOREIGN KEY ("{sql_col}") REFERENCES {parent}("{ref_col}")'
139
  )
140
  else:
141
  fk_constraints.append(
 
135
  parent_cols = table_lookup.get(parent, {})
136
  if parent_field in parent_cols:
137
  fk_constraints.append(
138
+ f' FOREIGN KEY ("{sql_col}") REFERENCES "{parent}"("{ref_col}")'
139
  )
140
  else:
141
  fk_constraints.append(
serialized_file_creation_demo/README.md ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # Serialized File Creation Demo
2
+
3
+ [This Jupyter notebook](./serialized_file_creation_demo.ipynb) demonstrates how to use the AI-assisted data model output (originally just a collection of TSV files) to a serialized file, a [PFB (Portable Format for Bioinformatics)](https://pmc.ncbi.nlm.nih.gov/articles/PMC10035862/) file.
4
+
5
+ PFB is widely used within NIH-funded initiativies that our center is a part of, as a means for efficient storage and transfer of data between systems.
serialized_file_creation_demo/gen3_dm_scaffold.json CHANGED
The diff for this file is too large to render. See raw diff
 
serialized_file_creation_demo/serialized_file_creation_demo.ipynb CHANGED
@@ -280,7 +280,7 @@
280
  "pfb_data = os.path.join(sdm_dir, Path(out_file).stem + \"_data.avro\")\n",
281
  "!pfb from -o $pfb_data json -s $pfb_schema --program DEV --project test $sdm_dir/tsv_metadata\n",
282
  "if Path(pfb_data).exists():\n",
283
- " print(f\"PFB containing TSV files written to:\\n{pfb_data}.\")"
284
  ]
285
  },
286
  {
 
280
  "pfb_data = os.path.join(sdm_dir, Path(out_file).stem + \"_data.avro\")\n",
281
  "!pfb from -o $pfb_data json -s $pfb_schema --program DEV --project test $sdm_dir/tsv_metadata\n",
282
  "if Path(pfb_data).exists():\n",
283
+ " print(f\"PFB containing TSV files written to:\\n{pfb_data}\")"
284
  ]
285
  },
286
  {
serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/SDM_0__submitted_genotyping_array.mass_cytometry_image.actionable_mutation__jsonschema_dd.json CHANGED
@@ -1 +1,598 @@
1
- Invalid username or password.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nodes": [
3
+ {
4
+ "name": "project",
5
+ "description": "Any specifically defined piece of work that is undertaken or attempted to meet a single requirement. (NCIt C47885)",
6
+ "links": [],
7
+ "required": [
8
+ "availability_type",
9
+ "project.id"
10
+ ],
11
+ "properties": [
12
+ {
13
+ "name": "address",
14
+ "description": "",
15
+ "type": "string"
16
+ },
17
+ {
18
+ "name": "availability_type",
19
+ "description": "Is the project open or restricted?",
20
+ "type": "enum"
21
+ },
22
+ {
23
+ "name": "brief_summary",
24
+ "description": "A short description of the clinical study, including a brief statement of the clinical study's hypothesis, written in language intended for the lay public.",
25
+ "type": "string"
26
+ },
27
+ {
28
+ "name": "collaborators",
29
+ "description": "Other organizations (if any) providing support. Support may include funding, design, implementation, data analysis or reporting. The responsible party is responsible for confirming all collaborators before listing them.",
30
+ "type": "string"
31
+ },
32
+ {
33
+ "name": "coverage",
34
+ "description": "The spatial or temporal topic of the resource, the spatial applicability of the resource, or the jurisdiction under which the resource is relevant. Spatial topic and spatial applicability may be a named place or a location specified by its geographic coordinates. Temporal topic may be a named period, date, or date range. A jurisdiction may be a named administrative entity or a geographic place to which the resource applies. Recommended best practice is to use a controlled vocabulary such as the Thesaurus of Geographic Names [TGN] (http://www.getty.edu/research/tools/vocabulary/tgn/index.html). Where appropriate, named places or time periods can be used in preference to numeric identifiers such as sets of coordinates or date ranges.",
35
+ "type": "string"
36
+ },
37
+ {
38
+ "name": "data_contributor",
39
+ "description": "The name of the organization or individual that the contributed dataset belongs to.",
40
+ "type": "string"
41
+ },
42
+ {
43
+ "name": "data_type",
44
+ "description": "The general classification of the approach used for the study, i.e. GSA, GDA, RNA-seq.",
45
+ "type": "array"
46
+ },
47
+ {
48
+ "name": "data_url_doi",
49
+ "description": "A URL or DOI for the source of the dataset or the contributing organization's website.",
50
+ "type": "string"
51
+ },
52
+ {
53
+ "name": "disclaimer",
54
+ "description": "The disclaimers that are needed to use the following dataset outside of its source location.",
55
+ "type": "string"
56
+ },
57
+ {
58
+ "name": "estimated_study_completion",
59
+ "description": "The estimated date that the study will be completed/published.",
60
+ "type": "string"
61
+ },
62
+ {
63
+ "name": "institution",
64
+ "description": "Public or Private entity, including Government Agencies.",
65
+ "type": "array"
66
+ },
67
+ {
68
+ "name": "primary_site",
69
+ "description": "The primary body site studied in this dataset.",
70
+ "type": "string"
71
+ },
72
+ {
73
+ "name": "project.id",
74
+ "description": "A unique identifier for records in this 'project' table.",
75
+ "type": "string"
76
+ },
77
+ {
78
+ "name": "project_sponsor",
79
+ "description": "the name of an agency, institution, consortium, or other body that oversees the projects and resources. For academic programs that center around a lab or individual use the department or consortium name.",
80
+ "type": "string"
81
+ },
82
+ {
83
+ "name": "protocol",
84
+ "description": "If a JCOIN hub study, the category of study as defined by Ducharme et al., 2021. Journal of Substance Abuse Treatment publication.",
85
+ "type": "enum"
86
+ },
87
+ {
88
+ "name": "release_requested",
89
+ "description": "User requests that the GDC release the project. Release can only be requested if the project is releasable.",
90
+ "type": "boolean"
91
+ },
92
+ {
93
+ "name": "release_status",
94
+ "description": "Release status of the study.",
95
+ "type": "enum"
96
+ },
97
+ {
98
+ "name": "research_program",
99
+ "description": "Name of the NIH-registered Research Program.",
100
+ "type": "string"
101
+ },
102
+ {
103
+ "name": "submission_enabled",
104
+ "description": "Indicates if submission to a project is allowed.",
105
+ "type": "boolean"
106
+ },
107
+ {
108
+ "name": "support_id",
109
+ "description": "The ID of the source providing support/grant resources.",
110
+ "type": "string"
111
+ }
112
+ ]
113
+ },
114
+ {
115
+ "name": "study",
116
+ "description": "A coordinated set of actions and observations designed to generate data, with the ultimate goal of discovery or hypothesis testing.",
117
+ "links": [
118
+ "project"
119
+ ],
120
+ "required": [
121
+ "project.id",
122
+ "study.id",
123
+ "study_description"
124
+ ],
125
+ "properties": [
126
+ {
127
+ "name": "data_description",
128
+ "description": "Brief description of the data being provided for this study. Free text",
129
+ "type": "string"
130
+ },
131
+ {
132
+ "name": "study_completeness",
133
+ "description": "Description of data status. 0=Descriptive data and results as originally received from the data provider. 1=Includes updates to the original data submission short of completeness. 2=Complete set of descriptive data and results, as ascertained by curator.",
134
+ "type": "enum"
135
+ },
136
+ {
137
+ "name": "study_description",
138
+ "description": "A brief description of the study being performed. Free text",
139
+ "type": "string"
140
+ },
141
+ {
142
+ "name": "study_doi",
143
+ "description": "Digital object identifier (DOI) is a type of persistent identifier used to uniquely identify objects",
144
+ "type": "string"
145
+ },
146
+ {
147
+ "name": "study_organization",
148
+ "description": "Name of the primary organization that oversees implementation of the study",
149
+ "type": "string"
150
+ },
151
+ {
152
+ "name": "project.id",
153
+ "description": "Unique identifiers for records in the 'project' table that relate via this foreign key to records in this 'study' table.",
154
+ "type": "string"
155
+ },
156
+ {
157
+ "name": "study.id",
158
+ "description": "A unique identifier for records in this 'study' table.",
159
+ "type": "string"
160
+ },
161
+ {
162
+ "name": "data_url_doi",
163
+ "description": "A URL or DOI for the source of the dataset or the contributing organization's website.",
164
+ "type": "string"
165
+ },
166
+ {
167
+ "name": "full_name",
168
+ "description": "The full name or title of the dataset or publication.",
169
+ "type": "string"
170
+ }
171
+ ]
172
+ },
173
+ {
174
+ "name": "case",
175
+ "description": "The collection of all data related to a specific subject in the context of a specific project.",
176
+ "links": [
177
+ "project",
178
+ "study"
179
+ ],
180
+ "required": [
181
+ "case.id",
182
+ "project.id"
183
+ ],
184
+ "properties": [
185
+ {
186
+ "name": "AnchorDate",
187
+ "description": "The reference or anchor date used during date obfuscation, where a single date is obscurred by creating one or more date ranges in relation to this date.",
188
+ "type": "enum"
189
+ },
190
+ {
191
+ "name": "project.id",
192
+ "description": "Unique identifiers for records in the 'project' table that relate via this foreign key to records in this 'case' table.",
193
+ "type": "string"
194
+ },
195
+ {
196
+ "name": "case.id",
197
+ "description": "A unique identifier for records in this 'case' table.",
198
+ "type": "string"
199
+ },
200
+ {
201
+ "name": "cohort_id",
202
+ "description": "Cohort identifier",
203
+ "type": "integer"
204
+ },
205
+ {
206
+ "name": "cohort_name",
207
+ "description": "Cohort name",
208
+ "type": "string"
209
+ },
210
+ {
211
+ "name": "geographic_location",
212
+ "description": "Location of experiment.",
213
+ "type": "string"
214
+ },
215
+ {
216
+ "name": "index_date",
217
+ "description": "The reference or anchor date used during date obfuscation, where a single date is obscurred by creating one or more date ranges in relation to this date.",
218
+ "type": "enum"
219
+ },
220
+ {
221
+ "name": "species",
222
+ "description": "Taxonomic species of the subject.",
223
+ "type": "enum"
224
+ },
225
+ {
226
+ "name": "study.id",
227
+ "description": "Unique identifiers for records in the 'study' table that relate via this foreign key to records in this 'case' table.",
228
+ "type": "string"
229
+ }
230
+ ]
231
+ },
232
+ {
233
+ "name": "cell_subject",
234
+ "description": "description of subject.",
235
+ "links": [
236
+ "study",
237
+ "case"
238
+ ],
239
+ "required": [
240
+ "cell_subject.id"
241
+ ],
242
+ "properties": [
243
+ {
244
+ "name": "cell_type",
245
+ "description": "",
246
+ "type": "string"
247
+ },
248
+ {
249
+ "name": "isolation_protocol",
250
+ "description": "",
251
+ "type": "string"
252
+ },
253
+ {
254
+ "name": "karyotype",
255
+ "description": "",
256
+ "type": "string"
257
+ },
258
+ {
259
+ "name": "provenance",
260
+ "description": "",
261
+ "type": "string"
262
+ },
263
+ {
264
+ "name": "source_organ",
265
+ "description": "",
266
+ "type": "string"
267
+ },
268
+ {
269
+ "name": "study.id",
270
+ "description": "Unique identifiers for records in the 'study' table that relate via this foreign key to records in this 'cell_subject' table.",
271
+ "type": "string"
272
+ },
273
+ {
274
+ "name": "case.id",
275
+ "description": "Unique identifiers for records in the 'case' table that relate via this foreign key to records in this 'cell_case' table.",
276
+ "type": "string"
277
+ },
278
+ {
279
+ "name": "cell_subject.id",
280
+ "description": "A unique identifier for records in this 'cell_subject' table.",
281
+ "type": "string"
282
+ }
283
+ ]
284
+ },
285
+ {
286
+ "name": "actionable_mutation",
287
+ "description": "Mutations extracted from a patient's sequencing results.",
288
+ "links": [
289
+ "case"
290
+ ],
291
+ "required": [
292
+ "case.id",
293
+ "actionable_mutation.id"
294
+ ],
295
+ "properties": [
296
+ {
297
+ "name": "ClinicallyActionable",
298
+ "description": "ClinicallyActionable",
299
+ "type": "number"
300
+ },
301
+ {
302
+ "name": "DaysFromAnchorDateToBxResultsDate",
303
+ "description": "DaysFromAnchorDateToBxResultsDate",
304
+ "type": "integer"
305
+ },
306
+ {
307
+ "name": "Lab",
308
+ "description": "Lab",
309
+ "type": "string"
310
+ },
311
+ {
312
+ "name": "MutantFraction",
313
+ "description": "MutantFraction",
314
+ "type": "number"
315
+ },
316
+ {
317
+ "name": "TrialPhase",
318
+ "description": "TrialPhase",
319
+ "type": "number"
320
+ },
321
+ {
322
+ "name": "case.id",
323
+ "description": "Unique identifiers for records in the 'case' table that relate via this foreign key to records in this 'actionable_mutation' table.",
324
+ "type": "string"
325
+ },
326
+ {
327
+ "name": "actionable_mutation.id",
328
+ "description": "A unique identifier for records in this 'actionable_mutation' table.",
329
+ "type": "string"
330
+ }
331
+ ]
332
+ },
333
+ {
334
+ "name": "sample",
335
+ "description": "description of sample.",
336
+ "links": [
337
+ "case",
338
+ "cell_subject"
339
+ ],
340
+ "required": [
341
+ "sample.id"
342
+ ],
343
+ "properties": [
344
+ {
345
+ "name": "provenance",
346
+ "description": "template version",
347
+ "type": "string"
348
+ },
349
+ {
350
+ "name": "weight",
351
+ "description": "The weight of the patient measured in kilograms.",
352
+ "type": "number"
353
+ },
354
+ {
355
+ "name": "case.id",
356
+ "description": "Unique identifiers for records in the 'case' table that relate via this foreign key to records in this 'sample' table.",
357
+ "type": "string"
358
+ },
359
+ {
360
+ "name": "cell_subject.id",
361
+ "description": "Unique identifiers for records in the 'cell_subject' table that relate via this foreign key to records in this 'sample' table.",
362
+ "type": "string"
363
+ },
364
+ {
365
+ "name": "sample.id",
366
+ "description": "A unique identifier for records in this 'sample' table.",
367
+ "type": "string"
368
+ }
369
+ ]
370
+ },
371
+ {
372
+ "name": "aliquot",
373
+ "description": "Pertaining to a portion of the whole; any one of two or more samples of something, of the same volume or weight.",
374
+ "links": [
375
+ "sample"
376
+ ],
377
+ "required": [
378
+ "sample.id",
379
+ "aliquot.id"
380
+ ],
381
+ "properties": [
382
+ {
383
+ "name": "a260_a280_ratio",
384
+ "description": "Numeric value that represents the sample ratio of nucleic acid absorbance at 260 nm and 280 nm, used to determine a measure of DNA purity.",
385
+ "type": "number"
386
+ },
387
+ {
388
+ "name": "derivitization",
389
+ "description": "",
390
+ "type": "string"
391
+ },
392
+ {
393
+ "name": "provenance",
394
+ "description": "template version",
395
+ "type": "string"
396
+ },
397
+ {
398
+ "name": "sample.id",
399
+ "description": "Unique identifiers for records in the 'sample' table that relate via this foreign key to records in this 'aliquot' table.",
400
+ "type": "string"
401
+ },
402
+ {
403
+ "name": "aliquot.id",
404
+ "description": "A unique identifier for records in this 'aliquot' table.",
405
+ "type": "string"
406
+ }
407
+ ]
408
+ },
409
+ {
410
+ "name": "submitted_genotyping_array",
411
+ "description": "Data file containing raw data from a genotyping array.",
412
+ "links": [
413
+ "aliquot"
414
+ ],
415
+ "required": [
416
+ "submitted_genotyping_array.id",
417
+ "file_name",
418
+ "data_category",
419
+ "aliquot.id"
420
+ ],
421
+ "properties": [
422
+ {
423
+ "name": "data_category",
424
+ "description": "Broad categorization of the contents of the data file.",
425
+ "type": "enum"
426
+ },
427
+ {
428
+ "name": "aliquot.id",
429
+ "description": "Unique identifiers for records in the 'aliquot' table that relate via this foreign key to records in this 'submitted_genotyping_array' table.",
430
+ "type": "string"
431
+ },
432
+ {
433
+ "name": "file_name",
434
+ "description": "The name (or part of a name) of a file (of any type).",
435
+ "type": "string"
436
+ },
437
+ {
438
+ "name": "submitted_genotyping_array.id",
439
+ "description": "A unique identifier for records in this 'submitted_genotyping_array' table.",
440
+ "type": "string"
441
+ }
442
+ ]
443
+ },
444
+ {
445
+ "name": "analyte",
446
+ "description": "Any aspect of an aliquot used in an analysis or assay to characterize the sample. These aspects range from molecules, such as DNA and RNA, that can be extracted from the aliquot to general descriptions of the aliquot's components, such as cell count and morphology.",
447
+ "links": [
448
+ "aliquot",
449
+ "study"
450
+ ],
451
+ "required": [
452
+ "analyte_type",
453
+ "analyte_isolation_method",
454
+ "analyte.id"
455
+ ],
456
+ "properties": [
457
+ {
458
+ "name": "analyte_isolation_method",
459
+ "description": "The name or general description of the method used to isolate the analyte. Alternatively, if you have provided a protocol, put the file_name here.",
460
+ "type": "string"
461
+ },
462
+ {
463
+ "name": "analyte_type",
464
+ "description": "Text term that represents the kind of molecular specimen analyte.",
465
+ "type": "enum"
466
+ },
467
+ {
468
+ "name": "frame_identifier",
469
+ "description": "In an analysis of a slide, the frame denotes the region of the slide that is being examined. Within a frame are multiple cells.",
470
+ "type": "string"
471
+ },
472
+ {
473
+ "name": "run_identifier",
474
+ "description": "The identifier given to the run during which this particular analyte was tested or evaluated. If you analyze multiple analytes through the same experimentation run, this is a good way to keep track.",
475
+ "type": "string"
476
+ },
477
+ {
478
+ "name": "specificity_other",
479
+ "description": "If the true negative rate is something other than a number (for example, 'WT'), enter the text here.",
480
+ "type": "string"
481
+ },
482
+ {
483
+ "name": "aliquot.id",
484
+ "description": "Unique identifiers for records in the 'aliquot' table that relate via this foreign key to records in this 'analyte' table.",
485
+ "type": "string"
486
+ },
487
+ {
488
+ "name": "study.id",
489
+ "description": "Unique identifiers for records in the 'study' table that relate via this foreign key to records in this 'analyte' table.",
490
+ "type": "string"
491
+ },
492
+ {
493
+ "name": "analyte.id",
494
+ "description": "A unique identifier for records in this 'analyte' table.",
495
+ "type": "string"
496
+ }
497
+ ]
498
+ },
499
+ {
500
+ "name": "mass_cytometry_assay",
501
+ "description": "Mass cytometry is a variation of flow cytometry in which antibodies are labeled with heavy metal ion tags rather than fluorochromes. Readout is by time-of-flight mass spectrometry. This allows for the combination of many more antibody specificities in a single samples, without significant spillover between channels.",
502
+ "links": [
503
+ "analyte"
504
+ ],
505
+ "required": [
506
+ "mass_cytometry_assay.id",
507
+ "analyte.id",
508
+ "assay_method",
509
+ "md5sum"
510
+ ],
511
+ "properties": [
512
+ {
513
+ "name": "assay_method",
514
+ "description": "General methodology used to perform the assay.",
515
+ "type": "enum"
516
+ },
517
+ {
518
+ "name": "md5sum",
519
+ "description": "The 128-bit hash value expressed as a 32 digit hexadecimal number used as a file's digital fingerprint.",
520
+ "type": "string"
521
+ },
522
+ {
523
+ "name": "protocol_used",
524
+ "description": "The name or general description of the protocol used to run the mass cytometry assay. Alternatively, if you have provided a protocol, enter its file_name here.",
525
+ "type": "string"
526
+ },
527
+ {
528
+ "name": "analyte.id",
529
+ "description": "Unique identifiers for records in the 'analyte' table that relate via this foreign key to records in this 'mass_cytometry_assay' table.",
530
+ "type": "string"
531
+ },
532
+ {
533
+ "name": "mass_cytometry_assay.id",
534
+ "description": "A unique identifier for records in this 'mass_cytometry_assay' table.",
535
+ "type": "string"
536
+ }
537
+ ]
538
+ },
539
+ {
540
+ "name": "mass_cytometry_image",
541
+ "description": "Following an imaging mass cytometry experiment, the raw data output can be converted into antibody-specific images.",
542
+ "links": [
543
+ "mass_cytometry_assay"
544
+ ],
545
+ "required": [
546
+ "assay_target",
547
+ "file_name",
548
+ "mass_cytometry_assay.id",
549
+ "data_type",
550
+ "data_category",
551
+ "data_format",
552
+ "mass_cytometry_image.id"
553
+ ],
554
+ "properties": [
555
+ {
556
+ "name": "assay_target",
557
+ "description": "Target for the assay: can be a specific gene, protein, or otherwise.",
558
+ "type": "string"
559
+ },
560
+ {
561
+ "name": "data_category",
562
+ "description": "Broad categorization of the contents of the data file.",
563
+ "type": "enum"
564
+ },
565
+ {
566
+ "name": "data_format",
567
+ "description": "Format of the data files.",
568
+ "type": "enum"
569
+ },
570
+ {
571
+ "name": "data_type",
572
+ "description": "Specific content type of the data file.",
573
+ "type": "enum"
574
+ },
575
+ {
576
+ "name": "file_name",
577
+ "description": "The name (or part of a name) of a file (of any type).",
578
+ "type": "string"
579
+ },
580
+ {
581
+ "name": "object_id",
582
+ "description": "The GUID of the object in the index service.",
583
+ "type": "string"
584
+ },
585
+ {
586
+ "name": "mass_cytometry_assay.id",
587
+ "description": "Unique identifiers for records in the 'mass_cytometry_assay' table that relate via this foreign key to records in this 'mass_cytometry_image' table.",
588
+ "type": "string"
589
+ },
590
+ {
591
+ "name": "mass_cytometry_image.id",
592
+ "description": "A unique identifier for records in this 'mass_cytometry_image' table.",
593
+ "type": "string"
594
+ }
595
+ ]
596
+ }
597
+ ]
598
+ }
serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/actionable_mutation_metadata.tsv CHANGED
@@ -1 +1 @@
1
- Invalid username or password.
 
1
+ actionable_mutation.id ClinicallyActionable DaysFromAnchorDateToBxResultsDate Lab MutantFraction TrialPhase case.id
serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/aliquot_metadata.tsv CHANGED
@@ -1 +1 @@
1
- Invalid username or password.
 
1
+ aliquot.id a260_a280_ratio cell_type derivitization isolation_protocol karyotype provenance source_organ weight study.id case.id cell_subject.id sample.id
serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/analyte_metadata.tsv CHANGED
@@ -1 +1 @@
1
- Invalid username or password.
 
1
+ analyte.id analyte_isolation_method analyte_type frame_identifier run_identifier specificity_other study.id aliquot.id
serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/case_metadata.tsv CHANGED
@@ -1 +1 @@
1
- Invalid username or password.
 
1
+ case.id AnchorDate address availability_type brief_summary cohort_id cohort_name collaborators coverage data_contributor data_description data_type data_url_doi disclaimer estimated_study_completion full_name geographic_location index_date institution primary_site project_sponsor protocol release_requested release_status research_program species study_completeness study_description study_doi study_organization submission_enabled support_id project.id study.id
serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/mass_cytometry_assay_file_manifest.tsv CHANGED
@@ -1 +1 @@
1
- Invalid username or password.
 
1
+ mass_cytometry_assay.id assay_method md5sum protocol_used analyte.id
serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/mass_cytometry_image_file_manifest.tsv CHANGED
@@ -1 +1 @@
1
- Invalid username or password.
 
1
+ mass_cytometry_image.id assay_target data_category data_format data_type file_name object_id mass_cytometry_assay.id
serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/submitted_genotyping_array.mass_cytometry_image.actionable_mutation_paths.json CHANGED
@@ -1 +1,34 @@
1
- Invalid username or password.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "submitted_genotyping_array": [
3
+ "project",
4
+ "study",
5
+ "case",
6
+ "cell_subject",
7
+ "sample",
8
+ "aliquot"
9
+ ],
10
+ "mass_cytometry_image": [
11
+ "project",
12
+ "study",
13
+ "case",
14
+ "cell_subject",
15
+ "sample",
16
+ "aliquot",
17
+ "analyte",
18
+ "mass_cytometry_assay"
19
+ ],
20
+ "actionable_mutation": [
21
+ "project",
22
+ "study",
23
+ "case"
24
+ ],
25
+ "mass_cytometry_assay": [
26
+ "project",
27
+ "study",
28
+ "case",
29
+ "cell_subject",
30
+ "sample",
31
+ "aliquot",
32
+ "analyte"
33
+ ]
34
+ }
serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/submitted_genotyping_array_file_manifest.tsv CHANGED
@@ -1 +1 @@
1
- Invalid username or password.
 
1
+ submitted_genotyping_array.id data_category file_name aliquot.id