avantol commited on
Commit
c567880
·
1 Parent(s): b65b037

feat(app): Initial release

Browse files
Files changed (29) hide show
  1. README.md +5 -19
  2. app.py +249 -17
  3. duplicate.png +0 -0
  4. poetry.lock +0 -0
  5. pyproject.toml +0 -17
  6. requirements.txt +10 -0
  7. sample_metadata.tsv +1 -0
  8. schema_to_sql.py +206 -0
  9. secret.png +0 -0
  10. serialized_file_creation_demo/gen3_dm_scaffold.json +1 -0
  11. serialized_file_creation_demo/serialized_file_creation_demo.ipynb +360 -0
  12. serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/SDM_0__submitted_genotyping_array.mass_cytometry_image.actionable_mutation__jsonschema_dd.json +1 -0
  13. serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/actionable_mutation_metadata.tsv +1 -0
  14. serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/aliquot_metadata.tsv +1 -0
  15. serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/analyte_metadata.tsv +1 -0
  16. serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/case_metadata.tsv +1 -0
  17. serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/mass_cytometry_assay_file_manifest.tsv +1 -0
  18. serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/mass_cytometry_image_file_manifest.tsv +1 -0
  19. serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/submitted_genotyping_array.mass_cytometry_image.actionable_mutation_paths.json +1 -0
  20. serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/submitted_genotyping_array_file_manifest.tsv +1 -0
  21. serialized_file_creation_demo/utils.py +566 -0
  22. tsvs/alias_metadata.tsv +1 -0
  23. tsvs/center_metadata.tsv +1 -0
  24. tsvs/diagnosis_metadata.tsv +1 -0
  25. tsvs/exposure_metadata.tsv +1 -0
  26. tsvs/participant_metadata.tsv +1 -0
  27. tsvs/summary_file_file_manifest.tsv +1 -0
  28. tsvs/visit_metadata.tsv +1 -0
  29. utils.py +485 -0
README.md CHANGED
@@ -1,26 +1,12 @@
1
  ---
2
- title: Data Model Curator Demo
3
  emoji: 📝
4
- colorFrom: pink
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 5.37.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
- short_description: Demonstration of basic usage of the data model curator tool
12
-
13
- hf_oauth: true
14
- # optional, default duration is 8 hours/480 minutes. Max duration is 30 days/43200 minutes.
15
- hf_oauth_expiration_minutes: 480
16
- # optional, see "Scopes" below. "openid profile" is always included.
17
- hf_oauth_scopes:
18
- - read-repos
19
- - inference-api
20
- # optional, restrict access to members of specific organizations
21
- # hf_oauth_authorized_org:
22
- # - ORG_NAME1
23
- # - ORG_NAME2
24
  ---
25
-
26
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Llama Data Model Generator Demo
3
  emoji: 📝
4
+ colorFrom: green
5
+ colorTo: blue
6
  sdk: gradio
7
+ sdk_version: 5.35.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
+ short_description: Demo of our Llama data model generator model
 
 
 
 
 
 
 
 
 
 
 
 
12
  ---
 
 
app.py CHANGED
@@ -1,27 +1,259 @@
1
- from __future__ import annotations
 
 
2
 
 
3
  import gradio as gr
4
- from huggingface_hub import whoami
 
 
5
 
 
 
 
 
 
 
 
6
 
7
- def hello(profile: gr.OAuthProfile | None) -> str:
8
- if profile is None:
9
- return "I don't know you."
10
- return f"Hello {profile.name}"
11
 
 
 
12
 
13
- def list_organizations(oauth_token: gr.OAuthToken | None) -> str:
14
- if oauth_token is None:
15
- return "Please deploy this on Spaces and log in to list organizations."
16
- org_names = [org["name"] for org in whoami(oauth_token.token)["orgs"]]
17
- return f"You belong to {', '.join(org_names)}."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
 
20
  with gr.Blocks() as demo:
21
- gr.LoginButton()
22
- m1 = gr.Markdown()
23
- m2 = gr.Markdown()
24
- demo.load(hello, inputs=None, outputs=m1)
25
- demo.load(list_organizations, inputs=None, outputs=m2)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
- demo.launch()
 
 
1
+ import json
2
+ import os
3
+ import zipfile
4
 
5
+ import torch
6
  import gradio as gr
7
+ import spaces
8
+ from peft import PeftConfig, PeftModel
9
+ from transformers import AutoModelForCausalLM, AutoTokenizer
10
 
11
+ from schema_to_sql import dd_to_sql
12
+ from utils import (
13
+ create_graph_image_from_json,
14
+ create_summary_tables,
15
+ get_example_ai_model_output,
16
+ get_prompt_with_files_uploaded,
17
+ )
18
 
19
+ from utils import MAX_NEW_TOKENS, TEMPERATURE
 
 
 
20
 
21
+ LOCAL_DIR = "example_tsvs"
22
+ ZIP_PATH = "tsvs.zip"
23
 
24
+ AUTH_TOKEN = os.environ.get("HF_TOKEN", False)
25
+
26
+ BASE_MODEL = "meta-llama/Llama-3.1-8B-Instruct"
27
+ LORA_ADAPTER = "uc-ctds/data-model-curator"
28
+
29
+ MAX_RETRY_ATTEMPTS = 1
30
+
31
+ print(f"Is CUDA available: {torch.cuda.is_available()}")
32
+ print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
33
+
34
+ tokenizer = AutoTokenizer.from_pretrained(
35
+ BASE_MODEL, token=AUTH_TOKEN, device_map="auto"
36
+ )
37
+ model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, token=AUTH_TOKEN)
38
+ model = model.to("cuda")
39
+ model = model.eval()
40
+
41
+ peft_config = PeftConfig.from_pretrained(LORA_ADAPTER, token=AUTH_TOKEN)
42
+ model = PeftModel.from_pretrained(model, LORA_ADAPTER, token=AUTH_TOKEN)
43
+
44
+
45
+ @spaces.GPU(duration=360)
46
+ def run_llm_inference(model_prompt):
47
+ retry_count = 1
48
+
49
+ print("Tokenizing Input")
50
+ inputs = tokenizer(model_prompt, return_tensors="pt")
51
+ inputs = inputs.to(model.device)
52
+ prompt_length = inputs["input_ids"].shape[1]
53
+
54
+ print("Generating Initial Response")
55
+ outputs = model.generate(
56
+ **inputs,
57
+ max_new_tokens=MAX_NEW_TOKENS,
58
+ temperature=TEMPERATURE,
59
+ )
60
+
61
+ # Decode and parse output
62
+ print("Decoding output")
63
+ output_data_model = tokenizer.decode(outputs[0][prompt_length:])
64
+ output_data_model = output_data_model.split("<|eot_id|>")[0]
65
+ print(output_data_model)
66
+
67
+ # Test output for JSON schema validity
68
+ try:
69
+ test_respone = json.loads(output_data_model)
70
+ valid_output = True
71
+ print("Yay - model passed")
72
+ return output_data_model
73
+
74
+ except:
75
+ valid_output = False
76
+
77
+ while (valid_output is False) and (retry_count <= MAX_RETRY_ATTEMPTS):
78
+
79
+ print(
80
+ f"Attempt {retry_count} did not generate a proper JSON output, proceeding to attempt {retry_count+1} of {MAX_RETRY_ATTEMPTS+1}"
81
+ )
82
+ retry_count += 1
83
+
84
+ # Try generating a new response
85
+ outputs = model.generate(
86
+ **inputs,
87
+ max_new_tokens=MAX_NEW_TOKENS,
88
+ temperature=TEMPERATURE,
89
+ )
90
+
91
+ output_data_model = tokenizer.decode(outputs[0][prompt_length:])
92
+ output_data_model = output_data_model.split("<|eot_id|>")[0]
93
+ print(output_data_model)
94
+ # Test output for JSON schema validity
95
+ try:
96
+ json.loads(output_data_model)
97
+ valid_output = True
98
+ print("Yay - model passed")
99
+ return output_data_model
100
+ except:
101
+ valid_output = False
102
+
103
+ # Handle cases when the model fails to generate a proper json schema
104
+ if (valid_output is False) and (retry_count > MAX_RETRY_ATTEMPTS):
105
+ print(
106
+ "Failed To Generate a Proper Schema, try checking the prompt or input TSVs and running again"
107
+ )
108
+ output_data_model = '{"nodes": [{"name": "Attempt Failed - Check logs for suggested next steps", "links": []}]}'
109
+
110
+ return output_data_model
111
+
112
+
113
+ def gen_output_from_files_uploaded(filepaths: list[str] = None):
114
+ prompt_from_tsv_upload = get_prompt_with_files_uploaded(filepaths)
115
+
116
+ # Run model to get model response (model_response is a string that needs to be loaded to json)
117
+ model_response = run_llm_inference(prompt_from_tsv_upload)
118
+ model_response_json = json.loads(model_response)
119
+
120
+ # Create Graph Network Image
121
+ graph_network_img = create_graph_image_from_json(model_response_json)
122
+
123
+ # Create SQL Code
124
+ sql, validation = dd_to_sql(model_response_json)
125
+
126
+ # Create Summary Table
127
+ nodes_df, properties_df = {}, {}
128
+ try:
129
+ nodes_df, properties_df = create_summary_tables(model_response_json)
130
+ except Exception as exc:
131
+ print(f"summary table creation failed: {exc}")
132
+
133
+ return model_response, graph_network_img, sql, nodes_df, properties_df
134
+
135
+
136
+ def gen_output_from_example():
137
+ model_response = get_example_ai_model_output()
138
+ model_response_json = json.loads(model_response)
139
+ graph_network_img = create_graph_image_from_json(model_response_json)
140
+ sql, validation = dd_to_sql(model_response_json)
141
+
142
+ return model_response, graph_network_img, sql
143
+
144
+
145
+ def zip_tsvs():
146
+ tsv_files = [f for f in os.listdir(LOCAL_DIR) if f.endswith(".tsv")]
147
+ if not tsv_files:
148
+ return None
149
+
150
+ with zipfile.ZipFile(ZIP_PATH, "w") as zipf:
151
+ for file in tsv_files:
152
+ file_path = os.path.join(LOCAL_DIR, file)
153
+ zipf.write(file_path, arcname=file)
154
+
155
+ return ZIP_PATH
156
 
157
 
158
  with gr.Blocks() as demo:
159
+ gr.Markdown("# Demonstration of Llama Data Model Generator")
160
+
161
+ gr.Markdown("## IMPORTANT Setup")
162
+
163
+ gr.Markdown(
164
+ "This demonstrates usage of our [Llama Data Model Generator](https://huggingface.co/uc-ctds/llama-data-model-generator). "
165
+ "We fine-tuned the base [Llama 3.1 8B Instruct model](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct), "
166
+ "so you must be approved to access it to use this space. Please follow the previous links and gain access to the "
167
+ "gated models before proceeding."
168
+ )
169
+
170
+ gr.Markdown(
171
+ "After gaining access, you must **duplicate this space** and add a secret variable HF_TOKEN in the settings. "
172
+ "See [Official Huggingface docs](https://huggingface.co/docs/hub/security-tokens) on how to generate a token. "
173
+ "It should only have `read` access. Note: this is due to a limitation in Huggingface Spaces and end-user "
174
+ "access to gated models."
175
+ )
176
+
177
+ gr.Image("duplicate.png", label="How to duplicate this space")
178
+
179
+ gr.Markdown(
180
+ "In your duplicated space, press the `Settings` gear button on the top right. "
181
+ "Then in the `Variables and Secrets` section, create a new **secret** named `HF_TOKEN`. "
182
+ "Ensure you add your token created in the previous step by following the official Huggingface docs."
183
+ )
184
+
185
+ gr.Image("secret.png", label="Add your Huggingface token to the secret")
186
+
187
+ gr.Markdown(
188
+ "**IMPORTANT:** Only continue after doing the above or you will get errors."
189
+ )
190
+
191
+ gr.Markdown("## (Optional) Get Sample TSV(s) to Upload")
192
+
193
+ gr.Markdown("### Example 1: A single TSV")
194
+ download_btn = gr.DownloadButton(
195
+ label="Download Single TSV", value="sample_metadata.tsv"
196
+ )
197
+ gr.Markdown("### Example 2: Many TSVs in a single .zip file.")
198
+ download_btn = gr.DownloadButton(
199
+ label="Download All Sample TSVs as .zip", value=zip_tsvs
200
+ )
201
+ gr.Markdown("You need to extract the .zip if you want to use them.")
202
+
203
+ gr.Markdown("## Upload TSVs With Headers (No Data Rows Required)")
204
+ files = gr.Files(
205
+ label="Upload TSVs",
206
+ file_types=[".tsv"],
207
+ type="filepath",
208
+ )
209
+
210
+ gr.Markdown(
211
+ "Depending on your Huggingface subscription and availability of free GPUs, this can take a few minutes to complete."
212
+ )
213
+ gr.Markdown(
214
+ "Behind the scenes, our [Llama Data Model Generator](https://huggingface.co/uc-ctds/llama-data-model-generator) AI model is being loaded "
215
+ "onto GPUs and the TSVs uploaded are being sent to the model in a specialized prompt. "
216
+ "For information about the model, please see the model card itself by clicking "
217
+ "the link above."
218
+ )
219
+
220
+ # Define Outputs
221
+ with gr.Row():
222
+ with gr.Column(scale=7):
223
+ json_out = gr.Code(
224
+ label="Generated Data Model Output",
225
+ value=json.dumps([]),
226
+ language="json",
227
+ interactive=True,
228
+ show_label=True,
229
+ container=True,
230
+ elem_id="json-output",
231
+ )
232
+ with gr.Column(scale=7):
233
+ sql_out = gr.Textbox(
234
+ label="SQL Defined Relational Schema",
235
+ value=[],
236
+ show_label=True,
237
+ container=True,
238
+ )
239
+
240
+ with gr.Row():
241
+ with gr.Column(scale=7):
242
+ graph_out = gr.Image(label="Network Graph Representation", type="pil")
243
+
244
+ # If files are uploaded, generate prompt and run model
245
+ files.upload(
246
+ fn=gen_output_from_files_uploaded,
247
+ inputs=files,
248
+ outputs=[json_out, graph_out, sql_out],
249
+ )
250
+
251
+ gr.Markdown("Run out of FreeGPU or having issues? Try the example output!")
252
+ demo_btn = gr.Button("Manually Load Example Output from Previous Run")
253
+ demo_btn.click(
254
+ fn=gen_output_from_example,
255
+ outputs=[json_out, graph_out, sql_out],
256
+ )
257
 
258
+ if __name__ == "__main__":
259
+ demo.launch()
duplicate.png ADDED
poetry.lock DELETED
The diff for this file is too large to render. See raw diff
 
pyproject.toml DELETED
@@ -1,17 +0,0 @@
1
- [project]
2
- name = "data-model-curator-demo"
3
- version = "0.1.0"
4
- description = ""
5
- authors = []
6
- license = {text = "Apache 2.0"}
7
- readme = "README.md"
8
- requires-python = ">=3.10"
9
- dependencies = [
10
- "gradio (>=5.37.0,<6.0.0)"
11
- ]
12
- package-mode = false
13
-
14
-
15
- [build-system]
16
- requires = ["poetry-core>=2.0.0,<3.0.0"]
17
- build-backend = "poetry.core.masonry.api"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ torch==2.5.1
2
+ transformers==4.50.0
3
+ pydantic==2.10.6
4
+ gradio==5.35.0
5
+ networkx==3.4.2
6
+ matplotlib==3.10.0
7
+ vllm==0.6.4.post1
8
+ peft
9
+ Pillow
10
+ spaces
sample_metadata.tsv ADDED
@@ -0,0 +1 @@
 
 
1
+ sample.id CancerRegistry_PatientID Class_of_Case_Desc Ethnicity Last_Name Sex_Desc awg_review body_fluid_code data_citation data_contributor data_description date_of_death dbgap_accession_number dna_260_280 dna_concentration full_name in_review intended_release_date longitudinal procedure_date protocol_number releasable request_submission research_design research_objective research_setup project.id dataset.id subject.id
schema_to_sql.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sqlite3
2
+
3
+
4
+ def convert_type(type):
5
+ """
6
+ Returns SQL type for given AI generated type
7
+
8
+ This function takes AI generated type and returns SQL type.
9
+ For simplified Data Dictionary enums are converted to text data type, and
10
+ arrays are converted in text arrays
11
+
12
+ Parameters:
13
+ type (str): AI generated type
14
+
15
+ Returns:
16
+ sql_type (str): SQL type
17
+ """
18
+ sql_match = {
19
+ "string": "TEXT",
20
+ "integer": "INTEGER",
21
+ "number": "REAL",
22
+ "boolean": "BOOLEAN",
23
+ "array": "TEXT[]",
24
+ "enum": "TEXT",
25
+ }
26
+ sql_type = sql_match.get(type, "TEXT")
27
+ return sql_type
28
+
29
+
30
+ def get_pk_field(node):
31
+ """
32
+ Returns primary key field for given AI generated node
33
+
34
+ This function takes AI generated node dictionary and returns primary key field.
35
+
36
+ Parameters:
37
+ node (dict): AI generated node dictionary
38
+
39
+ Returns:
40
+ pk_field (str): Primary key field
41
+ """
42
+ # Look for a typical PK pattern: <table>.id
43
+ for prop in node["properties"]:
44
+ if prop["name"] == f"{node['name']}.id":
45
+ return prop["name"]
46
+ # Fallback
47
+ return None
48
+
49
+
50
+ def get_all_columns(node):
51
+ """
52
+ Returns all columns for given AI generated node
53
+
54
+ This function takes AI generated node dictionary and returns all columns.
55
+
56
+ Parameters:
57
+ node (dict): AI generated node dictionary
58
+
59
+ Returns:
60
+ columns (list): List of column names
61
+ """
62
+ return [prop["name"] for prop in node["properties"]]
63
+
64
+
65
+ def as_sql_col(prop_name):
66
+ """
67
+ Returns property name as a sql column name with "." replaced with "__"
68
+
69
+ This function takes AI generated DD node property name and replaces "." with "__".
70
+ Dot in the field name may cause issues during the SQL table creation.
71
+
72
+ Parameters:
73
+ prop_name (str): property name
74
+
75
+ Returns:
76
+ col_name (str): Column name with "." replaced with "__"
77
+ """
78
+ return prop_name.replace(".", "__")
79
+
80
+
81
+ def get_foreign_table_and_field(prop_name, node_name):
82
+ """
83
+ Returns foreign table and field for given property name and node_name
84
+
85
+ This function takes AI generated DD node name and property name and returns foreign table and field.
86
+
87
+ Parameters:
88
+ prop_name (str): property name
89
+ node_name (str): node name
90
+
91
+ Returns:
92
+ foreign_table (str): Foreign table name
93
+ foreign_field (str): Foreign field name
94
+ """
95
+ # Looks for pattern: e.g. project.id when not in 'project'
96
+ if prop_name.endswith(".id") and not prop_name.startswith(node_name + "."):
97
+ parent = prop_name.split(".")[0]
98
+ return parent, prop_name
99
+ return None, None
100
+
101
+
102
+ def generate_create_table(node, table_lookup):
103
+ """
104
+ Returns SQL for the given AI generated node
105
+
106
+ This function takes AI generated node dictionary and returns SQL for the node.
107
+
108
+ Parameters:
109
+ node (dict): AI generated node dictionary
110
+ table_lookup (dict): Dictionary of tables and their columns
111
+
112
+ Returns:
113
+ sql (str): SQL for the node
114
+ """
115
+ col_lines = []
116
+ fk_constraints = []
117
+ pk_fields = []
118
+ pk_field = get_pk_field(node)
119
+ required = node.get("required", [])
120
+
121
+ for prop in node["properties"]:
122
+ col = prop["name"]
123
+ coltype = convert_type(prop["type"])
124
+ sql_col = as_sql_col(col)
125
+ line = f' "{sql_col}" {coltype}'
126
+ if pk_field and col == pk_field:
127
+ pk_fields.append(sql_col)
128
+ if col in required or (pk_field and col == pk_field):
129
+ line += " NOT NULL"
130
+ col_lines.append(line)
131
+ # Foreign Keys
132
+ parent, parent_field = get_foreign_table_and_field(col, node["name"])
133
+ if parent:
134
+ ref_col = as_sql_col(parent_field)
135
+ parent_cols = table_lookup.get(parent, {})
136
+ if parent_field in parent_cols:
137
+ fk_constraints.append(
138
+ f' FOREIGN KEY ("{sql_col}") REFERENCES {parent}("{ref_col}")'
139
+ )
140
+ else:
141
+ fk_constraints.append(
142
+ f" -- WARNING: {parent} does not have field {parent_field}"
143
+ )
144
+
145
+ # Primary Keys
146
+ constraints = []
147
+ if pk_fields:
148
+ constraint_sql = ", ".join(f'"{c}"' for c in pk_fields)
149
+ constraints.append(f" PRIMARY KEY ({constraint_sql})")
150
+
151
+ lines = col_lines + constraints + fk_constraints
152
+ return f'CREATE TABLE "{node["name"]}" (\n' + ",\n".join(lines) + "\n);"
153
+
154
+
155
+ def validate_sql(sql, node_name):
156
+ """
157
+ Returns validation result for the given SQL
158
+
159
+ This function takes SQL and node name and returns validation result.
160
+
161
+ Parameters:
162
+ sql (str): SQL
163
+ node_name (str): Node name
164
+
165
+ Returns:
166
+ validation_result (str): Validation result
167
+ """
168
+ conn = sqlite3.connect(":memory:")
169
+ try:
170
+ conn.execute(sql)
171
+ validation_result = f'Valid SQL for table "{node_name}"\n'
172
+ except sqlite3.Error as e:
173
+ validation_result = f'Invalid SQL for table "{node_name}":\n{e}\n'
174
+ finally:
175
+ conn.close()
176
+ return validation_result
177
+
178
+
179
+ def dd_to_sql(dd):
180
+ """
181
+ Returns SQL for the given AI generated DD
182
+
183
+ This function takes AI generated DD and returns SQL for the DD.
184
+
185
+ Parameters:
186
+ dd (dict): AI generated DD
187
+
188
+ Returns:
189
+ sql (str): SQL
190
+ validation (str): Validation result
191
+ """
192
+ # Build a lookup for table columns in all nodes
193
+ table_lookup = {}
194
+ for node in dd["nodes"]:
195
+ table_lookup[node["name"]] = get_all_columns(node)
196
+ # pprint.pprint(table_lookup)
197
+
198
+ # Generate SQL
199
+ combined_sql = ""
200
+ validation = "Validation notes:\n"
201
+ for node in dd["nodes"]:
202
+ sql = generate_create_table(node, table_lookup) + "\n\n"
203
+ validation = validation + validate_sql(sql, node["name"])
204
+ combined_sql = combined_sql + sql
205
+
206
+ return combined_sql, validation
secret.png ADDED
serialized_file_creation_demo/gen3_dm_scaffold.json ADDED
@@ -0,0 +1 @@
 
 
1
+ Invalid username or password.
serialized_file_creation_demo/serialized_file_creation_demo.ipynb ADDED
@@ -0,0 +1,360 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "0",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Creation of Serialized File From AI Model Output\n",
9
+ "---\n",
10
+ "This notebook demonstrates how to use the AI-assited data model output (originally just a collection of TSV files) to a serialized file, a [PFB (Portable Format for Bioinformatics)](https://pmc.ncbi.nlm.nih.gov/articles/PMC10035862/) file. \n",
11
+ "\n",
12
+ "PFB is widely used within NIH-funded initiativies that our center is a part of, as a means for efficient storage and transfer of data between systems.\n",
13
+ " "
14
+ ]
15
+ },
16
+ {
17
+ "cell_type": "markdown",
18
+ "id": "1",
19
+ "metadata": {},
20
+ "source": [
21
+ "### Setup"
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "code",
26
+ "execution_count": null,
27
+ "id": "2",
28
+ "metadata": {},
29
+ "outputs": [],
30
+ "source": [
31
+ "%pip install pandas gen3"
32
+ ]
33
+ },
34
+ {
35
+ "cell_type": "markdown",
36
+ "id": "3",
37
+ "metadata": {},
38
+ "source": [
39
+ "We need some helper files to demonstrate this, so pull them in from Huggingface."
40
+ ]
41
+ },
42
+ {
43
+ "cell_type": "code",
44
+ "execution_count": null,
45
+ "id": "4",
46
+ "metadata": {},
47
+ "outputs": [],
48
+ "source": [
49
+ "!git clone https://huggingface.co/spaces/uc-ctds/arpa-h-demo-test\n",
50
+ "!cd arpa-h-demo-test/serialized_file_creation_demo"
51
+ ]
52
+ },
53
+ {
54
+ "cell_type": "markdown",
55
+ "id": "5",
56
+ "metadata": {},
57
+ "source": [
58
+ "### Imports and Initial Loading"
59
+ ]
60
+ },
61
+ {
62
+ "cell_type": "code",
63
+ "execution_count": null,
64
+ "id": "6",
65
+ "metadata": {},
66
+ "outputs": [],
67
+ "source": [
68
+ "from utils import *\n",
69
+ "import os\n",
70
+ "from pathlib import Path\n",
71
+ "import pandas as pd"
72
+ ]
73
+ },
74
+ {
75
+ "cell_type": "code",
76
+ "execution_count": null,
77
+ "id": "7",
78
+ "metadata": {},
79
+ "outputs": [],
80
+ "source": [
81
+ "# read in the minimal Gen3 data model scaffold\n",
82
+ "scaffold_file = \"./gen3_dm_scaffold.json\"\n",
83
+ "scaffold = read_schema(scaffold_file)"
84
+ ]
85
+ },
86
+ {
87
+ "cell_type": "markdown",
88
+ "id": "8",
89
+ "metadata": {},
90
+ "source": [
91
+ "We are demonstrating the ability to use this against an AI-generated model, but not directly inferencing to get the data model. Instead we're using a Sythnetic Data Contribution (a sample of what a data contributor would provide AND the expected simplified data model). We use these to train and test the AI model. For simplicity, we're using the model here."
92
+ ]
93
+ },
94
+ {
95
+ "cell_type": "code",
96
+ "execution_count": null,
97
+ "id": "9",
98
+ "metadata": {},
99
+ "outputs": [],
100
+ "source": [
101
+ "# Find the simplified data model in a Synthetic Data Contribution directory\n",
102
+ "sdm_dir = \"./submitted_genotyping_array.mass_cytometry_image.actionable_mutation\"\n",
103
+ "sdm_file = next(\n",
104
+ " (f for f in os.listdir(sdm_dir) if f.endswith(\"_jsonschema_dd.json\")), None\n",
105
+ ")\n",
106
+ "sdm_path = os.path.join(sdm_dir, sdm_file)\n",
107
+ "print(sdm_path)"
108
+ ]
109
+ },
110
+ {
111
+ "cell_type": "code",
112
+ "execution_count": null,
113
+ "id": "10",
114
+ "metadata": {},
115
+ "outputs": [],
116
+ "source": [
117
+ "sdm = read_schema(schema=sdm_path)"
118
+ ]
119
+ },
120
+ {
121
+ "cell_type": "markdown",
122
+ "id": "11",
123
+ "metadata": {},
124
+ "source": [
125
+ "### Creation of Serialized File"
126
+ ]
127
+ },
128
+ {
129
+ "cell_type": "markdown",
130
+ "id": "12",
131
+ "metadata": {},
132
+ "source": [
133
+ "As of writing, PFB requires a Gen3-style data model, so the next steps are to ensure we can go from the simplified AI model output to a Gen3 data model. Note that in the future we may allow alternative, non-Gen3 models to create such PFBs."
134
+ ]
135
+ },
136
+ {
137
+ "cell_type": "code",
138
+ "execution_count": null,
139
+ "id": "13",
140
+ "metadata": {},
141
+ "outputs": [],
142
+ "source": [
143
+ "## Create a Gen3 data model from the simplified data model\n",
144
+ "\n",
145
+ "gdm = sdm_to_gen3(sdm) # convert simplified data model nodes into the Gen3-style nodes\n",
146
+ "gdm = merge_scaffold_into_gdm(\n",
147
+ " gdm, scaffold\n",
148
+ ") # merge the scaffold into the Gen3-style data model\n",
149
+ "gdm = fix_project(gdm) # ensure project links to program and has req'd props\n",
150
+ "gdm = add_gen3_required_properties(\n",
151
+ " gdm\n",
152
+ ") # add required Gen3 properties to the project node\n",
153
+ "gdm = add_yaml_suffix_to_nodes(gdm) # ensure all nodes have .yaml suffix"
154
+ ]
155
+ },
156
+ {
157
+ "cell_type": "code",
158
+ "execution_count": null,
159
+ "id": "14",
160
+ "metadata": {},
161
+ "outputs": [],
162
+ "source": [
163
+ "## Write the Gen3-style data model to a JSON file\n",
164
+ "sdm_name = Path(\n",
165
+ " sdm_path\n",
166
+ ").stem # get the stem (basename without extension) of the sdm file\n",
167
+ "out_file = os.path.join(sdm_dir, f\"Gen3_{sdm_name}_pfb.json\")\n",
168
+ "write_schema(gdm, out_file) # write the schema to a file"
169
+ ]
170
+ },
171
+ {
172
+ "cell_type": "markdown",
173
+ "id": "15",
174
+ "metadata": {},
175
+ "source": [
176
+ "Now we have the data model in proper format, we can serialize it into a PFB."
177
+ ]
178
+ },
179
+ {
180
+ "cell_type": "code",
181
+ "execution_count": null,
182
+ "id": "16",
183
+ "metadata": {},
184
+ "outputs": [],
185
+ "source": [
186
+ "# Convert the Gen3-style data model to PFB format schema\n",
187
+ "pfb_schema = os.path.join(sdm_dir, Path(out_file).stem + \".avro\")\n",
188
+ "!pfb from -o $pfb_schema dict $out_file"
189
+ ]
190
+ },
191
+ {
192
+ "cell_type": "markdown",
193
+ "id": "17",
194
+ "metadata": {},
195
+ "source": [
196
+ "### PFB Utilities"
197
+ ]
198
+ },
199
+ {
200
+ "cell_type": "markdown",
201
+ "id": "18",
202
+ "metadata": {},
203
+ "source": [
204
+ "Now we can demonstrate creation of a PFB when you have content for it (in this case in the form of TSV metadata). The above is a PFB which contains only the data model."
205
+ ]
206
+ },
207
+ {
208
+ "cell_type": "code",
209
+ "execution_count": null,
210
+ "id": "19",
211
+ "metadata": {},
212
+ "outputs": [],
213
+ "source": [
214
+ "# Get a list of TSV files in the sdm_dir\n",
215
+ "tsv_files = [f for f in os.listdir(sdm_dir) if f.endswith(\".tsv\")]\n",
216
+ "tsv_files"
217
+ ]
218
+ },
219
+ {
220
+ "cell_type": "code",
221
+ "execution_count": null,
222
+ "id": "20",
223
+ "metadata": {},
224
+ "outputs": [],
225
+ "source": [
226
+ "# calculate tsv file size and md5sum for each tsv_files\n",
227
+ "for tsv_file in tsv_files:\n",
228
+ " tsv_path = os.path.join(sdm_dir, tsv_file)\n",
229
+ " file_size = os.path.getsize(tsv_path)\n",
230
+ " # get the md5sum of the TSV file using md5 bash command\n",
231
+ " md5sum = get_md5sum(tsv_path)\n",
232
+ " tsv_metadata = {\n",
233
+ " \"submitter_id\": \"actionable_mutation_metadata.tsv\",\n",
234
+ " \"file_format\": \"TSV\",\n",
235
+ " \"file_name\": \"actionable_mutation_metadata.tsv\",\n",
236
+ " \"file_size\": file_size,\n",
237
+ " \"md5sum\": md5sum,\n",
238
+ " }\n",
239
+ " os.makedirs(\n",
240
+ " os.path.join(sdm_dir, \"tsv_metadata\"), exist_ok=True\n",
241
+ " ) # create the tsv_metadata directory if it doesn't exist\n",
242
+ " tsv_metadata_stem = Path(tsv_file).stem\n",
243
+ " if tsv_metadata_stem.endswith(\"_metadata\"):\n",
244
+ " tsv_metadata_stem = tsv_metadata_stem.replace(\"_metadata\", \".json\")\n",
245
+ " elif tsv_metadata_stem.endswith(\"_file_manifest\"):\n",
246
+ " tsv_metadata_stem = tsv_metadata_stem.replace(\"_file_manifest\", \".json\")\n",
247
+ " tsv_metadata_file = os.path.join(sdm_dir, \"tsv_metadata\", tsv_metadata_stem)\n",
248
+ " with open(tsv_metadata_file, \"w\") as f:\n",
249
+ " json.dump(tsv_metadata, f, indent=4)\n",
250
+ " print(f\"\\tTSV metadata written to {tsv_metadata_file}\")"
251
+ ]
252
+ },
253
+ {
254
+ "cell_type": "code",
255
+ "execution_count": null,
256
+ "id": "21",
257
+ "metadata": {},
258
+ "outputs": [],
259
+ "source": [
260
+ "%ls -l $sdm_dir/tsv_metadata"
261
+ ]
262
+ },
263
+ {
264
+ "cell_type": "code",
265
+ "execution_count": null,
266
+ "id": "22",
267
+ "metadata": {},
268
+ "outputs": [],
269
+ "source": [
270
+ "tsv_metadata"
271
+ ]
272
+ },
273
+ {
274
+ "cell_type": "code",
275
+ "execution_count": null,
276
+ "id": "23",
277
+ "metadata": {},
278
+ "outputs": [],
279
+ "source": [
280
+ "pfb_data = os.path.join(sdm_dir, Path(out_file).stem + \"_data.avro\")\n",
281
+ "!pfb from -o $pfb_data json -s $pfb_schema --program DEV --project test $sdm_dir/tsv_metadata\n",
282
+ "if Path(pfb_data).exists():\n",
283
+ " print(f\"PFB containing TSV files written to:\\n{pfb_data}.\")"
284
+ ]
285
+ },
286
+ {
287
+ "cell_type": "markdown",
288
+ "id": "24",
289
+ "metadata": {},
290
+ "source": [
291
+ "PFB contains a utility to convert from the serialized format to more readable and workable files, including TSVs. Here we demonstrate that utility:"
292
+ ]
293
+ },
294
+ {
295
+ "cell_type": "code",
296
+ "execution_count": null,
297
+ "id": "25",
298
+ "metadata": {},
299
+ "outputs": [],
300
+ "source": [
301
+ "!gen3 pfb to -i $pfb_data tsv # convert the PFB file to TSV format"
302
+ ]
303
+ },
304
+ {
305
+ "cell_type": "code",
306
+ "execution_count": null,
307
+ "id": "26",
308
+ "metadata": {},
309
+ "outputs": [],
310
+ "source": [
311
+ "!gen3 pfb show -i $pfb_data # show the contents of the PFB file"
312
+ ]
313
+ },
314
+ {
315
+ "cell_type": "code",
316
+ "execution_count": null,
317
+ "id": "27",
318
+ "metadata": {},
319
+ "outputs": [],
320
+ "source": [
321
+ "!gen3 pfb show -i $pfb_data schema | jq # show the schema of the PFB file"
322
+ ]
323
+ },
324
+ {
325
+ "cell_type": "markdown",
326
+ "id": "28",
327
+ "metadata": {},
328
+ "source": [
329
+ "Now we've gone all the way from a dump of data contribution files, to a simple structured data model, to a serialized PFB, and back to usable files!"
330
+ ]
331
+ },
332
+ {
333
+ "cell_type": "markdown",
334
+ "id": "29",
335
+ "metadata": {},
336
+ "source": []
337
+ }
338
+ ],
339
+ "metadata": {
340
+ "kernelspec": {
341
+ "display_name": "Python 3",
342
+ "language": "python",
343
+ "name": "python3"
344
+ },
345
+ "language_info": {
346
+ "codemirror_mode": {
347
+ "name": "ipython",
348
+ "version": 3
349
+ },
350
+ "file_extension": ".py",
351
+ "mimetype": "text/x-python",
352
+ "name": "python",
353
+ "nbconvert_exporter": "python",
354
+ "pygments_lexer": "ipython3",
355
+ "version": "3.9.6"
356
+ }
357
+ },
358
+ "nbformat": 4,
359
+ "nbformat_minor": 5
360
+ }
serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/SDM_0__submitted_genotyping_array.mass_cytometry_image.actionable_mutation__jsonschema_dd.json ADDED
@@ -0,0 +1 @@
 
 
1
+ Invalid username or password.
serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/actionable_mutation_metadata.tsv ADDED
@@ -0,0 +1 @@
 
 
1
+ Invalid username or password.
serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/aliquot_metadata.tsv ADDED
@@ -0,0 +1 @@
 
 
1
+ Invalid username or password.
serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/analyte_metadata.tsv ADDED
@@ -0,0 +1 @@
 
 
1
+ Invalid username or password.
serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/case_metadata.tsv ADDED
@@ -0,0 +1 @@
 
 
1
+ Invalid username or password.
serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/mass_cytometry_assay_file_manifest.tsv ADDED
@@ -0,0 +1 @@
 
 
1
+ Invalid username or password.
serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/mass_cytometry_image_file_manifest.tsv ADDED
@@ -0,0 +1 @@
 
 
1
+ Invalid username or password.
serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/submitted_genotyping_array.mass_cytometry_image.actionable_mutation_paths.json ADDED
@@ -0,0 +1 @@
 
 
1
+ Invalid username or password.
serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/submitted_genotyping_array_file_manifest.tsv ADDED
@@ -0,0 +1 @@
 
 
1
+ Invalid username or password.
serialized_file_creation_demo/utils.py ADDED
@@ -0,0 +1,566 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## This script will take a simplified data model and convert it to a Gen3-style data model
2
+
3
+ import json
4
+ import os
5
+ import sys
6
+ import shutil
7
+ import copy
8
+ import random
9
+ import datetime
10
+ from pathlib import Path
11
+ import pandas as pd
12
+ import hashlib
13
+
14
+ ###################################################################################################################################################
15
+ ### PSEUDO CODE
16
+ # 1. Read the simplified data model from a JSON file.
17
+ # 2. Create a new dictionary to hold the Gen3-style data model.
18
+ # 3. Add a generic program, CMC nodes, _terms, _settings, _definitions dict (reference https://github.com/uc-cdis/pypfb/blob/master/examples/minimal-pfb/minimal_file.json to see what's required)
19
+ # 4. For each node in the simplified data model:
20
+ # - Convert the node to the Gen3 format.
21
+ # - Add it to the new dictionary.
22
+ # 5. Write the new dictionary to a JSON file.
23
+ ###################################################################################################################################################
24
+
25
+
26
+ ######## SCRIPTS
27
+
28
+
29
+ def read_schema(schema):
30
+ """Reads in a schema.json file for a simplified / Gen3 data dictionary and returns a Python dictionary."""
31
+ with open(schema) as json_file:
32
+ try:
33
+ dm = json.load(json_file)
34
+ except json.JSONDecodeError:
35
+ print(f"Error reading schema file: {schema}")
36
+ return {}
37
+ return dm
38
+
39
+
40
+ # # read in the simplified data model
41
+ # sdm_file = "./Documents/Notes/AI/AI_data_curation/SDC/sdc_v3_nmax1_nmin1_pmax75_pmin25_limit20_dmax1000_20250423/SDM_0_20250423_tsvs/actionable_mutation/SDM_0__actionable_mutation__jsonschema_dd.json"
42
+ # sdm = read_schema(schema=sdm_file)
43
+
44
+
45
+ def write_schema(data, out_file):
46
+ with open(out_file, "w") as f:
47
+ json.dump(data, f, indent=4)
48
+ print(f"\tData model written to {out_file}")
49
+
50
+
51
+ def create_gen3_dm_scaffold():
52
+ """Creates a minimal Gen3 data model scaffold with the required nodes and properties.
53
+ Returns a dictionary representing the scaffold.
54
+ '_definitions',
55
+ '_settings',
56
+ '_terms',
57
+ 'core_metadata_collection',
58
+ 'data_release',
59
+ 'metaschema',
60
+ 'program',
61
+ 'root']
62
+ """
63
+ scaffold = {}
64
+ ## Add the _terms, _definitions, _settings using the minimal file in pypfb
65
+ mdfile = "minimal_file.json"
66
+ md = read_schema(mdfile)
67
+ gdm = copy.deepcopy(
68
+ {n: md[n] for n in md if n in ["_definitions", "_settings", "_terms"]}
69
+ )
70
+ ########### read in MIDRC data model to create node templates for program, project, and CMC
71
+ ## Add program/project/CMC node
72
+ midrc_dm = "./Documents/Notes/AI/AI_data_curation/input_schemas/gen3_schemas/data.midrc.org_schema.json"
73
+ mdm = read_schema(midrc_dm)
74
+ keep_nodes = [
75
+ "data_release",
76
+ "metaschema",
77
+ "program",
78
+ "project",
79
+ "core_metadata_collection",
80
+ "root",
81
+ ]
82
+ gdm = gdm | copy.deepcopy(
83
+ {n: mdm[n] for n in mdm if n in keep_nodes}
84
+ ) # start with the program node
85
+ gdm["data_release"]["namespace"] = "https://gen3.org"
86
+ gdm["core_metadata_collection"]["namespace"] = "https://gen3.org"
87
+ del gdm["core_metadata_collection"]["properties"]["case_ids"]
88
+ gdm["metaschema"]["properties"]["links"][
89
+ "title"
90
+ ] = "Define a link to other GDC entities" # remove the title from the links property
91
+ # write the minimal dict scaffold to a file:
92
+ out_dir = "./Gen3/dd/synthetic_data_for_AI"
93
+ out_file = os.path.join(out_dir, "gen3_dm_scaffold.json")
94
+ write_schema(gdm, out_file)
95
+ # read in the minimal Gen3 data model scaffold
96
+ scaffold_file = "./Gen3/dd/synthetic_data_for_AI/gen3_dm_scaffold.json"
97
+ scaffold = read_schema(scaffold_file)
98
+ return scaffold
99
+
100
+
101
+ def convert_sdm_node_to_gen3(node, sdm):
102
+ """Convert a simplified data model node to a Gen3-style data model node dict with these keys:
103
+ ['$schema',
104
+ 'additionalProperties',
105
+ 'category',
106
+ 'description',
107
+ 'id',
108
+ 'links',
109
+ 'namespace',
110
+ 'nodeTerms',
111
+ 'program',
112
+ 'project',
113
+ 'properties',
114
+ 'required',
115
+ 'submittable',
116
+ 'systemProperties',
117
+ 'title',
118
+ 'type',
119
+ 'uniqueKeys',
120
+ 'validators']
121
+ """
122
+ node_def = [n for n in sdm["nodes"] if n["name"] == node][
123
+ 0
124
+ ] # get the node definition from the simplified data model
125
+ gen3_node = {}
126
+ gen3_node["$schema"] = "http://json-schema.org/draft-04/schema#"
127
+ gen3_node["id"] = node_def.get("name", node)
128
+ gen3_node["title"] = node_def.get(
129
+ "name", node
130
+ ) # TODO: original title is lost; use the node name as the title
131
+ gen3_node["type"] = "object" # all object
132
+ gen3_node["nodeTerms"] = None
133
+ gen3_node["namespace"] = "https://gen3.org"
134
+ gen3_node["category"] = node_def.get(
135
+ "category", "default"
136
+ ) # # TODO: original category lost, get category from a Gen3 model
137
+ gen3_node["program"] = "*"
138
+ gen3_node["project"] = "*"
139
+ gen3_node["description"] = node_def.get("description", "")
140
+ gen3_node["additionalProperties"] = False
141
+ gen3_node["submittable"] = True
142
+ gen3_node["validators"] = None
143
+ gen3_node["systemProperties"] = [
144
+ "id",
145
+ "project_id",
146
+ "state",
147
+ "created_datetime",
148
+ "updated_datetime",
149
+ ] # TODO: if it's a file node, should add "file_state", "error_type"
150
+ gen3_node["links"] = []
151
+ for link in node_def.get("links", []):
152
+ if any([l.strip(".id") for l in node_def["required"] if l.startswith(link)]):
153
+ required = True
154
+ else:
155
+ required = False
156
+ if any(
157
+ [
158
+ p.strip(".id")
159
+ for p in [p["name"] for p in node_def["properties"]]
160
+ if p.startswith(link)
161
+ ]
162
+ ):
163
+ name = link
164
+ else:
165
+ name = link
166
+ gen3_node["links"].append(
167
+ {
168
+ "backref": node_def.get("name", node), # TODO: backref is lost
169
+ "label": "child_of", # TODO: original label is lost; use 'child_of' as a default
170
+ "multiplicity": "many_to_many", # TODO: original multiplicity lost; get multiplicity from a Gen3 model
171
+ "name": name, # TODO: original name is lost; use the link target as the name
172
+ "required": required,
173
+ "target_type": link,
174
+ }
175
+ )
176
+ gen3_node["required"] = node_def.get("required", [])
177
+ gen3_node["uniqueKeys"] = [
178
+ ["id"],
179
+ ["project_id", "submitter_id"],
180
+ ] # TODO: original uniqueKeys lost; same for all nodes
181
+ gen3_node["properties"] = {}
182
+ # Add properties to the Gen3 node
183
+ for prop_def in node_def.get("properties", []):
184
+ prop = prop_def["name"]
185
+ if prop.endswith(
186
+ ".id"
187
+ ): # it's a link property, so add all the things for links
188
+ prop = prop[:-3] # remove the '.id' suffix
189
+ gen3_node["properties"][prop] = {}
190
+ gen3_node["properties"][prop]["description"] = prop_def.get(
191
+ "description", ""
192
+ )
193
+ gen3_node["properties"][prop]["anyOf"] = [
194
+ {
195
+ "items": {
196
+ "additionalProperties": True,
197
+ "maxItems": 1,
198
+ "minItems": 1,
199
+ "properties": {
200
+ "id": {
201
+ "pattern": "^[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}$",
202
+ "term": {
203
+ "description": "A 128-bit identifier. Depending on the mechanism used to generate it, it is either guaranteed to be different from all other UUIDs/GUIDs generated until 3400 AD or extremely likely to be different. Its relatively small size lends itself well to sorting, ordering, and hashing of all sorts, storing in databases, simple allocation, and ease of programming in general.\n",
204
+ "termDef": {
205
+ "cde_id": "C54100",
206
+ "cde_version": None,
207
+ "source": "NCIt",
208
+ "term": "Universally Unique Identifier",
209
+ "term_url": "https://ncit.nci.nih.gov/ncitbrowser/ConceptReport.jsp?dictionary=NCI_Thesaurus&version=16.02d&ns=NCI_Thesaurus&code=C54100",
210
+ },
211
+ },
212
+ "type": "string",
213
+ },
214
+ "submitter_id": {"type": "string"},
215
+ },
216
+ "type": "object",
217
+ },
218
+ "type": "array",
219
+ },
220
+ {
221
+ "additionalProperties": True,
222
+ "properties": {
223
+ "id": {
224
+ "pattern": "^[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}$",
225
+ "term": {
226
+ "description": "A 128-bit identifier. Depending on the mechanism used to generate it, it is either guaranteed to be different from all other UUIDs/GUIDs generated until 3400 AD or extremely likely to be different. Its relatively small size lends itself well to sorting, ordering, and hashing of all sorts, storing in databases, simple allocation, and ease of programming in general.\n",
227
+ "termDef": {
228
+ "cde_id": "C54100",
229
+ "cde_version": None,
230
+ "source": "NCIt",
231
+ "term": "Universally Unique Identifier",
232
+ "term_url": "https://ncit.nci.nih.gov/ncitbrowser/ConceptReport.jsp?dictionary=NCI_Thesaurus&version=16.02d&ns=NCI_Thesaurus&code=C54100",
233
+ },
234
+ },
235
+ "type": "string",
236
+ },
237
+ "submitter_id": {"type": "string"},
238
+ },
239
+ "type": "object",
240
+ },
241
+ ]
242
+ else:
243
+ gen3_node["properties"][prop] = {}
244
+ gen3_node["properties"][prop]["description"] = prop_def.get(
245
+ "description", ""
246
+ )
247
+ if "type" in prop_def and prop_def["type"] == "enum":
248
+ prop_type = "string" # TODO: if the type is enum, we need to specify the enum values, which is lost in simplified data models
249
+ elif "type" in prop_def and prop_def["type"] == "array":
250
+ prop_type = "array" # TODO: if the type is array, we need to specify items type, which is lost in simplified data models
251
+ gen3_node["properties"][prop]["items"] = {
252
+ "type": "string"
253
+ } # default to string if not specified
254
+ else:
255
+ prop_type = prop_def.get(
256
+ "type", "string"
257
+ ) # default to string if not specified
258
+ gen3_node["properties"][prop]["type"] = prop_type
259
+
260
+ return gen3_node
261
+
262
+
263
+ # convert_sdm_node_to_gen3('case',sdm)
264
+
265
+
266
+ def sdm_to_gen3(sdm):
267
+ """Converts nodes in a simplified data model into the Gen3-style data model nodes.
268
+ Returns the Gen3-ified data model as a dictionary.
269
+ """
270
+ gdm = {}
271
+ sdm_nodes = [
272
+ n["name"] for n in sdm["nodes"]
273
+ ] # get the list of node names from the simplified data model
274
+ for node in sdm_nodes:
275
+ gen3_node = convert_sdm_node_to_gen3(node, sdm)
276
+ gdm[node] = gen3_node
277
+ return gdm
278
+
279
+
280
+ # gdm = sdm_to_gen3(sdm) # convert simplified data model nodes into the Gen3-style nodes
281
+
282
+
283
+ # merge the scaffold with the Gen3-style data model
284
+ def merge_scaffold_into_gdm(gdm, scaffold):
285
+ """Merges the scaffold into the Gen3-style data model.
286
+ The scaffold contains the _terms, _definitions, _settings, program, project, and CMC nodes.
287
+ """
288
+ # Add the scaffold nodes to the Gen3-style data model
289
+ for node in scaffold:
290
+ if node not in gdm:
291
+ gdm[node] = scaffold[node]
292
+ return gdm
293
+
294
+
295
+ # gdm = merge_scaffold_into_gdm(gdm, scaffold) # merge the scaffold into the Gen3-style data model
296
+
297
+
298
+ def fix_project(gdm):
299
+ """Ensures that the project node links to the program node in the Gen3-style data model.
300
+ Adds a 'program' property to the project node if it doesn't already exist.
301
+ Adds required project node properties if they are missing.
302
+ """
303
+ if "project" in gdm and "program" in gdm:
304
+ # add link to programs
305
+ if "links" in gdm["project"] and not any(
306
+ link["target_type"] == "program" for link in gdm["project"]["links"]
307
+ ):
308
+ gdm["project"]["links"].append(
309
+ {
310
+ "backref": "project",
311
+ "label": "member_of",
312
+ "multiplicity": "many_to_one",
313
+ "name": "program",
314
+ "required": True,
315
+ "target_type": "program",
316
+ }
317
+ )
318
+ # add link name to properties
319
+ if (
320
+ "properties" in gdm["project"]
321
+ and "program" not in gdm["project"]["properties"]
322
+ ):
323
+ gdm["project"]["properties"]["program"] = {
324
+ "anyOf": [
325
+ {
326
+ "items": {
327
+ "additionalProperties": True,
328
+ "maxItems": 1,
329
+ "minItems": 1,
330
+ "properties": {
331
+ "id": {
332
+ "pattern": "^[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}$",
333
+ "term": {
334
+ "description": "A 128-bit identifier. Depending on the mechanism used to generate it, it is either guaranteed to be different from all other UUIDs/GUIDs generated until 3400 AD or extremely likely to be different. Its relatively small size lends itself well to sorting, ordering, and hashing of all sorts, storing in databases, simple allocation, and ease of programming in general.\n",
335
+ "termDef": {
336
+ "cde_id": "C54100",
337
+ "cde_version": None,
338
+ "source": "NCIt",
339
+ "term": "Universally Unique Identifier",
340
+ "term_url": "https://ncit.nci.nih.gov/ncitbrowser/ConceptReport.jsp?dictionary=NCI_Thesaurus&version=16.02d&ns=NCI_Thesaurus&code=C54100",
341
+ },
342
+ },
343
+ "type": "string",
344
+ },
345
+ "submitter_id": {"type": "string"},
346
+ },
347
+ "type": "object",
348
+ },
349
+ "type": "array",
350
+ },
351
+ {
352
+ "additionalProperties": True,
353
+ "properties": {
354
+ "id": {
355
+ "pattern": "^[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}$",
356
+ "term": {
357
+ "description": "A 128-bit identifier. Depending on the mechanism used to generate it, it is either guaranteed to be different from all other UUIDs/GUIDs generated until 3400 AD or extremely likely to be different. Its relatively small size lends itself well to sorting, ordering, and hashing of all sorts, storing in databases, simple allocation, and ease of programming in general.\n",
358
+ "termDef": {
359
+ "cde_id": "C54100",
360
+ "cde_version": None,
361
+ "source": "NCIt",
362
+ "term": "Universally Unique Identifier",
363
+ "term_url": "https://ncit.nci.nih.gov/ncitbrowser/ConceptReport.jsp?dictionary=NCI_Thesaurus&version=16.02d&ns=NCI_Thesaurus&code=C54100",
364
+ },
365
+ },
366
+ "type": "string",
367
+ },
368
+ "submitter_id": {"type": "string"},
369
+ },
370
+ "type": "object",
371
+ },
372
+ ],
373
+ "description": "The program to which the project belongs.\n",
374
+ }
375
+ # add link name to required properties (and any other Gen3-required project props)
376
+ if "required" in gdm["project"] and "program" not in gdm["project"]["required"]:
377
+ gdm["project"]["required"].append("program")
378
+
379
+ # Add required properties to project if missing
380
+ required_props = ["code", "name", "dbgap_accession_number"]
381
+ id_prop = {
382
+ "description": "UUID for the project.",
383
+ "pattern": "^[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}$",
384
+ "systemAlias": "node_id",
385
+ "term": {
386
+ "description": "A 128-bit identifier. Depending on the mechanism used to generate it, it is either guaranteed to be different from all other UUIDs/GUIDs generated until 3400 AD or extremely likely to be different. Its relatively small size lends itself well to sorting, ordering, and hashing of all sorts, storing in databases, simple allocation, and ease of programming in general.\n",
387
+ "termDef": {
388
+ "cde_id": "C54100",
389
+ "cde_version": None,
390
+ "source": "NCIt",
391
+ "term": "Universally Unique Identifier",
392
+ "term_url": "https://ncit.nci.nih.gov/ncitbrowser/ConceptReport.jsp?dictionary=NCI_Thesaurus&version=16.02d&ns=NCI_Thesaurus&code=C54100",
393
+ },
394
+ },
395
+ "type": "string",
396
+ }
397
+ name_prop = {
398
+ "description": "Display name/brief description for the project.",
399
+ "type": "string",
400
+ }
401
+ dbgap_accession_number_prop = {
402
+ "description": "The dbgap accession number provided for the project.",
403
+ "type": "string",
404
+ }
405
+ for prop in required_props:
406
+ if prop not in gdm["project"]["properties"]:
407
+ if prop == "id":
408
+ gdm["project"]["properties"][prop] = id_prop
409
+ elif prop == "name":
410
+ gdm["project"]["properties"][prop] = name_prop
411
+ elif prop == "dbgap_accession_number":
412
+ gdm["project"]["properties"][prop] = dbgap_accession_number_prop
413
+ if prop not in gdm["project"]["required"]:
414
+ gdm["project"]["required"].append(prop)
415
+ return gdm
416
+
417
+
418
+ # gdm = fix_project(gdm)
419
+
420
+
421
+ ## Add required Gen3 properties (submitter_id, id, code, name etc.) to the Gen3-style data model nodes
422
+ def add_gen3_required_properties(gdm):
423
+ """Adds the required Gen3 properties to the project node in the Gen3-style data model.
424
+ Ensures that the project node has: code, name, dbgap_accession_number, program.
425
+ Ensures other nodes have the required/system properties: id, submitter_id, created_datetime, updated_datetime, state, project_id,
426
+ Adds "submitter_id" and "type" to "required"
427
+ if "file_name", "object_id", "data_format", "data_category", "data_type", "md5sum" etc. are present, they need to add file_properties to properties/required as well.
428
+ """
429
+ # Define the required props that could be missing from simplified data models
430
+
431
+ required_props = {
432
+ "created_datetime": {
433
+ "oneOf": [{"format": "date-time", "type": "string"}, {"type": "null"}],
434
+ "term": {
435
+ "description": "A combination of date and time of day in the form [-]CCYY-MM-DDThh:mm:ss[Z|(+|-)hh:mm]\n"
436
+ },
437
+ },
438
+ "id": {
439
+ "pattern": "^[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}$",
440
+ "systemAlias": "node_id",
441
+ "term": {
442
+ "description": "A 128-bit identifier. Depending on the mechanism used to generate it, it is either guaranteed to be different from all other UUIDs/GUIDs generated until 3400 AD or extremely likely to be different. Its relatively small size lends itself well to sorting, ordering, and hashing of all sorts, storing in databases, simple allocation, and ease of programming in general.\n",
443
+ "termDef": {
444
+ "cde_id": "C54100",
445
+ "cde_version": None,
446
+ "source": "NCIt",
447
+ "term": "Universally Unique Identifier",
448
+ "term_url": "https://ncit.nci.nih.gov/ncitbrowser/ConceptReport.jsp?dictionary=NCI_Thesaurus&version=16.02d&ns=NCI_Thesaurus&code=C54100",
449
+ },
450
+ },
451
+ "type": "string",
452
+ },
453
+ "project_id": {
454
+ "term": {
455
+ "description": "Unique ID for any specific defined piece of work that is undertaken or attempted to meet a single requirement.\n"
456
+ },
457
+ "type": "string",
458
+ },
459
+ "state": {
460
+ "default": "validated",
461
+ "downloadable": [
462
+ "uploaded",
463
+ "md5summed",
464
+ "validating",
465
+ "validated",
466
+ "error",
467
+ "invalid",
468
+ "released",
469
+ ],
470
+ "oneOf": [
471
+ {
472
+ "enum": [
473
+ "uploading",
474
+ "uploaded",
475
+ "md5summing",
476
+ "md5summed",
477
+ "validating",
478
+ "error",
479
+ "invalid",
480
+ "suppressed",
481
+ "redacted",
482
+ "live",
483
+ ]
484
+ },
485
+ {"enum": ["validated", "submitted", "released"]},
486
+ ],
487
+ "public": ["live"],
488
+ "term": {"description": "The current state of the object.\n"},
489
+ },
490
+ "submitter_id": {
491
+ "description": "A human-readable, unique identifier for a record in the metadata database. It can be used in place of the UUID for identifying or recalling a record (e.g., in data queries or uploads/exports).",
492
+ "type": "string",
493
+ },
494
+ "type": {
495
+ "description": 'The node_id of the node in the data model; the name of the node used in queries and API requests (e.g., "aligned_reads_file" for the "Aligned Reads File" node).',
496
+ "type": "string",
497
+ },
498
+ "updated_datetime": {
499
+ "oneOf": [{"format": "date-time", "type": "string"}, {"type": "null"}],
500
+ "term": {
501
+ "description": "A combination of date and time of day in the form [-]CCYY-MM-DDThh:mm:ss[Z|(+|-)hh:mm]\n"
502
+ },
503
+ },
504
+ }
505
+ for node in [
506
+ n
507
+ for n in gdm
508
+ if n
509
+ not in [
510
+ "_definitions",
511
+ "_settings",
512
+ "_terms",
513
+ "program",
514
+ "project",
515
+ "core_metadata_collection",
516
+ "data_release",
517
+ "metaschema",
518
+ "root",
519
+ ]
520
+ ]:
521
+ for req in list(required_props.keys()):
522
+ if req not in gdm[node]["properties"]:
523
+ gdm[node]["properties"][req] = required_props[req]
524
+ # add submitter_id and type to required_properties
525
+ if "submitter_id" not in gdm[node]["required"]:
526
+ gdm[node]["required"].append("submitter_id")
527
+ if "type" not in gdm[node]["required"]:
528
+ gdm[node]["required"].append("type")
529
+ if f"{node}.id" in gdm[node]["required"]:
530
+ gdm[node]["required"].remove(f"{node}.id")
531
+ # replace the link names with .id with the target_node name
532
+ link_targets = [link["target_type"] for link in gdm[node]["links"]]
533
+ for link in link_targets:
534
+ if f"{link}.id" in gdm[node]["required"]:
535
+ gdm[node]["required"].remove(f"{link}.id")
536
+ if link not in gdm[node]["required"]:
537
+ gdm[node]["required"].append(link)
538
+ return gdm
539
+
540
+
541
+ # gdm = add_gen3_required_properties(gdm) # add required Gen3 properties to the project node
542
+
543
+
544
+ def add_yaml_suffix_to_nodes(schema):
545
+ """To ensure that the schema is compatible with Gen3's PFB format:
546
+ Adds a .yaml suffix to all nodes in the schema that do not already have it.
547
+ """
548
+ schema = {
549
+ f"{node}.yaml": schema[node] for node in schema if not node.endswith(".yaml")
550
+ }
551
+ return schema
552
+
553
+
554
+ # gdm = add_yaml_suffix_to_nodes(gdm) # ensure all nodes have .yaml suffix
555
+
556
+
557
+ def get_md5sum(filename):
558
+ """Return the MD5 hash of a file."""
559
+ hash_md5 = hashlib.md5()
560
+ with open(filename, "rb") as f:
561
+ for chunk in iter(lambda: f.read(4096), b""):
562
+ hash_md5.update(chunk)
563
+ return hash_md5.hexdigest()
564
+
565
+
566
+ # md5sum(tsv_path)
tsvs/alias_metadata.tsv ADDED
@@ -0,0 +1 @@
 
 
1
+ alias.id participant.id
tsvs/center_metadata.tsv ADDED
@@ -0,0 +1 @@
 
 
1
+ center.id investigator_name name category code collaborators dbgap_accession_number publisher released study_design_allocation title verification_date project.id
tsvs/diagnosis_metadata.tsv ADDED
@@ -0,0 +1 @@
 
 
1
+ diagnosis.id ibd_affection_status age_at_diagnosis age_at_diagnosis_gt89 participant.id visit.id
tsvs/exposure_metadata.tsv ADDED
@@ -0,0 +1 @@
 
 
1
+ exposure.id nocigar_day_unknown smoking smoking_stop visit.id
tsvs/participant_metadata.tsv ADDED
@@ -0,0 +1 @@
 
 
1
+ participant.id consent_codes consented_for_data_sharing consortium_id_of_affected_spouse initials mothers_consortium_id center.id
tsvs/summary_file_file_manifest.tsv ADDED
@@ -0,0 +1 @@
 
 
1
+ summary_file.id data_format file_size center.id
tsvs/visit_metadata.tsv ADDED
@@ -0,0 +1 @@
 
 
1
+ visit.id age_at_visit bmi days_to_follow_up ever_transferred harmonized_visit_number health_insurance review_yr visit_date visit_number visit_type weight participant.id
utils.py ADDED
@@ -0,0 +1,485 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import csv
3
+ from string import Template
4
+ import networkx as nx
5
+ import matplotlib.pyplot as plt
6
+ from io import BytesIO
7
+ from PIL import Image
8
+
9
+ MAX_INPUT_TOKEN_LENGTH = 1024
10
+ TEMPERATURE = 0.1
11
+ MAX_NEW_TOKENS = 8192
12
+
13
+
14
+ def remove_chars_loop(text, chars_to_remove):
15
+ for char in chars_to_remove:
16
+ text = text.replace(char, "")
17
+ return text
18
+
19
+
20
+ def get_prompt_with_files_uploaded(filepaths: list[str] = None) -> str:
21
+ if not filepaths:
22
+ return "No files uploaded yet."
23
+
24
+ prompt_begin = """
25
+ You are a data structuring expert tasked with analyzing data files (CSV, TXT, TSV, XML) to identify their schema and
26
+ generate output in the Gen3 Data Dictionary format (JSON). Review the data files for column names, data types,
27
+ and relationships, and if a data dictionary is provided, ensure strict alignment with its metadata.
28
+ Column names may have embedded information to infer the type and/or units from.
29
+
30
+ Follow these steps:
31
+ - Examine each data file to define the schema
32
+ - Cross-reference with the data dictionary, if available, to match all column definitions and metadata exactly
33
+ - Generate an output schema that mirrors the provided data structure WITHOUT adding any new entities or attributes
34
+ - Limit your output to the smallest amount possible of JSON to capture the necessary information. DO NOT BE VERBOSE
35
+
36
+ The output must include nodes, properties of those nodes, descriptions of those properties, and links to other nodes.
37
+ The ouput must format as ONLY JSON, do not include additional text and please be concise. Limit your output to only what's
38
+ necessary (nodes, properties, descriptions, relationships / links).
39
+ """
40
+
41
+ file_template = Template(
42
+ """
43
+ File name: `$file_name`
44
+ File contents:
45
+
46
+ ```
47
+ $file_contents```
48
+ """
49
+ )
50
+
51
+ prompt_end = """
52
+ Please generate the Gen3 Data Dictionary in JSON format:
53
+ """
54
+
55
+ # Start prompt
56
+ prompt = prompt_begin
57
+
58
+ for path in filepaths:
59
+ file_name = os.path.basename(path)
60
+ with open(path, "r", encoding="utf-8") as f:
61
+ reader = csv.DictReader(f, delimiter="\t")
62
+ file_contents = "\t".join(reader.fieldnames)
63
+ prompt += file_template.substitute(
64
+ file_name=file_name, file_contents=file_contents
65
+ )
66
+
67
+ prompt += prompt_end
68
+
69
+ print(f"prompt: {prompt}")
70
+
71
+ return prompt
72
+
73
+
74
+ def create_graph_image_from_json(json_response):
75
+ adj_dict = {}
76
+
77
+ if isinstance(json_response, dict) and "nodes" in json_response:
78
+ for node in json_response.get("nodes"):
79
+ adj_dict[node["name"]] = node["links"]
80
+
81
+ G = nx.from_dict_of_lists(adj_dict)
82
+
83
+ fig, ax = plt.subplots()
84
+ nx.draw_networkx(G, with_labels=True, node_color="lightblue", ax=ax)
85
+ buf = BytesIO()
86
+ fig.savefig(buf, format="png")
87
+ plt.close(fig)
88
+ buf.seek(0)
89
+ pil_img = Image.open(buf)
90
+
91
+ return pil_img
92
+
93
+
94
+ def create_summary_tables(json_response):
95
+
96
+ node_descriptions = {}
97
+ node_property_descriptions = {}
98
+
99
+ for node in json_response["nodes"]:
100
+ node_descriptions[node["name"]] = node["description"]
101
+
102
+ properties_dict = {}
103
+ for prop in node["properties"]:
104
+ properties_dict[prop["name"]] = prop["description"]
105
+
106
+ node_property_descriptions[node["name"]] = properties_dict
107
+
108
+ return node_descriptions, node_property_descriptions
109
+
110
+
111
+ def get_example_ai_model_output():
112
+ return """
113
+ {
114
+ "nodes": [
115
+ {
116
+ "name": "project",
117
+ "description": "Any specifically defined piece of work that is undertaken or attempted to meet a single requirement. (NCIt C47885)",
118
+ "links": [],
119
+ "required": [
120
+ "code",
121
+ "dbgap_accession_number",
122
+ "name",
123
+ "project.id"
124
+ ],
125
+ "properties": [
126
+ {
127
+ "name": "category",
128
+ "description": "The nature of the investigation or investigational use for which clinical study information is being submitted.",
129
+ "type": "enum"
130
+ },
131
+ {
132
+ "name": "code",
133
+ "description": "Unique identifier for the project.",
134
+ "type": "string"
135
+ },
136
+ {
137
+ "name": "collaborators",
138
+ "description": "Other organizations (if any) providing support. Support may include funding, design, implementation, data analysis or reporting. The responsible party is responsible for confirming all collaborators before listing them.",
139
+ "type": "string"
140
+ },
141
+ {
142
+ "name": "dbgap_accession_number",
143
+ "description": "The dbgap accession number provided for the project.",
144
+ "type": "string"
145
+ },
146
+ {
147
+ "name": "investigator_name",
148
+ "description": "Name of the principal investigator for the project.",
149
+ "type": "string"
150
+ },
151
+ {
152
+ "name": "name",
153
+ "description": "Display name/brief description for the project.",
154
+ "type": "string"
155
+ },
156
+ {
157
+ "name": "publisher",
158
+ "description": "An entity responsible for making the resource available. Examples of a Publisher include a person, an organization, or a service. Typically, the name of a Publisher should be used to indicate the entity.",
159
+ "type": "string"
160
+ },
161
+ {
162
+ "name": "released",
163
+ "description": "To release a project is to tell the GDC to include all submitted entities in the next GDC index.",
164
+ "type": "boolean"
165
+ },
166
+ {
167
+ "name": "study_design_allocation",
168
+ "description": "The method by which participants are assigned to arms in a clinical trial.",
169
+ "type": "enum"
170
+ },
171
+ {
172
+ "name": "title",
173
+ "description": "The title of the clinical study, corresponding to the title of the protocol.",
174
+ "type": "string"
175
+ },
176
+ {
177
+ "name": "project.id",
178
+ "description": "A unique identifier for records in this 'project' table.",
179
+ "type": "string"
180
+ },
181
+ {
182
+ "name": "verification_date",
183
+ "description": "The date on which the responsible party last verified the clinical study information in the entire ClinicalTrials.gov record for the clinical study, even if no additional or updated information is being submitted.",
184
+ "type": "string"
185
+ }
186
+ ]
187
+ },
188
+ {
189
+ "name": "center",
190
+ "description": "Genetic Research Center (GRC) or other clinical center at which research participants are recruited.",
191
+ "links": [
192
+ "project"
193
+ ],
194
+ "required": [
195
+ "name",
196
+ "project.id",
197
+ "center.id"
198
+ ],
199
+ "properties": [
200
+ {
201
+ "name": "name",
202
+ "description": "Name of center at which participants were recruited and/or at which data were collected.",
203
+ "type": "string"
204
+ },
205
+ {
206
+ "name": "project.id",
207
+ "description": "Unique identifiers for records in the 'project' table that relate via this foreign key to records in this 'center' table.",
208
+ "type": "string"
209
+ },
210
+ {
211
+ "name": "center.id",
212
+ "description": "A unique identifier for records in this 'center' table.",
213
+ "type": "string"
214
+ }
215
+ ]
216
+ },
217
+ {
218
+ "name": "participant",
219
+ "description": "The collection of all data related to a specific subject in the context of a specific project.",
220
+ "links": [
221
+ "project",
222
+ "center"
223
+ ],
224
+ "required": [
225
+ "participant.id",
226
+ "project.id"
227
+ ],
228
+ "properties": [
229
+ {
230
+ "name": "initials",
231
+ "description": "The participant's initials.",
232
+ "type": "string"
233
+ },
234
+ {
235
+ "name": "project.id",
236
+ "description": "Unique identifiers for records in the 'project' table that relate via this foreign key to records in this 'participant' table.",
237
+ "type": "string"
238
+ },
239
+ {
240
+ "name": "participant.id",
241
+ "description": "A unique identifier for records in this 'participant' table.",
242
+ "type": "string"
243
+ },
244
+ {
245
+ "name": "consent_codes",
246
+ "description": "",
247
+ "type": "array"
248
+ },
249
+ {
250
+ "name": "consented_for_data_sharing",
251
+ "description": "The participant has consented to share their data.",
252
+ "type": "boolean"
253
+ },
254
+ {
255
+ "name": "consortium_id_of_affected_spouse",
256
+ "description": "TBD",
257
+ "type": "integer"
258
+ },
259
+ {
260
+ "name": "mothers_consortium_id",
261
+ "description": "TBD",
262
+ "type": "integer"
263
+ },
264
+ {
265
+ "name": "center.id",
266
+ "description": "Unique identifiers for records in the 'center' table that relate via this foreign key to records in this 'participant' table.",
267
+ "type": "string"
268
+ }
269
+ ]
270
+ },
271
+ {
272
+ "name": "summary_file",
273
+ "description": "A summary of the data file, including the number of rows, columns, and data types.",
274
+ "links": [
275
+ "center"
276
+ ],
277
+ "required": [
278
+ "data_format",
279
+ "file_size",
280
+ "center.id",
281
+ "summary_file.id"
282
+ ],
283
+ "properties": [
284
+ {
285
+ "name": "data_format",
286
+ "description": "Format of the data files.",
287
+ "type": "enum"
288
+ },
289
+ {
290
+ "name": "file_size",
291
+ "description": "The size of the data file (object) in bytes.",
292
+ "type": "integer"
293
+ },
294
+ {
295
+ "name": "center.id",
296
+ "description": "Unique identifiers for records in the 'center' table that relate via this foreign key to records in this'summary_file' table.",
297
+ "type": "string"
298
+ },
299
+ {
300
+ "name": "summary_file.id",
301
+ "description": "A unique identifier for records in this'summary_file' table.",
302
+ "type": "string"
303
+ }
304
+ ]
305
+ },
306
+ {
307
+ "name": "visit",
308
+ "description": "A visit by a patient or study participant to a medical professional. A clinical encounter that encompasses planned and unplanned trial interventions, procedures and assessments that may be performed on a participant. A visit has a start and an end, each described with a rule. The process by which information about the health status of an individual is obtained before and after a study has officially closed; an activity that continues something that has already begun or that repeats something that has already been done.",
309
+ "links": [
310
+ "participant"
311
+ ],
312
+ "required": [
313
+ "visit.id",
314
+ "participant.id"
315
+ ],
316
+ "properties": [
317
+ {
318
+ "name": "age_at_visit",
319
+ "description": "The study participant's age, in years, at the visit. If the age is greater than 89 years, use the age_at_visit_gt89 property instead.",
320
+ "type": "number"
321
+ },
322
+ {
323
+ "name": "bmi",
324
+ "description": "The body mass divided by the square of the body height expressed in units of kg/m^2.",
325
+ "type": "number"
326
+ },
327
+ {
328
+ "name": "days_to_follow_up",
329
+ "description": "Number of days between the date used for index and the date the patient was seen or contacted at follow-up.",
330
+ "type": "integer"
331
+ },
332
+ {
333
+ "name": "ever_transferred",
334
+ "description": "Participant ever transferred sites (changed ids)",
335
+ "type": "enum"
336
+ },
337
+ {
338
+ "name": "harmonized_visit_number",
339
+ "description": "The derived harmonized visit number for the studies MACS and WIHS.",
340
+ "type": "integer"
341
+ },
342
+ {
343
+ "name": "health_insurance",
344
+ "description": "Currently have any health insurance",
345
+ "type": "boolean"
346
+ },
347
+ {
348
+ "name": "review_yr",
349
+ "description": "Year in which the participant's visit was reviewed",
350
+ "type": "integer"
351
+ },
352
+ {
353
+ "name": "visit_date",
354
+ "description": "Year of the visit.",
355
+ "type": "integer"
356
+ },
357
+ {
358
+ "name": "visit_number",
359
+ "description": "Visit number",
360
+ "type": "integer"
361
+ },
362
+ {
363
+ "name": "visit_type",
364
+ "description": "Define if the visit is a follow-up or the baseline visit.",
365
+ "type": "enum"
366
+ },
367
+ {
368
+ "name": "weight",
369
+ "description": "The weight of the participant measured in grams.",
370
+ "type": "number"
371
+ },
372
+ {
373
+ "name": "participant.id",
374
+ "description": "Unique identifiers for records in the 'participant' table that relate via this foreign key to records in this 'visit' table.",
375
+ "type": "string"
376
+ },
377
+ {
378
+ "name": "visit.id",
379
+ "description": "A unique identifier for records in this 'visit' table.",
380
+ "type": "string"
381
+ }
382
+ ]
383
+ },
384
+ {
385
+ "name": "alias",
386
+ "description": "An alias for the subject.",
387
+ "links": [
388
+ "participant"
389
+ ],
390
+ "required": [
391
+ "participant.id",
392
+ "alias.id"
393
+ ],
394
+ "properties": [
395
+ {
396
+ "name": "participant.id",
397
+ "description": "Unique identifiers for records in the 'participant' table that relate via this foreign key to records in this 'alias' table.",
398
+ "type": "string"
399
+ },
400
+ {
401
+ "name": "alias.id",
402
+ "description": "A unique identifier for records in this 'alias' table.",
403
+ "type": "string"
404
+ }
405
+ ]
406
+ },
407
+ {
408
+ "name": "diagnosis",
409
+ "description": "Data from the investigation, analysis and recognition of the presence and nature of disease, condition, or injury from expressed signs and symptoms; also, the scientific determination of any kind; the concise results of such an investigation.",
410
+ "links": [
411
+ "visit"
412
+ ],
413
+ "required": [
414
+ "visit.id",
415
+ "diagnosis.id"
416
+ ],
417
+ "properties": [
418
+ {
419
+ "name": "age_at_diagnosis",
420
+ "description": "The age of the patient at the time of diagnosis.",
421
+ "type": "number"
422
+ },
423
+ {
424
+ "name": "age_at_diagnosis_gt89",
425
+ "description": "Indicates if the age at diagnosis is greater than 89 years.",
426
+ "type": "enum"
427
+ },
428
+ {
429
+ "name": "ibd_affection_status",
430
+ "description": "The IBD Affection Status of the patient.",
431
+ "type": "enum"
432
+ },
433
+ {
434
+ "name": "visit.id",
435
+ "description": "Unique identifiers for records in the 'visit' table that relate via this foreign key to records in this 'diagnosis' table.",
436
+ "type": "string"
437
+ },
438
+ {
439
+ "name": "diagnosis.id",
440
+ "description": "A unique identifier for records in this 'diagnosis' table.",
441
+ "type": "string"
442
+ }
443
+ ]
444
+ },
445
+ {
446
+ "name": "exposure",
447
+ "description": "Data related to exposure information.",
448
+ "links": [
449
+ "visit"
450
+ ],
451
+ "required": [
452
+ "visit.id",
453
+ "exposure.id"
454
+ ],
455
+ "properties": [
456
+ {
457
+ "name": "nocigar_day_unknown",
458
+ "description": "Unknown",
459
+ "type": "enum"
460
+ },
461
+ {
462
+ "name": "smoking",
463
+ "description": "Smoking",
464
+ "type": "enum"
465
+ },
466
+ {
467
+ "name": "smoking_stop",
468
+ "description": "Smoking stop",
469
+ "type": "enum"
470
+ },
471
+ {
472
+ "name": "visit.id",
473
+ "description": "Unique identifiers for records in the 'visit' table that relate via this foreign key to records in this 'exposure' table.",
474
+ "type": "string"
475
+ },
476
+ {
477
+ "name": "exposure.id",
478
+ "description": "A unique identifier for records in this 'exposure' table.",
479
+ "type": "string"
480
+ }
481
+ ]
482
+ }
483
+ ]
484
+ }
485
+ """