Spaces:

uc-ctds
/

llama-data-model-generator-demo

Sleeping

App Files Files Community

avantol commited on Jul 17

Commit

c567880

1 Parent(s): b65b037

feat(app): Initial release

Browse files

Files changed (29) hide show

README.md +5 -19
app.py +249 -17
duplicate.png +0 -0
poetry.lock +0 -0
pyproject.toml +0 -17
requirements.txt +10 -0
sample_metadata.tsv +1 -0
schema_to_sql.py +206 -0
secret.png +0 -0
serialized_file_creation_demo/gen3_dm_scaffold.json +1 -0
serialized_file_creation_demo/serialized_file_creation_demo.ipynb +360 -0
serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/SDM_0__submitted_genotyping_array.mass_cytometry_image.actionable_mutation__jsonschema_dd.json +1 -0
serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/actionable_mutation_metadata.tsv +1 -0
serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/aliquot_metadata.tsv +1 -0
serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/analyte_metadata.tsv +1 -0
serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/case_metadata.tsv +1 -0
serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/mass_cytometry_assay_file_manifest.tsv +1 -0
serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/mass_cytometry_image_file_manifest.tsv +1 -0
serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/submitted_genotyping_array.mass_cytometry_image.actionable_mutation_paths.json +1 -0
serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/submitted_genotyping_array_file_manifest.tsv +1 -0
serialized_file_creation_demo/utils.py +566 -0
tsvs/alias_metadata.tsv +1 -0
tsvs/center_metadata.tsv +1 -0
tsvs/diagnosis_metadata.tsv +1 -0
tsvs/exposure_metadata.tsv +1 -0
tsvs/participant_metadata.tsv +1 -0
tsvs/summary_file_file_manifest.tsv +1 -0
tsvs/visit_metadata.tsv +1 -0
utils.py +485 -0

README.md CHANGED Viewed

@@ -1,26 +1,12 @@
 ---
-title: Data Model Curator Demo
 emoji: 📝
-colorFrom: pink
-colorTo: red
 sdk: gradio
-sdk_version: 5.37.0
 app_file: app.py
 pinned: false
 license: apache-2.0
-short_description: Demonstration of basic usage of the data model curator tool
-hf_oauth: true
-# optional, default duration is 8 hours/480 minutes. Max duration is 30 days/43200 minutes.
-hf_oauth_expiration_minutes: 480
-# optional, see "Scopes" below. "openid profile" is always included.
-hf_oauth_scopes:
- - read-repos
- - inference-api
-# optional, restrict access to members of specific organizations
-# hf_oauth_authorized_org:
-#   - ORG_NAME1
-#   - ORG_NAME2
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Llama Data Model Generator Demo
 emoji: 📝
+colorFrom: green
+colorTo: blue
 sdk: gradio
+sdk_version: 5.35.0
 app_file: app.py
 pinned: false
 license: apache-2.0
+short_description: Demo of our Llama data model generator model
 ---

app.py CHANGED Viewed

@@ -1,27 +1,259 @@
-from __future__ import annotations
 import gradio as gr
-from huggingface_hub import whoami
-def hello(profile: gr.OAuthProfile | None) -> str:
-    if profile is None:
-        return "I don't know you."
-    return f"Hello {profile.name}"
-def list_organizations(oauth_token: gr.OAuthToken | None) -> str:
-    if oauth_token is None:
-        return "Please deploy this on Spaces and log in to list organizations."
-    org_names = [org["name"] for org in whoami(oauth_token.token)["orgs"]]
-    return f"You belong to {', '.join(org_names)}."
 with gr.Blocks() as demo:
-    gr.LoginButton()
-    m1 = gr.Markdown()
-    m2 = gr.Markdown()
-    demo.load(hello, inputs=None, outputs=m1)
-    demo.load(list_organizations, inputs=None, outputs=m2)
-demo.launch()

+import json
+import os
+import zipfile
+import torch
 import gradio as gr
+import spaces
+from peft import PeftConfig, PeftModel
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from schema_to_sql import dd_to_sql
+from utils import (
+    create_graph_image_from_json,
+    create_summary_tables,
+    get_example_ai_model_output,
+    get_prompt_with_files_uploaded,
+)
+from utils import MAX_NEW_TOKENS, TEMPERATURE
+LOCAL_DIR = "example_tsvs"
+ZIP_PATH = "tsvs.zip"
+AUTH_TOKEN = os.environ.get("HF_TOKEN", False)
+BASE_MODEL = "meta-llama/Llama-3.1-8B-Instruct"
+LORA_ADAPTER = "uc-ctds/data-model-curator"
+MAX_RETRY_ATTEMPTS = 1
+print(f"Is CUDA available: {torch.cuda.is_available()}")
+print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
+tokenizer = AutoTokenizer.from_pretrained(
+    BASE_MODEL, token=AUTH_TOKEN, device_map="auto"
+)
+model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, token=AUTH_TOKEN)
+model = model.to("cuda")
+model = model.eval()
+peft_config = PeftConfig.from_pretrained(LORA_ADAPTER, token=AUTH_TOKEN)
+model = PeftModel.from_pretrained(model, LORA_ADAPTER, token=AUTH_TOKEN)
+@spaces.GPU(duration=360)
+def run_llm_inference(model_prompt):
+    retry_count = 1
+    print("Tokenizing Input")
+    inputs = tokenizer(model_prompt, return_tensors="pt")
+    inputs = inputs.to(model.device)
+    prompt_length = inputs["input_ids"].shape[1]
+    print("Generating Initial Response")
+    outputs = model.generate(
+        **inputs,
+        max_new_tokens=MAX_NEW_TOKENS,
+        temperature=TEMPERATURE,
+    )
+    # Decode and parse output
+    print("Decoding output")
+    output_data_model = tokenizer.decode(outputs[0][prompt_length:])
+    output_data_model = output_data_model.split("<|eot_id|>")[0]
+    print(output_data_model)
+    # Test output for JSON schema validity
+    try:
+        test_respone = json.loads(output_data_model)
+        valid_output = True
+        print("Yay - model passed")
+        return output_data_model
+    except:
+        valid_output = False
+    while (valid_output is False) and (retry_count <= MAX_RETRY_ATTEMPTS):
+        print(
+            f"Attempt {retry_count} did not generate a proper JSON output, proceeding to attempt {retry_count+1} of {MAX_RETRY_ATTEMPTS+1}"
+        )
+        retry_count += 1
+        # Try generating a new response
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=MAX_NEW_TOKENS,
+            temperature=TEMPERATURE,
+        )
+        output_data_model = tokenizer.decode(outputs[0][prompt_length:])
+        output_data_model = output_data_model.split("<|eot_id|>")[0]
+        print(output_data_model)
+        # Test output for JSON schema validity
+        try:
+            json.loads(output_data_model)
+            valid_output = True
+            print("Yay - model passed")
+            return output_data_model
+        except:
+            valid_output = False
+    # Handle cases when the model fails to generate a proper json schema
+    if (valid_output is False) and (retry_count > MAX_RETRY_ATTEMPTS):
+        print(
+            "Failed To Generate a Proper Schema, try checking the prompt or input TSVs and running again"
+        )
+        output_data_model = '{"nodes": [{"name": "Attempt Failed - Check logs for suggested next steps", "links": []}]}'
+    return output_data_model
+def gen_output_from_files_uploaded(filepaths: list[str] = None):
+    prompt_from_tsv_upload = get_prompt_with_files_uploaded(filepaths)
+    # Run model to get model response (model_response is a string that needs to be loaded to json)
+    model_response = run_llm_inference(prompt_from_tsv_upload)
+    model_response_json = json.loads(model_response)
+    # Create Graph Network Image
+    graph_network_img = create_graph_image_from_json(model_response_json)
+    # Create SQL Code
+    sql, validation = dd_to_sql(model_response_json)
+    # Create Summary Table
+    nodes_df, properties_df = {}, {}
+    try:
+        nodes_df, properties_df = create_summary_tables(model_response_json)
+    except Exception as exc:
+        print(f"summary table creation failed: {exc}")
+    return model_response, graph_network_img, sql, nodes_df, properties_df
+def gen_output_from_example():
+    model_response = get_example_ai_model_output()
+    model_response_json = json.loads(model_response)
+    graph_network_img = create_graph_image_from_json(model_response_json)
+    sql, validation = dd_to_sql(model_response_json)
+    return model_response, graph_network_img, sql
+def zip_tsvs():
+    tsv_files = [f for f in os.listdir(LOCAL_DIR) if f.endswith(".tsv")]
+    if not tsv_files:
+        return None
+    with zipfile.ZipFile(ZIP_PATH, "w") as zipf:
+        for file in tsv_files:
+            file_path = os.path.join(LOCAL_DIR, file)
+            zipf.write(file_path, arcname=file)
+    return ZIP_PATH
 with gr.Blocks() as demo:
+    gr.Markdown("# Demonstration of Llama Data Model Generator")
+    gr.Markdown("## IMPORTANT Setup")
+    gr.Markdown(
+        "This demonstrates usage of our [Llama Data Model Generator](https://huggingface.co/uc-ctds/llama-data-model-generator). "
+        "We fine-tuned the base [Llama 3.1 8B Instruct model](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct), "
+        "so you must be approved to access it to use this space. Please follow the previous links and gain access to the "
+        "gated models before proceeding."
+    )
+    gr.Markdown(
+        "After gaining access, you must **duplicate this space** and add a secret variable HF_TOKEN in the settings. "
+        "See [Official Huggingface docs](https://huggingface.co/docs/hub/security-tokens) on how to generate a token. "
+        "It should only have `read` access. Note: this is due to a limitation in Huggingface Spaces and end-user "
+        "access to gated models."
+    )
+    gr.Image("duplicate.png", label="How to duplicate this space")
+    gr.Markdown(
+        "In your duplicated space, press the `Settings` gear button on the top right. "
+        "Then in the `Variables and Secrets` section, create a new **secret** named `HF_TOKEN`. "
+        "Ensure you add your token created in the previous step by following the official Huggingface docs."
+    )
+    gr.Image("secret.png", label="Add your Huggingface token to the secret")
+    gr.Markdown(
+        "**IMPORTANT:** Only continue after doing the above or you will get errors."
+    )
+    gr.Markdown("## (Optional) Get Sample TSV(s) to Upload")
+    gr.Markdown("### Example 1: A single TSV")
+    download_btn = gr.DownloadButton(
+        label="Download Single TSV", value="sample_metadata.tsv"
+    )
+    gr.Markdown("### Example 2: Many TSVs in a single .zip file.")
+    download_btn = gr.DownloadButton(
+        label="Download All Sample TSVs as .zip", value=zip_tsvs
+    )
+    gr.Markdown("You need to extract the .zip if you want to use them.")
+    gr.Markdown("## Upload TSVs With Headers (No Data Rows Required)")
+    files = gr.Files(
+        label="Upload TSVs",
+        file_types=[".tsv"],
+        type="filepath",
+    )
+    gr.Markdown(
+        "Depending on your Huggingface subscription and availability of free GPUs, this can take a few minutes to complete."
+    )
+    gr.Markdown(
+        "Behind the scenes, our [Llama Data Model Generator](https://huggingface.co/uc-ctds/llama-data-model-generator) AI model is being loaded "
+        "onto GPUs and the TSVs uploaded are being sent to the model in a specialized prompt. "
+        "For information about the model, please see the model card itself by clicking "
+        "the link above."
+    )
+    # Define Outputs
+    with gr.Row():
+        with gr.Column(scale=7):
+            json_out = gr.Code(
+                label="Generated Data Model Output",
+                value=json.dumps([]),
+                language="json",
+                interactive=True,
+                show_label=True,
+                container=True,
+                elem_id="json-output",
+            )
+        with gr.Column(scale=7):
+            sql_out = gr.Textbox(
+                label="SQL Defined Relational Schema",
+                value=[],
+                show_label=True,
+                container=True,
+            )
+    with gr.Row():
+        with gr.Column(scale=7):
+            graph_out = gr.Image(label="Network Graph Representation", type="pil")
+    # If files are uploaded, generate prompt and run model
+    files.upload(
+        fn=gen_output_from_files_uploaded,
+        inputs=files,
+        outputs=[json_out, graph_out, sql_out],
+    )
+    gr.Markdown("Run out of FreeGPU or having issues? Try the example output!")
+    demo_btn = gr.Button("Manually Load Example Output from Previous Run")
+    demo_btn.click(
+        fn=gen_output_from_example,
+        outputs=[json_out, graph_out, sql_out],
+    )
+if __name__ == "__main__":
+    demo.launch()

duplicate.png ADDED Viewed

poetry.lock DELETED Viewed

The diff for this file is too large to render. See raw diff

pyproject.toml DELETED Viewed

@@ -1,17 +0,0 @@
-[project]
-name = "data-model-curator-demo"
-version = "0.1.0"
-description = ""
-authors = []
-license = {text = "Apache 2.0"}
-readme = "README.md"
-requires-python = ">=3.10"
-dependencies = [
-    "gradio (>=5.37.0,<6.0.0)"
-]
-package-mode = false
-[build-system]
-requires = ["poetry-core>=2.0.0,<3.0.0"]
-build-backend = "poetry.core.masonry.api"

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+torch==2.5.1
+transformers==4.50.0
+pydantic==2.10.6
+gradio==5.35.0
+networkx==3.4.2
+matplotlib==3.10.0
+vllm==0.6.4.post1
+peft
+Pillow
+spaces

sample_metadata.tsv ADDED Viewed

	@@ -0,0 +1 @@

+ sample.id CancerRegistry_PatientID Class_of_Case_Desc Ethnicity Last_Name Sex_Desc awg_review body_fluid_code data_citation data_contributor data_description date_of_death dbgap_accession_number dna_260_280 dna_concentration full_name in_review intended_release_date longitudinal procedure_date protocol_number releasable request_submission research_design research_objective research_setup project.id dataset.id subject.id

schema_to_sql.py ADDED Viewed

	@@ -0,0 +1,206 @@

+import sqlite3
+def convert_type(type):
+    """
+    Returns SQL type for given AI generated type
+    This function takes AI generated type and returns SQL type.
+    For simplified Data Dictionary enums are converted to text data type, and
+    arrays are converted in text arrays
+    Parameters:
+      type (str): AI generated type
+    Returns:
+      sql_type (str): SQL type
+    """
+    sql_match = {
+        "string": "TEXT",
+        "integer": "INTEGER",
+        "number": "REAL",
+        "boolean": "BOOLEAN",
+        "array": "TEXT[]",
+        "enum": "TEXT",
+    }
+    sql_type = sql_match.get(type, "TEXT")
+    return sql_type
+def get_pk_field(node):
+    """
+    Returns primary key field for given AI generated node
+    This function takes AI generated node dictionary and returns primary key field.
+    Parameters:
+      node (dict): AI generated node dictionary
+    Returns:
+      pk_field (str): Primary key field
+    """
+    # Look for a typical PK pattern: <table>.id
+    for prop in node["properties"]:
+        if prop["name"] == f"{node['name']}.id":
+            return prop["name"]
+    # Fallback
+    return None
+def get_all_columns(node):
+    """
+    Returns all columns for given AI generated node
+    This function takes AI generated node dictionary and returns all columns.
+    Parameters:
+      node (dict): AI generated node dictionary
+    Returns:
+      columns (list): List of column names
+    """
+    return [prop["name"] for prop in node["properties"]]
+def as_sql_col(prop_name):
+    """
+    Returns property name as a sql column name with "." replaced with "__"
+    This function takes AI generated DD node property name and replaces "." with "__".
+    Dot in the field name may cause issues during the SQL table creation.
+    Parameters:
+      prop_name (str): property name
+    Returns:
+      col_name (str): Column name with "." replaced with "__"
+    """
+    return prop_name.replace(".", "__")
+def get_foreign_table_and_field(prop_name, node_name):
+    """
+    Returns foreign table and field for given property name and node_name
+    This function takes AI generated DD node name and property name and returns foreign table and field.
+    Parameters:
+      prop_name (str): property name
+      node_name (str): node name
+    Returns:
+      foreign_table (str): Foreign table name
+      foreign_field (str): Foreign field name
+    """
+    # Looks for pattern: e.g. project.id when not in 'project'
+    if prop_name.endswith(".id") and not prop_name.startswith(node_name + "."):
+        parent = prop_name.split(".")[0]
+        return parent, prop_name
+    return None, None
+def generate_create_table(node, table_lookup):
+    """
+    Returns SQL for the given AI generated node
+    This function takes AI generated node dictionary and returns SQL for the node.
+    Parameters:
+      node (dict): AI generated node dictionary
+      table_lookup (dict): Dictionary of tables and their columns
+    Returns:
+      sql (str): SQL for the node
+    """
+    col_lines = []
+    fk_constraints = []
+    pk_fields = []
+    pk_field = get_pk_field(node)
+    required = node.get("required", [])
+    for prop in node["properties"]:
+        col = prop["name"]
+        coltype = convert_type(prop["type"])
+        sql_col = as_sql_col(col)
+        line = f'  "{sql_col}" {coltype}'
+        if pk_field and col == pk_field:
+            pk_fields.append(sql_col)
+        if col in required or (pk_field and col == pk_field):
+            line += " NOT NULL"
+        col_lines.append(line)
+        # Foreign Keys
+        parent, parent_field = get_foreign_table_and_field(col, node["name"])
+        if parent:
+            ref_col = as_sql_col(parent_field)
+            parent_cols = table_lookup.get(parent, {})
+            if parent_field in parent_cols:
+                fk_constraints.append(
+                    f'  FOREIGN KEY ("{sql_col}") REFERENCES {parent}("{ref_col}")'
+                )
+            else:
+                fk_constraints.append(
+                    f"  -- WARNING: {parent} does not have field {parent_field}"
+                )
+    # Primary Keys
+    constraints = []
+    if pk_fields:
+        constraint_sql = ", ".join(f'"{c}"' for c in pk_fields)
+        constraints.append(f"  PRIMARY KEY ({constraint_sql})")
+    lines = col_lines + constraints + fk_constraints
+    return f'CREATE TABLE "{node["name"]}" (\n' + ",\n".join(lines) + "\n);"
+def validate_sql(sql, node_name):
+    """
+    Returns validation result for the given SQL
+    This function takes SQL and node name and returns validation result.
+    Parameters:
+      sql (str): SQL
+      node_name (str): Node name
+    Returns:
+      validation_result (str): Validation result
+    """
+    conn = sqlite3.connect(":memory:")
+    try:
+        conn.execute(sql)
+        validation_result = f'Valid SQL for table "{node_name}"\n'
+    except sqlite3.Error as e:
+        validation_result = f'Invalid SQL for table "{node_name}":\n{e}\n'
+    finally:
+        conn.close()
+    return validation_result
+def dd_to_sql(dd):
+    """
+    Returns SQL for the given AI generated DD
+    This function takes AI generated DD and returns SQL for the DD.
+    Parameters:
+      dd (dict): AI generated DD
+    Returns:
+      sql (str): SQL
+      validation (str): Validation result
+    """
+    # Build a lookup for table columns in all nodes
+    table_lookup = {}
+    for node in dd["nodes"]:
+        table_lookup[node["name"]] = get_all_columns(node)
+    # pprint.pprint(table_lookup)
+    # Generate SQL
+    combined_sql = ""
+    validation = "Validation notes:\n"
+    for node in dd["nodes"]:
+        sql = generate_create_table(node, table_lookup) + "\n\n"
+        validation = validation + validate_sql(sql, node["name"])
+        combined_sql = combined_sql + sql
+    return combined_sql, validation

secret.png ADDED Viewed

serialized_file_creation_demo/gen3_dm_scaffold.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ Invalid username or password.

serialized_file_creation_demo/serialized_file_creation_demo.ipynb ADDED Viewed

	@@ -0,0 +1,360 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "0",
+   "metadata": {},
+   "source": [
+    "# Creation of Serialized File From AI Model Output\n",
+    "---\n",
+    "This notebook demonstrates how to use the AI-assited data model output (originally just a collection of TSV files) to a serialized file, a [PFB (Portable Format for Bioinformatics)](https://pmc.ncbi.nlm.nih.gov/articles/PMC10035862/) file. \n",
+    "\n",
+    "PFB is widely used within NIH-funded initiativies that our center is a part of, as a means for efficient storage and transfer of data between systems.\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1",
+   "metadata": {},
+   "source": [
+    "### Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install pandas gen3"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3",
+   "metadata": {},
+   "source": [
+    "We need some helper files to demonstrate this, so pull them in from Huggingface."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!git clone https://huggingface.co/spaces/uc-ctds/arpa-h-demo-test\n",
+    "!cd arpa-h-demo-test/serialized_file_creation_demo"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5",
+   "metadata": {},
+   "source": [
+    "### Imports and Initial Loading"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from utils import *\n",
+    "import os\n",
+    "from pathlib import Path\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# read in the minimal Gen3 data model scaffold\n",
+    "scaffold_file = \"./gen3_dm_scaffold.json\"\n",
+    "scaffold = read_schema(scaffold_file)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8",
+   "metadata": {},
+   "source": [
+    "We are demonstrating the ability to use this against an AI-generated model, but not directly inferencing to get the data model. Instead we're using a Sythnetic Data Contribution (a sample of what a data contributor would provide AND the expected simplified data model). We use these to train and test the AI model. For simplicity, we're using the model here."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Find the simplified data model in a Synthetic Data Contribution directory\n",
+    "sdm_dir = \"./submitted_genotyping_array.mass_cytometry_image.actionable_mutation\"\n",
+    "sdm_file = next(\n",
+    "    (f for f in os.listdir(sdm_dir) if f.endswith(\"_jsonschema_dd.json\")), None\n",
+    ")\n",
+    "sdm_path = os.path.join(sdm_dir, sdm_file)\n",
+    "print(sdm_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "10",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sdm = read_schema(schema=sdm_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "11",
+   "metadata": {},
+   "source": [
+    "### Creation of Serialized File"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "12",
+   "metadata": {},
+   "source": [
+    "As of writing, PFB requires a Gen3-style data model, so the next steps are to ensure we can go from the simplified AI model output to a Gen3 data model. Note that in the future we may allow alternative, non-Gen3 models to create such PFBs."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "13",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Create a Gen3 data model from the simplified data model\n",
+    "\n",
+    "gdm = sdm_to_gen3(sdm)  # convert simplified data model nodes into the Gen3-style nodes\n",
+    "gdm = merge_scaffold_into_gdm(\n",
+    "    gdm, scaffold\n",
+    ")  # merge the scaffold into the Gen3-style data model\n",
+    "gdm = fix_project(gdm)  # ensure project links to program and has req'd props\n",
+    "gdm = add_gen3_required_properties(\n",
+    "    gdm\n",
+    ")  # add required Gen3 properties to the project node\n",
+    "gdm = add_yaml_suffix_to_nodes(gdm)  # ensure all nodes have .yaml suffix"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "14",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Write the Gen3-style data model to a JSON file\n",
+    "sdm_name = Path(\n",
+    "    sdm_path\n",
+    ").stem  # get the stem (basename without extension) of the sdm file\n",
+    "out_file = os.path.join(sdm_dir, f\"Gen3_{sdm_name}_pfb.json\")\n",
+    "write_schema(gdm, out_file)  # write the schema to a file"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "15",
+   "metadata": {},
+   "source": [
+    "Now we have the data model in proper format, we can serialize it into a PFB."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Convert the Gen3-style data model to PFB format schema\n",
+    "pfb_schema = os.path.join(sdm_dir, Path(out_file).stem + \".avro\")\n",
+    "!pfb from -o $pfb_schema dict $out_file"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "17",
+   "metadata": {},
+   "source": [
+    "### PFB Utilities"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "18",
+   "metadata": {},
+   "source": [
+    "Now we can demonstrate creation of a PFB when you have content for it (in this case in the form of TSV metadata). The above is a PFB which contains only the data model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "19",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Get a list of TSV files in the sdm_dir\n",
+    "tsv_files = [f for f in os.listdir(sdm_dir) if f.endswith(\".tsv\")]\n",
+    "tsv_files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "20",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# calculate tsv file size and md5sum for each tsv_files\n",
+    "for tsv_file in tsv_files:\n",
+    "    tsv_path = os.path.join(sdm_dir, tsv_file)\n",
+    "    file_size = os.path.getsize(tsv_path)\n",
+    "    # get the md5sum of the TSV file using md5 bash command\n",
+    "    md5sum = get_md5sum(tsv_path)\n",
+    "    tsv_metadata = {\n",
+    "        \"submitter_id\": \"actionable_mutation_metadata.tsv\",\n",
+    "        \"file_format\": \"TSV\",\n",
+    "        \"file_name\": \"actionable_mutation_metadata.tsv\",\n",
+    "        \"file_size\": file_size,\n",
+    "        \"md5sum\": md5sum,\n",
+    "    }\n",
+    "    os.makedirs(\n",
+    "        os.path.join(sdm_dir, \"tsv_metadata\"), exist_ok=True\n",
+    "    )  # create the tsv_metadata directory if it doesn't exist\n",
+    "    tsv_metadata_stem = Path(tsv_file).stem\n",
+    "    if tsv_metadata_stem.endswith(\"_metadata\"):\n",
+    "        tsv_metadata_stem = tsv_metadata_stem.replace(\"_metadata\", \".json\")\n",
+    "    elif tsv_metadata_stem.endswith(\"_file_manifest\"):\n",
+    "        tsv_metadata_stem = tsv_metadata_stem.replace(\"_file_manifest\", \".json\")\n",
+    "    tsv_metadata_file = os.path.join(sdm_dir, \"tsv_metadata\", tsv_metadata_stem)\n",
+    "    with open(tsv_metadata_file, \"w\") as f:\n",
+    "        json.dump(tsv_metadata, f, indent=4)\n",
+    "    print(f\"\\tTSV metadata written to {tsv_metadata_file}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "21",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%ls -l $sdm_dir/tsv_metadata"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "22",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tsv_metadata"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "23",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pfb_data = os.path.join(sdm_dir, Path(out_file).stem + \"_data.avro\")\n",
+    "!pfb from -o $pfb_data json -s $pfb_schema --program DEV --project test $sdm_dir/tsv_metadata\n",
+    "if Path(pfb_data).exists():\n",
+    "    print(f\"PFB containing TSV files written to:\\n{pfb_data}.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "24",
+   "metadata": {},
+   "source": [
+    "PFB contains a utility to convert from the serialized format to more readable and workable files, including TSVs. Here we demonstrate that utility:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "25",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!gen3 pfb to -i $pfb_data tsv # convert the PFB file to TSV format"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "26",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!gen3 pfb show -i $pfb_data # show the contents of the PFB file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "27",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!gen3 pfb show -i $pfb_data schema | jq # show the schema of the PFB file"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "28",
+   "metadata": {},
+   "source": [
+    "Now we've gone all the way from a dump of data contribution files, to a simple structured data model, to a serialized PFB, and back to usable files!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "29",
+   "metadata": {},
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/SDM_0__submitted_genotyping_array.mass_cytometry_image.actionable_mutation__jsonschema_dd.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ Invalid username or password.

serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/actionable_mutation_metadata.tsv ADDED Viewed

	@@ -0,0 +1 @@


1	+ Invalid username or password.

serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/aliquot_metadata.tsv ADDED Viewed

	@@ -0,0 +1 @@


1	+ Invalid username or password.

serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/analyte_metadata.tsv ADDED Viewed

	@@ -0,0 +1 @@


1	+ Invalid username or password.

serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/case_metadata.tsv ADDED Viewed

	@@ -0,0 +1 @@


1	+ Invalid username or password.

serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/mass_cytometry_assay_file_manifest.tsv ADDED Viewed

	@@ -0,0 +1 @@


1	+ Invalid username or password.

serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/mass_cytometry_image_file_manifest.tsv ADDED Viewed

	@@ -0,0 +1 @@


1	+ Invalid username or password.

serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/submitted_genotyping_array.mass_cytometry_image.actionable_mutation_paths.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ Invalid username or password.

serialized_file_creation_demo/submitted_genotyping_array.mass_cytometry_image.actionable_mutation/submitted_genotyping_array_file_manifest.tsv ADDED Viewed

	@@ -0,0 +1 @@


1	+ Invalid username or password.

serialized_file_creation_demo/utils.py ADDED Viewed

	@@ -0,0 +1,566 @@

+## This script will take a simplified data model and convert it to a Gen3-style data model
+import json
+import os
+import sys
+import shutil
+import copy
+import random
+import datetime
+from pathlib import Path
+import pandas as pd
+import hashlib
+###################################################################################################################################################
+### PSEUDO CODE
+# 1. Read the simplified data model from a JSON file.
+# 2. Create a new dictionary to hold the Gen3-style data model.
+# 3. Add a generic program, CMC nodes, _terms, _settings, _definitions dict (reference https://github.com/uc-cdis/pypfb/blob/master/examples/minimal-pfb/minimal_file.json to see what's required)
+# 4. For each node in the simplified data model:
+#    - Convert the node to the Gen3 format.
+#    - Add it to the new dictionary.
+# 5. Write the new dictionary to a JSON file.
+###################################################################################################################################################
+######## SCRIPTS
+def read_schema(schema):
+    """Reads in a schema.json file for a simplified / Gen3 data dictionary and returns a Python dictionary."""
+    with open(schema) as json_file:
+        try:
+            dm = json.load(json_file)
+        except json.JSONDecodeError:
+            print(f"Error reading schema file: {schema}")
+            return {}
+    return dm
+# # read in the simplified data model
+# sdm_file = "./Documents/Notes/AI/AI_data_curation/SDC/sdc_v3_nmax1_nmin1_pmax75_pmin25_limit20_dmax1000_20250423/SDM_0_20250423_tsvs/actionable_mutation/SDM_0__actionable_mutation__jsonschema_dd.json"
+# sdm = read_schema(schema=sdm_file)
+def write_schema(data, out_file):
+    with open(out_file, "w") as f:
+        json.dump(data, f, indent=4)
+    print(f"\tData model written to {out_file}")
+def create_gen3_dm_scaffold():
+    """Creates a minimal Gen3 data model scaffold with the required nodes and properties.
+    Returns a dictionary representing the scaffold.
+        '_definitions',
+        '_settings',
+        '_terms',
+        'core_metadata_collection',
+        'data_release',
+        'metaschema',
+        'program',
+        'root']
+    """
+    scaffold = {}
+    ## Add the _terms, _definitions, _settings using the minimal file in pypfb
+    mdfile = "minimal_file.json"
+    md = read_schema(mdfile)
+    gdm = copy.deepcopy(
+        {n: md[n] for n in md if n in ["_definitions", "_settings", "_terms"]}
+    )
+    ########### read in MIDRC data model to create node templates for program, project, and CMC
+    ## Add program/project/CMC node
+    midrc_dm = "./Documents/Notes/AI/AI_data_curation/input_schemas/gen3_schemas/data.midrc.org_schema.json"
+    mdm = read_schema(midrc_dm)
+    keep_nodes = [
+        "data_release",
+        "metaschema",
+        "program",
+        "project",
+        "core_metadata_collection",
+        "root",
+    ]
+    gdm = gdm | copy.deepcopy(
+        {n: mdm[n] for n in mdm if n in keep_nodes}
+    )  # start with the program node
+    gdm["data_release"]["namespace"] = "https://gen3.org"
+    gdm["core_metadata_collection"]["namespace"] = "https://gen3.org"
+    del gdm["core_metadata_collection"]["properties"]["case_ids"]
+    gdm["metaschema"]["properties"]["links"][
+        "title"
+    ] = "Define a link to other GDC entities"  # remove the title from the links property
+    # write the minimal dict scaffold to a file:
+    out_dir = "./Gen3/dd/synthetic_data_for_AI"
+    out_file = os.path.join(out_dir, "gen3_dm_scaffold.json")
+    write_schema(gdm, out_file)
+    # read in the minimal Gen3 data model scaffold
+    scaffold_file = "./Gen3/dd/synthetic_data_for_AI/gen3_dm_scaffold.json"
+    scaffold = read_schema(scaffold_file)
+    return scaffold
+def convert_sdm_node_to_gen3(node, sdm):
+    """Convert a simplified data model node to a Gen3-style data model node dict with these keys:
+    ['$schema',
+    'additionalProperties',
+    'category',
+    'description',
+    'id',
+    'links',
+    'namespace',
+    'nodeTerms',
+    'program',
+    'project',
+    'properties',
+    'required',
+    'submittable',
+    'systemProperties',
+    'title',
+    'type',
+    'uniqueKeys',
+    'validators']
+    """
+    node_def = [n for n in sdm["nodes"] if n["name"] == node][
+        0
+    ]  # get the node definition from the simplified data model
+    gen3_node = {}
+    gen3_node["$schema"] = "http://json-schema.org/draft-04/schema#"
+    gen3_node["id"] = node_def.get("name", node)
+    gen3_node["title"] = node_def.get(
+        "name", node
+    )  # TODO: original title is lost; use the node name as the title
+    gen3_node["type"] = "object"  # all object
+    gen3_node["nodeTerms"] = None
+    gen3_node["namespace"] = "https://gen3.org"
+    gen3_node["category"] = node_def.get(
+        "category", "default"
+    )  # # TODO: original category lost, get category from a Gen3 model
+    gen3_node["program"] = "*"
+    gen3_node["project"] = "*"
+    gen3_node["description"] = node_def.get("description", "")
+    gen3_node["additionalProperties"] = False
+    gen3_node["submittable"] = True
+    gen3_node["validators"] = None
+    gen3_node["systemProperties"] = [
+        "id",
+        "project_id",
+        "state",
+        "created_datetime",
+        "updated_datetime",
+    ]  # TODO: if it's a file node, should add "file_state", "error_type"
+    gen3_node["links"] = []
+    for link in node_def.get("links", []):
+        if any([l.strip(".id") for l in node_def["required"] if l.startswith(link)]):
+            required = True
+        else:
+            required = False
+        if any(
+            [
+                p.strip(".id")
+                for p in [p["name"] for p in node_def["properties"]]
+                if p.startswith(link)
+            ]
+        ):
+            name = link
+        else:
+            name = link
+        gen3_node["links"].append(
+            {
+                "backref": node_def.get("name", node),  # TODO: backref is lost
+                "label": "child_of",  # TODO: original label is lost; use 'child_of' as a default
+                "multiplicity": "many_to_many",  # TODO: original multiplicity lost; get multiplicity from a Gen3 model
+                "name": name,  # TODO: original name is lost; use the link target as the name
+                "required": required,
+                "target_type": link,
+            }
+        )
+    gen3_node["required"] = node_def.get("required", [])
+    gen3_node["uniqueKeys"] = [
+        ["id"],
+        ["project_id", "submitter_id"],
+    ]  # TODO: original uniqueKeys lost; same for all nodes
+    gen3_node["properties"] = {}
+    # Add properties to the Gen3 node
+    for prop_def in node_def.get("properties", []):
+        prop = prop_def["name"]
+        if prop.endswith(
+            ".id"
+        ):  # it's a link property, so add all the things for links
+            prop = prop[:-3]  # remove the '.id' suffix
+            gen3_node["properties"][prop] = {}
+            gen3_node["properties"][prop]["description"] = prop_def.get(
+                "description", ""
+            )
+            gen3_node["properties"][prop]["anyOf"] = [
+                {
+                    "items": {
+                        "additionalProperties": True,
+                        "maxItems": 1,
+                        "minItems": 1,
+                        "properties": {
+                            "id": {
+                                "pattern": "^[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}$",
+                                "term": {
+                                    "description": "A 128-bit identifier. Depending on the mechanism used to generate it, it is either guaranteed to be different from all other UUIDs/GUIDs generated until 3400 AD or extremely likely to be different. Its relatively small size lends itself well to sorting, ordering, and hashing of all sorts, storing in databases, simple allocation, and ease of programming in general.\n",
+                                    "termDef": {
+                                        "cde_id": "C54100",
+                                        "cde_version": None,
+                                        "source": "NCIt",
+                                        "term": "Universally Unique Identifier",
+                                        "term_url": "https://ncit.nci.nih.gov/ncitbrowser/ConceptReport.jsp?dictionary=NCI_Thesaurus&version=16.02d&ns=NCI_Thesaurus&code=C54100",
+                                    },
+                                },
+                                "type": "string",
+                            },
+                            "submitter_id": {"type": "string"},
+                        },
+                        "type": "object",
+                    },
+                    "type": "array",
+                },
+                {
+                    "additionalProperties": True,
+                    "properties": {
+                        "id": {
+                            "pattern": "^[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}$",
+                            "term": {
+                                "description": "A 128-bit identifier. Depending on the mechanism used to generate it, it is either guaranteed to be different from all other UUIDs/GUIDs generated until 3400 AD or extremely likely to be different. Its relatively small size lends itself well to sorting, ordering, and hashing of all sorts, storing in databases, simple allocation, and ease of programming in general.\n",
+                                "termDef": {
+                                    "cde_id": "C54100",
+                                    "cde_version": None,
+                                    "source": "NCIt",
+                                    "term": "Universally Unique Identifier",
+                                    "term_url": "https://ncit.nci.nih.gov/ncitbrowser/ConceptReport.jsp?dictionary=NCI_Thesaurus&version=16.02d&ns=NCI_Thesaurus&code=C54100",
+                                },
+                            },
+                            "type": "string",
+                        },
+                        "submitter_id": {"type": "string"},
+                    },
+                    "type": "object",
+                },
+            ]
+        else:
+            gen3_node["properties"][prop] = {}
+            gen3_node["properties"][prop]["description"] = prop_def.get(
+                "description", ""
+            )
+            if "type" in prop_def and prop_def["type"] == "enum":
+                prop_type = "string"  # TODO: if the type is enum, we need to specify the enum values, which is lost in simplified data models
+            elif "type" in prop_def and prop_def["type"] == "array":
+                prop_type = "array"  # TODO: if the type is array, we need to specify items type, which is lost in simplified data models
+                gen3_node["properties"][prop]["items"] = {
+                    "type": "string"
+                }  # default to string if not specified
+            else:
+                prop_type = prop_def.get(
+                    "type", "string"
+                )  # default to string if not specified
+            gen3_node["properties"][prop]["type"] = prop_type
+    return gen3_node
+# convert_sdm_node_to_gen3('case',sdm)
+def sdm_to_gen3(sdm):
+    """Converts nodes in a simplified data model into the Gen3-style data model nodes.
+    Returns the Gen3-ified data model as a dictionary.
+    """
+    gdm = {}
+    sdm_nodes = [
+        n["name"] for n in sdm["nodes"]
+    ]  # get the list of node names from the simplified data model
+    for node in sdm_nodes:
+        gen3_node = convert_sdm_node_to_gen3(node, sdm)
+        gdm[node] = gen3_node
+    return gdm
+# gdm = sdm_to_gen3(sdm)  # convert simplified data model nodes into the Gen3-style nodes
+# merge the scaffold with the Gen3-style data model
+def merge_scaffold_into_gdm(gdm, scaffold):
+    """Merges the scaffold into the Gen3-style data model.
+    The scaffold contains the _terms, _definitions, _settings, program, project, and CMC nodes.
+    """
+    # Add the scaffold nodes to the Gen3-style data model
+    for node in scaffold:
+        if node not in gdm:
+            gdm[node] = scaffold[node]
+    return gdm
+# gdm = merge_scaffold_into_gdm(gdm, scaffold)  # merge the scaffold into the Gen3-style data model
+def fix_project(gdm):
+    """Ensures that the project node links to the program node in the Gen3-style data model.
+    Adds a 'program' property to the project node if it doesn't already exist.
+    Adds required project node properties if they are missing.
+    """
+    if "project" in gdm and "program" in gdm:
+        # add link to programs
+        if "links" in gdm["project"] and not any(
+            link["target_type"] == "program" for link in gdm["project"]["links"]
+        ):
+            gdm["project"]["links"].append(
+                {
+                    "backref": "project",
+                    "label": "member_of",
+                    "multiplicity": "many_to_one",
+                    "name": "program",
+                    "required": True,
+                    "target_type": "program",
+                }
+            )
+        # add link name to properties
+        if (
+            "properties" in gdm["project"]
+            and "program" not in gdm["project"]["properties"]
+        ):
+            gdm["project"]["properties"]["program"] = {
+                "anyOf": [
+                    {
+                        "items": {
+                            "additionalProperties": True,
+                            "maxItems": 1,
+                            "minItems": 1,
+                            "properties": {
+                                "id": {
+                                    "pattern": "^[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}$",
+                                    "term": {
+                                        "description": "A 128-bit identifier. Depending on the mechanism used to generate it, it is either guaranteed to be different from all other UUIDs/GUIDs generated until 3400 AD or extremely likely to be different. Its relatively small size lends itself well to sorting, ordering, and hashing of all sorts, storing in databases, simple allocation, and ease of programming in general.\n",
+                                        "termDef": {
+                                            "cde_id": "C54100",
+                                            "cde_version": None,
+                                            "source": "NCIt",
+                                            "term": "Universally Unique Identifier",
+                                            "term_url": "https://ncit.nci.nih.gov/ncitbrowser/ConceptReport.jsp?dictionary=NCI_Thesaurus&version=16.02d&ns=NCI_Thesaurus&code=C54100",
+                                        },
+                                    },
+                                    "type": "string",
+                                },
+                                "submitter_id": {"type": "string"},
+                            },
+                            "type": "object",
+                        },
+                        "type": "array",
+                    },
+                    {
+                        "additionalProperties": True,
+                        "properties": {
+                            "id": {
+                                "pattern": "^[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}$",
+                                "term": {
+                                    "description": "A 128-bit identifier. Depending on the mechanism used to generate it, it is either guaranteed to be different from all other UUIDs/GUIDs generated until 3400 AD or extremely likely to be different. Its relatively small size lends itself well to sorting, ordering, and hashing of all sorts, storing in databases, simple allocation, and ease of programming in general.\n",
+                                    "termDef": {
+                                        "cde_id": "C54100",
+                                        "cde_version": None,
+                                        "source": "NCIt",
+                                        "term": "Universally Unique Identifier",
+                                        "term_url": "https://ncit.nci.nih.gov/ncitbrowser/ConceptReport.jsp?dictionary=NCI_Thesaurus&version=16.02d&ns=NCI_Thesaurus&code=C54100",
+                                    },
+                                },
+                                "type": "string",
+                            },
+                            "submitter_id": {"type": "string"},
+                        },
+                        "type": "object",
+                    },
+                ],
+                "description": "The program to which the project belongs.\n",
+            }
+        # add link name to required properties (and any other Gen3-required project props)
+        if "required" in gdm["project"] and "program" not in gdm["project"]["required"]:
+            gdm["project"]["required"].append("program")
+        # Add required properties to project if missing
+        required_props = ["code", "name", "dbgap_accession_number"]
+        id_prop = {
+            "description": "UUID for the project.",
+            "pattern": "^[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}$",
+            "systemAlias": "node_id",
+            "term": {
+                "description": "A 128-bit identifier. Depending on the mechanism used to generate it, it is either guaranteed to be different from all other UUIDs/GUIDs generated until 3400 AD or extremely likely to be different. Its relatively small size lends itself well to sorting, ordering, and hashing of all sorts, storing in databases, simple allocation, and ease of programming in general.\n",
+                "termDef": {
+                    "cde_id": "C54100",
+                    "cde_version": None,
+                    "source": "NCIt",
+                    "term": "Universally Unique Identifier",
+                    "term_url": "https://ncit.nci.nih.gov/ncitbrowser/ConceptReport.jsp?dictionary=NCI_Thesaurus&version=16.02d&ns=NCI_Thesaurus&code=C54100",
+                },
+            },
+            "type": "string",
+        }
+        name_prop = {
+            "description": "Display name/brief description for the project.",
+            "type": "string",
+        }
+        dbgap_accession_number_prop = {
+            "description": "The dbgap accession number provided for the project.",
+            "type": "string",
+        }
+        for prop in required_props:
+            if prop not in gdm["project"]["properties"]:
+                if prop == "id":
+                    gdm["project"]["properties"][prop] = id_prop
+                elif prop == "name":
+                    gdm["project"]["properties"][prop] = name_prop
+                elif prop == "dbgap_accession_number":
+                    gdm["project"]["properties"][prop] = dbgap_accession_number_prop
+            if prop not in gdm["project"]["required"]:
+                gdm["project"]["required"].append(prop)
+    return gdm
+# gdm = fix_project(gdm)
+## Add required Gen3 properties (submitter_id, id, code, name etc.) to the Gen3-style data model nodes
+def add_gen3_required_properties(gdm):
+    """Adds the required Gen3 properties to the project node in the Gen3-style data model.
+    Ensures that the project node has: code, name, dbgap_accession_number, program.
+    Ensures other nodes have the required/system properties: id, submitter_id, created_datetime, updated_datetime, state, project_id,
+    Adds "submitter_id" and "type" to "required"
+    if "file_name", "object_id", "data_format", "data_category", "data_type", "md5sum" etc. are present, they need to add file_properties to properties/required as well.
+    """
+    # Define the required props that could be missing from simplified data models
+    required_props = {
+        "created_datetime": {
+            "oneOf": [{"format": "date-time", "type": "string"}, {"type": "null"}],
+            "term": {
+                "description": "A combination of date and time of day in the form [-]CCYY-MM-DDThh:mm:ss[Z|(+|-)hh:mm]\n"
+            },
+        },
+        "id": {
+            "pattern": "^[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}$",
+            "systemAlias": "node_id",
+            "term": {
+                "description": "A 128-bit identifier. Depending on the mechanism used to generate it, it is either guaranteed to be different from all other UUIDs/GUIDs generated until 3400 AD or extremely likely to be different. Its relatively small size lends itself well to sorting, ordering, and hashing of all sorts, storing in databases, simple allocation, and ease of programming in general.\n",
+                "termDef": {
+                    "cde_id": "C54100",
+                    "cde_version": None,
+                    "source": "NCIt",
+                    "term": "Universally Unique Identifier",
+                    "term_url": "https://ncit.nci.nih.gov/ncitbrowser/ConceptReport.jsp?dictionary=NCI_Thesaurus&version=16.02d&ns=NCI_Thesaurus&code=C54100",
+                },
+            },
+            "type": "string",
+        },
+        "project_id": {
+            "term": {
+                "description": "Unique ID for any specific defined piece of work that is undertaken or attempted to meet a single requirement.\n"
+            },
+            "type": "string",
+        },
+        "state": {
+            "default": "validated",
+            "downloadable": [
+                "uploaded",
+                "md5summed",
+                "validating",
+                "validated",
+                "error",
+                "invalid",
+                "released",
+            ],
+            "oneOf": [
+                {
+                    "enum": [
+                        "uploading",
+                        "uploaded",
+                        "md5summing",
+                        "md5summed",
+                        "validating",
+                        "error",
+                        "invalid",
+                        "suppressed",
+                        "redacted",
+                        "live",
+                    ]
+                },
+                {"enum": ["validated", "submitted", "released"]},
+            ],
+            "public": ["live"],
+            "term": {"description": "The current state of the object.\n"},
+        },
+        "submitter_id": {
+            "description": "A human-readable, unique identifier for a record in the metadata database. It can be used in place of the UUID for identifying or recalling a record (e.g., in data queries or uploads/exports).",
+            "type": "string",
+        },
+        "type": {
+            "description": 'The node_id of the node in the data model; the name of the node used in queries and API requests (e.g., "aligned_reads_file" for the "Aligned Reads File" node).',
+            "type": "string",
+        },
+        "updated_datetime": {
+            "oneOf": [{"format": "date-time", "type": "string"}, {"type": "null"}],
+            "term": {
+                "description": "A combination of date and time of day in the form [-]CCYY-MM-DDThh:mm:ss[Z|(+|-)hh:mm]\n"
+            },
+        },
+    }
+    for node in [
+        n
+        for n in gdm
+        if n
+        not in [
+            "_definitions",
+            "_settings",
+            "_terms",
+            "program",
+            "project",
+            "core_metadata_collection",
+            "data_release",
+            "metaschema",
+            "root",
+        ]
+    ]:
+        for req in list(required_props.keys()):
+            if req not in gdm[node]["properties"]:
+                gdm[node]["properties"][req] = required_props[req]
+        # add submitter_id and type to required_properties
+        if "submitter_id" not in gdm[node]["required"]:
+            gdm[node]["required"].append("submitter_id")
+        if "type" not in gdm[node]["required"]:
+            gdm[node]["required"].append("type")
+        if f"{node}.id" in gdm[node]["required"]:
+            gdm[node]["required"].remove(f"{node}.id")
+        # replace the link names with .id with the target_node name
+        link_targets = [link["target_type"] for link in gdm[node]["links"]]
+        for link in link_targets:
+            if f"{link}.id" in gdm[node]["required"]:
+                gdm[node]["required"].remove(f"{link}.id")
+            if link not in gdm[node]["required"]:
+                gdm[node]["required"].append(link)
+    return gdm
+# gdm = add_gen3_required_properties(gdm)  # add required Gen3 properties to the project node
+def add_yaml_suffix_to_nodes(schema):
+    """To ensure that the schema is compatible with Gen3's PFB format:
+    Adds a .yaml suffix to all nodes in the schema that do not already have it.
+    """
+    schema = {
+        f"{node}.yaml": schema[node] for node in schema if not node.endswith(".yaml")
+    }
+    return schema
+# gdm = add_yaml_suffix_to_nodes(gdm)  # ensure all nodes have .yaml suffix
+def get_md5sum(filename):
+    """Return the MD5 hash of a file."""
+    hash_md5 = hashlib.md5()
+    with open(filename, "rb") as f:
+        for chunk in iter(lambda: f.read(4096), b""):
+            hash_md5.update(chunk)
+    return hash_md5.hexdigest()
+# md5sum(tsv_path)

tsvs/alias_metadata.tsv ADDED Viewed

	@@ -0,0 +1 @@


1	+ alias.id participant.id

tsvs/center_metadata.tsv ADDED Viewed

	@@ -0,0 +1 @@


1	+ center.id investigator_name name category code collaborators dbgap_accession_number publisher released study_design_allocation title verification_date project.id

tsvs/diagnosis_metadata.tsv ADDED Viewed

	@@ -0,0 +1 @@


1	+ diagnosis.id ibd_affection_status age_at_diagnosis age_at_diagnosis_gt89 participant.id visit.id

tsvs/exposure_metadata.tsv ADDED Viewed

	@@ -0,0 +1 @@


1	+ exposure.id nocigar_day_unknown smoking smoking_stop visit.id

tsvs/participant_metadata.tsv ADDED Viewed

	@@ -0,0 +1 @@


1	+ participant.id consent_codes consented_for_data_sharing consortium_id_of_affected_spouse initials mothers_consortium_id center.id

tsvs/summary_file_file_manifest.tsv ADDED Viewed

	@@ -0,0 +1 @@


1	+ summary_file.id data_format file_size center.id

tsvs/visit_metadata.tsv ADDED Viewed

	@@ -0,0 +1 @@


1	+ visit.id age_at_visit bmi days_to_follow_up ever_transferred harmonized_visit_number health_insurance review_yr visit_date visit_number visit_type weight participant.id

utils.py ADDED Viewed

	@@ -0,0 +1,485 @@

+import os
+import csv
+from string import Template
+import networkx as nx
+import matplotlib.pyplot as plt
+from io import BytesIO
+from PIL import Image
+MAX_INPUT_TOKEN_LENGTH = 1024
+TEMPERATURE = 0.1
+MAX_NEW_TOKENS = 8192
+def remove_chars_loop(text, chars_to_remove):
+    for char in chars_to_remove:
+        text = text.replace(char, "")
+    return text
+def get_prompt_with_files_uploaded(filepaths: list[str] = None) -> str:
+    if not filepaths:
+        return "No files uploaded yet."
+    prompt_begin = """
+    You are a data structuring expert tasked with analyzing data files (CSV, TXT, TSV, XML) to identify their schema and
+    generate output in the Gen3 Data Dictionary format (JSON). Review the data files for column names, data types,
+    and relationships, and if a data dictionary is provided, ensure strict alignment with its metadata.
+    Column names may have embedded information to infer the type and/or units from.
+    Follow these steps:
+    - Examine each data file to define the schema
+    - Cross-reference with the data dictionary, if available, to match all column definitions and metadata exactly
+    - Generate an output schema that mirrors the provided data structure WITHOUT adding any new entities or attributes
+    - Limit your output to the smallest amount possible of JSON to capture the necessary information. DO NOT BE VERBOSE
+    The output must include nodes, properties of those nodes, descriptions of those properties, and links to other nodes.
+    The ouput must format as ONLY JSON, do not include additional text and please be concise. Limit your output to only what's
+    necessary (nodes, properties, descriptions, relationships / links).
+    """
+    file_template = Template(
+        """
+    File name: `$file_name`
+    File contents:
+    ```
+    $file_contents```
+    """
+    )
+    prompt_end = """
+    Please generate the Gen3 Data Dictionary in JSON format:
+    """
+    # Start prompt
+    prompt = prompt_begin
+    for path in filepaths:
+        file_name = os.path.basename(path)
+        with open(path, "r", encoding="utf-8") as f:
+            reader = csv.DictReader(f, delimiter="\t")
+            file_contents = "\t".join(reader.fieldnames)
+            prompt += file_template.substitute(
+                file_name=file_name, file_contents=file_contents
+            )
+    prompt += prompt_end
+    print(f"prompt: {prompt}")
+    return prompt
+def create_graph_image_from_json(json_response):
+    adj_dict = {}
+    if isinstance(json_response, dict) and "nodes" in json_response:
+        for node in json_response.get("nodes"):
+            adj_dict[node["name"]] = node["links"]
+    G = nx.from_dict_of_lists(adj_dict)
+    fig, ax = plt.subplots()
+    nx.draw_networkx(G, with_labels=True, node_color="lightblue", ax=ax)
+    buf = BytesIO()
+    fig.savefig(buf, format="png")
+    plt.close(fig)
+    buf.seek(0)
+    pil_img = Image.open(buf)
+    return pil_img
+def create_summary_tables(json_response):
+    node_descriptions = {}
+    node_property_descriptions = {}
+    for node in json_response["nodes"]:
+        node_descriptions[node["name"]] = node["description"]
+        properties_dict = {}
+        for prop in node["properties"]:
+            properties_dict[prop["name"]] = prop["description"]
+        node_property_descriptions[node["name"]] = properties_dict
+    return node_descriptions, node_property_descriptions
+def get_example_ai_model_output():
+    return """
+{
+       "nodes": [
+         {
+           "name": "project",
+           "description": "Any specifically defined piece of work that is undertaken or attempted to meet a single requirement. (NCIt C47885)",
+           "links": [],
+           "required": [
+             "code",
+             "dbgap_accession_number",
+             "name",
+             "project.id"
+           ],
+           "properties": [
+             {
+               "name": "category",
+               "description": "The nature of the investigation or investigational use for which clinical study information is being submitted.",
+               "type": "enum"
+             },
+             {
+               "name": "code",
+               "description": "Unique identifier for the project.",
+               "type": "string"
+             },
+             {
+               "name": "collaborators",
+               "description": "Other organizations (if any) providing support. Support may include funding, design, implementation, data analysis or reporting. The responsible party is responsible for confirming all collaborators before listing them.",
+               "type": "string"
+             },
+             {
+               "name": "dbgap_accession_number",
+               "description": "The dbgap accession number provided for the project.",
+               "type": "string"
+             },
+             {
+               "name": "investigator_name",
+               "description": "Name of the principal investigator for the project.",
+               "type": "string"
+             },
+             {
+               "name": "name",
+               "description": "Display name/brief description for the project.",
+               "type": "string"
+             },
+             {
+               "name": "publisher",
+               "description": "An entity responsible for making the resource available. Examples of a Publisher include a person, an organization, or a service. Typically, the name of a Publisher should be used to indicate the entity.",
+               "type": "string"
+             },
+             {
+               "name": "released",
+               "description": "To release a project is to tell the GDC to include all submitted entities in the next GDC index.",
+               "type": "boolean"
+             },
+             {
+               "name": "study_design_allocation",
+               "description": "The method by which participants are assigned to arms in a clinical trial.",
+               "type": "enum"
+             },
+             {
+               "name": "title",
+               "description": "The title of the clinical study, corresponding to the title of the protocol.",
+               "type": "string"
+             },
+             {
+               "name": "project.id",
+               "description": "A unique identifier for records in this 'project' table.",
+               "type": "string"
+             },
+             {
+               "name": "verification_date",
+               "description": "The date on which the responsible party last verified the clinical study information in the entire ClinicalTrials.gov record for the clinical study, even if no additional or updated information is being submitted.",
+               "type": "string"
+             }
+           ]
+         },
+         {
+           "name": "center",
+           "description": "Genetic Research Center (GRC) or other clinical center at which research participants are recruited.",
+           "links": [
+             "project"
+           ],
+           "required": [
+             "name",
+             "project.id",
+             "center.id"
+           ],
+           "properties": [
+             {
+               "name": "name",
+               "description": "Name of center at which participants were recruited and/or at which data were collected.",
+               "type": "string"
+             },
+             {
+               "name": "project.id",
+               "description": "Unique identifiers for records in the 'project' table that relate via this foreign key to records in this 'center' table.",
+               "type": "string"
+             },
+             {
+               "name": "center.id",
+               "description": "A unique identifier for records in this 'center' table.",
+               "type": "string"
+             }
+           ]
+         },
+         {
+           "name": "participant",
+           "description": "The collection of all data related to a specific subject in the context of a specific project.",
+           "links": [
+             "project",
+             "center"
+           ],
+           "required": [
+             "participant.id",
+             "project.id"
+           ],
+           "properties": [
+             {
+               "name": "initials",
+               "description": "The participant's initials.",
+               "type": "string"
+             },
+             {
+               "name": "project.id",
+               "description": "Unique identifiers for records in the 'project' table that relate via this foreign key to records in this 'participant' table.",
+               "type": "string"
+             },
+             {
+               "name": "participant.id",
+               "description": "A unique identifier for records in this 'participant' table.",
+               "type": "string"
+             },
+             {
+               "name": "consent_codes",
+               "description": "",
+               "type": "array"
+             },
+             {
+               "name": "consented_for_data_sharing",
+               "description": "The participant has consented to share their data.",
+               "type": "boolean"
+             },
+             {
+               "name": "consortium_id_of_affected_spouse",
+               "description": "TBD",
+               "type": "integer"
+             },
+             {
+               "name": "mothers_consortium_id",
+               "description": "TBD",
+               "type": "integer"
+             },
+             {
+               "name": "center.id",
+               "description": "Unique identifiers for records in the 'center' table that relate via this foreign key to records in this 'participant' table.",
+               "type": "string"
+             }
+           ]
+         },
+         {
+           "name": "summary_file",
+           "description": "A summary of the data file, including the number of rows, columns, and data types.",
+           "links": [
+             "center"
+           ],
+           "required": [
+             "data_format",
+             "file_size",
+             "center.id",
+             "summary_file.id"
+           ],
+           "properties": [
+             {
+               "name": "data_format",
+               "description": "Format of the data files.",
+               "type": "enum"
+             },
+             {
+               "name": "file_size",
+               "description": "The size of the data file (object) in bytes.",
+               "type": "integer"
+             },
+             {
+               "name": "center.id",
+               "description": "Unique identifiers for records in the 'center' table that relate via this foreign key to records in this'summary_file' table.",
+               "type": "string"
+             },
+             {
+               "name": "summary_file.id",
+               "description": "A unique identifier for records in this'summary_file' table.",
+               "type": "string"
+             }
+           ]
+         },
+         {
+           "name": "visit",
+           "description": "A visit by a patient or study participant to a medical professional. A clinical encounter that encompasses planned and unplanned trial interventions, procedures and assessments that may be performed on a participant. A visit has a start and an end, each described with a rule. The process by which information about the health status of an individual is obtained before and after a study has officially closed; an activity that continues something that has already begun or that repeats something that has already been done.",
+           "links": [
+             "participant"
+           ],
+           "required": [
+             "visit.id",
+             "participant.id"
+           ],
+           "properties": [
+             {
+               "name": "age_at_visit",
+               "description": "The study participant's age, in years, at the visit. If the age is greater than 89 years, use the age_at_visit_gt89 property instead.",
+               "type": "number"
+             },
+             {
+               "name": "bmi",
+               "description": "The body mass divided by the square of the body height expressed in units of kg/m^2.",
+               "type": "number"
+             },
+             {
+               "name": "days_to_follow_up",
+               "description": "Number of days between the date used for index and the date the patient was seen or contacted at follow-up.",
+               "type": "integer"
+             },
+             {
+               "name": "ever_transferred",
+               "description": "Participant ever transferred sites (changed ids)",
+               "type": "enum"
+             },
+             {
+               "name": "harmonized_visit_number",
+               "description": "The derived harmonized visit number for the studies MACS and WIHS.",
+               "type": "integer"
+             },
+             {
+               "name": "health_insurance",
+               "description": "Currently have any health insurance",
+               "type": "boolean"
+             },
+             {
+               "name": "review_yr",
+               "description": "Year in which the participant's visit was reviewed",
+               "type": "integer"
+             },
+             {
+               "name": "visit_date",
+               "description": "Year of the visit.",
+               "type": "integer"
+             },
+             {
+               "name": "visit_number",
+               "description": "Visit number",
+               "type": "integer"
+             },
+             {
+               "name": "visit_type",
+               "description": "Define if the visit is a follow-up or the baseline visit.",
+               "type": "enum"
+             },
+             {
+               "name": "weight",
+               "description": "The weight of the participant measured in grams.",
+               "type": "number"
+             },
+             {
+               "name": "participant.id",
+               "description": "Unique identifiers for records in the 'participant' table that relate via this foreign key to records in this 'visit' table.",
+               "type": "string"
+             },
+             {
+               "name": "visit.id",
+               "description": "A unique identifier for records in this 'visit' table.",
+               "type": "string"
+             }
+           ]
+         },
+         {
+           "name": "alias",
+           "description": "An alias for the subject.",
+           "links": [
+             "participant"
+           ],
+           "required": [
+             "participant.id",
+             "alias.id"
+           ],
+           "properties": [
+             {
+               "name": "participant.id",
+               "description": "Unique identifiers for records in the 'participant' table that relate via this foreign key to records in this 'alias' table.",
+               "type": "string"
+             },
+             {
+               "name": "alias.id",
+               "description": "A unique identifier for records in this 'alias' table.",
+               "type": "string"
+             }
+           ]
+         },
+         {
+           "name": "diagnosis",
+           "description": "Data from the investigation, analysis and recognition of the presence and nature of disease, condition, or injury from expressed signs and symptoms; also, the scientific determination of any kind; the concise results of such an investigation.",
+           "links": [
+             "visit"
+           ],
+           "required": [
+             "visit.id",
+             "diagnosis.id"
+           ],
+           "properties": [
+             {
+               "name": "age_at_diagnosis",
+               "description": "The age of the patient at the time of diagnosis.",
+               "type": "number"
+             },
+             {
+               "name": "age_at_diagnosis_gt89",
+               "description": "Indicates if the age at diagnosis is greater than 89 years.",
+               "type": "enum"
+             },
+             {
+               "name": "ibd_affection_status",
+               "description": "The IBD Affection Status of the patient.",
+               "type": "enum"
+             },
+             {
+               "name": "visit.id",
+               "description": "Unique identifiers for records in the 'visit' table that relate via this foreign key to records in this 'diagnosis' table.",
+               "type": "string"
+             },
+             {
+               "name": "diagnosis.id",
+               "description": "A unique identifier for records in this 'diagnosis' table.",
+               "type": "string"
+             }
+           ]
+         },
+         {
+           "name": "exposure",
+           "description": "Data related to exposure information.",
+           "links": [
+             "visit"
+           ],
+           "required": [
+             "visit.id",
+             "exposure.id"
+           ],
+           "properties": [
+             {
+               "name": "nocigar_day_unknown",
+               "description": "Unknown",
+               "type": "enum"
+             },
+             {
+               "name": "smoking",
+               "description": "Smoking",
+               "type": "enum"
+             },
+             {
+               "name": "smoking_stop",
+               "description": "Smoking stop",
+               "type": "enum"
+             },
+             {
+               "name": "visit.id",
+               "description": "Unique identifiers for records in the 'visit' table that relate via this foreign key to records in this 'exposure' table.",
+               "type": "string"
+             },
+             {
+               "name": "exposure.id",
+               "description": "A unique identifier for records in this 'exposure' table.",
+               "type": "string"
+             }
+           ]
+         }
+       ]
+     }
+"""