jamesjohnson763's picture
Upload 3 files
b31a24a
raw
history blame
13 kB
import pandas_profiling as pp
import pandas as pd
import tensorflow as tf
from datasets import load_dataset
from tensorflow.python.framework import tensor_shape
#LOINC
datasetLOINC = load_dataset("awacke1/LOINC-CodeSet-Value-Description.csv", split="train")
#SNOMED:
datasetSNOMED = load_dataset("awacke1/SNOMED-CT-Code-Value-Semantic-Set.csv", split="train")
#eCQM:
dataseteCQM = load_dataset("awacke1/eCQM-Code-Value-Semantic-Set.csv", split="train")
# map using autotokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
dataset = datasetLOINC.map(lambda examples: tokenizer(examples["Description"]), batched=True)
JSONOBJ2=dataset[0]
print(JSONOBJ2)
sw = datasetLOINC.filter(lambda example: example["Description"].startswith("Allergy"))
len(sw)
print(sw)
print(datasetLOINC)
print(datasetSNOMED)
print(dataseteCQM)
# play with some dataset tools before the show:
#print(start_with_ar["Description"])
#---
#Main Stage - Begin!
#---
import os
import json
import numpy as np
import gradio as gr
HF_TOKEN = os.environ.get("HF_TOKEN")
CHOICES = ["SNOMED", "LOINC", "CQM"]
JSONOBJ = """{"items":{"item":[{"id": "0001","type": null,"is_good": false,"ppu": 0.55,"batters":{"batter":[{ "id": "1001", "type": "Regular" },{ "id": "1002", "type": "Chocolate" },{ "id": "1003", "type": "Blueberry" },{ "id": "1004", "type": "Devil's Food" }]},"topping":[{ "id": "5001", "type": "None" },{ "id": "5002", "type": "Glazed" },{ "id": "5005", "type": "Sugar" },{ "id": "5007", "type": "Powdered Sugar" },{ "id": "5006", "type": "Chocolate with Sprinkles" },{ "id": "5003", "type": "Chocolate" },{ "id": "5004", "type": "Maple" }]}]}}"""
def profile_dataset(dataset=datasetSNOMED, username="awacke1", token=HF_TOKEN, dataset_name="awacke1/SNOMED-CT-Code-Value-Semantic-Set.csv"):
df = pd.read_csv(dataset.Description)
if len(df.columns) <= 15:
profile = pp.ProfileReport(df, title=f"{dataset_name} Report")
else:
profile = pp.ProfileReport(df, title=f"{dataset_name} Report", minimal = True)
repo_url = create_repo(f"{username}/{dataset_name}", repo_type = "space", token = token, space_sdk = "static", private=False)
profile.to_file("./index.html")
upload_file(path_or_fileobj ="./index.html", path_in_repo = "index.html", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token)
readme = f"---\ntitle: {dataset_name}\nemoji: ✨\ncolorFrom: green\ncolorTo: red\nsdk: static\npinned: false\ntags:\n- dataset-report\n---"
with open("README.md", "w+") as f:
f.write(readme)
upload_file(path_or_fileobj ="./README.md", path_in_repo = "README.md", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token)
return f"Your dataset report will be ready at {repo_url}"
#def lowercase_title(example):
# return {"Description": example[title].lower()}
# demonstrate map function of dataset
#JSONOBJ_MAP=datasetLOINC.map(lowercase_title)
#JSONOBJ_MAP=datasetLOINC.filter(lambda example: example["Description"].startswith("Mental health"))
def concatenate_text(examples):
return {
"text": examples["Code"]
+ " \n "
+ examples["Description"]
+ " \n "
+ examples["Purpose: Clinical Focus"]
}
def cls_pooling(model_output):
return model_output.last_hidden_state[:, 0]
def get_embeddings(text_list):
encoded_input = tokenizer(
text_list, padding=True, truncation=True, return_tensors="tf"
)
encoded_input = {k: v for k, v in encoded_input.items()}
model_output = model(**encoded_input)
return cls_pooling(model_output)
def fn( text1, text2, num, slider1, slider2, single_checkbox, checkboxes, radio, dropdown, im1, im2, im3, im4,
video, audio1, audio2, file, df1, df2,):
#def fn( text1, text2, single_checkbox, checkboxes, radio, im4, file, df1, df2,):
searchTerm = text1
searchTermSentence = text2
start_with_searchTermLOINC = datasetLOINC.filter(lambda example:example["Description"].startswith('Allergy')) #Allergy
# FAISS
columns = start_with_searchTermLOINC.column_names
columns_to_keep = ["Value Set Name", "Code", "Description", "Purpose: Clinical Focus", "Code System OID"]
columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
start_with_searchTermLOINC = start_with_searchTermLOINC.remove_columns(columns_to_remove)
start_with_searchTermLOINC
start_with_searchTermLOINC.set_format("pandas")
df = start_with_searchTermLOINC[:]
df["Purpose: Clinical Focus"][0]
df4 = df.explode("Purpose: Clinical Focus", ignore_index=True)
df4.head(4)
from datasets import Dataset
clinical_dataset = Dataset.from_pandas(df4)
clinical_dataset
clinical_dataset = clinical_dataset.map(lambda x: {"c_length": len(x["Description"].split())})
clinical_dataset = clinical_dataset.filter(lambda x: x["c_length"] > 15)
clinical_dataset
clinical_dataset = clinical_dataset.map(concatenate_text)
#embedding = get_embeddings(clinical_dataset["text"][0])
#embedding.shape
from transformers import AutoTokenizer, TFAutoModel
model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = TFAutoModel.from_pretrained(model_ckpt, from_pt=True)
# TensorShape([1, 768])
tf.shape([1, 768])
embeddings_dataset = clinical_dataset.map(
lambda x: {"embeddings": get_embeddings(x["text"]).numpy()[0]})
# embeddings_dataset.add_faiss_index(column="embeddings")
# question = "How can I load a dataset offline?"
# question_embedding = get_embeddings([question]).numpy()
# question_embedding.shape
# scores, samples = embeddings_dataset.get_nearest_examples("embeddings", question_embedding, k=5)
# import pandas as pd
# samples_df = pd.DataFrame.from_dict(samples)
# samples_df["scores"] = scores
# samples_df.sort_values("scores", ascending=False, inplace=True)
# "text": examples["Code"]
# + " \n "
# + examples["Description"]
# + " \n "
# + examples["Purpose: Clinical Focus"]
# for _, row in samples_df.iterrows():
# print(f"Code: {row.Code}")
# print(f"Description: {row.Description}")
# #print(f"Purpose: Clinical Focus: {row.Purpose: Clinical Focus}")
# #print(f"URL: {row.html_url}")
# print("=" * 50)
# print()
# SNOMED and CQM ---------------
start_with_searchTermSNOMED = datasetSNOMED.filter(lambda example: example["Description"].startswith('Hospital')) #Hospital
start_with_searchTermCQM = dataseteCQM.filter(lambda example: example["Description"].startswith('Telephone')) #Telephone
print(start_with_searchTermLOINC )
print(start_with_searchTermSNOMED )
print(start_with_searchTermCQM)
#print(start_with_searchTermLOINC["train"][0] )
#print(start_with_searchTermSNOMED["train"][0] )
#print(start_with_searchTermCQM["train"][0] )
#returnMsg=profile_dataset()
#print(returnMsg)
# try:
#top1matchLOINC = json.loads(start_with_searchTermLOINC['train'])
#top1matchSNOMED = json.loads(start_with_searchTermSNOMED['train'])
#top1matchCQM = json.loads(start_with_searchTermCQM['train'])
# top1matchLOINC = json.loads(start_with_searchTermLOINC)
# top1matchSNOMED = json.loads(start_with_searchTermSNOMED)
# top1matchCQM = json.loads(start_with_searchTermCQM)
# except:
# print('Hello')
#print(start_with_searchTermLOINC[0])
#print(start_with_searchTermSNOMED[0] )
#print(start_with_searchTermCQM[0] )
#print(returnMsg)
# print("Datasets Processed")
return (
(text1 if single_checkbox else text2)
+ ", selected:"
+ ", ".join(checkboxes), # Text
{
"positive": num / (num + slider1 + slider2),
"negative": slider1 / (num + slider1 + slider2),
"neutral": slider2 / (num + slider1 + slider2),
}, # Label
(audio1[0], np.flipud(audio1[1]))
if audio1 is not None else os.path.join(os.path.dirname(__file__), "files/cantina.wav"), # Audio
np.flipud(im1)
if im1 is not None else os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"), # Image
video
if video is not None else os.path.join(os.path.dirname(__file__), "files/world.mp4"), # Video
[
("The", "art"),
("quick brown", "adj"),
("fox", "nn"),
("jumped", "vrb"),
("testing testing testing", None),
("over", "prp"),
("the", "art"),
("testing", None),
("lazy", "adj"),
("dogs", "nn"),
(".", "punc"),
] + [(f"test {x}", f"test {x}") for x in range(10)], # HighlightedText
[
("The testing testing testing", None),
("over", 0.6),
("the", 0.2),
("testing", None),
("lazy", -0.1),
("dogs", 0.4),
(".", 0),
] + [(f"test", x / 10) for x in range(-10, 10)], # HighlightedText
#json.loads(JSONOBJ), # JSON
start_with_searchTermLOINC.to_json(orient="records", path_or_buf="None"),
#json.dumps(json.loads(start_with_searchTermLOINC['train'].to_json(orient="records", path_or_buf="None"))),
"<button style='background-color: red'>Click Me: " + radio + "</button>", # HTML
os.path.join(os.path.dirname(__file__), "files/titanic.csv"),
df1, # Dataframe
np.random.randint(0, 10, (4, 4)), # Dataframe
df2, # Timeseries
)
demo = gr.Interface(
fn,
inputs=[
gr.Textbox(value="Allergy", label="Textbox"),
gr.Textbox(lines=3, value="Bathing", placeholder="Type here..", label="Textbox 2"),
gr.Number(label="Number", value=42),
gr.Slider(10, 20, value=15, label="Slider: 10 - 20"),
gr.Slider(maximum=20, step=0.04, label="Slider: step @ 0.04"),
gr.Checkbox(label="Check for NER Match on Submit"),
gr.CheckboxGroup(label="Clinical Terminology to Check", choices=CHOICES, value=CHOICES[0:2]),
gr.Radio(label="Preferred Terminology Output", choices=CHOICES, value=CHOICES[2]),
gr.Dropdown(label="Dropdown", choices=CHOICES),
gr.Image(label="Image"),
gr.Image(label="Image w/ Cropper", tool="select"),
gr.Image(label="Sketchpad", source="canvas"),
gr.Image(label="Webcam", source="webcam"),
gr.Video(label="Video"),
gr.Audio(label="Audio"),
gr.Audio(label="Microphone", source="microphone"),
gr.File(label="File"),
gr.Dataframe(label="Filters", headers=["Name", "Age", "Gender"]),
gr.Timeseries(x="time", y=["price", "value"], colors=["pink", "purple"]),
],
outputs=[
gr.Textbox(label="Textbox"),
gr.Label(label="Label"),
gr.Audio(label="Audio"),
gr.Image(label="Image"),
gr.Video(label="Video"),
gr.HighlightedText(label="HighlightedText", color_map={"punc": "pink", "test 0": "blue"}),
gr.HighlightedText(label="HighlightedText", show_legend=True),
gr.JSON(label="JSON"),
gr.HTML(label="HTML"),
gr.File(label="File"),
gr.Dataframe(label="Dataframe"),
gr.Dataframe(label="Numpy"),
gr.Timeseries(x="time", y=["price", "value"], label="Timeseries"),
],
examples=[
[
"Allergy",
"Admission",
10,
12,
4,
True,
["SNOMED", "LOINC", "CQM"],
"SNOMED",
"bar",
os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"),
os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"),
os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"),
os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"),
os.path.join(os.path.dirname(__file__), "files/world.mp4"),
os.path.join(os.path.dirname(__file__), "files/cantina.wav"),
os.path.join(os.path.dirname(__file__), "files/cantina.wav"),
os.path.join(os.path.dirname(__file__), "files/titanic.csv"),
[[1, 2, 3], [3, 4, 5]],
os.path.join(os.path.dirname(__file__), "files/time.csv"),
]
]
* 3,
theme="default",
title="βš—οΈπŸ§ πŸ”¬πŸ§¬ Clinical Terminology Auto Mapper AI πŸ‘©β€βš•οΈπŸ©Ίβš•οΈπŸ™‹",
cache_examples=False,
description="Clinical Terminology Auto Mapper AI",
article="Learn more at [Yggdrasil](https://github.com/AaronCWacker/Yggdrasil)",
# live=True,
)
if __name__ == "__main__":
demo.launch(debug=True)