awacke1 commited on
Commit
9811acc
·
1 Parent(s): 8f665dd

Upload 3 files

Browse files
Files changed (3) hide show
  1. README.txt +13 -0
  2. app.py +327 -0
  3. requirements.txt +7 -0
README.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: 🧬CTMap - Clinical Terminology AutoMap AI
3
+ emoji: ⚗️🧠🔬🧬
4
+ colorFrom: yellow
5
+ colorTo: green
6
+ sdk: gradio
7
+ sdk_version: 3.5
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas_profiling as pp
2
+ import pandas as pd
3
+ import tensorflow as tf
4
+
5
+ from datasets import load_dataset
6
+ from tensorflow.python.framework import tensor_shape
7
+
8
+ #LOINC
9
+ datasetLOINC = load_dataset("awacke1/LOINC-CodeSet-Value-Description.csv", split="train")
10
+ #SNOMED:
11
+ datasetSNOMED = load_dataset("awacke1/SNOMED-CT-Code-Value-Semantic-Set.csv", split="train")
12
+ #eCQM:
13
+ dataseteCQM = load_dataset("awacke1/eCQM-Code-Value-Semantic-Set.csv", split="train")
14
+
15
+ # map using autotokenizer
16
+ from transformers import AutoTokenizer
17
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
18
+ dataset = datasetLOINC.map(lambda examples: tokenizer(examples["Description"]), batched=True)
19
+ JSONOBJ2=dataset[0]
20
+ print(JSONOBJ2)
21
+
22
+ sw = datasetLOINC.filter(lambda example: example["Description"].startswith("Allergy"))
23
+ len(sw)
24
+ print(sw)
25
+ print(datasetLOINC)
26
+ print(datasetSNOMED)
27
+ print(dataseteCQM)
28
+
29
+ # play with some dataset tools before the show:
30
+
31
+ #print(start_with_ar["Description"])
32
+
33
+ #---
34
+ #Main Stage - Begin!
35
+ #---
36
+
37
+ import os
38
+ import json
39
+ import numpy as np
40
+ import gradio as gr
41
+
42
+ HF_TOKEN = os.environ.get("HF_TOKEN")
43
+ CHOICES = ["SNOMED", "LOINC", "CQM"]
44
+ JSONOBJ = """{"items":{"item":[{"id": "0001","type": null,"is_good": false,"ppu": 0.55,"batters":{"batter":[{ "id": "1001", "type": "Regular" },{ "id": "1002", "type": "Chocolate" },{ "id": "1003", "type": "Blueberry" },{ "id": "1004", "type": "Devil's Food" }]},"topping":[{ "id": "5001", "type": "None" },{ "id": "5002", "type": "Glazed" },{ "id": "5005", "type": "Sugar" },{ "id": "5007", "type": "Powdered Sugar" },{ "id": "5006", "type": "Chocolate with Sprinkles" },{ "id": "5003", "type": "Chocolate" },{ "id": "5004", "type": "Maple" }]}]}}"""
45
+
46
+
47
+ def profile_dataset(dataset=datasetSNOMED, username="awacke1", token=HF_TOKEN, dataset_name="awacke1/SNOMED-CT-Code-Value-Semantic-Set.csv"):
48
+ df = pd.read_csv(dataset.Description)
49
+ if len(df.columns) <= 15:
50
+ profile = pp.ProfileReport(df, title=f"{dataset_name} Report")
51
+ else:
52
+ profile = pp.ProfileReport(df, title=f"{dataset_name} Report", minimal = True)
53
+
54
+ repo_url = create_repo(f"{username}/{dataset_name}", repo_type = "space", token = token, space_sdk = "static", private=False)
55
+
56
+ profile.to_file("./index.html")
57
+
58
+ upload_file(path_or_fileobj ="./index.html", path_in_repo = "index.html", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token)
59
+ readme = f"---\ntitle: {dataset_name}\nemoji: ✨\ncolorFrom: green\ncolorTo: red\nsdk: static\npinned: false\ntags:\n- dataset-report\n---"
60
+ with open("README.md", "w+") as f:
61
+ f.write(readme)
62
+ upload_file(path_or_fileobj ="./README.md", path_in_repo = "README.md", repo_id =f"{username}/{dataset_name}", repo_type = "space", token=token)
63
+ return f"Your dataset report will be ready at {repo_url}"
64
+
65
+ #def lowercase_title(example):
66
+ # return {"Description": example[title].lower()}
67
+
68
+ # demonstrate map function of dataset
69
+ #JSONOBJ_MAP=datasetLOINC.map(lowercase_title)
70
+ #JSONOBJ_MAP=datasetLOINC.filter(lambda example: example["Description"].startswith("Mental health"))
71
+
72
+
73
+
74
+
75
+ def concatenate_text(examples):
76
+ return {
77
+ "text": examples["Code"]
78
+ + " \n "
79
+ + examples["Description"]
80
+ + " \n "
81
+ + examples["Purpose: Clinical Focus"]
82
+ }
83
+
84
+ def cls_pooling(model_output):
85
+ return model_output.last_hidden_state[:, 0]
86
+
87
+ def get_embeddings(text_list):
88
+ encoded_input = tokenizer(
89
+ text_list, padding=True, truncation=True, return_tensors="tf"
90
+ )
91
+ encoded_input = {k: v for k, v in encoded_input.items()}
92
+ model_output = model(**encoded_input)
93
+ return cls_pooling(model_output)
94
+
95
+
96
+ def fn( text1, text2, num, slider1, slider2, single_checkbox, checkboxes, radio, dropdown, im1, im2, im3, im4,
97
+ video, audio1, audio2, file, df1, df2,):
98
+ #def fn( text1, text2, single_checkbox, checkboxes, radio, im4, file, df1, df2,):
99
+
100
+ searchTerm = text1
101
+ searchTermSentence = text2
102
+
103
+ start_with_searchTermLOINC = datasetLOINC.filter(lambda example:example["Description"].startswith('Allergy')) #Allergy
104
+
105
+
106
+ # FAISS
107
+ columns = start_with_searchTermLOINC.column_names
108
+ columns_to_keep = ["Value Set Name", "Code", "Description", "Purpose: Clinical Focus", "Code System OID"]
109
+ columns_to_remove = set(columns_to_keep).symmetric_difference(columns)
110
+ start_with_searchTermLOINC = start_with_searchTermLOINC.remove_columns(columns_to_remove)
111
+ start_with_searchTermLOINC
112
+ start_with_searchTermLOINC.set_format("pandas")
113
+ df = start_with_searchTermLOINC[:]
114
+
115
+ df["Purpose: Clinical Focus"][0]
116
+
117
+ df4 = df.explode("Purpose: Clinical Focus", ignore_index=True)
118
+ df4.head(4)
119
+
120
+ from datasets import Dataset
121
+ clinical_dataset = Dataset.from_pandas(df4)
122
+ clinical_dataset
123
+
124
+ clinical_dataset = clinical_dataset.map(lambda x: {"c_length": len(x["Description"].split())})
125
+
126
+ clinical_dataset = clinical_dataset.filter(lambda x: x["c_length"] > 15)
127
+ clinical_dataset
128
+
129
+
130
+ clinical_dataset = clinical_dataset.map(concatenate_text)
131
+ #embedding = get_embeddings(clinical_dataset["text"][0])
132
+ #embedding.shape
133
+
134
+ from transformers import AutoTokenizer, TFAutoModel
135
+
136
+ model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
137
+ tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
138
+ model = TFAutoModel.from_pretrained(model_ckpt, from_pt=True)
139
+
140
+ # TensorShape([1, 768])
141
+ tf.shape([1, 768])
142
+
143
+ embeddings_dataset = clinical_dataset.map(
144
+ lambda x: {"embeddings": get_embeddings(x["text"]).numpy()[0]})
145
+
146
+ # embeddings_dataset.add_faiss_index(column="embeddings")
147
+
148
+ # question = "How can I load a dataset offline?"
149
+ # question_embedding = get_embeddings([question]).numpy()
150
+ # question_embedding.shape
151
+
152
+ # scores, samples = embeddings_dataset.get_nearest_examples("embeddings", question_embedding, k=5)
153
+
154
+ # import pandas as pd
155
+
156
+ # samples_df = pd.DataFrame.from_dict(samples)
157
+ # samples_df["scores"] = scores
158
+ # samples_df.sort_values("scores", ascending=False, inplace=True)
159
+
160
+
161
+ # "text": examples["Code"]
162
+ # + " \n "
163
+ # + examples["Description"]
164
+ # + " \n "
165
+ # + examples["Purpose: Clinical Focus"]
166
+
167
+
168
+ # for _, row in samples_df.iterrows():
169
+ # print(f"Code: {row.Code}")
170
+ # print(f"Description: {row.Description}")
171
+ # #print(f"Purpose: Clinical Focus: {row.Purpose: Clinical Focus}")
172
+ # #print(f"URL: {row.html_url}")
173
+ # print("=" * 50)
174
+ # print()
175
+
176
+ # SNOMED and CQM ---------------
177
+ start_with_searchTermSNOMED = datasetSNOMED.filter(lambda example: example["Description"].startswith('Hospital')) #Hospital
178
+ start_with_searchTermCQM = dataseteCQM.filter(lambda example: example["Description"].startswith('Telephone')) #Telephone
179
+
180
+ print(start_with_searchTermLOINC )
181
+ print(start_with_searchTermSNOMED )
182
+ print(start_with_searchTermCQM)
183
+
184
+ #print(start_with_searchTermLOINC["train"][0] )
185
+ #print(start_with_searchTermSNOMED["train"][0] )
186
+ #print(start_with_searchTermCQM["train"][0] )
187
+
188
+ #returnMsg=profile_dataset()
189
+ #print(returnMsg)
190
+
191
+ # try:
192
+ #top1matchLOINC = json.loads(start_with_searchTermLOINC['train'])
193
+ #top1matchSNOMED = json.loads(start_with_searchTermSNOMED['train'])
194
+ #top1matchCQM = json.loads(start_with_searchTermCQM['train'])
195
+ # top1matchLOINC = json.loads(start_with_searchTermLOINC)
196
+ # top1matchSNOMED = json.loads(start_with_searchTermSNOMED)
197
+ # top1matchCQM = json.loads(start_with_searchTermCQM)
198
+ # except:
199
+ # print('Hello')
200
+ #print(start_with_searchTermLOINC[0])
201
+ #print(start_with_searchTermSNOMED[0] )
202
+ #print(start_with_searchTermCQM[0] )
203
+
204
+ #print(returnMsg)
205
+ # print("Datasets Processed")
206
+
207
+ return (
208
+ (text1 if single_checkbox else text2)
209
+ + ", selected:"
210
+ + ", ".join(checkboxes), # Text
211
+ {
212
+ "positive": num / (num + slider1 + slider2),
213
+ "negative": slider1 / (num + slider1 + slider2),
214
+ "neutral": slider2 / (num + slider1 + slider2),
215
+ }, # Label
216
+ (audio1[0], np.flipud(audio1[1]))
217
+ if audio1 is not None else os.path.join(os.path.dirname(__file__), "files/cantina.wav"), # Audio
218
+ np.flipud(im1)
219
+ if im1 is not None else os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"), # Image
220
+ video
221
+ if video is not None else os.path.join(os.path.dirname(__file__), "files/world.mp4"), # Video
222
+ [
223
+ ("The", "art"),
224
+ ("quick brown", "adj"),
225
+ ("fox", "nn"),
226
+ ("jumped", "vrb"),
227
+ ("testing testing testing", None),
228
+ ("over", "prp"),
229
+ ("the", "art"),
230
+ ("testing", None),
231
+ ("lazy", "adj"),
232
+ ("dogs", "nn"),
233
+ (".", "punc"),
234
+ ] + [(f"test {x}", f"test {x}") for x in range(10)], # HighlightedText
235
+ [
236
+ ("The testing testing testing", None),
237
+ ("over", 0.6),
238
+ ("the", 0.2),
239
+ ("testing", None),
240
+ ("lazy", -0.1),
241
+ ("dogs", 0.4),
242
+ (".", 0),
243
+ ] + [(f"test", x / 10) for x in range(-10, 10)], # HighlightedText
244
+ #json.loads(JSONOBJ), # JSON
245
+ start_with_searchTermLOINC.to_json(orient="records", path_or_buf="None"),
246
+ #json.dumps(json.loads(start_with_searchTermLOINC['train'].to_json(orient="records", path_or_buf="None"))),
247
+ "<button style='background-color: red'>Click Me: " + radio + "</button>", # HTML
248
+ os.path.join(os.path.dirname(__file__), "files/titanic.csv"),
249
+ df1, # Dataframe
250
+ np.random.randint(0, 10, (4, 4)), # Dataframe
251
+ df2, # Timeseries
252
+ )
253
+
254
+
255
+
256
+ demo = gr.Interface(
257
+ fn,
258
+ inputs=[
259
+ gr.Textbox(value="Allergy", label="Textbox"),
260
+ gr.Textbox(lines=3, value="Bathing", placeholder="Type here..", label="Textbox 2"),
261
+ gr.Number(label="Number", value=42),
262
+ gr.Slider(10, 20, value=15, label="Slider: 10 - 20"),
263
+ gr.Slider(maximum=20, step=0.04, label="Slider: step @ 0.04"),
264
+ gr.Checkbox(label="Check for NER Match on Submit"),
265
+ gr.CheckboxGroup(label="Clinical Terminology to Check", choices=CHOICES, value=CHOICES[0:2]),
266
+ gr.Radio(label="Preferred Terminology Output", choices=CHOICES, value=CHOICES[2]),
267
+ gr.Dropdown(label="Dropdown", choices=CHOICES),
268
+ gr.Image(label="Image"),
269
+ gr.Image(label="Image w/ Cropper", tool="select"),
270
+ gr.Image(label="Sketchpad", source="canvas"),
271
+ gr.Image(label="Webcam", source="webcam"),
272
+ gr.Video(label="Video"),
273
+ gr.Audio(label="Audio"),
274
+ gr.Audio(label="Microphone", source="microphone"),
275
+ gr.File(label="File"),
276
+ gr.Dataframe(label="Filters", headers=["Name", "Age", "Gender"]),
277
+ gr.Timeseries(x="time", y=["price", "value"], colors=["pink", "purple"]),
278
+ ],
279
+ outputs=[
280
+ gr.Textbox(label="Textbox"),
281
+ gr.Label(label="Label"),
282
+ gr.Audio(label="Audio"),
283
+ gr.Image(label="Image"),
284
+ gr.Video(label="Video"),
285
+ gr.HighlightedText(label="HighlightedText", color_map={"punc": "pink", "test 0": "blue"}),
286
+ gr.HighlightedText(label="HighlightedText", show_legend=True),
287
+ gr.JSON(label="JSON"),
288
+ gr.HTML(label="HTML"),
289
+ gr.File(label="File"),
290
+ gr.Dataframe(label="Dataframe"),
291
+ gr.Dataframe(label="Numpy"),
292
+ gr.Timeseries(x="time", y=["price", "value"], label="Timeseries"),
293
+ ],
294
+ examples=[
295
+ [
296
+ "Allergy",
297
+ "Admission",
298
+ 10,
299
+ 12,
300
+ 4,
301
+ True,
302
+ ["SNOMED", "LOINC", "CQM"],
303
+ "SNOMED",
304
+ "bar",
305
+ os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"),
306
+ os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"),
307
+ os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"),
308
+ os.path.join(os.path.dirname(__file__), "files/cheetah1.jpg"),
309
+ os.path.join(os.path.dirname(__file__), "files/world.mp4"),
310
+ os.path.join(os.path.dirname(__file__), "files/cantina.wav"),
311
+ os.path.join(os.path.dirname(__file__), "files/cantina.wav"),
312
+ os.path.join(os.path.dirname(__file__), "files/titanic.csv"),
313
+ [[1, 2, 3], [3, 4, 5]],
314
+ os.path.join(os.path.dirname(__file__), "files/time.csv"),
315
+ ]
316
+ ]
317
+ * 3,
318
+ theme="default",
319
+ title="⚗️🧠🔬🧬 Clinical Terminology Auto Mapper AI 👩‍⚕️🩺⚕️🙋",
320
+ cache_examples=False,
321
+ description="Clinical Terminology Auto Mapper AI",
322
+ article="Learn more at [Yggdrasil](https://github.com/AaronCWacker/Yggdrasil)",
323
+ # live=True,
324
+ )
325
+
326
+ if __name__ == "__main__":
327
+ demo.launch(debug=True)
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ datasets
2
+ transformers
3
+ pandas-profiling
4
+ huggingface-hub
5
+ gradio
6
+ Tensorflow
7
+ torch