updated GARD_Search and Classify_Pipeline
- opened
- +63 -98
@@ -6,7 +6,7 @@ from typing import List, Dict, Union, Optional, Set, Tuple
6 |
7 |
## This software/database is a "United States Government Work" under the terms of the United States Copyright Act. It was written as part of the author's official duties as United States Government employee and thus cannot be copyrighted. This software is freely available to the public for use. The National Center for Advancing Translational Science (NCATS) and the U.S. Government have not placed any restriction on its use or reproduction. Although all reasonable efforts have been taken to ensure the accuracy and reliability of the software and data, the NCATS and the U.S. Government do not and cannot warrant the performance or results that may be obtained by using this software or data. The NCATS and the U.S. Government disclaim all warranties, express or implied, including warranties of performance, merchantability or fitness for any particular purpose. Please cite the authors in any work or product based on this material.
8 |
9 |
# Written by William Kariampuzha @ NIH/NCATS.
10 |
# The transformer-based pipeline code has its own copyright notice under the Apache License.
11 |
# The code was compiled into a single python file to make adding additional features and importing into other modules easy.
12 |
# Each section has its own import statements to facilitate clean code reuse, except for typing which applies to all.
@@ -91,7 +91,7 @@ def search_getAbs(searchterm_list:Union[List[str],List[int],str], maxResults:int
91 |
if pmid[0].isdigit():
92 |
93 |
94 |
#Construct sets for filtering (right before adding abstract to pmid_abs
95 |
# The purpose of this is to do a second check of the abstracts, filters out any abstracts unrelated to the search terms
96 |
#if filtering is 'lenient' or default
97 |
if filtering !='none' or filtering !='strict':
@@ -220,70 +220,39 @@ def streamlit_getAbs(searchterm_list:Union[List[str],List[int],str], maxResults:
220 |
221 |
return pmid_abs, (found, relevant)
222 |
223 |
## Section:
224 |
import os
225 |
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
226 |
from tensorflow.keras.preprocessing.sequence import pad_sequences
227 |
from tensorflow.keras.preprocessing.text import tokenizer_from_json
228 |
import tensorflow as tf
229 |
import numpy as np
230 |
import spacy
231 |
import json
232 |
233 |
class Classify_Pipeline:
234 |
def __init__(self,
235 |
236 |
237 |
238 |
239 |
240 |
241 |
self.classify_tokenizer = tokenizer_from_json(json.load(f))
242 |
#OLD Code - used pickle which is unsafe
243 |
#with open(model+'/tokenizer.pickle', 'rb') as handle:
244 |
# import pickle
245 |
# self.classify_tokenizer = pickle.load(handle)
246 |
# Defaults to load my_model_orphanet_final, the most up-to-date version of the classification model,
247 |
# but can also be run on any other tf.keras model
248 |
249 |
# load the model
250 |
self.classify_model = tf.keras.models.load_model(model_name)
251 |
# for preprocessing
252 |
from nltk.corpus import stopwords
253 |
self.STOPWORDS = set(stopwords.words('english'))
254 |
# Modes
255 |
self.max_length = 300
256 |
self.trunc_type = 'post'
257 |
self.padding_type = 'post'
258 |
259 |
def __str__(self) -> str:
260 |
return "Instantiation: epi_classify = Classify_Pipeline(
261 |
262 |
def __call__(self, abstract:str) -> Tuple[float,bool]:
263 |
return self.getTextPredictions(abstract)
264 |
265 |
def getTextPredictions(self, abstract:str) -> Tuple[float,bool]:
266 |
if len(abstract)>5:
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
prob = y_pred1[0][1]
282 |
if y_pred == 1:
283 |
isEpi = True
284 |
285 |
isEpi = False
286 |
287 |
return prob, isEpi
288 |
289 |
return 0.0, False
@@ -292,36 +261,6 @@ class Classify_Pipeline:
292 |
abstract = PMID_getAb(pmid)
293 |
prob, isEpi = self.getTextPredictions(abstract)
294 |
return abstract, prob, isEpi
295 |
296 |
# Standardize the abstract by replacing all named entities with their entity label.
297 |
# Eg. 3 patients reported at a clinic in England --> CARDINAL patients reported at a clinic in GPE
298 |
# expects the spaCy model en_core_web_lg as input
299 |
def standardizeAbstract(self, abstract:str) -> str:
300 |
doc = self.nlp(abstract)
301 |
newAbstract = abstract
302 |
for e in reversed(doc.ents):
303 |
304 |
start = e.start_char
305 |
end = start + len(e.text)
306 |
newAbstract = newAbstract[:start] + e.label_ + newAbstract[end:]
307 |
return newAbstract
308 |
309 |
# Same as above but replaces biomedical named entities from scispaCy models
310 |
# Expects as input en_ner_bc5cdr_md and en_ner_bionlp13cg_md
311 |
def standardizeSciTerms(self, abstract:str) -> str:
312 |
doc = self.nlpSci(abstract)
313 |
newAbstract = abstract
314 |
for e in reversed(doc.ents):
315 |
start = e.start_char
316 |
end = start + len(e.text)
317 |
newAbstract = newAbstract[:start] + e.label_ + newAbstract[end:]
318 |
319 |
doc = self.nlpSci2(newAbstract)
320 |
for e in reversed(doc.ents):
321 |
start = e.start_char
322 |
end = start + len(e.text)
323 |
newAbstract = newAbstract[:start] + e.label_ + newAbstract[end:]
324 |
return newAbstract
325 |
326 |
## Section: GARD SEARCH
327 |
# can identify rare diseases in text using the GARD dictionary from neo4j
@@ -331,11 +270,18 @@ class GARD_Search:
331 |
def __init__(self):
332 |
import json, codecs
333 |
#These are opened locally so that garbage collection removes them from memory
334 |
335 |
336 |
from nltk.corpus import stopwords
337 |
STOPWORDS = set(stopwords.words('english'))
338 |
339 |
#keys are going to be disease names, values are going to be the GARD ID, set up this way bc dictionaries are faster lookup than lists
340 |
GARD_dict = {}
341 |
#Find out what the length of the longest disease name sequence is, of all names and synonyms. This is used by get_diseases
@@ -356,6 +302,7 @@ class GARD_Search:
356 |
GARD_dict[s] = entry['gard_id']
357 |
max_length = max(max_length,len(s.split()))
358 |
359 |
self.GARD_dict = GARD_dict
360 |
self.max_length = max_length
361 |
@@ -444,6 +391,12 @@ class GARD_Search:
444 |
445 |
return [searchterm]
446 |
447 |
## Section: BioBERT-based epidemiology NER Model (EpiExtract4GARD)
448 |
from nltk import tokenize as nltk_tokenize
449 |
from dataclasses import dataclass
@@ -455,6 +408,7 @@ import re
455 |
from transformers import BertConfig, AutoModelForTokenClassification, BertTokenizer, Trainer
456 |
from unidecode import unidecode
457 |
from collections import OrderedDict
458 |
import pandas as pd
459 |
from more_itertools import pairwise
460 |
@@ -785,7 +739,7 @@ class NER_Pipeline:
785 |
786 |
# Unattached function -- not a method
787 |
# move this to the NER_pipeline as a method??
788 |
#This ensures that there is a standardized ordering of df columns while ensuring dynamics with multiple models. This is used by search_term_extraction.
789 |
def order_labels(entity_classes:Union[Set[str],List[str]]) -> List[str]:
790 |
ordered_labels = []
791 |
label_order = ['DIS','ABRV','EPI','STAT','LOC','DATE','SEX','ETHN']
@@ -800,7 +754,7 @@ def order_labels(entity_classes:Union[Set[str],List[str]]) -> List[str]:
800 |
## This section combines all of the previous code into pipelines so that usage of these models and search functions are easy to implement in apps.
801 |
802 |
# Given a search term and max results to return, this will acquire PubMed IDs and Title+Abstracts and Classify them as epidemiological.
803 |
# results = search_term_extraction(search_term, maxResults, filering,
804 |
#Returns a Pandas dataframe
805 |
def search_term_classification(search_term:Union[int,str], maxResults:int,
806 |
filtering:str, rd_identify:GARD_Search, #for abstract search & filtering
@@ -855,6 +809,11 @@ def API_search_classification(search_term:Union[int,str], maxResults:int,
855 |
856 |
return results
857 |
858 |
def API_text_classification(text:str,epi_classify:Classify_Pipeline) -> Dict[str,str]:
859 |
epi_prob, isEpi = epi_classify(text)
860 |
return {'ABSTRACT':text, 'EPI_PROB':str(epi_prob), 'IsEpi':isEpi}
@@ -900,7 +859,7 @@ def search_term_extraction(search_term:Union[int,str], maxResults:int, filtering
900 |
print(len(results),'abstracts classified as epidemiological.')
901 |
return results.sort_values('EPI_PROB', ascending=False)
902 |
903 |
#Returns a Pandas dataframe
904 |
def streamlit_extraction(search_term:Union[int,str], maxResults:int, filtering:str, #for abstract search
905 |
epi_ner:NER_Pipeline, #for biobert extraction
906 |
GARD_Search:GARD_Search, extract_diseases:bool, #for disease extraction
@@ -1021,7 +980,7 @@ def API_text_extraction(text:str, #Text to be extracted
1021 |
1022 |
json_output = ['ABSTRACT']+ordered_labels
1023 |
1024 |
1025 |
#Do the extraction
1026 |
if extract_diseases:
1027 |
extraction = epi_ner(text, GARD_Search)
@@ -1031,15 +990,17 @@ def API_text_extraction(text:str, #Text to be extracted
1031 |
if extraction:
1032 |
#Re-order the dictionary into desired JSON output
1033 |
extraction = OrderedDict([(term, extraction[term]) for term in json_output if term in extraction.keys()])
1034 |
1035 |
1036 |
1037 |
1038 |
def API_text_classification_extraction(text:str, #Text to be extracted
1039 |
epi_ner:NER_Pipeline, #for biobert extraction
1040 |
GARD_Search:GARD_Search, extract_diseases:bool, #for disease extraction
1041 |
epi_classify:Classify_Pipeline) -> Dict[str,str]:
1042 |
1043 |
#Format of Output
1044 |
ordered_labels = order_labels(epi_ner.labels)
1045 |
if extract_diseases:
@@ -1061,7 +1022,11 @@ def API_text_classification_extraction(text:str, #Text to be extracted
1061 |
1062 |
#Re-order the dictionary into desired JSON output
1063 |
output = OrderedDict([(term, extraction[term]) for term in json_output if term in extraction.keys()])
1064 |
1065 |
1066 |
## Section: Deprecated Functions
1067 |
import requests
@@ -1148,4 +1113,4 @@ def search_EBI_API(searchterm_list:Union[List[str],str], maxResults:int) -> Dict
1148 |
pmids_abs[pmid] = titles[0]+' '+abstracts[0]
1149 |
1150 |
1151 |
return pmids_abs
6 |
7 |
## This software/database is a "United States Government Work" under the terms of the United States Copyright Act. It was written as part of the author's official duties as United States Government employee and thus cannot be copyrighted. This software is freely available to the public for use. The National Center for Advancing Translational Science (NCATS) and the U.S. Government have not placed any restriction on its use or reproduction. Although all reasonable efforts have been taken to ensure the accuracy and reliability of the software and data, the NCATS and the U.S. Government do not and cannot warrant the performance or results that may be obtained by using this software or data. The NCATS and the U.S. Government disclaim all warranties, express or implied, including warranties of performance, merchantability or fitness for any particular purpose. Please cite the authors in any work or product based on this material.
8 |
9 |
# Written by William Kariampuzha @ NIH/NCATS.
10 |
# The transformer-based pipeline code has its own copyright notice under the Apache License.
11 |
# The code was compiled into a single python file to make adding additional features and importing into other modules easy.
12 |
# Each section has its own import statements to facilitate clean code reuse, except for typing which applies to all.
91 |
if pmid[0].isdigit():
92 |
93 |
94 |
#Construct sets for filtering (right before adding abstract to pmid_abs)
95 |
# The purpose of this is to do a second check of the abstracts, filters out any abstracts unrelated to the search terms
96 |
#if filtering is 'lenient' or default
97 |
if filtering !='none' or filtering !='strict':
220 |
221 |
return pmid_abs, (found, relevant)
222 |
223 |
## Section: Transformer based Epi Classification Model (EpiClassify4GARD)
224 |
225 |
# Imports
226 |
from transformers import AutoModelForSequenceClassification, BertTokenizer, BertConfig
227 |
class Classify_Pipeline:
228 |
def __init__(self, name_or_path_to_model_folder:str = "ncats/EpiClassify4GARD"):
229 |
#Initialize tokenizer and model
230 |
self.config = BertConfig.from_pretrained(name_or_path_to_model_folder)
231 |
self.tokenizer = BertTokenizer.from_pretrained(self.config._name_or_path, model_max_length=self.config.max_position_embeddings)
232 |
self.model = AutoModelForSequenceClassification.from_pretrained(name_or_path_to_model_folder,config=self.config)
233 |
234 |
#Custom pipeline by WKariampuzha @NCATS (not Huggingface/Google/NVIDIA copyright)
235 |
def __str__(self) -> str:
236 |
return "Instantiation: epi_classify = Classify_Pipeline(name_or_path_to_model_folder)" +"\n Calling: prob, isEpi = epi_classify(text) \n PubMed ID Predictions: abstracts, prob, isEpi = epi_classify.getPMIDPredictions(pmid)"
237 |
238 |
def __call__(self, abstract:str) -> Tuple[float,bool]:
239 |
return self.getTextPredictions(abstract)
240 |
241 |
def getTextPredictions(self, abstract:str) -> Tuple[float,bool]:
242 |
if len(abstract)>5:
243 |
# input_ids
244 |
input_ids = self.tokenizer(text=abstract, max_length=self.config.max_position_embeddings,padding="max_length",truncation=True,return_tensors='pt')
245 |
if len(input_ids)>self.config.max_position_embeddings:
246 |
raise InputError(f"Token Embeddings of size {input_ids} exceed length for maximum model embedding input {self.config.max_position_embeddings}.")
247 |
#split into sentences?
248 |
# softmax output is a Torch Tensor with two classes [[vector_False_class,vector_True_class]]
249 |
output = self.model(**input_ids)
250 |
# True = 1, False = 0
251 |
isEpi = bool(output.logits.argmax().item())
252 |
# softmax output is a Torch Tensor with two classes [[prob_is_False,prob_is_True]]
253 |
prob_tensor = output.logits.softmax(dim=-1)
254 |
# We only want to return the probability that it is true
255 |
prob =[0][1].item()
256 |
return prob, isEpi
257 |
258 |
return 0.0, False
261 |
abstract = PMID_getAb(pmid)
262 |
prob, isEpi = self.getTextPredictions(abstract)
263 |
return abstract, prob, isEpi
264 |
265 |
## Section: GARD SEARCH
266 |
# can identify rare diseases in text using the GARD dictionary from neo4j
270 |
def __init__(self):
271 |
import json, codecs
272 |
#These are opened locally so that garbage collection removes them from memory
273 |
274 |
with'gard-id-name-synonyms.json', 'r', 'utf-8-sig') as f:
275 |
diseases = json.load(f)
276 |
277 |
r = requests.get('')
278 |
diseases = json.loads(r.content)
279 |
280 |
from nltk.corpus import stopwords
281 |
STOPWORDS = set(stopwords.words('english'))
282 |
283 |
#This should be a list of all GARD IDs for purposes like random choice for testing
284 |
GARD_id_list = [entry['gard_id'] for entry in diseases]
285 |
#keys are going to be disease names, values are going to be the GARD ID, set up this way bc dictionaries are faster lookup than lists
286 |
GARD_dict = {}
287 |
#Find out what the length of the longest disease name sequence is, of all names and synonyms. This is used by get_diseases
302 |
GARD_dict[s] = entry['gard_id']
303 |
max_length = max(max_length,len(s.split()))
304 |
305 |
self.GARD_id_list = GARD_id_list
306 |
self.GARD_dict = GARD_dict
307 |
self.max_length = max_length
308 |
391 |
392 |
return [searchterm]
393 |
394 |
# Return a random GARD_ID Search Term list
395 |
def random_disease(self) -> List[str]:
396 |
import random
397 |
gard_id = random.choice(self.GARD_id_list)
398 |
return self.autosearch(gard_id)
399 |
400 |
## Section: BioBERT-based epidemiology NER Model (EpiExtract4GARD)
401 |
from nltk import tokenize as nltk_tokenize
402 |
from dataclasses import dataclass
408 |
from transformers import BertConfig, AutoModelForTokenClassification, BertTokenizer, Trainer
409 |
from unidecode import unidecode
410 |
from collections import OrderedDict
411 |
import json
412 |
import pandas as pd
413 |
from more_itertools import pairwise
414 |
739 |
740 |
# Unattached function -- not a method
741 |
# move this to the NER_pipeline as a method??
742 |
# This ensures that there is a standardized ordering of df columns while ensuring dynamics with multiple models. This is used by search_term_extraction.
743 |
def order_labels(entity_classes:Union[Set[str],List[str]]) -> List[str]:
744 |
ordered_labels = []
745 |
label_order = ['DIS','ABRV','EPI','STAT','LOC','DATE','SEX','ETHN']
754 |
## This section combines all of the previous code into pipelines so that usage of these models and search functions are easy to implement in apps.
755 |
756 |
# Given a search term and max results to return, this will acquire PubMed IDs and Title+Abstracts and Classify them as epidemiological.
757 |
# results = search_term_extraction(search_term, maxResults, filering, GARD_Search, Classify_Pipeline)
758 |
#Returns a Pandas dataframe
759 |
def search_term_classification(search_term:Union[int,str], maxResults:int,
760 |
filtering:str, rd_identify:GARD_Search, #for abstract search & filtering
809 |
810 |
return results
811 |
812 |
def API_PMID_classification(pmid:Union[int,str], epi_classify:Classify_Pipeline) -> Dict[str,str]:
813 |
text = PMID_getAb(pmid)
814 |
epi_prob, isEpi = epi_classify(text)
815 |
return {'PMID':pmid,'ABSTRACT':text, 'EPI_PROB':str(epi_prob), 'IsEpi':isEpi}
816 |
817 |
def API_text_classification(text:str,epi_classify:Classify_Pipeline) -> Dict[str,str]:
818 |
epi_prob, isEpi = epi_classify(text)
819 |
return {'ABSTRACT':text, 'EPI_PROB':str(epi_prob), 'IsEpi':isEpi}
859 |
print(len(results),'abstracts classified as epidemiological.')
860 |
return results.sort_values('EPI_PROB', ascending=False)
861 |
862 |
#Returns a Pandas dataframe
863 |
def streamlit_extraction(search_term:Union[int,str], maxResults:int, filtering:str, #for abstract search
864 |
epi_ner:NER_Pipeline, #for biobert extraction
865 |
GARD_Search:GARD_Search, extract_diseases:bool, #for disease extraction
980 |
981 |
json_output = ['ABSTRACT']+ordered_labels
982 |
983 |
extraction = dict()
984 |
#Do the extraction
985 |
if extract_diseases:
986 |
extraction = epi_ner(text, GARD_Search)
990 |
if extraction:
991 |
#Re-order the dictionary into desired JSON output
992 |
extraction = OrderedDict([(term, extraction[term]) for term in json_output if term in extraction.keys()])
993 |
994 |
#This may return JSONs of different length than above
995 |
extraction = OrderedDict([(term, []) for term in json_output])
996 |
997 |
return extraction
998 |
999 |
def API_text_classification_extraction(text:str, #Text to be extracted
1000 |
epi_ner:NER_Pipeline, #for biobert extraction
1001 |
GARD_Search:GARD_Search, extract_diseases:bool, #for disease extraction
1002 |
epi_classify:Classify_Pipeline) -> Dict[str,str]:
1003 |
1004 |
#Format of Output
1005 |
ordered_labels = order_labels(epi_ner.labels)
1006 |
if extract_diseases:
1022 |
1023 |
#Re-order the dictionary into desired JSON output
1024 |
output = OrderedDict([(term, extraction[term]) for term in json_output if term in extraction.keys()])
1025 |
1026 |
#This may return JSONs of different length than above
1027 |
output = OrderedDict([(term, []) for term in json_output])
1028 |
1029 |
return output
1030 |
1031 |
## Section: Deprecated Functions
1032 |
import requests
1113 |
pmids_abs[pmid] = titles[0]+' '+abstracts[0]
1114 |
1115 |
1116 |
return pmids_abs