wzkariampuzha commited on
Commit
223e572
·
1 Parent(s): d5db2e6

Update epi_pipeline.py

Browse files
Files changed (1) hide show
  1. epi_pipeline.py +130 -27
epi_pipeline.py CHANGED
@@ -1,30 +1,21 @@
1
  from typing import List, Dict, Union, Optional, Set, Tuple
2
 
3
  # coding=utf-8
4
- # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
5
- # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
6
- #
7
- # Licensed under the Apache License, Version 2.0 (the "License");
8
- # you may not use this file except in compliance with the License.
9
- # You may obtain a copy of the License at
10
- #
11
- # http://www.apache.org/licenses/LICENSE-2.0
12
- #
13
- # Unless required by applicable law or agreed to in writing, software
14
- # distributed under the License is distributed on an "AS IS" BASIS,
15
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
- # See the License for the specific language governing permissions and
17
- # limitations under the License.
18
- # ALSO See NCATS LICENSE
19
 
20
- # Written by William Kariampuzha at NIH/NCATS. Adapted from code written by Jennifer John, et al. above
21
 
 
 
 
22
  # Each section has its own import statements to facilitate clean code reuse, except for typing which applies to all.
23
- # the `Any` type is used in place of the specific class variable, not necessarily to mean that any object type can go there...
24
 
25
  ## Section: GATHER ABSTRACTS FROM APIs
26
  import requests
27
  import xml.etree.ElementTree as ET
 
 
28
  from nltk.corpus import stopwords
29
  STOPWORDS = set(stopwords.words('english'))
30
  from nltk import tokenize as nltk_tokenize
@@ -464,6 +455,22 @@ import pandas as pd
464
  from more_itertools import pairwise
465
 
466
  # Subsection: Processing the abstracts into the correct data format
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
467
  @dataclass
468
  class NERInput:
469
  """
@@ -546,7 +553,10 @@ class NerDataset(Dataset):
546
  def abstract2NERinputs(self, abstract:str) -> List[NERInput]:
547
  guid_index = 0
548
  sentences = self.str2sents(abstract)
549
- ner_inputs = [NERInput(str(guid), nltk_tokenize.word_tokenize(sent), ["O" for i in range(len(nltk_tokenize.word_tokenize(sent)))]) for guid, sent in enumerate(sentences)]
 
 
 
550
  return ner_inputs
551
 
552
  def convert_NERinputs_to_features(self,
@@ -662,6 +672,7 @@ class NerDataset(Dataset):
662
  class NER_Pipeline:
663
  def __init__(self, name_or_path_to_model_folder:str = "ncats/EpiExtract4GARD-v2"):
664
  self.bert_tokenizer = BertTokenizer.from_pretrained(name_or_path_to_model_folder)
 
665
  #model = AutoModelForTokenClassification.from_pretrained(name_or_path_to_model_folder)
666
  self.config = BertConfig.from_pretrained(name_or_path_to_model_folder)
667
  self.labels = {re.sub(".-","",label) for label in self.config.label2id.keys() if label != "O"}
@@ -670,6 +681,7 @@ class NER_Pipeline:
670
  def __str__(self):
671
  return "Instantiation: pipe = NER_Pipeline(name_or_path_to_model_folder)"+"\n Calling: output_dict = pipe(text)"
672
 
 
673
  def __call__(self, text:str, rd_identify:Union[GARD_Search,None] = None):
674
  output_dict = {label:[] for label in self.labels}
675
 
@@ -767,6 +779,8 @@ class NER_Pipeline:
767
  return bi, tag
768
 
769
 
 
 
770
  #This ensures that there is a standardized ordering of df columns while ensuring dynamics with multiple models. This is used by search_term_extraction.
771
  def order_labels(entity_classes:Union[Set[str],List[str]]) -> List[str]:
772
  ordered_labels = []
@@ -778,9 +792,72 @@ def order_labels(entity_classes:Union[Set[str],List[str]]) -> List[str]:
778
  ordered_labels.append(entity)
779
  return ordered_labels
780
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
781
  # Given a search term and max results to return, this will acquire PubMed IDs and Title+Abstracts and Classify them as epidemiological.
782
  # It then extracts Epidemiologic Information[Disease GARD ID, Disease Name, Location, Epidemiologic Identifier, Epidemiologic Statistic] for each abstract
783
- # results = search_term_extraction(search_term, maxResults, filering, NER_pipeline, labels, extract_diseases, GARD_dict, max_length, classify_model_vars)
784
  #Returns a Pandas dataframe
785
  def search_term_extraction(search_term:Union[int,str], maxResults:int, filtering:str, #for abstract search
786
  epi_ner:NER_Pipeline, #for biobert extraction
@@ -882,11 +959,11 @@ def streamlit_extraction(search_term:Union[int,str], maxResults:int, filtering:s
882
 
883
  return results.sort_values('PROB_OF_EPI', ascending=False), sankey_data, disease_gardID
884
 
885
- #Identical to search_term_extraction, except it returns a JSON object instead of a df
886
  def API_search_extraction(search_term:Union[int,str], maxResults:int, filtering:str, #for abstract search
887
  epi_ner:NER_Pipeline, #for biobert extraction
888
  GARD_Search:GARD_Search, extract_diseases:bool, #for disease extraction
889
- epi_classify:Classify_Pipeline) -> json: #for classification
890
 
891
  #Format of Output
892
  ordered_labels = order_labels(epi_ner.labels)
@@ -926,20 +1003,19 @@ def API_search_extraction(search_term:Union[int,str], maxResults:int, filtering:
926
  entry['EPI_PROB'] = str(entry['EPI_PROB'])
927
 
928
  return results
929
- #return json.dumps(results)
930
 
931
- #Identical to search_term_extraction, except it returns a JSON object instead of a df
932
  def API_text_extraction(text:str, #Text to be extracted
933
  epi_ner:NER_Pipeline, #for biobert extraction
934
  GARD_Search:GARD_Search, extract_diseases:bool, #for disease extraction
935
- ) -> json: #for classification
936
-
937
  #Format of Output
938
  ordered_labels = order_labels(epi_ner.labels)
939
  if extract_diseases:
940
  json_output = ['ABSTRACT','IDS','DIS']+ordered_labels
941
  else:
942
- json_output = ['ABSTRACT',]+ordered_labels
943
 
944
  results = {'entries':[]}
945
  #Do the extraction
@@ -954,7 +1030,34 @@ def API_text_extraction(text:str, #Text to be extracted
954
  results['entries'].append(extraction)
955
 
956
  return results
957
- #return json.dumps(results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
958
 
959
  ## Section: Deprecated Functions
960
  import requests
 
1
  from typing import List, Dict, Union, Optional, Set, Tuple
2
 
3
  # coding=utf-8
4
+ ## PUBLIC DOMAIN NOTICE
5
+ ## National Center for Advancing Translational Sciences
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
+ ## This software/database is a "United States Government Work" under the terms of the United States Copyright Act. It was written as part of the author's official duties as United States Government employee and thus cannot be copyrighted. This software is freely available to the public for use. The National Center for Advancing Translational Science (NCATS) and the U.S. Government have not placed any restriction on its use or reproduction. Although all reasonable efforts have been taken to ensure the accuracy and reliability of the software and data, the NCATS and the U.S. Government do not and cannot warrant the performance or results that may be obtained by using this software or data. The NCATS and the U.S. Government disclaim all warranties, express or implied, including warranties of performance, merchantability or fitness for any particular purpose. Please cite the authors in any work or product based on this material.
8
 
9
+ # Written by William Kariampuzha @ NIH/NCATS. Adapted from code written by Jennifer John, et al.
10
+ # The transformer-based pipeline code has its own copyright notice under the Apache License.
11
+ # The code was compiled into a single python file to make adding additional features and importing into other modules easy.
12
  # Each section has its own import statements to facilitate clean code reuse, except for typing which applies to all.
 
13
 
14
  ## Section: GATHER ABSTRACTS FROM APIs
15
  import requests
16
  import xml.etree.ElementTree as ET
17
+ import nltk
18
+ nltk.data.path.extend(["/home/user/app/nltk_data","./nltk_data"])
19
  from nltk.corpus import stopwords
20
  STOPWORDS = set(stopwords.words('english'))
21
  from nltk import tokenize as nltk_tokenize
 
455
  from more_itertools import pairwise
456
 
457
  # Subsection: Processing the abstracts into the correct data format
458
+
459
+ # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
460
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
461
+ #
462
+ # Licensed under the Apache License, Version 2.0 (the "License");
463
+ # you may not use this file except in compliance with the License.
464
+ # You may obtain a copy of the License at
465
+ #
466
+ # http://www.apache.org/licenses/LICENSE-2.0
467
+ #
468
+ # Unless required by applicable law or agreed to in writing, software
469
+ # distributed under the License is distributed on an "AS IS" BASIS,
470
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
471
+ # See the License for the specific language governing permissions and
472
+ # limitations under the License.
473
+
474
  @dataclass
475
  class NERInput:
476
  """
 
553
  def abstract2NERinputs(self, abstract:str) -> List[NERInput]:
554
  guid_index = 0
555
  sentences = self.str2sents(abstract)
556
+ ner_inputs = [NERInput(str(guid),
557
+ nltk_tokenize.word_tokenize(sent),
558
+ ["O" for i in range(len(nltk_tokenize.word_tokenize(sent)))])
559
+ for guid, sent in enumerate(sentences)]
560
  return ner_inputs
561
 
562
  def convert_NERinputs_to_features(self,
 
672
  class NER_Pipeline:
673
  def __init__(self, name_or_path_to_model_folder:str = "ncats/EpiExtract4GARD-v2"):
674
  self.bert_tokenizer = BertTokenizer.from_pretrained(name_or_path_to_model_folder)
675
+ #no need for model variable because trainer wraps model and has more functions
676
  #model = AutoModelForTokenClassification.from_pretrained(name_or_path_to_model_folder)
677
  self.config = BertConfig.from_pretrained(name_or_path_to_model_folder)
678
  self.labels = {re.sub(".-","",label) for label in self.config.label2id.keys() if label != "O"}
 
681
  def __str__(self):
682
  return "Instantiation: pipe = NER_Pipeline(name_or_path_to_model_folder)"+"\n Calling: output_dict = pipe(text)"
683
 
684
+ #Custom pipeline by WKariampuzha @NCATS (not Huggingface/Google/NVIDIA copyright)
685
  def __call__(self, text:str, rd_identify:Union[GARD_Search,None] = None):
686
  output_dict = {label:[] for label in self.labels}
687
 
 
779
  return bi, tag
780
 
781
 
782
+ # Unattached function -- not a method
783
+ # move this to the NER_pipeline as a method??
784
  #This ensures that there is a standardized ordering of df columns while ensuring dynamics with multiple models. This is used by search_term_extraction.
785
  def order_labels(entity_classes:Union[Set[str],List[str]]) -> List[str]:
786
  ordered_labels = []
 
792
  ordered_labels.append(entity)
793
  return ordered_labels
794
 
795
+ ## SECTION: PIPELINES
796
+ ## This section combines all of the previous code into pipelines so that usage of these models and search functions are easy to implement in apps.
797
+
798
+ # Given a search term and max results to return, this will acquire PubMed IDs and Title+Abstracts and Classify them as epidemiological.
799
+ # results = search_term_extraction(search_term, maxResults, filering, GARD_dict, classify_model_vars)
800
+ #Returns a Pandas dataframe
801
+ def search_term_classification(search_term:Union[int,str], maxResults:int,
802
+ filtering:str, rd_identify:GARD_Search, #for abstract search & filtering
803
+ epi_classify:Classify_Pipeline) -> pd.DataFrame: #for classification
804
+
805
+ results = pd.DataFrame(columns=['PMID', 'ABSTRACT','EPI_PROB','IsEpi'])
806
+
807
+ ##Check to see if search term maps to anything in the GARD dictionary, if so it pulls up all synonyms for the search
808
+ search_term_list = rd_identify.autosearch(search_term)
809
+
810
+ #Gather title+abstracts into a dictionary {pmid:abstract}
811
+ pmid_abs = search_getAbs(search_term_list, maxResults, filtering)
812
+
813
+ for pmid, abstract in pmid_abs.items():
814
+ epi_prob, isEpi = epi_classify(abstract)
815
+ result = {'PMID':pmid, 'ABSTRACT':abstract, 'EPI_PROB':epi_prob, 'IsEpi':isEpi}
816
+ #Slow dataframe update
817
+ results = results.append(result, ignore_index=True)
818
+
819
+ return results.sort_values('EPI_PROB', ascending=False)
820
+
821
+ #Identical to search_term_classification, except it returns a JSON-compatible dictionary instead of a df
822
+ def API_search_classification(search_term:Union[int,str], maxResults:int,
823
+ filtering:str, GARD_Search:GARD_Search, #for abstract search & filtering
824
+ epi_classify:Classify_Pipeline) -> Dict[str,str]: #for classification
825
+
826
+ #Format of Output
827
+ results = {'entries':[]}
828
+
829
+ ##Check to see if search term maps to anything in the GARD dictionary, if so it pulls up all synonyms for the search
830
+ print('Inside `API_search_classification`. this is `search_term`:',search_term,type(search_term))
831
+ search_term_list = GARD_Search.autosearch(search_term)
832
+
833
+ #Gather title+abstracts into a dictionary {pmid:abstract}
834
+ pmid_abs = search_getAbs(search_term_list, maxResults, filtering)
835
+
836
+ for pmid, abstract in pmid_abs.items():
837
+ epi_prob, isEpi = epi_classify(abstract)
838
+ result = {'PMID':pmid, 'ABSTRACT':abstract, 'EPI_PROB':epi_prob, 'IsEpi':isEpi}
839
+ results['entries'].append(result)
840
+
841
+ #sort
842
+ results['entries'].sort(reverse=True, key=lambda x:x['EPI_PROB'])
843
+
844
+ # float is not JSON serializable, so must convert all epi_probs to str
845
+ # This returns a map object, which is not JSON serializable
846
+ # results['entries'] = map(lambda entry:str(entry['EPI_PROB']),results['entries'])
847
+ # so must convert floats to str the boring and slow way
848
+
849
+ for entry in results['entries']:
850
+ entry['EPI_PROB'] = str(entry['EPI_PROB'])
851
+
852
+ return results
853
+
854
+ def API_text_classification(text:str,epi_classify:Classify_Pipeline) -> Dict[str,str]:
855
+ epi_prob, isEpi = epi_classify(text)
856
+ return {'ABSTRACT':text, 'EPI_PROB':str(epi_prob), 'IsEpi':isEpi}
857
+
858
  # Given a search term and max results to return, this will acquire PubMed IDs and Title+Abstracts and Classify them as epidemiological.
859
  # It then extracts Epidemiologic Information[Disease GARD ID, Disease Name, Location, Epidemiologic Identifier, Epidemiologic Statistic] for each abstract
860
+ # results = search_term_extraction(search_term, maxResults, filering, NER_pipeline, extract_diseases, GARD_Search, Classify_Pipeline)
861
  #Returns a Pandas dataframe
862
  def search_term_extraction(search_term:Union[int,str], maxResults:int, filtering:str, #for abstract search
863
  epi_ner:NER_Pipeline, #for biobert extraction
 
959
 
960
  return results.sort_values('PROB_OF_EPI', ascending=False), sankey_data, disease_gardID
961
 
962
+ #Identical to search_term_extraction, except it returns a JSON-compatible dictionary instead of a df
963
  def API_search_extraction(search_term:Union[int,str], maxResults:int, filtering:str, #for abstract search
964
  epi_ner:NER_Pipeline, #for biobert extraction
965
  GARD_Search:GARD_Search, extract_diseases:bool, #for disease extraction
966
+ epi_classify:Classify_Pipeline) -> Dict[str,str]: #for classification
967
 
968
  #Format of Output
969
  ordered_labels = order_labels(epi_ner.labels)
 
1003
  entry['EPI_PROB'] = str(entry['EPI_PROB'])
1004
 
1005
  return results
 
1006
 
1007
+ #Identical to search_term_extraction, except it returns a JSON-compatible dictionary instead of a df
1008
  def API_text_extraction(text:str, #Text to be extracted
1009
  epi_ner:NER_Pipeline, #for biobert extraction
1010
  GARD_Search:GARD_Search, extract_diseases:bool, #for disease extraction
1011
+ ) -> Dict[str,str]:
1012
+
1013
  #Format of Output
1014
  ordered_labels = order_labels(epi_ner.labels)
1015
  if extract_diseases:
1016
  json_output = ['ABSTRACT','IDS','DIS']+ordered_labels
1017
  else:
1018
+ json_output = ['ABSTRACT']+ordered_labels
1019
 
1020
  results = {'entries':[]}
1021
  #Do the extraction
 
1030
  results['entries'].append(extraction)
1031
 
1032
  return results
1033
+
1034
+ def API_text_classification_extraction(text:str, #Text to be extracted
1035
+ epi_ner:NER_Pipeline, #for biobert extraction
1036
+ GARD_Search:GARD_Search, extract_diseases:bool, #for disease extraction
1037
+ epi_classify:Classify_Pipeline) -> Dict[str,str]:
1038
+
1039
+ #Format of Output
1040
+ ordered_labels = order_labels(epi_ner.labels)
1041
+ if extract_diseases:
1042
+ json_output = ['ABSTRACT','IsEpi','EPI_PROB','IDS','DIS']+ordered_labels
1043
+ else:
1044
+ json_output = ['ABSTRACT','IsEpi','EPI_PROB']+ordered_labels
1045
+
1046
+ #Do the extraction
1047
+ if extract_diseases:
1048
+ extraction = epi_ner(text, GARD_Search)
1049
+ else:
1050
+ extraction = epi_ner(text)
1051
+
1052
+ if extraction:
1053
+ #Add the epidemiology probability and result
1054
+ #Does not matter which order these are done in but doing classification after may save some time if there is no valid extraction
1055
+ epi_prob, isEpi = epi_classify(text)
1056
+ extraction.update({'EPI_PROB':str(epi_prob),'IsEpi':isEpi})
1057
+
1058
+ #Re-order the dictionary into desired JSON output
1059
+ output = OrderedDict([(term, extraction[term]) for term in json_output if term in extraction.keys()])
1060
+ return output
1061
 
1062
  ## Section: Deprecated Functions
1063
  import requests