Spaces:
Build error
Build error
File size: 5,782 Bytes
273b73b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 13 09:20:38 2020
@author: luol2
"""
import argparse
from nn_model import bioTag_CNN,bioTag_BERT,bioTag_Bioformer
from dic_ner import dic_ont
from tagging_text import bioTag
import os
import time
import json
import sys
def file_tag_hybrid(infile,outfile,biotag_dic,nn_model,para_set):
with open(outfile,'w', encoding='utf8') as fout:
fin=open(infile,'r',encoding='utf-8')
all_context=fin.read().strip().split('\n\n')
N=len(all_context)
fin.close()
i=1
for doc in all_context:
print("Processing:{0}%".format(round(i * 100 / N)), end="\r")
i+=1
lines=doc.split('\n')
pmid=''
title=''
abstract=''
for line in lines:
seg_t=line.split('|t|')
seg_a=line.split('|a|')
if len(seg_t)>=2:
pmid=seg_t[0]
title=seg_t[1]
elif len(seg_a)>=2:
pmid=seg_a[0]
abstract=seg_a[1]
if pmid !='':
fout.write(pmid+"|t|"+title+"\n")
fout.write(pmid+"|a|"+abstract+"\n")
intext=title+' '+abstract
tag_result=bioTag(intext,biotag_dic,nn_model,onlyLongest=para_set['onlyLongest'], abbrRecog=para_set['abbrRecog'],Threshold=para_set['ML_Threshold'])
for ele in tag_result:
start = ele[0]
last = ele[1]
mention = intext[int(ele[0]):int(ele[1])]
type='Phenotype'
id=ele[2]
score=ele[3]
fout.write(pmid+"\t"+start+"\t"+last+"\t"+mention+"\t"+type+"\t"+id+"\n")
#fout.write(pmid+"\t"+start+"\t"+last+"\t"+mention+"\t"+type+"\t"+id+"\t"+score+"\n")
fout.write('\n')
def path_tag_hybrid(inpath,outpath,biotag_dic,nn_model,para_set):
i=1
N=0
preds_result={}
for filename in os.listdir(inpath):
N+=1
for filename in os.listdir(inpath):
print("Processing:{0}%".format(round(i * 100 / N)), end="\r")
i+=1
pmid=filename
fin=open(inpath+filename,'r',encoding='utf-8')
intext=fin.read().rstrip()
fin.close()
temp_result=[]
tag_result=bioTag(intext,biotag_dic,nn_model,onlyLongest=para_set['onlyLongest'], abbrRecog=para_set['abbrRecog'],Threshold=para_set['ML_Threshold'])
fout=open(outpath+filename,'w',encoding='utf-8')
for ele in tag_result:
fout.write('\t'.join([ele[0],ele[1],ele[2],ele[3],intext[int(ele[0]):int(ele[1])]])+'\n')
fout.close()
def phecr_tag(infile,para_set,outfile):
ontfiles={'dic_file':'../dict_new/noabb_lemma.dic',
'word_hpo_file':'../dict_new/word_id_map.json',
'hpo_word_file':'../dict_new/id_word_map.json'}
if para_set['model_type']=='cnn':
vocabfiles={'w2vfile':'../vocab/bio_embedding_intrinsic.d200',
'charfile':'../vocab/char.vocab',
'labelfile':'../dict_new/lable.vocab',
'posfile':'../vocab/pos.vocab'}
modelfile='../models/cnn_p5n5_b128_95_hponew1.h5'
elif para_set['model_type']=='bioformer':
vocabfiles={'labelfile':'../dict_new/lable.vocab',
'config_path':'../vocab/bioformer-cased-v1.0/bert_config.json',
'checkpoint_path':'../vocab/bioformer-cased-v1.0/bioformer-cased-v1.0-model.ckpt-2000000',
'vocab_path':'../vocab/bioformer-cased-v1.0/vocab.txt'}
modelfile='../models/bioformer_p5n5_b64_1e-5_95_hponew3.h5'
else:
print('Model type is wrong, please select cnn or bioformer.')
sys.exit()
biotag_dic=dic_ont(ontfiles)
if para_set['model_type']=='cnn':
nn_model=bioTag_CNN(vocabfiles)
nn_model.load_model(modelfile)
elif para_set['model_type']=='bioformer':
nn_model=bioTag_Bioformer(vocabfiles)
nn_model.load_model(modelfile)
if os.path.isdir(infile):
print("Input a directory:",infile,'\n....tagging begin....')
start_time=time.time()
path_tag_hybrid(infile,outfile,biotag_dic,nn_model,para_set)
print('tagging done, processing time:',time.time()-start_time)
elif os.path.isfile(infile):
print("Input a file:", infile, '\n....tagging begin....')
start_time=time.time()
file_tag_hybrid(infile,outfile,biotag_dic,nn_model,para_set)
print('tagging done, processing time:',time.time()-start_time)
if __name__=="__main__":
parser = argparse.ArgumentParser(description='Tagging free text with PhenoTagger, python PhenoTagger_tagging.py -i infile/inpath -o outfile/outpath')
parser.add_argument('--infile', '-i', help="the input file or path",default='../example/ex2/')
parser.add_argument('--outfile', '-o', help="the output file or path",default='../example/ex2_output/')
args = parser.parse_args()
if (os.path.isdir(args.infile)) and (not os.path.exists(args.outfile)):
os.makedirs(args.outfile)
para_set={
'model_type':'bioformer', # cnn or bioformer
'onlyLongest':True, # False: return overlap concepts, True only longgest
'abbrRecog':True,# False: don't identify abbr, True: identify abbr
'ML_Threshold':0.95,# the Threshold of deep learning model
}
phecr_tag(args.infile,para_set,args.outfile)
|