Spaces:
Build error
Build error
# -*- coding: utf-8 -*- | |
""" | |
Created on Thu Aug 13 09:20:38 2020 | |
@author: luol2 | |
""" | |
import argparse | |
from nn_model import bioTag_CNN,bioTag_BERT,bioTag_Bioformer | |
from dic_ner import dic_ont | |
from tagging_text import bioTag | |
import os | |
import time | |
import json | |
import sys | |
def file_tag_hybrid(infile,outfile,biotag_dic,nn_model,para_set): | |
with open(outfile,'w', encoding='utf8') as fout: | |
fin=open(infile,'r',encoding='utf-8') | |
all_context=fin.read().strip().split('\n\n') | |
N=len(all_context) | |
fin.close() | |
i=1 | |
for doc in all_context: | |
print("Processing:{0}%".format(round(i * 100 / N)), end="\r") | |
i+=1 | |
lines=doc.split('\n') | |
pmid='' | |
title='' | |
abstract='' | |
for line in lines: | |
seg_t=line.split('|t|') | |
seg_a=line.split('|a|') | |
if len(seg_t)>=2: | |
pmid=seg_t[0] | |
title=seg_t[1] | |
elif len(seg_a)>=2: | |
pmid=seg_a[0] | |
abstract=seg_a[1] | |
if pmid !='': | |
fout.write(pmid+"|t|"+title+"\n") | |
fout.write(pmid+"|a|"+abstract+"\n") | |
intext=title+' '+abstract | |
tag_result=bioTag(intext,biotag_dic,nn_model,onlyLongest=para_set['onlyLongest'], abbrRecog=para_set['abbrRecog'],Threshold=para_set['ML_Threshold']) | |
for ele in tag_result: | |
start = ele[0] | |
last = ele[1] | |
mention = intext[int(ele[0]):int(ele[1])] | |
type='Phenotype' | |
id=ele[2] | |
score=ele[3] | |
fout.write(pmid+"\t"+start+"\t"+last+"\t"+mention+"\t"+type+"\t"+id+"\n") | |
#fout.write(pmid+"\t"+start+"\t"+last+"\t"+mention+"\t"+type+"\t"+id+"\t"+score+"\n") | |
fout.write('\n') | |
def path_tag_hybrid(inpath,outpath,biotag_dic,nn_model,para_set): | |
i=1 | |
N=0 | |
preds_result={} | |
for filename in os.listdir(inpath): | |
N+=1 | |
for filename in os.listdir(inpath): | |
print("Processing:{0}%".format(round(i * 100 / N)), end="\r") | |
i+=1 | |
pmid=filename | |
fin=open(inpath+filename,'r',encoding='utf-8') | |
intext=fin.read().rstrip() | |
fin.close() | |
temp_result=[] | |
tag_result=bioTag(intext,biotag_dic,nn_model,onlyLongest=para_set['onlyLongest'], abbrRecog=para_set['abbrRecog'],Threshold=para_set['ML_Threshold']) | |
fout=open(outpath+filename,'w',encoding='utf-8') | |
for ele in tag_result: | |
fout.write('\t'.join([ele[0],ele[1],ele[2],ele[3],intext[int(ele[0]):int(ele[1])]])+'\n') | |
fout.close() | |
def phecr_tag(infile,para_set,outfile): | |
ontfiles={'dic_file':'../dict_new/noabb_lemma.dic', | |
'word_hpo_file':'../dict_new/word_id_map.json', | |
'hpo_word_file':'../dict_new/id_word_map.json'} | |
if para_set['model_type']=='cnn': | |
vocabfiles={'w2vfile':'../vocab/bio_embedding_intrinsic.d200', | |
'charfile':'../vocab/char.vocab', | |
'labelfile':'../dict_new/lable.vocab', | |
'posfile':'../vocab/pos.vocab'} | |
modelfile='../models/cnn_p5n5_b128_95_hponew1.h5' | |
elif para_set['model_type']=='bioformer': | |
vocabfiles={'labelfile':'../dict_new/lable.vocab', | |
'config_path':'../vocab/bioformer-cased-v1.0/bert_config.json', | |
'checkpoint_path':'../vocab/bioformer-cased-v1.0/bioformer-cased-v1.0-model.ckpt-2000000', | |
'vocab_path':'../vocab/bioformer-cased-v1.0/vocab.txt'} | |
modelfile='../models/bioformer_p5n5_b64_1e-5_95_hponew3.h5' | |
else: | |
print('Model type is wrong, please select cnn or bioformer.') | |
sys.exit() | |
biotag_dic=dic_ont(ontfiles) | |
if para_set['model_type']=='cnn': | |
nn_model=bioTag_CNN(vocabfiles) | |
nn_model.load_model(modelfile) | |
elif para_set['model_type']=='bioformer': | |
nn_model=bioTag_Bioformer(vocabfiles) | |
nn_model.load_model(modelfile) | |
if os.path.isdir(infile): | |
print("Input a directory:",infile,'\n....tagging begin....') | |
start_time=time.time() | |
path_tag_hybrid(infile,outfile,biotag_dic,nn_model,para_set) | |
print('tagging done, processing time:',time.time()-start_time) | |
elif os.path.isfile(infile): | |
print("Input a file:", infile, '\n....tagging begin....') | |
start_time=time.time() | |
file_tag_hybrid(infile,outfile,biotag_dic,nn_model,para_set) | |
print('tagging done, processing time:',time.time()-start_time) | |
if __name__=="__main__": | |
parser = argparse.ArgumentParser(description='Tagging free text with PhenoTagger, python PhenoTagger_tagging.py -i infile/inpath -o outfile/outpath') | |
parser.add_argument('--infile', '-i', help="the input file or path",default='../example/ex2/') | |
parser.add_argument('--outfile', '-o', help="the output file or path",default='../example/ex2_output/') | |
args = parser.parse_args() | |
if (os.path.isdir(args.infile)) and (not os.path.exists(args.outfile)): | |
os.makedirs(args.outfile) | |
para_set={ | |
'model_type':'bioformer', # cnn or bioformer | |
'onlyLongest':True, # False: return overlap concepts, True only longgest | |
'abbrRecog':True,# False: don't identify abbr, True: identify abbr | |
'ML_Threshold':0.95,# the Threshold of deep learning model | |
} | |
phecr_tag(args.infile,para_set,args.outfile) | |