Spaces:
Build error
Build error
lingbionlp
commited on
Commit
·
645d04b
1
Parent(s):
acdbc7f
Upload 10 files
Browse files- src/ml_ner.py +8 -17
- src/nn_model.py +1 -13
- src/tagging_text.py +2 -2
src/ml_ner.py
CHANGED
@@ -8,7 +8,7 @@ Created on Fri Jun 12 16:41:54 2020
|
|
8 |
import io
|
9 |
import time
|
10 |
import numpy as np
|
11 |
-
|
12 |
def ml_intext(infile):
|
13 |
fin=open(infile,'r',encoding='utf-8')
|
14 |
alltexts=fin.read().strip().split('\n\n')
|
@@ -462,7 +462,7 @@ def combine_strategy(test_decode_temp, T=0.8):
|
|
462 |
return fout.getvalue()
|
463 |
|
464 |
|
465 |
-
def model_predict(
|
466 |
if nn_model.model_type=='cnn':
|
467 |
#startTime=time.time()
|
468 |
test_set,test_label = ml_intext_fn(ml_input)
|
@@ -482,7 +482,6 @@ def model_predict(session,ml_input,nn_model,ml_input_txt,ml_input_index,Threshol
|
|
482 |
input_test.append(test_x[3])
|
483 |
# print('ml-model-represent:',time.time()-startTime)
|
484 |
# startTime=time.time()
|
485 |
-
K.set_session(session)
|
486 |
test_pre = nn_model.model.predict(input_test)
|
487 |
# print('ml-model-predict:',time.time()-startTime)
|
488 |
|
@@ -492,10 +491,6 @@ def model_predict(session,ml_input,nn_model,ml_input_txt,ml_input_index,Threshol
|
|
492 |
test_x,test_y=nn_model.rep.load_data(test_set,test_label,word_max_len=nn_model.maxlen)
|
493 |
#print('ml-model-represent:',time.time()-startTime)
|
494 |
#startTime=time.time()
|
495 |
-
#K.set_session(session)
|
496 |
-
#with session.as_default():
|
497 |
-
#with session.graph.as_default():
|
498 |
-
#print('......session')
|
499 |
test_pre = nn_model.model.predict(test_x)
|
500 |
#print('ml-model-modedpred:',time.time()-startTime)
|
501 |
# startTime=time.time()
|
@@ -527,19 +522,15 @@ def model_predict_old(ml_input,nn_model,ml_input_txt,ml_input_index,Threshold):
|
|
527 |
|
528 |
if nn_model.fea_dict['pos'] == 1:
|
529 |
input_test.append(test_x[3])
|
530 |
-
|
531 |
-
|
532 |
-
with nn_model.session.graph.as_default():
|
533 |
-
test_pre = nn_model.model.predict(input_test,batch_size=256)
|
534 |
|
535 |
elif nn_model.model_type=='bert' or nn_model.model_type=='bioformer':
|
536 |
|
537 |
test_set,test_label = ml_intext_fn(ml_input)
|
538 |
test_x,test_y=nn_model.rep.load_data(test_set,test_label,word_max_len=nn_model.maxlen)
|
539 |
-
|
540 |
-
|
541 |
-
with nn_model.session.graph.as_default():
|
542 |
-
test_pre = nn_model.model.predict(test_x,batch_size=128)
|
543 |
|
544 |
test_score=output_result(test_pre, nn_model.rep.label_2_index,Top_N=3)
|
545 |
#print('test_score:',test_score)
|
@@ -562,7 +553,7 @@ def output_txt(ml_input_txt):
|
|
562 |
|
563 |
return fout.getvalue()
|
564 |
|
565 |
-
def ml_tagging(
|
566 |
# startTime=time.time()
|
567 |
ml_input, ml_input_txt,ml_input_index=build_ngram_testset_filted(ssplit_token)
|
568 |
# print('ml-ngrambuild:',time.time()-startTime)
|
@@ -570,7 +561,7 @@ def ml_tagging(session,ssplit_token,ml_model,Threshold):
|
|
570 |
#print(ml_input)
|
571 |
# startTime=time.time()
|
572 |
if len(ml_input_index)>0:
|
573 |
-
ml_pre_tsv=model_predict(
|
574 |
else:
|
575 |
ml_pre_tsv=output_txt(ml_input_txt)
|
576 |
# print('ml-modelpred:',time.time()-startTime)
|
|
|
8 |
import io
|
9 |
import time
|
10 |
import numpy as np
|
11 |
+
|
12 |
def ml_intext(infile):
|
13 |
fin=open(infile,'r',encoding='utf-8')
|
14 |
alltexts=fin.read().strip().split('\n\n')
|
|
|
462 |
return fout.getvalue()
|
463 |
|
464 |
|
465 |
+
def model_predict(ml_input,nn_model,ml_input_txt,ml_input_index,Threshold):
|
466 |
if nn_model.model_type=='cnn':
|
467 |
#startTime=time.time()
|
468 |
test_set,test_label = ml_intext_fn(ml_input)
|
|
|
482 |
input_test.append(test_x[3])
|
483 |
# print('ml-model-represent:',time.time()-startTime)
|
484 |
# startTime=time.time()
|
|
|
485 |
test_pre = nn_model.model.predict(input_test)
|
486 |
# print('ml-model-predict:',time.time()-startTime)
|
487 |
|
|
|
491 |
test_x,test_y=nn_model.rep.load_data(test_set,test_label,word_max_len=nn_model.maxlen)
|
492 |
#print('ml-model-represent:',time.time()-startTime)
|
493 |
#startTime=time.time()
|
|
|
|
|
|
|
|
|
494 |
test_pre = nn_model.model.predict(test_x)
|
495 |
#print('ml-model-modedpred:',time.time()-startTime)
|
496 |
# startTime=time.time()
|
|
|
522 |
|
523 |
if nn_model.fea_dict['pos'] == 1:
|
524 |
input_test.append(test_x[3])
|
525 |
+
|
526 |
+
test_pre = nn_model.model.predict(input_test,batch_size=256)
|
|
|
|
|
527 |
|
528 |
elif nn_model.model_type=='bert' or nn_model.model_type=='bioformer':
|
529 |
|
530 |
test_set,test_label = ml_intext_fn(ml_input)
|
531 |
test_x,test_y=nn_model.rep.load_data(test_set,test_label,word_max_len=nn_model.maxlen)
|
532 |
+
|
533 |
+
test_pre = nn_model.model.predict(test_x,batch_size=128)
|
|
|
|
|
534 |
|
535 |
test_score=output_result(test_pre, nn_model.rep.label_2_index,Top_N=3)
|
536 |
#print('test_score:',test_score)
|
|
|
553 |
|
554 |
return fout.getvalue()
|
555 |
|
556 |
+
def ml_tagging(ssplit_token,ml_model,Threshold):
|
557 |
# startTime=time.time()
|
558 |
ml_input, ml_input_txt,ml_input_index=build_ngram_testset_filted(ssplit_token)
|
559 |
# print('ml-ngrambuild:',time.time()-startTime)
|
|
|
561 |
#print(ml_input)
|
562 |
# startTime=time.time()
|
563 |
if len(ml_input_index)>0:
|
564 |
+
ml_pre_tsv=model_predict(ml_input,ml_model,ml_input_txt,ml_input_index,Threshold)
|
565 |
else:
|
566 |
ml_pre_tsv=output_txt(ml_input_txt)
|
567 |
# print('ml-modelpred:',time.time()-startTime)
|
src/nn_model.py
CHANGED
@@ -8,12 +8,10 @@ Created on Thu Mar 26 09:04:13 2020
|
|
8 |
import time
|
9 |
import sys
|
10 |
import numpy as np
|
11 |
-
import tensorflow as tf
|
12 |
import keras
|
13 |
from src.nn_represent import CNN_RepresentationLayer,BERT_RepresentationLayer
|
14 |
from keras.layers import *
|
15 |
from keras.models import Model
|
16 |
-
from keras import backend as K
|
17 |
from keras_bert import load_trained_model_from_checkpoint
|
18 |
|
19 |
|
@@ -37,7 +35,7 @@ class bioTag_CNN():
|
|
37 |
self.charfile=model_files['charfile']
|
38 |
self.labelfile=model_files['labelfile']
|
39 |
self.posfile=model_files['posfile']
|
40 |
-
|
41 |
vocab={'char':self.charfile,'label':self.labelfile,'pos':self.posfile}
|
42 |
print('loading w2v model.....')
|
43 |
self.rep = CNN_RepresentationLayer(self.w2vfile,vocab_file=vocab, frequency=400000)
|
@@ -94,8 +92,6 @@ class bioTag_CNN():
|
|
94 |
self.model = Model(inputs=all_fea, outputs=output)
|
95 |
def load_model(self,model_file):
|
96 |
self.model.load_weights(model_file)
|
97 |
-
self.session = K.get_session()
|
98 |
-
print(self.session)
|
99 |
#self.model.summary()
|
100 |
print('load cnn model done!')
|
101 |
|
@@ -107,7 +103,6 @@ class bioTag_BERT():
|
|
107 |
checkpoint_path = model_files['checkpoint_path']
|
108 |
vocab_path = model_files['vocab_path']
|
109 |
self.label_file=model_files['labelfile']
|
110 |
-
self.session = tf.Session()
|
111 |
|
112 |
self.rep = BERT_RepresentationLayer( vocab_path, self.label_file)
|
113 |
|
@@ -124,8 +119,6 @@ class bioTag_BERT():
|
|
124 |
|
125 |
def load_model(self,model_file):
|
126 |
self.model.load_weights(model_file)
|
127 |
-
self.session = K.get_session()
|
128 |
-
print(self.session)
|
129 |
#self.model.summary()
|
130 |
|
131 |
class bioTag_Bioformer():
|
@@ -152,11 +145,6 @@ class bioTag_Bioformer():
|
|
152 |
|
153 |
def load_model(self,model_file):
|
154 |
self.model.load_weights(model_file)
|
155 |
-
#self.model._make_predict_function()
|
156 |
-
#session = K.get_session()
|
157 |
-
#print(session)
|
158 |
#self.model.summary()
|
159 |
-
session=''
|
160 |
-
return session
|
161 |
print('load bioformer model done!')
|
162 |
|
|
|
8 |
import time
|
9 |
import sys
|
10 |
import numpy as np
|
|
|
11 |
import keras
|
12 |
from src.nn_represent import CNN_RepresentationLayer,BERT_RepresentationLayer
|
13 |
from keras.layers import *
|
14 |
from keras.models import Model
|
|
|
15 |
from keras_bert import load_trained_model_from_checkpoint
|
16 |
|
17 |
|
|
|
35 |
self.charfile=model_files['charfile']
|
36 |
self.labelfile=model_files['labelfile']
|
37 |
self.posfile=model_files['posfile']
|
38 |
+
|
39 |
vocab={'char':self.charfile,'label':self.labelfile,'pos':self.posfile}
|
40 |
print('loading w2v model.....')
|
41 |
self.rep = CNN_RepresentationLayer(self.w2vfile,vocab_file=vocab, frequency=400000)
|
|
|
92 |
self.model = Model(inputs=all_fea, outputs=output)
|
93 |
def load_model(self,model_file):
|
94 |
self.model.load_weights(model_file)
|
|
|
|
|
95 |
#self.model.summary()
|
96 |
print('load cnn model done!')
|
97 |
|
|
|
103 |
checkpoint_path = model_files['checkpoint_path']
|
104 |
vocab_path = model_files['vocab_path']
|
105 |
self.label_file=model_files['labelfile']
|
|
|
106 |
|
107 |
self.rep = BERT_RepresentationLayer( vocab_path, self.label_file)
|
108 |
|
|
|
119 |
|
120 |
def load_model(self,model_file):
|
121 |
self.model.load_weights(model_file)
|
|
|
|
|
122 |
#self.model.summary()
|
123 |
|
124 |
class bioTag_Bioformer():
|
|
|
145 |
|
146 |
def load_model(self,model_file):
|
147 |
self.model.load_weights(model_file)
|
|
|
|
|
|
|
148 |
#self.model.summary()
|
|
|
|
|
149 |
print('load bioformer model done!')
|
150 |
|
src/tagging_text.py
CHANGED
@@ -18,7 +18,7 @@ import time
|
|
18 |
import json
|
19 |
|
20 |
#hybrid method
|
21 |
-
def bioTag(
|
22 |
|
23 |
# startTime=time.time()
|
24 |
ssplit_token=ssplit_token_pos_lemma(text)
|
@@ -31,7 +31,7 @@ def bioTag(session,text,biotag_dic,ml_model,onlyLongest=False, abbrRecog=False,
|
|
31 |
# print('dict ner:',time.time()-startTime)
|
32 |
|
33 |
# startTime=time.time()
|
34 |
-
ml_tsv=ml_tagging(
|
35 |
#print('ml_tsv:\n',ml_tsv)
|
36 |
# print('ml ner:',time.time()-startTime)
|
37 |
|
|
|
18 |
import json
|
19 |
|
20 |
#hybrid method
|
21 |
+
def bioTag(text,biotag_dic,ml_model,onlyLongest=False, abbrRecog=False, Threshold=0.95):
|
22 |
|
23 |
# startTime=time.time()
|
24 |
ssplit_token=ssplit_token_pos_lemma(text)
|
|
|
31 |
# print('dict ner:',time.time()-startTime)
|
32 |
|
33 |
# startTime=time.time()
|
34 |
+
ml_tsv=ml_tagging(ssplit_token,ml_model,Threshold)
|
35 |
#print('ml_tsv:\n',ml_tsv)
|
36 |
# print('ml ner:',time.time()-startTime)
|
37 |
|