lingbionlp commited on
Commit
3d2a798
·
1 Parent(s): f9f538e

Update src/tagging_text.py

Browse files
Files changed (1) hide show
  1. src/tagging_text.py +102 -102
src/tagging_text.py CHANGED
@@ -1,102 +1,102 @@
1
- # -*- coding: utf-8 -*-
2
- """
3
- Created on Mon Aug 24 16:21:23 2020
4
-
5
- @author: luol2
6
- """
7
-
8
- import argparse
9
- from ssplit_tokenzier import ssplit_token_pos_lemma
10
- from ml_ner import ml_tagging,ml_tagging_allngram
11
- from combine_result import combine_ml_dict
12
- from restore_index import restore_index_nest_fn
13
- from dic_ner import dic_ont
14
- from post_processing import combine_overlap
15
- from abbre_resolution import postprocess_abbr
16
- import os
17
- import time
18
- import json
19
-
20
- #hybrid method
21
- def bioTag(session,text,biotag_dic,ml_model,onlyLongest=False, abbrRecog=False, Threshold=0.95):
22
-
23
- # startTime=time.time()
24
- ssplit_token=ssplit_token_pos_lemma(text)
25
- # print(ssplit_token)
26
- # print('ssplit token:',time.time()-startTime)
27
-
28
- # startTime=time.time()
29
- dict_tsv=biotag_dic.matching(ssplit_token)
30
- # print('dict tsv:\n',dict_tsv)
31
- # print('dict ner:',time.time()-startTime)
32
-
33
- # startTime=time.time()
34
- ml_tsv=ml_tagging(session,ssplit_token,ml_model,Threshold)
35
- #print('ml_tsv:\n',ml_tsv)
36
- # print('ml ner:',time.time()-startTime)
37
-
38
- # startTime=time.time()
39
- combine_tsv=combine_ml_dict(dict_tsv,ml_tsv)
40
- #combine_tsv=combine_ml_dict_fn(ml_tsv,dict_tsv)
41
- #print('combine:\n',combine_tsv)
42
- # print('combine:',time.time()-startTime)
43
-
44
- # startTime=time.time()
45
- final_result= restore_index_nest_fn(text,combine_tsv)
46
- # print('final ner:',time.time()-startTime)
47
- if onlyLongest==True:
48
- final_result=combine_overlap(final_result)
49
- if abbrRecog==True:
50
- final_result=postprocess_abbr(final_result,text)
51
- # print('final result:')
52
- # print(final_result)
53
- # print('final ner:',time.time()-startTime)
54
-
55
- return final_result
56
-
57
- # only machine learning-based method
58
- def bioTag_ml(text,ml_model,onlyLongest=False,abbrRecog=False, Threshold=0.95):
59
-
60
- # startTime=time.time()
61
- ssplit_token=ssplit_token_pos_lemma(text)
62
- # print(ssplit_token)
63
- # print('ssplit token:',time.time()-startTime)
64
-
65
- # startTime=time.time()
66
- ml_tsv=ml_tagging_allngram(ssplit_token,ml_model,Threshold)
67
- # print('ml_tsv:\n',ml_tsv)
68
- # print('ml ner:',time.time()-startTime)
69
-
70
- final_result= restore_index_nest_fn(text,ml_tsv)
71
- # print('final ner:',time.time()-startTime)
72
- if onlyLongest==True:
73
- final_result=combine_overlap(final_result)
74
-
75
- if abbrRecog==True:
76
- final_result=postprocess_abbr(final_result,text)
77
-
78
- return final_result
79
-
80
- # only dict method
81
- def bioTag_dic(text,biotag_dic,onlyLongest=False, abbrRecog=False):
82
-
83
- # startTime=time.time()
84
- ssplit_token=ssplit_token_pos_lemma(text)
85
- # print(ssplit_token)
86
- # print('ssplit token:',time.time()-startTime)
87
-
88
- # startTime=time.time()
89
- dict_tsv=biotag_dic.matching(ssplit_token)
90
- # print('dict tsv:\n',dict_tsv)
91
- # print('dict ner:',time.time()-startTime)
92
-
93
- final_result= restore_index_nest_fn(text,dict_tsv)
94
- # print('final ner:',time.time()-startTime)
95
- if onlyLongest==True:
96
- final_result=combine_overlap(final_result)
97
-
98
- if abbrRecog==True:
99
- final_result=postprocess_abbr(final_result,text)
100
-
101
- return final_result
102
-
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Mon Aug 24 16:21:23 2020
4
+
5
+ @author: luol2
6
+ """
7
+
8
+ import argparse
9
+ from src.ssplit_tokenzier import ssplit_token_pos_lemma
10
+ from src.ml_ner import ml_tagging,ml_tagging_allngram
11
+ from src.combine_result import combine_ml_dict
12
+ from src.restore_index import restore_index_nest_fn
13
+ from src.dic_ner import dic_ont
14
+ from src.post_processing import combine_overlap
15
+ from src.abbre_resolution import postprocess_abbr
16
+ import os
17
+ import time
18
+ import json
19
+
20
+ #hybrid method
21
+ def bioTag(session,text,biotag_dic,ml_model,onlyLongest=False, abbrRecog=False, Threshold=0.95):
22
+
23
+ # startTime=time.time()
24
+ ssplit_token=ssplit_token_pos_lemma(text)
25
+ # print(ssplit_token)
26
+ # print('ssplit token:',time.time()-startTime)
27
+
28
+ # startTime=time.time()
29
+ dict_tsv=biotag_dic.matching(ssplit_token)
30
+ # print('dict tsv:\n',dict_tsv)
31
+ # print('dict ner:',time.time()-startTime)
32
+
33
+ # startTime=time.time()
34
+ ml_tsv=ml_tagging(session,ssplit_token,ml_model,Threshold)
35
+ #print('ml_tsv:\n',ml_tsv)
36
+ # print('ml ner:',time.time()-startTime)
37
+
38
+ # startTime=time.time()
39
+ combine_tsv=combine_ml_dict(dict_tsv,ml_tsv)
40
+ #combine_tsv=combine_ml_dict_fn(ml_tsv,dict_tsv)
41
+ #print('combine:\n',combine_tsv)
42
+ # print('combine:',time.time()-startTime)
43
+
44
+ # startTime=time.time()
45
+ final_result= restore_index_nest_fn(text,combine_tsv)
46
+ # print('final ner:',time.time()-startTime)
47
+ if onlyLongest==True:
48
+ final_result=combine_overlap(final_result)
49
+ if abbrRecog==True:
50
+ final_result=postprocess_abbr(final_result,text)
51
+ # print('final result:')
52
+ # print(final_result)
53
+ # print('final ner:',time.time()-startTime)
54
+
55
+ return final_result
56
+
57
+ # only machine learning-based method
58
+ def bioTag_ml(text,ml_model,onlyLongest=False,abbrRecog=False, Threshold=0.95):
59
+
60
+ # startTime=time.time()
61
+ ssplit_token=ssplit_token_pos_lemma(text)
62
+ # print(ssplit_token)
63
+ # print('ssplit token:',time.time()-startTime)
64
+
65
+ # startTime=time.time()
66
+ ml_tsv=ml_tagging_allngram(ssplit_token,ml_model,Threshold)
67
+ # print('ml_tsv:\n',ml_tsv)
68
+ # print('ml ner:',time.time()-startTime)
69
+
70
+ final_result= restore_index_nest_fn(text,ml_tsv)
71
+ # print('final ner:',time.time()-startTime)
72
+ if onlyLongest==True:
73
+ final_result=combine_overlap(final_result)
74
+
75
+ if abbrRecog==True:
76
+ final_result=postprocess_abbr(final_result,text)
77
+
78
+ return final_result
79
+
80
+ # only dict method
81
+ def bioTag_dic(text,biotag_dic,onlyLongest=False, abbrRecog=False):
82
+
83
+ # startTime=time.time()
84
+ ssplit_token=ssplit_token_pos_lemma(text)
85
+ # print(ssplit_token)
86
+ # print('ssplit token:',time.time()-startTime)
87
+
88
+ # startTime=time.time()
89
+ dict_tsv=biotag_dic.matching(ssplit_token)
90
+ # print('dict tsv:\n',dict_tsv)
91
+ # print('dict ner:',time.time()-startTime)
92
+
93
+ final_result= restore_index_nest_fn(text,dict_tsv)
94
+ # print('final ner:',time.time()-startTime)
95
+ if onlyLongest==True:
96
+ final_result=combine_overlap(final_result)
97
+
98
+ if abbrRecog==True:
99
+ final_result=postprocess_abbr(final_result,text)
100
+
101
+ return final_result
102
+