Spaces:
Build error
Build error
lingbionlp
commited on
Commit
·
4dc59ae
1
Parent(s):
82ee352
Upload app.py
Browse files
app.py
CHANGED
@@ -1,157 +1,256 @@
|
|
1 |
# -*- coding: utf-8 -*-
|
2 |
"""
|
3 |
-
Created on
|
4 |
|
5 |
@author: luol2
|
6 |
"""
|
7 |
|
8 |
-
|
9 |
-
|
10 |
import streamlit as st
|
11 |
-
import
|
12 |
-
from src.nn_model import bioTag_CNN,bioTag_BERT,bioTag_Bioformer
|
13 |
from src.dic_ner import dic_ont
|
14 |
from src.tagging_text import bioTag
|
15 |
import os
|
16 |
-
import time
|
17 |
import json
|
18 |
-
import
|
19 |
-
import nltk
|
20 |
-
nltk.download('punkt')
|
21 |
-
nltk.download('averaged_perceptron_tagger')
|
22 |
-
nltk.download('wordnet')
|
23 |
|
24 |
st.set_page_config(
|
25 |
page_title="PhenoTagger",
|
26 |
-
page_icon="
|
27 |
-
|
28 |
-
initial_sidebar_state="expanded",
|
29 |
menu_items={
|
30 |
-
'Get Help': 'https://www.
|
31 |
-
'
|
32 |
-
'About': "# This is a header. This is an *extremely* cool app!"
|
33 |
}
|
34 |
)
|
35 |
-
st.title('PhenoTagger Demo')
|
36 |
|
37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
38 |
|
39 |
|
40 |
-
|
41 |
|
|
|
42 |
|
43 |
-
# with
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
|
|
|
48 |
|
49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
with st.form(key="my_form"):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
-
|
53 |
-
|
54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
'word_hpo_file':'./dict_new/word_id_map.json',
|
56 |
'hpo_word_file':'./dict_new/id_word_map.json'}
|
57 |
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
# sys.exit()
|
74 |
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
sbform.form_submit_button("Setting")
|
96 |
-
|
97 |
-
st.write('parameters:', para_overlap,para_abbr,para_threshold)
|
98 |
-
nn_model,biotag_dic,test_tag,session=load_model()
|
99 |
-
|
100 |
-
|
101 |
-
input_text = st.text_area(
|
102 |
-
"Paste your text below (max 500 words)",
|
103 |
-
height=510,
|
104 |
-
)
|
105 |
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
+ str(res)
|
113 |
-
+ " words."
|
114 |
-
+ " Only the first 500 words will be reviewed. Stay tuned as increased allowance is coming! 😊"
|
115 |
)
|
116 |
|
117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
|
119 |
-
submit_button = st.form_submit_button(label="✨ Get me the data!")
|
120 |
-
|
121 |
-
if para_overlap=='True':
|
122 |
-
para_overlap=True
|
123 |
-
else:
|
124 |
-
para_overlap=False
|
125 |
-
if para_abbr=='True':
|
126 |
-
para_abbr=True
|
127 |
-
else:
|
128 |
-
para_abbr=False
|
129 |
-
para_set={
|
130 |
-
#model_type':para_model, # cnn or bioformer
|
131 |
-
'onlyLongest':para_overlap, # False: return overlap concepts, True only longgest
|
132 |
-
'abbrRecog':para_abbr,# False: don't identify abbr, True: identify abbr
|
133 |
-
'ML_Threshold':para_threshold,# the Threshold of deep learning model
|
134 |
-
}
|
135 |
-
|
136 |
-
|
137 |
|
138 |
if not submit_button:
|
139 |
st.stop()
|
140 |
-
|
141 |
|
142 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
# print('dic...........:',biotag_dic.keys())
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# -*- coding: utf-8 -*-
|
2 |
"""
|
3 |
+
Created on Mon Nov 21 16:21:25 2022
|
4 |
|
5 |
@author: luol2
|
6 |
"""
|
7 |
|
|
|
|
|
8 |
import streamlit as st
|
9 |
+
from src.nn_model import bioTag_CNN,bioTag_Bioformer
|
|
|
10 |
from src.dic_ner import dic_ont
|
11 |
from src.tagging_text import bioTag
|
12 |
import os
|
|
|
13 |
import json
|
14 |
+
from pandas import DataFrame
|
|
|
|
|
|
|
|
|
15 |
|
16 |
st.set_page_config(
|
17 |
page_title="PhenoTagger",
|
18 |
+
page_icon="🎈",
|
19 |
+
layout="wide",
|
|
|
20 |
menu_items={
|
21 |
+
'Get Help': 'https://www.ncbi.nlm.nih.gov/research/bionlp/',
|
22 |
+
'About': "PhenoTagger v1.1"
|
|
|
23 |
}
|
24 |
)
|
|
|
25 |
|
26 |
|
27 |
+
# def _max_width_():
|
28 |
+
# max_width_str = f"max-width: 2400px;"
|
29 |
+
# st.markdown(
|
30 |
+
# f"""
|
31 |
+
# <style>
|
32 |
+
# .reportview-container .main .block-container{{
|
33 |
+
# {max_width_str}
|
34 |
+
# }}
|
35 |
+
# </style>
|
36 |
+
# """,
|
37 |
+
# unsafe_allow_html=True,
|
38 |
+
# )
|
39 |
|
40 |
|
41 |
+
# _max_width_()
|
42 |
|
43 |
+
# c30, c31, c32 = st.columns([2.5, 1, 3])
|
44 |
|
45 |
+
# with c30:
|
46 |
+
# # st.image("logo.png", width=400)
|
47 |
+
st.title("👨⚕️ PhenoTagger Demo")
|
|
|
48 |
|
49 |
+
with st.expander("ℹ️ - About this app", expanded=True):
|
50 |
|
51 |
+
st.write(
|
52 |
+
"""
|
53 |
+
- This app is an easy-to-use interface built in Streamlit for [PhenoTagger](https://github.com/ncbi-nlp/PhenoTagger) library!
|
54 |
+
- PhenoTagger is a hybrid method that combines dictionary and deep learning-based methods to recognize Human Phenotype Ontology (HPO) concepts in unstructured biomedical text. Please refer to [our paper](https://doi.org/10.1093/bioinformatics/btab019) for more details.
|
55 |
+
- Contact: [NLM/NCBI BioNLP Research Group](https://www.ncbi.nlm.nih.gov/research/bionlp/)
|
56 |
+
"""
|
57 |
+
)
|
58 |
+
|
59 |
+
st.markdown("")
|
60 |
+
|
61 |
+
st.markdown("")
|
62 |
+
st.markdown("## 📌 Paste document ")
|
63 |
with st.form(key="my_form"):
|
64 |
+
|
65 |
+
|
66 |
+
ce, c1, ce, c2, c3 = st.columns([0.07, 1, 0.07, 4, 0.07])
|
67 |
+
with c1:
|
68 |
+
ModelType = st.radio(
|
69 |
+
"Choose your model",
|
70 |
+
["Bioformer(Default)", "CNN"],
|
71 |
+
help="Bioformer is more precise, CNN is more efficient",
|
72 |
+
)
|
73 |
+
|
74 |
+
if ModelType == "Bioformer(Default)":
|
75 |
+
# kw_model = KeyBERT(model=roberta)
|
76 |
+
|
77 |
+
@st.cache(allow_output_mutation=True)
|
78 |
+
def load_model():
|
79 |
+
ontfiles={'dic_file':'./dict_new/noabb_lemma.dic',
|
80 |
+
'word_hpo_file':'./dict_new/word_id_map.json',
|
81 |
+
'hpo_word_file':'./dict_new/id_word_map.json'}
|
82 |
|
83 |
+
|
84 |
+
vocabfiles={'labelfile':'./dict_new/lable.vocab',
|
85 |
+
'config_path':'./vocab/bioformer-cased-v1.0/bert_config.json',
|
86 |
+
'checkpoint_path':'./vocab/bioformer-cased-v1.0/bioformer-cased-v1.0-model.ckpt-2000000',
|
87 |
+
'vocab_path':'./vocab/bioformer-cased-v1.0/vocab.txt'}
|
88 |
+
modelfile='./vocab/bioformer_p5n5_b64_1e-5_95_hponew3.h5'
|
89 |
+
|
90 |
+
|
91 |
+
biotag_dic=dic_ont(ontfiles)
|
92 |
+
|
93 |
+
nn_model=bioTag_Bioformer(vocabfiles)
|
94 |
+
nn_model.load_model(modelfile)
|
95 |
+
return nn_model,biotag_dic
|
96 |
+
|
97 |
+
nn_model,biotag_dic = load_model()
|
98 |
+
|
99 |
+
else:
|
100 |
+
@st.cache(allow_output_mutation=True)
|
101 |
+
def load_model():
|
102 |
+
ontfiles={'dic_file':'./dict_new/noabb_lemma.dic',
|
103 |
'word_hpo_file':'./dict_new/word_id_map.json',
|
104 |
'hpo_word_file':'./dict_new/id_word_map.json'}
|
105 |
|
106 |
+
|
107 |
+
vocabfiles={'w2vfile':'./vocab/bio_embedding_intrinsic.d200',
|
108 |
+
'charfile':'./vocab/char.vocab',
|
109 |
+
'labelfile':'./dict_new/lable.vocab',
|
110 |
+
'posfile':'./vocab/pos.vocab'}
|
111 |
+
modelfile='./models/cnn_p5n5_b128_95_hponew1.h5'
|
112 |
|
113 |
+
biotag_dic=dic_ont(ontfiles)
|
114 |
+
|
115 |
+
nn_model=bioTag_CNN(vocabfiles)
|
116 |
+
nn_model.load_model(modelfile)
|
117 |
+
|
118 |
+
return nn_model,biotag_dic
|
119 |
+
|
120 |
+
nn_model,biotag_dic = load_model()
|
|
|
121 |
|
122 |
+
para_overlap = st.checkbox(
|
123 |
+
"Overlap concept",
|
124 |
+
value=True,
|
125 |
+
help="Tick this box to identify overlapping concepts",
|
126 |
+
)
|
127 |
+
para_abbr = st.checkbox(
|
128 |
+
"Abbreviaitons",
|
129 |
+
value=True,
|
130 |
+
help="Tick this box to identify abbreviations",
|
131 |
+
)
|
132 |
+
|
133 |
+
para_threshold = st.slider(
|
134 |
+
"Threshold",
|
135 |
+
min_value=0.5,
|
136 |
+
max_value=0.95,
|
137 |
+
value=0.95,
|
138 |
+
step=0.05,
|
139 |
+
help="Retrun the preditions which socre over the threshold.",
|
140 |
+
)
|
141 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
|
143 |
+
|
144 |
+
|
145 |
+
with c2:
|
146 |
+
doc = st.text_area(
|
147 |
+
"Paste your text below",
|
148 |
+
height=400,
|
|
|
|
|
|
|
149 |
)
|
150 |
|
151 |
+
# MAX_WORDS = 500
|
152 |
+
# import re
|
153 |
+
# res = len(re.findall(r"\w+", doc))
|
154 |
+
# if res > MAX_WORDS:
|
155 |
+
# st.warning(
|
156 |
+
# "⚠️ Your text contains "
|
157 |
+
# + str(res)
|
158 |
+
# + " words."
|
159 |
+
# + " Only the first 500 words will be reviewed. Stay tuned as increased allowance is coming! 😊"
|
160 |
+
# )
|
161 |
+
|
162 |
+
# doc = doc[:MAX_WORDS]
|
163 |
+
|
164 |
+
submit_button = st.form_submit_button(label="✨ Submit!")
|
165 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
|
167 |
if not submit_button:
|
168 |
st.stop()
|
|
|
169 |
|
170 |
+
|
171 |
+
para_set={
|
172 |
+
#model_type':para_model, # cnn or bioformer
|
173 |
+
'onlyLongest':para_overlap, # False: return overlap concepts, True only longgest
|
174 |
+
'abbrRecog':para_abbr,# False: don't identify abbr, True: identify abbr
|
175 |
+
'ML_Threshold':para_threshold,# the Threshold of deep learning model
|
176 |
+
}
|
177 |
+
st.markdown("")
|
178 |
+
st.markdown("## 💡 Tagging results:")
|
179 |
+
with st.spinner('Wait for tagging...'):
|
180 |
+
tag_result=bioTag(doc,biotag_dic,nn_model,onlyLongest=para_set['onlyLongest'], abbrRecog=para_set['abbrRecog'],Threshold=para_set['ML_Threshold'])
|
181 |
+
|
182 |
+
st.markdown('<font style="color: rgb(128, 128, 128);">Move the mouse over the entity to display the HPO id.</font>', unsafe_allow_html=True)
|
183 |
# print('dic...........:',biotag_dic.keys())
|
184 |
+
# st.write('parameters:', para_overlap,para_abbr,para_threshold)
|
185 |
+
|
186 |
+
html_results=''
|
187 |
+
text_results=doc+'\n'
|
188 |
+
entity_end=0
|
189 |
+
hpoid_count={}
|
190 |
+
if len(tag_result)>=0:
|
191 |
+
for ele in tag_result:
|
192 |
+
entity_start=int(ele[0])
|
193 |
+
html_results+=doc[entity_end:entity_start]
|
194 |
+
entity_end=int(ele[1])
|
195 |
+
entity_id=ele[2]
|
196 |
+
entity_score=ele[3]
|
197 |
+
text_results+=ele[0]+'\t'+ele[1]+'\t'+doc[entity_start:entity_end]+'\t'+ele[2]+'\t'+format(float(ele[3]),'.2f')+'\n'
|
198 |
+
if entity_id not in hpoid_count.keys():
|
199 |
+
hpoid_count[entity_id]=1
|
200 |
+
else:
|
201 |
+
hpoid_count[entity_id]+=1
|
202 |
+
|
203 |
+
html_results+='<font style="background-color: rgb(255, 204, 0)'+';" title="'+entity_id+'">'+doc[entity_start:entity_end]+'</font>'
|
204 |
+
html_results+=doc[entity_end:]
|
205 |
+
|
206 |
+
else:
|
207 |
+
html_results=doc
|
208 |
+
|
209 |
+
st.markdown('<table border="1"><tr><td>'+html_results+'</td></tr></table>', unsafe_allow_html=True)
|
210 |
+
|
211 |
+
|
212 |
+
#table
|
213 |
+
data_entity=[]
|
214 |
+
for ele in hpoid_count.keys():
|
215 |
+
temp=[ele,biotag_dic.hpo_word[ele][0],hpoid_count[ele]] #hpoid, term name, count
|
216 |
+
data_entity.append(temp)
|
217 |
+
|
218 |
+
|
219 |
+
st.markdown("")
|
220 |
+
st.markdown("")
|
221 |
+
# st.markdown("## Table output:")
|
222 |
+
|
223 |
+
# cs, c1, c2, c3, cLast = st.columns([2, 1.5, 1.5, 1.5, 2])
|
224 |
+
|
225 |
+
# with c1:
|
226 |
+
# CSVButton2 = download_button(keywords, "Data.csv", "📥 Download (.csv)")
|
227 |
+
# with c2:
|
228 |
+
# CSVButton2 = download_button(keywords, "Data.txt", "📥 Download (.txt)")
|
229 |
+
# with c3:
|
230 |
+
# CSVButton2 = download_button(keywords, "Data.json", "📥 Download (.json)")
|
231 |
+
|
232 |
+
# st.header("")
|
233 |
+
|
234 |
+
df = (
|
235 |
+
DataFrame(data_entity, columns=["HPO_id", "Term name","Frequency"])
|
236 |
+
.sort_values(by="Frequency", ascending=False)
|
237 |
+
.reset_index(drop=True)
|
238 |
+
)
|
239 |
+
|
240 |
+
df.index += 1
|
241 |
+
|
242 |
+
c1, c2, c3 = st.columns([1, 4, 1])
|
243 |
+
|
244 |
+
# format_dictionary = {
|
245 |
+
# "Relevancy": "{:.1%}",
|
246 |
+
# }
|
247 |
+
|
248 |
+
# df = df.format(format_dictionary)
|
249 |
+
|
250 |
+
with c2:
|
251 |
+
st.table(df)
|
252 |
+
|
253 |
+
c1, c2, c3 = st.columns([1, 1, 1])
|
254 |
+
with c2:
|
255 |
+
st.download_button('Download annotations', text_results)
|
256 |
+
|