Spaces:
Build error
Build error
# -*- coding: utf-8 -*- | |
""" | |
Created on Mon Nov 21 16:21:25 2022 | |
@author: luol2 | |
""" | |
import streamlit as st | |
from src.nn_model import bioTag_CNN,bioTag_Bioformer | |
from src.dic_ner import dic_ont | |
from src.tagging_text import bioTag | |
import os | |
import json | |
from pandas import DataFrame | |
import nltk | |
nltk.download('punkt') | |
nltk.download('averaged_perceptron_tagger') | |
nltk.download('wordnet') | |
st.set_page_config( | |
page_title="PhenoTagger", | |
page_icon="🎈", | |
layout="wide", | |
menu_items={ | |
'Get Help': 'https://www.ncbi.nlm.nih.gov/research/bionlp/', | |
'About': "PhenoTagger v1.1" | |
} | |
) | |
# def _max_width_(): | |
# max_width_str = f"max-width: 2400px;" | |
# st.markdown( | |
# f""" | |
# <style> | |
# .reportview-container .main .block-container{{ | |
# {max_width_str} | |
# }} | |
# </style> | |
# """, | |
# unsafe_allow_html=True, | |
# ) | |
# _max_width_() | |
# c30, c31, c32 = st.columns([2.5, 1, 3]) | |
# with c30: | |
# # st.image("logo.png", width=400) | |
st.title("👨⚕️ PhenoTagger Demo") | |
with st.expander("ℹ️ - About this app", expanded=True): | |
st.write( | |
""" | |
- This app is an easy-to-use interface built in Streamlit for [PhenoTagger](https://github.com/ncbi-nlp/PhenoTagger) library! | |
- PhenoTagger is a hybrid method that combines dictionary and deep learning-based methods to recognize Human Phenotype Ontology (HPO) concepts in unstructured biomedical text. Please refer to [our paper](https://doi.org/10.1093/bioinformatics/btab019) for more details. | |
- Contact: [NLM/NCBI BioNLP Research Group](https://www.ncbi.nlm.nih.gov/research/bionlp/) | |
""" | |
) | |
st.markdown("") | |
st.markdown("") | |
st.markdown("## 📌 Paste document ") | |
with st.form(key="my_form"): | |
ce, c1, ce, c2, c3 = st.columns([0.07, 1, 0.07, 4, 0.07]) | |
with c1: | |
ModelType = st.radio( | |
"Choose your model", | |
["Bioformer(Default)", "CNN"], | |
help="Bioformer is more precise, CNN is more efficient", | |
) | |
if ModelType == "Bioformer(Default)": | |
# kw_model = KeyBERT(model=roberta) | |
def load_model(): | |
ontfiles={'dic_file':'./dict_new/noabb_lemma.dic', | |
'word_hpo_file':'./dict_new/word_id_map.json', | |
'hpo_word_file':'./dict_new/id_word_map.json'} | |
vocabfiles={'labelfile':'./dict_new/lable.vocab', | |
'config_path':'./vocab/bioformer-cased-v1.0/bert_config.json', | |
'checkpoint_path':'./vocab/bioformer-cased-v1.0/bioformer-cased-v1.0-model.ckpt-2000000', | |
'vocab_path':'./vocab/bioformer-cased-v1.0/vocab.txt'} | |
modelfile='./vocab/bioformer_p5n5_b64_1e-5_95_hponew3.h5' | |
biotag_dic=dic_ont(ontfiles) | |
nn_model=bioTag_Bioformer(vocabfiles) | |
nn_model.load_model(modelfile) | |
return nn_model,biotag_dic | |
nn_model,biotag_dic = load_model() | |
else: | |
def load_model(): | |
ontfiles={'dic_file':'./dict_new/noabb_lemma.dic', | |
'word_hpo_file':'./dict_new/word_id_map.json', | |
'hpo_word_file':'./dict_new/id_word_map.json'} | |
vocabfiles={'w2vfile':'./vocab/bio_embedding_intrinsic.d200', | |
'charfile':'./vocab/char.vocab', | |
'labelfile':'./dict_new/lable.vocab', | |
'posfile':'./vocab/pos.vocab'} | |
modelfile='./models/cnn_p5n5_b128_95_hponew1.h5' | |
biotag_dic=dic_ont(ontfiles) | |
nn_model=bioTag_CNN(vocabfiles) | |
nn_model.load_model(modelfile) | |
return nn_model,biotag_dic | |
nn_model,biotag_dic = load_model() | |
para_overlap = st.checkbox( | |
"Overlap concept", | |
value=True, | |
help="Tick this box to identify overlapping concepts", | |
) | |
para_abbr = st.checkbox( | |
"Abbreviaitons", | |
value=True, | |
help="Tick this box to identify abbreviations", | |
) | |
para_threshold = st.slider( | |
"Threshold", | |
min_value=0.5, | |
max_value=0.95, | |
value=0.95, | |
step=0.05, | |
help="Retrun the preditions which socre over the threshold.", | |
) | |
with c2: | |
doc = st.text_area( | |
"Paste your text below", | |
height=400, | |
) | |
# MAX_WORDS = 500 | |
# import re | |
# res = len(re.findall(r"\w+", doc)) | |
# if res > MAX_WORDS: | |
# st.warning( | |
# "⚠️ Your text contains " | |
# + str(res) | |
# + " words." | |
# + " Only the first 500 words will be reviewed. Stay tuned as increased allowance is coming! 😊" | |
# ) | |
# doc = doc[:MAX_WORDS] | |
submit_button = st.form_submit_button(label="✨ Submit!") | |
if not submit_button: | |
st.stop() | |
para_set={ | |
#model_type':para_model, # cnn or bioformer | |
'onlyLongest':para_overlap, # False: return overlap concepts, True only longgest | |
'abbrRecog':para_abbr,# False: don't identify abbr, True: identify abbr | |
'ML_Threshold':para_threshold,# the Threshold of deep learning model | |
} | |
st.markdown("") | |
st.markdown("## 💡 Tagging results:") | |
with st.spinner('Wait for tagging...'): | |
tag_result=bioTag(doc,biotag_dic,nn_model,onlyLongest=para_set['onlyLongest'], abbrRecog=para_set['abbrRecog'],Threshold=para_set['ML_Threshold']) | |
st.markdown('<font style="color: rgb(128, 128, 128);">Move the mouse over the entity to display the HPO id.</font>', unsafe_allow_html=True) | |
# print('dic...........:',biotag_dic.keys()) | |
# st.write('parameters:', para_overlap,para_abbr,para_threshold) | |
html_results='' | |
text_results=doc+'\n' | |
entity_end=0 | |
hpoid_count={} | |
if len(tag_result)>=0: | |
for ele in tag_result: | |
entity_start=int(ele[0]) | |
html_results+=doc[entity_end:entity_start] | |
entity_end=int(ele[1]) | |
entity_id=ele[2] | |
entity_score=ele[3] | |
text_results+=ele[0]+'\t'+ele[1]+'\t'+doc[entity_start:entity_end]+'\t'+ele[2]+'\t'+format(float(ele[3]),'.2f')+'\n' | |
if entity_id not in hpoid_count.keys(): | |
hpoid_count[entity_id]=1 | |
else: | |
hpoid_count[entity_id]+=1 | |
html_results+='<font style="background-color: rgb(255, 204, 0)'+';" title="'+entity_id+'">'+doc[entity_start:entity_end]+'</font>' | |
html_results+=doc[entity_end:] | |
else: | |
html_results=doc | |
st.markdown('<table border="1"><tr><td>'+html_results+'</td></tr></table>', unsafe_allow_html=True) | |
#table | |
data_entity=[] | |
for ele in hpoid_count.keys(): | |
temp=[ele,biotag_dic.hpo_word[ele][0],hpoid_count[ele]] #hpoid, term name, count | |
data_entity.append(temp) | |
st.markdown("") | |
st.markdown("") | |
# st.markdown("## Table output:") | |
# cs, c1, c2, c3, cLast = st.columns([2, 1.5, 1.5, 1.5, 2]) | |
# with c1: | |
# CSVButton2 = download_button(keywords, "Data.csv", "📥 Download (.csv)") | |
# with c2: | |
# CSVButton2 = download_button(keywords, "Data.txt", "📥 Download (.txt)") | |
# with c3: | |
# CSVButton2 = download_button(keywords, "Data.json", "📥 Download (.json)") | |
# st.header("") | |
df = ( | |
DataFrame(data_entity, columns=["HPO_id", "Term name","Frequency"]) | |
.sort_values(by="Frequency", ascending=False) | |
.reset_index(drop=True) | |
) | |
df.index += 1 | |
c1, c2, c3 = st.columns([1, 4, 1]) | |
# format_dictionary = { | |
# "Relevancy": "{:.1%}", | |
# } | |
# df = df.format(format_dictionary) | |
with c2: | |
st.table(df) | |
c1, c2, c3 = st.columns([1, 1, 1]) | |
with c2: | |
st.download_button('Download annotations', text_results) | |