Spaces:
Build error
Build error
# -*- coding: utf-8 -*- | |
""" | |
Created on Tue Nov 22 09:54:41 2022 | |
@author: luol2 | |
""" | |
import streamlit as st | |
import argparse | |
from src.nn_model import bioTag_CNN,bioTag_BERT,bioTag_Bioformer | |
from src.dic_ner import dic_ont | |
from src.tagging_text import bioTag | |
import os | |
import time | |
import json | |
import sys | |
import nltk | |
nltk.download('punkt') | |
nltk.download('averaged_perceptron_tagger') | |
nltk.download('wordnet') | |
st.set_page_config( | |
page_title="PhenoTagger", | |
page_icon=":shark:", | |
# layout="wide", | |
initial_sidebar_state="expanded", | |
menu_items={ | |
'Get Help': 'https://www.extremelycoolapp.com/help', | |
'Report a bug': "https://www.extremelycoolapp.com/bug", | |
'About': "# This is a header. This is an *extremely* cool app!" | |
} | |
) | |
st.title('PhenoTagger Demo') | |
# with st.spinner('Model is being loaded..'): | |
# print('load model done!') | |
with st.form(key="my_form"): | |
def load_model(): | |
ontfiles={'dic_file':'./dict_new/noabb_lemma.dic', | |
'word_hpo_file':'./dict_new/word_id_map.json', | |
'hpo_word_file':'./dict_new/id_word_map.json'} | |
# if para_set['model_type']=='cnn': | |
# vocabfiles={'w2vfile':'../vocab/bio_embedding_intrinsic.d200', | |
# 'charfile':'../vocab/char.vocab', | |
# 'labelfile':'../dict_new/lable.vocab', | |
# 'posfile':'../vocab/pos.vocab'} | |
# modelfile='../models/cnn_p5n5_b128_95_hponew1.h5' | |
# elif para_set['model_type']=='bioformer': | |
vocabfiles={'labelfile':'./dict_new/lable.vocab', | |
'config_path':'./vocab/bioformer-cased-v1.0/bert_config.json', | |
'checkpoint_path':'./vocab/bioformer-cased-v1.0/bioformer-cased-v1.0-model.ckpt-2000000', | |
'vocab_path':'./vocab/bioformer-cased-v1.0/vocab.txt'} | |
modelfile='./vocab/bioformer_p5n5_b64_1e-5_95_hponew3.h5' | |
# else: | |
# print('Model type is wrong, please select cnn or bioformer.') | |
# sys.exit() | |
biotag_dic=dic_ont(ontfiles) | |
# if para_set['model_type']=='cnn': | |
# nn_model=bioTag_CNN(vocabfiles) | |
# nn_model.load_model(modelfile) | |
# elif para_set['model_type']=='bioformer': | |
nn_model=bioTag_Bioformer(vocabfiles) | |
session=nn_model.load_model(modelfile) | |
test_tag='1232' | |
return nn_model,biotag_dic,test_tag,session | |
#hyper-parameter | |
st.sidebar.header("Hyperparameter Settings") | |
sbform = st.sidebar.form("Hyper-paramiters") | |
# para_model=sbform.selectbox('Model', ['cnn', 'bioformer']) | |
para_overlap=sbform.selectbox('Return overlapping concepts', ['True', 'False']) | |
para_abbr=sbform.selectbox('Identify abbreviations', ['True', 'False']) | |
para_threshold = sbform.slider('Threshold:', min_value=0.5, max_value=0.95, value=0.95, step=0.05) | |
sbform.form_submit_button("Setting") | |
st.write('parameters:', para_overlap,para_abbr,para_threshold) | |
nn_model,biotag_dic,test_tag,session=load_model() | |
input_text = st.text_area( | |
"Paste your text below (max 500 words)", | |
height=510, | |
) | |
MAX_WORDS = 500 | |
import re | |
res = len(re.findall(r"\w+", input_text)) | |
if res > MAX_WORDS: | |
st.warning( | |
"⚠️ Your text contains " | |
+ str(res) | |
+ " words." | |
+ " Only the first 500 words will be reviewed. Stay tuned as increased allowance is coming! 😊" | |
) | |
input_text = input_text[:MAX_WORDS] | |
submit_button = st.form_submit_button(label="✨ Get me the data!") | |
if para_overlap=='True': | |
para_overlap=True | |
else: | |
para_overlap=False | |
if para_abbr=='True': | |
para_abbr=True | |
else: | |
para_abbr=False | |
para_set={ | |
#model_type':para_model, # cnn or bioformer | |
'onlyLongest':para_overlap, # False: return overlap concepts, True only longgest | |
'abbrRecog':para_abbr,# False: don't identify abbr, True: identify abbr | |
'ML_Threshold':para_threshold,# the Threshold of deep learning model | |
} | |
if not submit_button: | |
st.stop() | |
st.markdown(f"""**Results:**\n""") | |
# print('dic...........:',biotag_dic.keys()) | |
print('........:',test_tag) | |
print('........!!!!!!:',input_text) | |
print('...input:',input_text) | |
tag_result=bioTag(session,input_text,biotag_dic,nn_model,onlyLongest=para_set['onlyLongest'], abbrRecog=para_set['abbrRecog'],Threshold=para_set['ML_Threshold']) | |
for ele in tag_result: | |
start = ele[0] | |
last = ele[1] | |
mention = input_text[int(ele[0]):int(ele[1])] | |
type='Phenotype' | |
id=ele[2] | |
score=ele[3] | |
output=start+"\t"+last+"\t"+mention+"\t"+id+'\t'+score+"\n" | |
st.info(output) | |