Spaces:

lingbionlp
/

PhenoTagger-Demo

Build error

App Files Files Community

PhenoTagger-Demo / app.py

lingbionlp

Update app.py

acdbc7f about 2 years ago

raw

history blame

4.91 kB

	# -- coding: utf-8 --
	"""
	Created on Tue Nov 22 09:54:41 2022

	@author: luol2
	"""



	import streamlit as st
	import argparse
	from src.nn_model import bioTag_CNN,bioTag_BERT,bioTag_Bioformer
	from src.dic_ner import dic_ont
	from src.tagging_text import bioTag
	import os
	import time
	import json
	import sys
	import nltk
	nltk.download('punkt')
	nltk.download('averaged_perceptron_tagger')
	nltk.download('wordnet')

	st.set_page_config(
	page_title="PhenoTagger",
	page_icon=":shark:",
	# layout="wide",
	initial_sidebar_state="expanded",
	menu_items={
	'Get Help': 'https://www.extremelycoolapp.com/help',
	'Report a bug': "https://www.extremelycoolapp.com/bug",
	'About': "# This is a header. This is an extremely cool app!"
	}
	)
	st.title('PhenoTagger Demo')







	# with st.spinner('Model is being loaded..'):

	# print('load model done!')




	with st.form(key="my_form"):

	@st.cache(allow_output_mutation=True)
	def load_model():
	ontfiles={'dic_file':'./dict_new/noabb_lemma.dic',
	'word_hpo_file':'./dict_new/word_id_map.json',
	'hpo_word_file':'./dict_new/id_word_map.json'}

	# if para_set['model_type']=='cnn':
	# vocabfiles={'w2vfile':'../vocab/bio_embedding_intrinsic.d200',
	# 'charfile':'../vocab/char.vocab',
	# 'labelfile':'../dict_new/lable.vocab',
	# 'posfile':'../vocab/pos.vocab'}
	# modelfile='../models/cnn_p5n5_b128_95_hponew1.h5'

	# elif para_set['model_type']=='bioformer':
	vocabfiles={'labelfile':'./dict_new/lable.vocab',
	'config_path':'./vocab/bioformer-cased-v1.0/bert_config.json',
	'checkpoint_path':'./vocab/bioformer-cased-v1.0/bioformer-cased-v1.0-model.ckpt-2000000',
	'vocab_path':'./vocab/bioformer-cased-v1.0/vocab.txt'}
	modelfile='./vocab/bioformer_p5n5_b64_1e-5_95_hponew3.h5'
	# else:
	# print('Model type is wrong, please select cnn or bioformer.')
	# sys.exit()


	biotag_dic=dic_ont(ontfiles)

	# if para_set['model_type']=='cnn':
	# nn_model=bioTag_CNN(vocabfiles)
	# nn_model.load_model(modelfile)
	# elif para_set['model_type']=='bioformer':
	nn_model=bioTag_Bioformer(vocabfiles)
	session=nn_model.load_model(modelfile)
	test_tag='1232'
	return nn_model,biotag_dic,test_tag,session


	#hyper-parameter
	st.sidebar.header("Hyperparameter Settings")
	sbform = st.sidebar.form("Hyper-paramiters")
	# para_model=sbform.selectbox('Model', ['cnn', 'bioformer'])
	para_overlap=sbform.selectbox('Return overlapping concepts', ['True', 'False'])
	para_abbr=sbform.selectbox('Identify abbreviations', ['True', 'False'])
	para_threshold = sbform.slider('Threshold:', min_value=0.5, max_value=0.95, value=0.95, step=0.05)
	sbform.form_submit_button("Setting")

	st.write('parameters:', para_overlap,para_abbr,para_threshold)
	nn_model,biotag_dic,test_tag,session=load_model()


	input_text = st.text_area(
	"Paste your text below (max 500 words)",
	height=510,
	)

	MAX_WORDS = 500
	import re
	res = len(re.findall(r"\w+", input_text))
	if res > MAX_WORDS:
	st.warning(
	"⚠️ Your text contains "
	+ str(res)
	+ " words."
	+ " Only the first 500 words will be reviewed. Stay tuned as increased allowance is coming! 😊"
	)

	input_text = input_text[:MAX_WORDS]

	submit_button = st.form_submit_button(label="✨ Get me the data!")

	if para_overlap=='True':
	para_overlap=True
	else:
	para_overlap=False
	if para_abbr=='True':
	para_abbr=True
	else:
	para_abbr=False
	para_set={
	#model_type':para_model, # cnn or bioformer
	'onlyLongest':para_overlap, # False: return overlap concepts, True only longgest
	'abbrRecog':para_abbr,# False: don't identify abbr, True: identify abbr
	'ML_Threshold':para_threshold,# the Threshold of deep learning model
	}



	if not submit_button:
	st.stop()


	st.markdown(f"""Results:\n""")
	# print('dic...........:',biotag_dic.keys())
	print('........:',test_tag)
	print('........!!!!!!:',input_text)
	print('...input:',input_text)
	tag_result=bioTag(session,input_text,biotag_dic,nn_model,onlyLongest=para_set['onlyLongest'], abbrRecog=para_set['abbrRecog'],Threshold=para_set['ML_Threshold'])
	for ele in tag_result:
	start = ele[0]
	last = ele[1]
	mention = input_text[int(ele[0]):int(ele[1])]
	type='Phenotype'
	id=ele[2]
	score=ele[3]
	output=start+"\t"+last+"\t"+mention+"\t"+id+'\t'+score+"\n"
	st.info(output)