Spaces:

lingbionlp
/

PhenoTagger-Demo

Build error

App Files Files Community

PhenoTagger-Demo / app.py

lingbionlp

Update app.py

3267b6f about 2 years ago

raw

history blame

7.99 kB

	# -- coding: utf-8 --
	"""
	Created on Mon Nov 21 16:21:25 2022

	@author: luol2
	"""

	import streamlit as st
	from src.nn_model import bioTag_CNN,bioTag_Bioformer
	from src.dic_ner import dic_ont
	from src.tagging_text import bioTag
	import os
	import json
	from pandas import DataFrame
	import nltk
	nltk.download('punkt')
	nltk.download('averaged_perceptron_tagger')
	nltk.download('wordnet')

	st.set_page_config(
	page_title="PhenoTagger",
	page_icon="🎈",
	layout="wide",
	menu_items={
	'Get Help': 'https://www.ncbi.nlm.nih.gov/research/bionlp/',
	'About': "PhenoTagger v1.1"
	}
	)


	# def _max_width_():
	# max_width_str = f"max-width: 2400px;"
	# st.markdown(
	# f"""
	# <style>
	# .reportview-container .main .block-container{{
	# {max_width_str}
	# }}
	# </style>
	# """,
	# unsafe_allow_html=True,
	# )


	# _max_width_()

	# c30, c31, c32 = st.columns([2.5, 1, 3])

	# with c30:
	# # st.image("logo.png", width=400)
	st.title("👨‍⚕️ PhenoTagger Demo")

	with st.expander("ℹ️ - About this app", expanded=True):

	st.write(
	"""
	- This app is an easy-to-use interface built in Streamlit for [PhenoTagger](https://github.com/ncbi-nlp/PhenoTagger) library!
	- PhenoTagger is a hybrid method that combines dictionary and deep learning-based methods to recognize Human Phenotype Ontology (HPO) concepts in unstructured biomedical text. Please refer to [our paper](https://doi.org/10.1093/bioinformatics/btab019) for more details.
	- Contact: [NLM/NCBI BioNLP Research Group](https://www.ncbi.nlm.nih.gov/research/bionlp/)
	"""
	)

	st.markdown("")

	st.markdown("")
	st.markdown("## 📌 Paste document ")
	with st.form(key="my_form"):


	ce, c1, ce, c2, c3 = st.columns([0.07, 1, 0.07, 4, 0.07])
	with c1:
	ModelType = st.radio(
	"Choose your model",
	["Bioformer(Default)", "CNN"],
	help="Bioformer is more precise, CNN is more efficient",
	)

	if ModelType == "Bioformer(Default)":
	# kw_model = KeyBERT(model=roberta)

	@st.cache(allow_output_mutation=True)
	def load_model():
	ontfiles={'dic_file':'./dict_new/noabb_lemma.dic',
	'word_hpo_file':'./dict_new/word_id_map.json',
	'hpo_word_file':'./dict_new/id_word_map.json'}


	vocabfiles={'labelfile':'./dict_new/lable.vocab',
	'config_path':'./vocab/bioformer-cased-v1.0/bert_config.json',
	'checkpoint_path':'./vocab/bioformer-cased-v1.0/bioformer-cased-v1.0-model.ckpt-2000000',
	'vocab_path':'./vocab/bioformer-cased-v1.0/vocab.txt'}
	modelfile='./vocab/bioformer_p5n5_b64_1e-5_95_hponew3.h5'


	biotag_dic=dic_ont(ontfiles)

	nn_model=bioTag_Bioformer(vocabfiles)
	nn_model.load_model(modelfile)
	return nn_model,biotag_dic

	nn_model,biotag_dic = load_model()

	else:
	@st.cache(allow_output_mutation=True)
	def load_model():
	ontfiles={'dic_file':'./dict_new/noabb_lemma.dic',
	'word_hpo_file':'./dict_new/word_id_map.json',
	'hpo_word_file':'./dict_new/id_word_map.json'}


	vocabfiles={'w2vfile':'./vocab/bio_embedding_intrinsic.d200',
	'charfile':'./vocab/char.vocab',
	'labelfile':'./dict_new/lable.vocab',
	'posfile':'./vocab/pos.vocab'}
	modelfile='./models/cnn_p5n5_b128_95_hponew1.h5'

	biotag_dic=dic_ont(ontfiles)

	nn_model=bioTag_CNN(vocabfiles)
	nn_model.load_model(modelfile)

	return nn_model,biotag_dic

	nn_model,biotag_dic = load_model()

	para_overlap = st.checkbox(
	"Overlap concept",
	value=True,
	help="Tick this box to identify overlapping concepts",
	)
	para_abbr = st.checkbox(
	"Abbreviaitons",
	value=True,
	help="Tick this box to identify abbreviations",
	)

	para_threshold = st.slider(
	"Threshold",
	min_value=0.5,
	max_value=0.95,
	value=0.95,
	step=0.05,
	help="Retrun the preditions which socre over the threshold.",
	)




	with c2:
	doc = st.text_area(
	"Paste your text below",
	height=400,
	)

	# MAX_WORDS = 500
	# import re
	# res = len(re.findall(r"\w+", doc))
	# if res > MAX_WORDS:
	# st.warning(
	# "⚠️ Your text contains "
	# + str(res)
	# + " words."
	# + " Only the first 500 words will be reviewed. Stay tuned as increased allowance is coming! 😊"
	# )

	# doc = doc[:MAX_WORDS]

	submit_button = st.form_submit_button(label="✨ Submit!")


	if not submit_button:
	st.stop()


	para_set={
	#model_type':para_model, # cnn or bioformer
	'onlyLongest':para_overlap, # False: return overlap concepts, True only longgest
	'abbrRecog':para_abbr,# False: don't identify abbr, True: identify abbr
	'ML_Threshold':para_threshold,# the Threshold of deep learning model
	}
	st.markdown("")
	st.markdown("## 💡 Tagging results:")
	with st.spinner('Wait for tagging...'):
	tag_result=bioTag(doc,biotag_dic,nn_model,onlyLongest=para_set['onlyLongest'], abbrRecog=para_set['abbrRecog'],Threshold=para_set['ML_Threshold'])

	st.markdown('<font style="color: rgb(128, 128, 128);">Move the mouse over the entity to display the HPO id.</font>', unsafe_allow_html=True)
	# print('dic...........:',biotag_dic.keys())
	# st.write('parameters:', para_overlap,para_abbr,para_threshold)

	html_results=''
	text_results=doc+'\n'
	entity_end=0
	hpoid_count={}
	if len(tag_result)>=0:
	for ele in tag_result:
	entity_start=int(ele[0])
	html_results+=doc[entity_end:entity_start]
	entity_end=int(ele[1])
	entity_id=ele[2]
	entity_score=ele[3]
	text_results+=ele[0]+'\t'+ele[1]+'\t'+doc[entity_start:entity_end]+'\t'+ele[2]+'\t'+format(float(ele[3]),'.2f')+'\n'
	if entity_id not in hpoid_count.keys():
	hpoid_count[entity_id]=1
	else:
	hpoid_count[entity_id]+=1

	html_results+='<font style="background-color: rgb(255, 204, 0)'+';" title="'+entity_id+'">'+doc[entity_start:entity_end]+'</font>'
	html_results+=doc[entity_end:]

	else:
	html_results=doc

	st.markdown('<table border="1"><tr><td>'+html_results+'</td></tr></table>', unsafe_allow_html=True)


	#table
	data_entity=[]
	for ele in hpoid_count.keys():
	temp=[ele,biotag_dic.hpo_word[ele][0],hpoid_count[ele]] #hpoid, term name, count
	data_entity.append(temp)


	st.markdown("")
	st.markdown("")
	# st.markdown("## Table output:")

	# cs, c1, c2, c3, cLast = st.columns([2, 1.5, 1.5, 1.5, 2])

	# with c1:
	# CSVButton2 = download_button(keywords, "Data.csv", "📥 Download (.csv)")
	# with c2:
	# CSVButton2 = download_button(keywords, "Data.txt", "📥 Download (.txt)")
	# with c3:
	# CSVButton2 = download_button(keywords, "Data.json", "📥 Download (.json)")

	# st.header("")

	df = (
	DataFrame(data_entity, columns=["HPO_id", "Term name","Frequency"])
	.sort_values(by="Frequency", ascending=False)
	.reset_index(drop=True)
	)

	df.index += 1

	c1, c2, c3 = st.columns([1, 4, 1])

	# format_dictionary = {
	# "Relevancy": "{:.1%}",
	# }

	# df = df.format(format_dictionary)

	with c2:
	st.table(df)

	c1, c2, c3 = st.columns([1, 1, 1])
	with c2:
	st.download_button('Download annotations', text_results)