Spaces:
Running
Running
File size: 8,600 Bytes
94675c0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import nltk
nltk.data.path.append("/home/user/app/nltk_data")
#nltk.download('stopwords')
#nltk.download('punkt')
import classify_abs
import extract_abs
import pandas as pd
#pd.set_option('display.max_colwidth', None)
import streamlit as st
st.set_page_config(layout="wide")
import spacy
import tensorflow as tf
import pickle
import re
import plotly.graph_objects as go
#### LOGO ####
st.markdown('''<img src="https://huggingface.co/spaces/ncats/EpiPipeline4RD/raw/main/ncats.svg" alt="National Center for Advancing Translational Sciences Logo">''',unsafe_allow_html=True)
st.markdown("")
st.markdown('''<img src="https://huggingface.co/spaces/ncats/EpiPipeline4RD/resolve/main/Logo_GARD_fullres.png" alt="NIH Genetic and Rare Diseases Information Center Logo" width=400>''',unsafe_allow_html=True)
#st.markdown('''<img src="https://huggingface.co/spaces/ncats/EpiPipeline4GARD/raw/main/ncats.svg" alt="National Center for Advancing Translational Sciences Logo" width=800>''',unsafe_allow_html=True)
#st.markdown("")
#st.markdown('''<img src="https://huggingface.co/spaces/ncats/EpiPipeline4GARD/resolve/main/Logo_GARD_fullres.png" alt="NIH Genetic and Rare Diseases Information Center Logo" width=800>''',unsafe_allow_html=True)
#st.markdown("![National Center for Advancing Translational Sciences (NCATS) Logo](https://huggingface.co/spaces/ncats/EpiPipeline4GARD/resolve/main/NCATS_logo.png)")
#### TITLE ####
st.title("Epidemiological Information Extraction Pipeline for Rare Diseases")
#st.subheader("National Center for Advancing Translational Sciences (NIH/NCATS)")
#### CHANGE SIDEBAR WIDTH ###
st.markdown(
"""
<style>
[data-testid="stSidebar"][aria-expanded="true"] > div:first-child {
width: 250px;
}
[data-testid="stSidebar"][aria-expanded="false"] > div:first-child {
width: 250px;
margin-left: -350px;
}
</style>
""",
unsafe_allow_html=True,
)
#### DESCRIPTION ####
st.markdown("This application was built by the [National Center for Advancing Translational Sciences (NCATS)](https://ncats.nih.gov/) to automatically search and extract rare disease epidemiology information from PubMed abstracts.")
#### SIDEBAR WIDGETS ####
#max_results is Maximum number of PubMed ID's to retrieve BEFORE filtering
max_results = st.sidebar.number_input("Maximum number of articles to find in PubMed", min_value=1, max_value=None, value=50)
filtering = st.sidebar.radio("What type of filtering would you like?",('Strict', 'Lenient', 'None')).lower()
extract_diseases = st.sidebar.checkbox("Extract Rare Diseases", value=False)
#### MODEL LOADING ####
@st.experimental_singleton(show_spinner=False)
def load_models_experimental():
classify_model_vars = classify_abs.init_classify_model()
NER_pipeline, entity_classes = extract_abs.init_NER_pipeline()
GARD_dict, max_length = extract_abs.load_GARD_diseases()
return classify_model_vars, NER_pipeline, entity_classes, GARD_dict, max_length
#### DOWNLOAD FUNCTION ####
@st.cache
def convert_df(df):
# IMPORTANT: Cache the conversion to prevent computation on every rerun
return df.to_csv().encode('utf-8')
#### SANKEY FUNCTION ####
#@st.cache(allow_output_mutation=True)
@st.experimental_singleton()
def epi_sankey(sankey_data, disease_or_gard_id):
found, relevant, epidemiologic = sankey_data
fig = go.Figure(data=[go.Sankey(
node = dict(
pad = 15,
thickness = 20,
line = dict(color = "white", width = 0.5),
label = ["PubMed IDs Gathered", "Irrelevant Abstracts","Relevant Abstracts Gathered","Epidemiologic Abstracts","Not Epidemiologic"],
color = "purple"
),
#label = ["A1", "A2", "B1", "B2", "C1", "C2"]
link = dict(
source = [0, 0, 2, 2],
target = [2, 1, 3, 4],
value = [relevant, found-relevant, epidemiologic, relevant-epidemiologic]
))])
fig.update_layout(
hovermode = 'x',
title="Search for the Epidemiology of "+disease_or_gard_id,
font=dict(size = 10, color = 'black'),
)
return fig
#### BEGIN APP ####
with st.spinner('Loading Epidemiology Models and Dependencies...'):
classify_model_vars, NER_pipeline, entity_classes, GARD_dict, max_length = load_models_experimental()
loaded = st.success('All Models and Dependencies Loaded!')
disease_or_gard_id = st.text_input("Input a rare disease term or NIH GARD ID.")
loaded.empty()
st.markdown("Examples of rare diseases include [**Fellman syndrome**](https://rarediseases.info.nih.gov/diseases/1/gracile-syndrome), [**Classic Homocystinuria**](https://rarediseases.info.nih.gov/diseases/6667/classic-homocystinuria), [**7383**](https://rarediseases.info.nih.gov/diseases/7383/phenylketonuria), and [**GARD:0009941**](https://rarediseases.info.nih.gov/diseases/9941/fshmd1a). A full list of rare diseases tracked by the NIH Genetic and Rare Diseases Information Center (GARD) can be found [here](https://rarediseases.info.nih.gov/diseases/browse-by-first-letter).")
if disease_or_gard_id:
df, sankey_data, name_gardID = extract_abs.streamlit_extraction(disease_or_gard_id, max_results, filtering,
NER_pipeline, entity_classes,
extract_diseases, GARD_dict, max_length,
classify_model_vars)
#IF it returns something, then continue.
if sankey_data:
df.replace(to_replace='None', value="None")
st.dataframe(df, height=200)
csv = convert_df(df)
disease, gardID = name_gardID
#if the user input does not have a number in it (i.e. weak proxy for if it is a GARD ID), then preserve the user input as the disease term.
if not bool(re.search(r'\d', disease_or_gard_id)):
disease = disease_or_gard_id
st.download_button(
label="Download epidemiology results for "+disease+" as CSV",
data = csv,
file_name=disease+'.csv',
mime='text/csv',
)
st.markdown('See the NIH GARD page for ['+disease+'](https://rarediseases.info.nih.gov/diseases/'+str(re.sub('GARD:|0','',gardID))+'/'+str('-'.join(disease.split()))+')')
fig = epi_sankey(sankey_data,disease)
st.plotly_chart(fig, use_container_width=True)
if 'IDS' in list(df.columns):
st.markdown('''COLUMNS: \\
- PROB_OF_EPI: Probability that the paper is an epidemiologic study based on its abstract. \\
- IsEpi: If it is an epidemiologic study (If PROB_OF_EPI >0.5) \\
- DIS: Rare disease terms or synonyms identified in the abstract from the GARD Dictionary
- IDS: GARD IDs identified in the abstract from the GARD Dictionary \\
- EPI: Epidemiology Types are the metrics used to estimate disease burden such as "incidence", "prevalence rate", or "occurrence"
- STAT: Epidemiology Rates describe how many people are afflicted by a disease.
- DATE: The dates when the epidemiologic studies were conducted
- LOC: Where the epidemiologic studies were conducted.
- SEX: The biological sexes mentioned in the abstract. Useful for diseases that disproportionately affect one sex over the other or may provide context to composition of the study population
- ETHN: Ethnicities, races, and nationalities of those represented in the epidemiologic study.
''')
else:
st.subheader("Categories of Results")
st.markdown(" - **PROB_OF_EPI**: Probability that the paper is an epidemiologic study based on its abstract. \n - **IsEpi**: If it is an epidemiologic study (If PROB_OF_EPI >0.5) \n - **EPI**: Epidemiology Types are the metrics used to estimate disease burden such as 'incidence', 'prevalence rate', or 'occurrence' \n - **STAT**: Epidemiology Rates describe how many people are afflicted by a disease. \n - **DATE**: The dates when the epidemiologic studies were conducted \n - **LOC**: Where the epidemiologic studies were conducted. \n - **SEX**: The biological sexes mentioned in the abstract. Useful for diseases that disproportionately affect one sex over the other or may provide context to composition of the study population \n - **ETHN**: Ethnicities, races, and nationalities of those represented in the epidemiologic study.")
#st.dataframe(data=None, width=None, height=None) |