Spaces:
Running
Running
File size: 8,036 Bytes
5dc2016 b5634db fc731db 7ce5b82 ddff90b 28995ec 98ac1da 28995ec 275028e 3279179 4a37eb1 ddff90b 9f09f8c 68a2646 98ac1da 68a2646 a4dfd52 6792ef6 9f09f8c 221e51c fc731db e7caceb 6d2e57c e7caceb 6d2e57c e7caceb 4a37eb1 e7caceb 4a37eb1 e7caceb 6d2e57c e7caceb 6d2e57c 9f09f8c c9b72ab fb5e624 9f09f8c f2852e3 38efeba 847adc5 a224dfa 0416a61 f2852e3 ddff90b 9f09f8c cde5ff7 b102419 fc731db 9f09f8c 31ca6c1 7780086 091df08 fc731db 9f09f8c 27bf394 0d9531e fc731db 0d9531e b28ab8e 062e24e f435314 e1cbd0e b28ab8e f435314 b28ab8e 062e24e fc731db b424a32 0d9531e 9f09f8c 8f768aa 6e2f665 a8b6710 9f09f8c fb31138 9f09f8c a8b6710 fb31138 7ead1f4 6d2e57c 8e409e1 fc731db 8e409e1 0b17811 265205e 0b17811 8e33224 8e409e1 062e24e fc731db 0b17811 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
import nltk
nltk.data.path.append("/home/user/app/nltk_data/")
nltk.data.path.append("/home/user/app/nltk_data")
nltk.data.path.append("home/user/app/nltk_data")
nltk.data.path.append("home/user/app/nltk_data/")
#nltk.download('stopwords')
#nltk.download('punkt')
import classify_abs
import extract_abs
import pandas as pd
#pd.set_option('display.max_colwidth', None)
import streamlit as st
st.set_page_config(layout="wide")
import spacy
import tensorflow as tf
import pickle
import plotly.graph_objects as go
#### LOGO ####
st.markdown('''<img src="https://huggingface.co/spaces/ncats/EpiPipeline4RD/raw/main/ncats.svg" alt="National Center for Advancing Translational Sciences Logo">''',unsafe_allow_html=True)
st.markdown("")
st.markdown('''<img src="https://huggingface.co/spaces/ncats/EpiPipeline4RD/resolve/main/Logo_GARD_fullres.png" alt="NIH Genetic and Rare Diseases Information Center Logo" width=400>''',unsafe_allow_html=True)
#st.markdown('''<img src="https://huggingface.co/spaces/ncats/EpiPipeline4GARD/raw/main/ncats.svg" alt="National Center for Advancing Translational Sciences Logo" width=800>''',unsafe_allow_html=True)
#st.markdown("")
#st.markdown('''<img src="https://huggingface.co/spaces/ncats/EpiPipeline4GARD/resolve/main/Logo_GARD_fullres.png" alt="NIH Genetic and Rare Diseases Information Center Logo" width=800>''',unsafe_allow_html=True)
#st.markdown("![National Center for Advancing Translational Sciences (NCATS) Logo](https://huggingface.co/spaces/ncats/EpiPipeline4GARD/resolve/main/NCATS_logo.png)")
#### TITLE ####
st.title("Epidemiological Information Extraction Pipeline for Rare Diseases")
#st.subheader("National Center for Advancing Translational Sciences (NIH/NCATS)")
#### CHANGE SIDEBAR WIDTH ###
st.markdown(
"""
<style>
[data-testid="stSidebar"][aria-expanded="true"] > div:first-child {
width: 250px;
}
[data-testid="stSidebar"][aria-expanded="false"] > div:first-child {
width: 250px;
margin-left: -350px;
}
</style>
""",
unsafe_allow_html=True,
)
#### DESCRIPTION ####
st.markdown("This application was built by the [National Center for Advancing Translational Sciences (NCATS)](https://ncats.nih.gov/) to automatically search and extract rare disease epidemiology information from PubMed abstracts.")
#### SIDEBAR WIDGETS ####
#max_results is Maximum number of PubMed ID's to retrieve BEFORE filtering
max_results = st.sidebar.number_input("Maximum number of articles to find in PubMed", min_value=1, max_value=None, value=50)
filtering = st.sidebar.radio("What type of filtering would you like?",('Strict', 'Lenient', 'None')).lower()
extract_diseases = st.sidebar.checkbox("Extract Rare Diseases", value=False)
#### MODEL LOADING ####
@st.experimental_singleton(show_spinner=False)
def load_models_experimental():
classify_model_vars = classify_abs.init_classify_model()
NER_pipeline, entity_classes = extract_abs.init_NER_pipeline()
GARD_dict, max_length = extract_abs.load_GARD_diseases()
return classify_model_vars, NER_pipeline, entity_classes, GARD_dict, max_length
#### DOWNLOAD FUNCTION ####
@st.cache
def convert_df(df):
# IMPORTANT: Cache the conversion to prevent computation on every rerun
return df.to_csv().encode('utf-8')
#### SANKEY FUNCTION ####
#@st.cache(allow_output_mutation=True)
@st.experimental_singleton()
def epi_sankey(sankey_data, disease_or_gard_id):
gathered, relevant, epidemiologic = sankey_data
fig = go.Figure(data=[go.Sankey(
node = dict(
pad = 15,
thickness = 20,
line = dict(color = "white", width = 0.5),
label = ["PubMed IDs Gathered", "Irrelevant Abstracts","Relevant Abstracts Gathered","Epidemiologic Abstracts","Not Epidemiologic"],
color = "purple"
),
#label = ["A1", "A2", "B1", "B2", "C1", "C2"]
link = dict(
source = [0, 0, 2, 2],
target = [2, 1, 3, 4],
value = [relevant, gathered-relevant, epidemiologic, relevant-epidemiologic]
))])
fig.update_layout(
hovermode = 'x',
title="Search for the Epidemiology of "+disease_or_gard_id,
font=dict(size = 10, color = 'black'),
)
return fig
#### BEGIN APP ####
with st.spinner('Loading Epidemiology Models and Dependencies...'):
classify_model_vars, NER_pipeline, entity_classes, GARD_dict, max_length = load_models_experimental()
loaded = st.success('All Models and Dependencies Loaded!')
disease_or_gard_id = st.text_input("Input a rare disease term or NIH GARD ID.")
loaded.empty()
st.markdown("Examples of rare diseases include [**Fellman syndrome**](https://rarediseases.info.nih.gov/diseases/1/gracile-syndrome), [**Classic Homocystinuria**](https://rarediseases.info.nih.gov/diseases/6667/classic-homocystinuria), [**7383**](https://rarediseases.info.nih.gov/diseases/7383/phenylketonuria), and [**GARD:0009941**](https://rarediseases.info.nih.gov/diseases/9941/fshmd1a). A full list of rare diseases tracked by the NIH Genetic and Rare Diseases Information Center (GARD) can be found [here](https://rarediseases.info.nih.gov/diseases/browse-by-first-letter).")
if disease_or_gard_id:
df, sankey_data = extract_abs.streamlit_extraction(disease_or_gard_id, max_results, filtering,
NER_pipeline, entity_classes,
extract_diseases,GARD_dict, max_length,
classify_model_vars)
st.dataframe(df, height=200)
csv = convert_df(df)
st.download_button(
label="Download epidemiology results for "+disease_or_gard_id+" as CSV",
data = csv,
file_name=disease_or_gard_id+'.csv',
mime='text/csv',
)
if 'IDS' in list(df.columns):
st.markdown('''COLUMNS: \\
- PROB_OF_EPI: Probability that the paper is an epidemiologic study based on its abstract. \\
- IsEpi: If it is an epidemiologic study (If PROB_OF_EPI >0.5) \\
- DIS: Rare disease terms or synonyms identified in the abstract from the GARD Dictionary
- IDS: GARD IDs identified in the abstract from the GARD Dictionary \\
- EPI: Epidemiology Types are the metrics used to estimate disease burden such as "incidence", "prevalence rate", or "occurrence"
- STAT: Epidemiology Rates describe how many people are afflicted by a disease.
- DATE: The dates when the epidemiologic studies were conducted
- LOC: Where the epidemiologic studies were conducted.
- SEX: The biological sexes mentioned in the abstract. Useful for diseases that disproportionately affect one sex over the other or may provide context to composition of the study population
- ETHN: Ethnicities, races, and nationalities of those represented in the epidemiologic study.
''')
else:
st.subheader("Categories of Results")
st.markdown(" - **PROB_OF_EPI**: Probability that the paper is an epidemiologic study based on its abstract. \n - **IsEpi**: If it is an epidemiologic study (If PROB_OF_EPI >0.5) \n - **EPI**: Epidemiology Types are the metrics used to estimate disease burden such as 'incidence', 'prevalence rate', or 'occurrence' \n - **STAT**: Epidemiology Rates describe how many people are afflicted by a disease. \n - **DATE**: The dates when the epidemiologic studies were conducted \n - **LOC**: Where the epidemiologic studies were conducted. \n - **SEX**: The biological sexes mentioned in the abstract. Useful for diseases that disproportionately affect one sex over the other or may provide context to composition of the study population \n - **ETHN**: Ethnicities, races, and nationalities of those represented in the epidemiologic study.")
#st.dataframe(data=None, width=None, height=None)
fig = epi_sankey(sankey_data,disease_or_gard_id)
st.plotly_chart(fig, use_container_width=True) |