Spaces:
Running
Running
File size: 8,434 Bytes
5dc2016 62586c5 fc731db 8ed723b 28995ec 98ac1da 28995ec 275028e 8ed723b 4a37eb1 ddff90b 9f09f8c 68a2646 98ac1da 68a2646 a4dfd52 6792ef6 9f09f8c 221e51c fc731db e7caceb 6d2e57c e7caceb 6d2e57c e7caceb 4a37eb1 e7caceb 4a37eb1 e7caceb 6d2e57c e7caceb 6d2e57c 9f09f8c c9b72ab fb5e624 9f09f8c f2852e3 38efeba 847adc5 a224dfa 0416a61 f2852e3 ddff90b 9f09f8c cde5ff7 b102419 8ed723b fc731db 9f09f8c 7780086 091df08 fc731db 9f09f8c 27bf394 04706b7 fc731db 0d9531e b28ab8e 062e24e f435314 e1cbd0e b28ab8e f435314 04706b7 b28ab8e 062e24e fc731db b424a32 0d9531e 9f09f8c 8f768aa 8ed723b a8b6710 9f09f8c fb31138 9f09f8c a8b6710 fb31138 7ead1f4 6d2e57c 8ed723b 801aede 2b7ba7c 5fe6306 2b7ba7c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
import nltk
nltk.data.path.append("/home/user/app/nltk_data")
#nltk.download('stopwords')
#nltk.download('punkt')
from epi_pipeline import (
streamlit_extraction,
NER_Pipeline,
GARD_Search,
Classify_Pipeline
)
import pandas as pd
#pd.set_option('display.max_colwidth', None)
import streamlit as st
st.set_page_config(layout="wide")
#import spacy
#import tensorflow as tf
#import pickle
#import re
import plotly.graph_objects as go
#### LOGO ####
st.markdown('''<img src="https://huggingface.co/spaces/ncats/EpiPipeline4RD/raw/main/ncats.svg" alt="National Center for Advancing Translational Sciences Logo">''',unsafe_allow_html=True)
st.markdown("")
st.markdown('''<img src="https://huggingface.co/spaces/ncats/EpiPipeline4RD/resolve/main/Logo_GARD_fullres.png" alt="NIH Genetic and Rare Diseases Information Center Logo" width=400>''',unsafe_allow_html=True)
#st.markdown('''<img src="https://huggingface.co/spaces/ncats/EpiPipeline4GARD/raw/main/ncats.svg" alt="National Center for Advancing Translational Sciences Logo" width=800>''',unsafe_allow_html=True)
#st.markdown("")
#st.markdown('''<img src="https://huggingface.co/spaces/ncats/EpiPipeline4GARD/resolve/main/Logo_GARD_fullres.png" alt="NIH Genetic and Rare Diseases Information Center Logo" width=800>''',unsafe_allow_html=True)
#st.markdown("![National Center for Advancing Translational Sciences (NCATS) Logo](https://huggingface.co/spaces/ncats/EpiPipeline4GARD/resolve/main/NCATS_logo.png)")
#### TITLE ####
st.title("Epidemiological Information Extraction Pipeline for Rare Diseases")
#st.subheader("National Center for Advancing Translational Sciences (NIH/NCATS)")
#### CHANGE SIDEBAR WIDTH ###
st.markdown(
"""
<style>
[data-testid="stSidebar"][aria-expanded="true"] > div:first-child {
width: 250px;
}
[data-testid="stSidebar"][aria-expanded="false"] > div:first-child {
width: 250px;
margin-left: -350px;
}
</style>
""",
unsafe_allow_html=True,
)
#### DESCRIPTION ####
st.markdown("This application was built by the [National Center for Advancing Translational Sciences (NCATS)](https://ncats.nih.gov/) to automatically search and extract rare disease epidemiology information from PubMed abstracts.")
#### SIDEBAR WIDGETS ####
#max_results is Maximum number of PubMed ID's to retrieve BEFORE filtering
max_results = st.sidebar.number_input("Maximum number of articles to find in PubMed", min_value=1, max_value=None, value=50)
filtering = st.sidebar.radio("What type of filtering would you like?",('Strict', 'Lenient', 'None')).lower()
extract_diseases = st.sidebar.checkbox("Extract Rare Diseases", value=False)
#### MODEL LOADING ####
@st.experimental_singleton(show_spinner=False)
def load_models_experimental():
epi_classify = Classify_Pipeline()
epi_extract = NER_Pipeline()
rd_identify = GARD_Search()
return epi_classify, epi_extract, rd_identify
#### DOWNLOAD FUNCTION ####
@st.cache
def convert_df(df):
# IMPORTANT: Cache the conversion to prevent computation on every rerun
return df.to_csv().encode('utf-8')
#### SANKEY FUNCTION ####
#@st.cache(allow_output_mutation=True)
@st.experimental_singleton()
def epi_sankey(sankey_data, disease_or_gard_id):
found, relevant, epidemiologic = sankey_data
fig = go.Figure(data=[go.Sankey(
node = dict(
pad = 15,
thickness = 20,
line = dict(color = "white", width = 0.5),
label = ["PubMed IDs Gathered", "Irrelevant Abstracts","Relevant Abstracts Gathered","Epidemiologic Abstracts","Not Epidemiologic"],
color = "purple"
),
#label = ["A1", "A2", "B1", "B2", "C1", "C2"]
link = dict(
source = [0, 0, 2, 2],
target = [2, 1, 3, 4],
value = [relevant, found-relevant, epidemiologic, relevant-epidemiologic]
))])
fig.update_layout(
hovermode = 'x',
title="Search for the Epidemiology of "+disease_or_gard_id,
font=dict(size = 10, color = 'black'),
)
return fig
#### BEGIN APP ####
with st.spinner('Loading Epidemiology Models and Dependencies...'):
epi_classify, epi_extract, rd_identify = load_models_experimental()
loaded = st.success('All Models and Dependencies Loaded!')
disease_or_gard_id = st.text_input("Input a rare disease term or NIH GARD ID.")
loaded.empty()
st.markdown("Examples of rare diseases include [**Fellman syndrome**](https://rarediseases.info.nih.gov/diseases/1/gracile-syndrome), [**Classic Homocystinuria**](https://rarediseases.info.nih.gov/diseases/6667/classic-homocystinuria), [**7383**](https://rarediseases.info.nih.gov/diseases/7383/phenylketonuria), and [**GARD:0009941**](https://rarediseases.info.nih.gov/diseases/9941/fshmd1a). A full list of rare diseases tracked by the NIH Genetic and Rare Diseases Information Center (GARD) can be found [here](https://rarediseases.info.nih.gov/diseases/browse-by-first-letter).")
if disease_or_gard_id:
df, sankey_data, name_gardID = streamlit_extraction(disease_or_gard_id, max_results, filtering,
epi_ner, GARD_Search, extract_diseases, epi_classify)
#IF it returns something, then continue.
if sankey_data:
df.replace(to_replace='None', value="None")
st.dataframe(df, height=200)
csv = convert_df(df)
disease, gardID = name_gardID
#if the user input does not have a number in it (i.e. weak proxy for if it is a GARD ID), then preserve the user input as the disease term.
if not bool(re.search(r'\d', disease_or_gard_id)):
disease = disease_or_gard_id
st.download_button(
label="Download epidemiology results for "+disease+" as CSV",
data = csv,
file_name=disease+'.csv',
mime='text/csv',
)
st.markdown('See the NIH GARD page for ['+disease+'](https://rarediseases.info.nih.gov/diseases/'+str(re.sub('GARD:|0','',gardID))+'/'+str('-'.join(disease.split()))+')')
fig = epi_sankey(sankey_data,disease)
st.plotly_chart(fig, use_container_width=True)
if 'IDS' in list(df.columns):
st.markdown('''COLUMNS: \\
- PROB_OF_EPI: Probability that the paper is an epidemiologic study based on its abstract. \\
- IsEpi: If it is an epidemiologic study (If PROB_OF_EPI >0.5) \\
- DIS: Rare disease terms or synonyms identified in the abstract from the GARD Dictionary
- IDS: GARD IDs identified in the abstract from the GARD Dictionary \\
- EPI: Epidemiology Types are the metrics used to estimate disease burden such as "incidence", "prevalence rate", or "occurrence"
- STAT: Epidemiology Rates describe how many people are afflicted by a disease.
- DATE: The dates when the epidemiologic studies were conducted
- LOC: Where the epidemiologic studies were conducted.
- SEX: The biological sexes mentioned in the abstract. Useful for diseases that disproportionately affect one sex over the other or may provide context to composition of the study population
- ETHN: Ethnicities, races, and nationalities of those represented in the epidemiologic study.
''')
else:
st.subheader("Categories of Results")
st.markdown(" - **PROB_OF_EPI**: Probability that the paper is an epidemiologic study based on its abstract. \n - **IsEpi**: If it is an epidemiologic study (If PROB_OF_EPI >0.5) \n - **EPI**: Epidemiology Types are the metrics used to estimate disease burden such as 'incidence', 'prevalence rate', or 'occurrence' \n - **STAT**: Epidemiology Rates describe how many people are afflicted by a disease. \n - **DATE**: The dates when the epidemiologic studies were conducted \n - **LOC**: Where the epidemiologic studies were conducted. \n - **SEX**: The biological sexes mentioned in the abstract. Useful for diseases that disproportionately affect one sex over the other or may provide context to composition of the study population \n - **ETHN**: Ethnicities, races, and nationalities of those represented in the epidemiologic study.")
#st.dataframe(data=None, width=None, height=None) |