Spaces:
Running
Running
File size: 5,293 Bytes
27f6851 df07b29 27f6851 903fca3 27f6851 72230c3 27f6851 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 |
import streamlit as st
import pandas as pd
import numpy as np
import time
import plotly.graph_objects as go
from scipy.ndimage import gaussian_filter1d
from zipfile import ZipFile
np.random.seed(2024)
uids = pd.read_csv("uniprot_ids.tsv.gz", names=["selection"], header=None, sep="\t")
# del_sub_merge = pd.read_csv("del_sub_data.csv.gz")
zf = ZipFile("ALL_hum_proteins_ESM1b_del_sub.zip")
width=600
def plot_interactive_scatter(uid: str):
user_data = pd.read_csv(zf.open(f"{uid}.csv"))
# Create scatter plot for user-specified data
user_trace = go.Scatter(
x=-np.log10(user_data.aPLLR),
y=user_data.avg_LLR,
mode='markers',
name=f"{uid}<br>Data",
text=user_data.site,
hoverinfo='text',
marker=dict(color='orange'))
return user_trace, user_data
def plot_interactive_line(uid_data: pd.DataFrame, uid: str, score: str, mutation: str,
hline1: float, hline2: float):
esm_data = -np.log10(uid_data[score]) if score == "aPLLR" else uid_data[score]
x_ticks = uid_data["site"].tolist()
plot_data = esm_data
hover_text = [f"{x}: {np.round(y, 3)}" for x, y in zip(uid_data.site, plot_data)]
line_trace = go.Scatter(
x=np.arange(1, len(uid_data)+1),
y=plot_data,
mode='lines',
text=hover_text,
hoverinfo='text',
marker=dict(color='orange')
)
line_fig = go.Figure(data=[line_trace])
line_fig.update_layout(
title=f"{uid} {mutation} Scores by Position",
yaxis_title=f'{mutation} Score<br>(More Negative = More Damaging)',
yaxis=dict(showgrid=False, zeroline=False, showline=False),
height=300,
hoverlabel=dict( # Set hover label font size
font=dict(size=16) # Specify the font size of the hover text
)
)
for hline in [hline1, hline2]:
line_fig.add_shape(
type='line',
x0=0, x1=1, y0=hline, y1=hline,
xref='paper', yref='y',
line=dict(color='Black', dash='dash'),
)
return line_fig
selection = st.selectbox("", uids.selection, index=11409)
selection_uid = selection.split(",")[0]
# Base dataset
base_data = pd.read_csv("rand_samp_gw_del_sub.csv.gz")
# Create base scatter plot
base_trace = go.Scatter(
x=-np.log10(base_data.aPLLR),
y=base_data.avg_LLR,
mode='markers',
name='Sample of<br>Genome-Wide<br>Data',
hoverinfo='none', # Disable hover information for the base data
marker=dict(color='grey')
)
# User-specified data
ut, ud = plot_interactive_scatter(selection_uid)
# Combine traces
fig = go.Figure([base_trace, ut])
# Customize layout
fig.update_layout(
title='Deletion v Substitution Effects',
xaxis_title='Deletion Score',
yaxis_title='Substitution Score',
yaxis=dict(showgrid=False, showline=False, zeroline=False),
legend=dict(
font=dict(size=15), # Specify the font size of the legend text
bordercolor="grey",
borderwidth=1
),
hoverlabel=dict( # Set hover label font size
font=dict(size=16) # Specify the font size of the hover text
)
)
fig.update_yaxes(showgrid=False)
# Extract out percentiles
del_bot, del_top = 0.16500809479645437, -0.7801050825906862
for del_cutoff in [del_bot, del_top]:
fig.add_shape(
type='line',
x0=del_cutoff, x1=del_cutoff, y0=0, y1=1,
xref='x', yref='paper',
line=dict(color='Black', width=2)
)
# to avoid reading the entire dataset into memory
sub_bot, sub_top = -12.004105263157896, -4.871947368421053
for sub_cutoff in [sub_bot, sub_top]:
fig.add_shape(
type='line',
x0=0, x1=1, y0=sub_cutoff, y1=sub_cutoff,
xref='paper', yref='y',
line=dict(color='Black', width=2),
)
fig.add_annotation(
x=2.5,
y=-18,
text=r"D<sup>+</sup>S<sup>—</sup>",
font=dict(color="green", size=24),
showarrow=False
)
fig.add_annotation(
x=-1.5,
y=0.5,
text=r"D<sup>—</sup>S<sup>+</sup>",
font=dict(color="red", size=24),
showarrow=False
)
lt_apllr = plot_interactive_line(ud, selection_uid, "aPLLR", "Deletion", del_bot, del_top)
lt_llr = plot_interactive_line(ud, selection_uid, "avg_LLR", "Substitution", sub_bot, sub_top)
# Show the scatter plot
st.plotly_chart(fig)
show_line_plots = st.checkbox("Show Deletion and Substitution Effects Alone")
if show_line_plots:
st.plotly_chart(lt_apllr)
st.plotly_chart(lt_llr)
st.download_button(
label=f"Download {selection_uid} data as CSV",
data=ud.reset_index(drop=True)[["site", "aPLLR", "avg_LLR"]].to_csv(),
file_name = f"{selection_uid}_del_sub.csv",
mime='text/csv'
)
st.markdown("""
**README**:
- Deletion scores are *visualized* on the -log10 scale.
- The genome-wide dataset can be downloaded by clicking [here](https://huggingface.co/spaces/goldmangrant/diff-tol/blob/main/ALL_hum_proteins_ESM1b_del_sub.zip) (or go to files tab).
- Non-aggregated substitution effects can be downloaded or browsed [here](https://huggingface.co/spaces/ntranoslab/esm_variants).
- Additional supplementary data from the paper can be downloaded [here](https://github.com/ntranoslab/diff-tol).
""")
|