Spaces:

ntranoslab
/

diff-tol

Sleeping

File size: 5,293 Bytes

import streamlit as st
import pandas as pd
import numpy as np
import time
import plotly.graph_objects as go
from scipy.ndimage import gaussian_filter1d
from zipfile import ZipFile

np.random.seed(2024)

uids = pd.read_csv("uniprot_ids.tsv.gz", names=["selection"], header=None, sep="\t")
# del_sub_merge = pd.read_csv("del_sub_data.csv.gz")
zf = ZipFile("ALL_hum_proteins_ESM1b_del_sub.zip")

width=600

def plot_interactive_scatter(uid: str):
    
    user_data = pd.read_csv(zf.open(f"{uid}.csv"))
        
    # Create scatter plot for user-specified data
    user_trace = go.Scatter(
    x=-np.log10(user_data.aPLLR),
    y=user_data.avg_LLR,
    mode='markers',
    name=f"{uid}<br>Data",
    text=user_data.site,
    hoverinfo='text',
    marker=dict(color='orange'))
    
    return user_trace, user_data

def plot_interactive_line(uid_data: pd.DataFrame, uid: str, score: str, mutation: str,
                          hline1: float, hline2: float):
    
    esm_data = -np.log10(uid_data[score]) if score == "aPLLR" else uid_data[score]
    x_ticks = uid_data["site"].tolist()
    
    plot_data = esm_data
    hover_text = [f"{x}: {np.round(y, 3)}" for x, y in zip(uid_data.site, plot_data)]
    
    line_trace = go.Scatter(
        x=np.arange(1, len(uid_data)+1),
        y=plot_data,
        mode='lines',
        text=hover_text,
        hoverinfo='text',
        marker=dict(color='orange')
    )
    line_fig = go.Figure(data=[line_trace])
    line_fig.update_layout(
        title=f"{uid} {mutation} Scores by Position",
        yaxis_title=f'{mutation} Score<br>(More Negative = More Damaging)',
        yaxis=dict(showgrid=False, zeroline=False, showline=False),
        height=300,
        hoverlabel=dict(  # Set hover label font size
            font=dict(size=16)  # Specify the font size of the hover text
        )
    )
    for hline in [hline1, hline2]:
        line_fig.add_shape(        
            type='line',
            x0=0, x1=1, y0=hline, y1=hline,
            xref='paper', yref='y',
            line=dict(color='Black', dash='dash'),
        )
    return line_fig
                           
selection = st.selectbox("", uids.selection, index=11409)
selection_uid = selection.split(",")[0]
                               
# Base dataset
base_data = pd.read_csv("rand_samp_gw_del_sub.csv.gz")

# Create base scatter plot
base_trace = go.Scatter(
    x=-np.log10(base_data.aPLLR),
    y=base_data.avg_LLR,
    mode='markers',
    name='Sample of<br>Genome-Wide<br>Data',
    hoverinfo='none', # Disable hover information for the base data
    marker=dict(color='grey')
)

# User-specified data
ut, ud = plot_interactive_scatter(selection_uid)

# Combine traces
fig = go.Figure([base_trace, ut])

# Customize layout
fig.update_layout(
    title='Deletion v Substitution Effects',
    xaxis_title='Deletion Score',
    yaxis_title='Substitution Score',
    yaxis=dict(showgrid=False, showline=False, zeroline=False),
    legend=dict(
        font=dict(size=15), # Specify the font size of the legend text
        bordercolor="grey",
        borderwidth=1
    ),
    hoverlabel=dict(  # Set hover label font size
        font=dict(size=16)  # Specify the font size of the hover text
    )
)

fig.update_yaxes(showgrid=False)

# Extract out percentiles
del_bot, del_top =  0.16500809479645437, -0.7801050825906862
for del_cutoff in [del_bot, del_top]:
    fig.add_shape(
        type='line',
        x0=del_cutoff, x1=del_cutoff, y0=0, y1=1,
        xref='x', yref='paper',
        line=dict(color='Black', width=2)
    )

# to avoid reading the entire dataset into memory
sub_bot, sub_top = -12.004105263157896, -4.871947368421053
for sub_cutoff in [sub_bot, sub_top]:
    fig.add_shape(
        type='line',
        x0=0, x1=1, y0=sub_cutoff, y1=sub_cutoff,
        xref='paper', yref='y',
        line=dict(color='Black', width=2),
    )

fig.add_annotation(
    x=2.5,
    y=-18,
    text=r"D<sup>+</sup>S<sup>—</sup>",
    font=dict(color="green", size=24),
    showarrow=False
)

fig.add_annotation(
    x=-1.5,
    y=0.5,
    text=r"D<sup>—</sup>S<sup>+</sup>",
    font=dict(color="red", size=24),
    showarrow=False
)

lt_apllr = plot_interactive_line(ud, selection_uid, "aPLLR", "Deletion", del_bot, del_top)

lt_llr = plot_interactive_line(ud, selection_uid, "avg_LLR", "Substitution", sub_bot, sub_top)

# Show the scatter plot
st.plotly_chart(fig)

show_line_plots = st.checkbox("Show Deletion and Substitution Effects Alone")

if show_line_plots:
    st.plotly_chart(lt_apllr)
    st.plotly_chart(lt_llr)

st.download_button(
    label=f"Download {selection_uid} data as CSV",
    data=ud.reset_index(drop=True)[["site", "aPLLR", "avg_LLR"]].to_csv(),
    file_name = f"{selection_uid}_del_sub.csv",
    mime='text/csv'
)



st.markdown("""
**README**:
- Deletion scores are *visualized* on the -log10 scale. 
- The genome-wide dataset can be downloaded by clicking [here](https://huggingface.co/spaces/goldmangrant/diff-tol/blob/main/ALL_hum_proteins_ESM1b_del_sub.zip) (or go to files tab).
- Non-aggregated substitution effects can be downloaded or browsed [here](https://huggingface.co/spaces/ntranoslab/esm_variants).
- Additional supplementary data from the paper can be downloaded [here](https://github.com/ntranoslab/diff-tol).
""")