Spaces:
Running
Running
import gradio as gr | |
import numpy as np | |
import pandas as pd | |
import plotly.express as px | |
from collections import Counter | |
css_colors = ["darkmagenta", "darkolivegreen", "darkorange", "darkorchid", "darkred", "darksalmon", "darkseagreen", "darkslateblue", "darkturquoise", "darkviolet", "deeppink", "deepskyblue", "dodgerblue", "firebrick", "coral", "magenta", "maroon", "mediumaquamarine", "mediumblue", "mediumorchid", "mediumpurple", "mediumseagreen", "mediumslateblue", "mediumspringgreen", "mediumturquoise", "mediumvioletred", "midnightblue", "mintcream", "mistyrose", "moccasin", "navajowhite", "navy", "oldlace", "olive", "olivedrab", "orange", "orangered", "orchid", "aqua", "aquamarine", "azure", "blue", "blueviolet", "brown", "burlywood", "cadetblue", "chartreuse", "chocolate", "cornflowerblue", "cornsilk", "crimson", "cyan", "darkblue", "darkcyan", "darkgoldenrod", "darkgreen", "darkkhaki", "floralwhite", "forestgreen", "fuchsia", "gainsboro", "ghostwhite", "gold", "goldenrod", "green", "greenyellow", "honeydew", "hotpink", "indianred", "indigo", "ivory", "khaki", "lavender", "lavenderblush", "lawngreen", "lemonchiffon", "lightblue", "lightcoral", "lightcyan", "lightgoldenrodyellow", "lightgreen", "lightpink", "lightsalmon", "lightseagreen", "lightskyblue", "lightsteelblue", "lightyellow", "lime", "limegreen", "linen", "palegoldenrod", "palegreen", "paleturquoise", "palevioletred", "papayawhip", "peachpuff", "peru", "pink", "plum", "powderblue", "purple", "red", "rosybrown", "royalblue", "rebeccapurple", "saddlebrown", "salmon", "sandybrown", "seagreen", "seashell", "sienna", "silver", "skyblue", "slateblue", "snow", "springgreen", "steelblue", "tan", "teal", "thistle", "tomato", "turquoise", "violet", "wheat", "white", "whitesmoke", "yellow", "yellowgreen"] # "darkgray", "darkgrey", "slategray", "slategrey", "lightslategray", "lightslategrey", "lightgray", "lightgrey", "gray", "grey", "dimgray", "dimgrey", "darkslategray", "darkslategrey", "aliceblue", "black", "beige", "antiquewhite", "bisque", "blanchedalmond", | |
# Read data | |
data = [] | |
with open("data/inventory.txt", "r") as fin: | |
for f in fin: | |
c_data = pd.read_csv(f.strip(), sep = "\t") | |
data.append(c_data) | |
data = pd.concat(data) | |
unique_celltypes = sorted([c for c in data["Celltype"].unique() if "CCI" not in c and "BTO" not in c]) | |
max_safe_scores = pd.read_csv("data/max_safe_scores.csv", sep = "\t").rename(columns = {"Score": "Max SAFE Score", "Label": "Celltype"}) | |
mean_safe_scores = pd.read_csv("data/mean_safe_scores.csv", sep = "\t").rename(columns = {"Score": "Mean SAFE Score", "Label": "Celltype"}) | |
neighborhood_enrichment = pd.read_csv("data/safe_neighborhoods_enriched.csv", sep = "\t").rename(columns = {"Label": "Celltype"}) | |
safe_scores = max_safe_scores.merge(mean_safe_scores, on = "Celltype") | |
safe_scores = safe_scores.merge(neighborhood_enrichment, on = "Celltype") | |
print(safe_scores) | |
# Helper functions | |
def plot_protein_emb(protein): | |
hover_keys = {"Name": True, "Celltype": True, "x": False, "y": False, "Selected": False} | |
p_data = data.copy() | |
p_data["Selected"] = [c if p == protein.lower() else "Not Selected" for p, c in zip(p_data["Name"].str.lower(), p_data["Celltype"].tolist())] | |
p_data["Size"] = [1 if i == "Not Selected" else 10 for i in p_data["Selected"].tolist()] | |
symbol_map = {s: "circle" if s == 1 else "star" for s in p_data["Size"].unique()} | |
p_celltypes = p_data["Selected"].unique() | |
color_map = {c: i for c, i in zip(p_celltypes, css_colors) if c != "Not Selected"} | |
color_map.update({"Not Selected": "lightgrey"}) | |
fig = px.scatter(p_data, x = "x", y = "y", color = "Selected", color_discrete_map = color_map, symbol = "Size", symbol_map = symbol_map, size = "Size", opacity = 0.8, hover_data = hover_keys) | |
fig.update_layout({"plot_bgcolor": "rgba(0, 0, 0, 0)"}, {"paper_bgcolor": "rgba(0, 0, 0, 0)"}) | |
fig.update_xaxes(title_text = "", showticklabels = False) | |
fig.update_yaxes(title_text = "", showticklabels = False) | |
fig.update_layout(showlegend = False) | |
fig.update_traces(marker=dict(line=dict(width=0))) | |
protein_context_df = p_data[p_data["Selected"] != "Not Selected"][["Name", "Celltype", "x", "y"]] | |
return fig, protein_context_df | |
def get_protein_counts(df): | |
counts = Counter(df["Celltype"].tolist()) | |
df = pd.DataFrame({"Celltype": list(counts.keys()), "Activated Proteins": list(counts.values())}) | |
df = df.sort_values(by = "Celltype") | |
df = df.merge(safe_scores, on = "Celltype") | |
print(df) | |
return df | |
def plot_celltype_emb(celltype): | |
hover_keys = {"Name": True, "Celltype": True, "x": False, "y": False} | |
if "All" in celltype: | |
fig = px.scatter(data, x = "x", y = "y", color = "Celltype", opacity = 0.4, hover_data = hover_keys) | |
activated_proteins_df = get_protein_counts(data) | |
else: | |
hover_keys.update({"Selected": False}) | |
c_data = data.copy() | |
celltype = [c.lower() for c in celltype] | |
color_map = {c: i for c, i in zip(celltype, css_colors)} | |
color_map.update({"Not Selected": "lightgrey"}) | |
c_data["Selected"] = [c if c in celltype else "Not Selected" for c in c_data["Celltype"].tolist()] | |
fig = px.scatter(c_data, x = "x", y = "y", color = "Selected", color_discrete_map = color_map, opacity = 0.8, hover_data = hover_keys) | |
activated_proteins_df = get_protein_counts(c_data[c_data["Selected"] != "Not Selected"]) | |
fig.update_layout({"plot_bgcolor": "rgba(0, 0, 0, 0)"}, {"paper_bgcolor": "rgba(0, 0, 0, 0)"}) | |
fig.update_xaxes(title_text = "", showticklabels = False) | |
fig.update_yaxes(title_text = "", showticklabels = False) | |
fig.update_layout(showlegend = False) | |
return fig, activated_proteins_df | |
# Create gradio interface | |
with gr.Blocks() as demo: | |
gr.Markdown('<center><h1>Contextualizing Protein Representations with PINNACLE</h1></center>') | |
gr.Markdown('Protein interaction networks are a critical component to study the function and therapeutic potential of proteins. \ | |
However, accurately modeling protein interactions across diverse biological contexts, such as tissues and cell types, \ | |
remains a significant challenge for existing algorithms. Here, we introduce <b>PINNACLE</b>, a flexible geometric deep learning approach \ | |
that trains on contextualized protein interaction networks to generate context-aware protein representations. Leveraging a \ | |
multi-organ single cell transcriptomic atlas of humans, <b>PINNACLE provides 394,760 protein representations split across 156 cell-type \ | |
contexts from 24 tissues and organs</b>. Our contextualized protein representations, infused with cellular and tissue organization, \ | |
can easily be adapted for diverse downstream tasks.') | |
gr.Markdown(' For more information, please check out our manuscript and documentation (links provided at the bottom of the page)!') | |
with gr.Tabs(): | |
with gr.TabItem("Protein"): | |
with gr.Column(): | |
gr.Markdown('<center><h3>Select protein of interest to examine across biological contexts</h3></center>') | |
protein = gr.Textbox(info = "Enter a protein name (in HGNC symbol)", lines = 1, value = "TNF", label = "Protein") | |
protein_submit_btn = gr.Button("Submit") | |
gr.Markdown('<center><h3>Contextualized protein representations</h3></center>') | |
protein_plot = gr.Plot() | |
with gr.Accordion(label = "Protein Contexts", open = False): | |
protein_context_df = gr.Dataframe(headers = ["Protein", "Celltype", "x", "y"], overflow_row_behaviour = "paginate") | |
with gr.TabItem("Cell Type"): | |
with gr.Column(): | |
gr.Markdown('<center><h3>Select biological context by specifying cell type of interest</h3></center>') | |
celltype = gr.Dropdown(["All"] + unique_celltypes, info = "Please select from the following cell types.", value = ["All"], multiselect = True, label="Cell Type") | |
celltype_submit_btn = gr.Button("Submit") | |
gr.Markdown('<center><h3>Contextualized protein representations</h3></center>') | |
celltype_plot = gr.Plot() | |
with gr.Accordion(label = "Cell Type Context", open = False): | |
activated_proteins_df = gr.Dataframe(headers = ["Celltype", "Activated Proteins"], overflow_row_behaviour = "paginate") | |
gr.Markdown("<p style='text-align: center'><a href='https://github.com/mims-harvard/PINNACLE'>Github Repo</a>" \ | |
"| <a href='https://zitniklab.hms.harvard.edu/projects/PINNACLE/'>Documentation</a> " \ | |
"| <a href='https://www.nature.com/articles/s41592-024-02341-3/'>Publication</a></p>") | |
protein_submit_btn.click(plot_protein_emb, inputs = [protein], outputs = [protein_plot, protein_context_df]) | |
celltype_submit_btn.click(plot_celltype_emb, inputs = [celltype], outputs = [celltype_plot, activated_proteins_df]) | |
# Launch | |
if __name__ == "__main__": | |
demo.launch() | |