File size: 5,635 Bytes
fc64043
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
07d6846
fc64043
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
07d6846
1c5f9da
fc64043
 
 
 
07d6846
1c5f9da
fc64043
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import datasets
import streamlit as st
import numpy as np
import pandas as pd
import altair as alt

st.set_page_config(layout='wide')

# parse out gene_ids from URL query args to it's possible to link to this page
query_params = st.query_params
if "gene_id_1" in query_params.keys():
    gene_id_1 = query_params["gene_id_1"]
else:
    gene_id_1 = "TGME49_231630"

if "gene_id_2" in query_params.keys():
    gene_id_2 = query_params["gene_id_2"]
else:
    gene_id_2 = "TGME49_230210"

    


st.markdown("""
# ToxoCEN Expression Scatter
**ToxoCEN** is a co-expression network for *Toxoplasma gondii* built on 719 RNA-seq runs across 39 studies.
A pair of genes are said to be co-expressed when their expression is correlated across different conditions and
is often a marker for genes to be involved in similar processes. 

To Cite:
CS Arnold, Y Wang, VB Carruthers, MJ O'Meara
ToxoCEN: A Co-Expression Network for Toxoplasma gondii

* Code available at https://github.com/maomlab/CalCEN/tree/master/vignettes/ToxoCEN
* Full network and dataset: https://huggingface.co/datasets/maomlab/ToxoCEN

## Plot scatter plot expression for a pair of genes across studies.
Put in the ``TGME49_######`` gene_id for two genes.
""")

TGME49_transcript_annotations = datasets.load_dataset(
    path = "maomlab/ToxoCEN",
    data_files = {"TGME49_transcript_annotations": "TGME49_transcript_annotations.tsv"})
TGME49_transcript_annotations = TGME49_transcript_annotations["TGME49_transcript_annotations"].to_pandas()

estimated_expression_meta = datasets.load_dataset(
    path = "maomlab/ToxoCEN",
    data_files = {"estimated_expression_meta": "data/estimated_expression_meta.tsv"})
estimated_expression_meta = estimated_expression_meta["estimated_expression_meta"].to_pandas()

estimated_expression = datasets.load_dataset(
    path = "maomlab/ToxoCEN",
    data_files = {"estimated_expression": "data/estimated_expression.parquet"})
estimated_expression = estimated_expression["estimated_expression"].to_pandas()

#DEBUG
print(f"estimated_expression shape: {estimated_expression.shape}")

col1, col2, col3, padding = st.columns(spec = [0.2, 0.2, 0.2, 0.4])
with col1:
    gene_id_1 = st.text_input(
        label = "Gene ID 1",
        value = f"{gene_id_1}",
        max_chars = 10,
        help = "TGME49 Gene ID e.g. TGME49_231630")

with col2:
    gene_id_2 = st.text_input(
        label = "Gene ID 2",
        value = f"{gene_id_2}",
        max_chars = 10,
        help = "TGME49 Gene ID e.g. TGME49_230210")

# check the user input
try:
    TGME49_id_1 = TGME49_transcript_annotations.loc[TGME49_transcript_annotations["gene_id"] == gene_id_1]["TGME49_id"].values[0]
    gene_name_1 = TGME49_transcript_annotations.loc[TGME49_transcript_annotations["gene_id"] == gene_id_1]["gene_name"].values[0]
    description_1 = TGME49_transcript_annotations.loc[TGME49_transcript_annotations["gene_id"] == gene_id_1]["description"].values[0]
except:
    st.error(f"Unable to locate TGME49_id for Gene ID 1: {gene_id_1}, it should be of the form 'TGME49_######'")

try:
    TGME49_id_2 = TGME49_transcript_annotations.loc[TGME49_transcript_annotations["gene_id"] == gene_id_2]["TGME49_id"].values[0]
    gene_name_2 = TGME49_transcript_annotations.loc[TGME49_transcript_annotations["gene_id"] == gene_id_2]["gene_name"].values[0]
    description_2 = TGME49_transcript_annotations.loc[TGME49_transcript_annotations["gene_id"] == gene_id_2]["description"].values[0]
except:
    st.error(f"Unable to locate TGME49_id for Gene ID 2: {gene_id_2}, it should be of the form 'TGME49_######'")

chart_data = pd.DataFrame({
    "gene_id_1": gene_id_1,
    "gene_id_2": gene_id_2,
    "expression_1": estimated_expression.loc[TGME49_transcript_annotations["gene_id"] == gene_id_1].to_numpy()[0],
    "expression_2": estimated_expression.loc[TGME49_transcript_annotations["gene_id"] == gene_id_2].to_numpy()[0],
    "log_expression_1": np.log10(estimated_expression.loc[TGME49_transcript_annotations["gene_id"] == gene_id_1].to_numpy()[0] + 1),
    "log_expression_2": np.log10(estimated_expression.loc[TGME49_transcript_annotations["gene_id"] == gene_id_2].to_numpy()[0] + 1),
    "run_accession": estimated_expression.columns})
chart_data = chart_data.merge(
    right = estimated_expression_meta,
    on = "run_accession")

with col3:
    st.text('') # help alignment with input box
    st.download_button(
        label = "Download data as TSV",
        data = chart_data.to_csv(sep ='\t').encode('utf-8'),
        file_name = f"ToxoCEN_expression_{gene_id_1}_vs_{gene_id_2}.tsv",
        mime = "text/csv")


st.markdown(f"""
#### Gene 1:
* *Gene ID*: [{gene_id_1}](https://toxodb.org/toxo/app/record/gene/{gene_id_1})
{'* *Gene Name*:' + gene_name_1 if gene_name_1 is not None else ''}
* *Description*: {description_1}
* *Top [Co-Expressed Partners](https://huggingface.co/spaces/maomlab/ToxoCEN-TopHits?gene_id={gene_id_1})*

#### Gene 2:
* *Gene ID*: [{gene_id_2}](https://toxodb.org/toxo/app/record/gene/{gene_id_2})
{'* *Gene Name*:' + gene_name_2 if gene_name_2 is not None else ''}
* *Description*: {description_2}
* *Top [Co-Expressed Partners](https://huggingface.co/spaces/maomlab/ToxoCEN-TopHits?gene_id={gene_id_2})*
""")

chart = (
   alt.Chart(
       chart_data,
       width = 750,
       height = 750)
   .mark_circle()
   .encode(
       x=alt.X("log_expression_1", title=f"Log10[{gene_id_1}+1] Expression"),
       y=alt.Y("log_expression_2", title=f"Log10[{gene_id_2}+1] Expression"),
       color=alt.Color("study_accession", title="Study Accession"),
       tooltip=["run_accession", "study_accession"]))

st.altair_chart(
    chart)