File size: 7,066 Bytes
7718235
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
library(ggplot2)
# moved annotated file here, previously we annotate from scratch.
ALL <- read.csv('figs/ALL.csv', row.names = 1, na.strings = c(".", "NA"))
ALL <- ALL[ALL$data_source != 'glazer',]
# compare conservation with benign
benign <- read.csv('figs/benign.csv', row.names = 1, na.strings = c(".", "NA"))
benign <- benign[benign$uniprotID %in% ALL$uniprotID,]
# plot number of G/LoF across genes
gene.df <- data.frame(uniprotID=unique(ALL$uniprotID),
                      GoF=NA, LoF=NA)
for (i in 1:dim(gene.df)[1]) {
  gene.df$GoF[i] <- sum(ALL$score[ALL$uniprotID==gene.df$uniprotID[i]]==1)
  gene.df$LoF[i] <- sum(ALL$score[ALL$uniprotID==gene.df$uniprotID[i]]==-1)
}
gene.df$label <- NA
genes.dic <- c("Q09428"="ABCC8", "P15056"="BRAF", "O00555"="CACNA1A", "P21802"="FGFR2",
           "Q14654"="KCNJ11", "P07949"="RET", "Q99250"="SCN2A", "Q14524"="SCN5A", "P04637"="TP53")
gene.df$label[gene.df$uniprotID %in% names(genes.dic)] <- genes.dic[gene.df$uniprotID[gene.df$uniprotID %in% names(genes.dic)]] 
gene.df$transfer.learning <- NA
gene.df$transfer.learning[!is.na(gene.df$label)] <- 'Selected' 
ggplot(gene.df, aes(x=GoF, y=LoF, col=transfer.learning, label=label)) + 
  geom_point() + ggrepel::geom_text_repel() + theme_bw() + 
  scale_x_continuous(trans = ggallin::pseudolog10_trans, breaks = c(5, 10, 20, 30, 40, 50, 75, 100)) +
  scale_y_continuous(trans = ggallin::pseudolog10_trans, breaks = c(5, 10, 20, 40, 60, 80, 100, 200, 400))
ggsave('figs/fig.2c.pdf', height = 3.5, width = 5)


p <- list()
ion.genes <- unique(ALL$uniprotID[grepl("Heyne", ALL$data_source)])
for (j in c(0, 1, 2)) {
  if (j==0) {
    sse <- table(ALL$secondary_struc[!ALL$uniprotID %in% ion.genes], ALL$LABEL[!ALL$uniprotID %in% ion.genes])
  } else if (j==1) {
    sse <- table(ALL$secondary_struc[ALL$uniprotID %in% ion.genes], ALL$LABEL[ALL$uniprotID %in% ion.genes])
  } else {
    sse <- table(ALL$secondary_struc, ALL$LABEL)
  }
  sse.df <- matrix(NA, nrow = dim(sse)[1], ncol = dim(sse)[2])
  colnames(sse.df) <- colnames(sse)
  rownames(sse.df) <- rownames(sse)
  for (i in 1:dim(sse)[2]) {
    sse.df[,i] <- sse[,i]
  }
  sse.df <- as.data.frame(sse.df)
  for (i in 1:dim(sse.df)[1]) {
    res <- binom.test(sse.df[i,1], sse.df[i,1]+sse.df[i,2], p=sum(sse.df[,1])/sum(sse.df[,1]+sse.df[,2]))
    sse.df$p.value[i] <- res$p.value
  }
  sse.df$q.value <- p.adjust(sse.df$p.value, method = "fdr")
  code.dict <- c("H"="Alpha helix (4-12)", "B"="Isolated beta-bridge residue", 
                 "E"="Beta Sheet", "G"="3-10 helix", "I"="Pi helix", "T"="Turn",
                 "S"="Bend", " "="none")
  sse.df$sec_struc <- code.dict[rownames(sse.df)]
  to.plot <- rbind(sse.df, sse.df)
  to.plot$n_mutation <- c(sse.df$GOF, sse.df$LOF)
  to.plot$frac_mutation <- c(sse.df$GOF/sum(sse.df$GOF), sse.df$LOF/sum(sse.df$LOF))
  to.plot$label <- c(rep("GOF", dim(sse.df)[1]), rep("LOF", dim(sse.df)[1]))
  to.plot$sec_struc <- gsub(" ", "\n", to.plot$sec_struc)
  
  anno <- to.plot
  anno$sec_struc[anno$q.value > 0.05] <- NA
  anno$frac_mutation[anno$q.value > 0.05] <- NA
  anno <- anno[!is.na(anno$sec_struc),]
  anno$x <- as.numeric(as.factor(to.plot$sec_struc))[match(anno$sec_struc, to.plot$sec_struc)] - 0.2
  anno$xend <- as.numeric(as.factor(to.plot$sec_struc))[match(anno$sec_struc, to.plot$sec_struc)] + 0.2
  anno$y <- anno$frac_mutation + 0.025
  anno <- anno[order(anno$x),]
  to.keep <- c()
  for (i in 1:(dim(anno)[1]/2)) {
    to.keep <- c(to.keep, c(i*2-1, i*2)[which.max(anno$y[c(i*2-1, i*2)])])
  }
  anno <- anno[to.keep,]
  anno$annotation <- NA
  for (k in 1:dim(anno)[1]) {
    anno$annotation[k] <- paste(c(rep(" ", k-1), "*", rep(" ", k-1)), collapse = "")
  }
  library(ggplot2)
  library(ggsignif)
  p1 <- ggplot(to.plot, aes(x=sec_struc, y=frac_mutation, fill=label)) +
    geom_bar(stat='identity', position=position_dodge()) + 
    geom_signif(stat="identity",
                data=anno,
                aes(x=x,
                    xend=xend,
                    y=y, yend=y,
                    annotation=annotation)) + ylim(0, 0.8) +
    xlab('secondary structures') +
    # scale_x_discrete(guide = guide_axis(n.dodge=2)) +
    theme_bw()
  if (j==0) {
    p1 <- p1 + ggtitle('Other Genes') + ggeasy::easy_center_title()
    # ggsave('02.01.sse.pdf', p1, height = 3, width = 6)
  } else {
    p1 <- p1 + ggtitle('Na+/Ca2+ Channel Genes') + ggeasy::easy_center_title()
    # ggsave('02.01.sse.Heyne.pdf', p1, height = 3, width = 6)
  }
  p[[j+1]] <- p1
}
library(patchwork)
p1 <- p[[2]]+p[[1]]+plot_layout(ncol = 1)

wil.stat <- wilcox.test(ALL$rsa[ALL$LABEL=="GOF"], ALL$rsa[ALL$LABEL=="LOF"])
p2 <- ggplot(rbind(ALL[,c("rsa", "LABEL")], benign[,c("rsa", "LABEL")]), aes(x=rsa, col=LABEL)) + geom_density() +
  theme_bw() + ggpp::geom_text_npc(data=data.frame(x="middle", y="top",
                                                   label=paste0("Mann-Whitney test G/LoF p=", signif(wil.stat$p.value, digits = 2))),
                                   aes(npcx=x, npcy=y, label=label),
                                   col='black')
# ggsave('02.01.rsa.pdf', p, height = 4, width = 6)
wil.stat <- wilcox.test(ALL$pLDDT[ALL$LABEL=="GOF"], ALL$pLDDT[ALL$LABEL=="LOF"])
p3 <- ggplot(rbind(ALL[,c("pLDDT", "LABEL")], benign[,c("pLDDT", "LABEL")]), aes(x=pLDDT, col=LABEL)) + geom_density() + 
  theme_bw() + ggpp::geom_text_npc(data=data.frame(x="middle", y="top",
                                                   label=paste0("Mann-Whitney test G/LoF p=", signif(wil.stat$p.value, digits = 2))),
                                   aes(npcx=x, npcy=y, label=label),
                                   col='black')

wil.stat <- wilcox.test(ALL$FoldXddG[ALL$LABEL=="GOF"], ALL$FoldXddG[ALL$LABEL=="LOF"])
p4 <- ggplot(rbind(ALL[,c("FoldXddG", "LABEL")], 
                   benign[,c("FoldXddG", "LABEL")]), 
             aes(x=FoldXddG, col=LABEL)) + geom_density() + 
  theme_bw() + ggpp::geom_text_npc(data=data.frame(x="right", y="top",
                                                   label=paste0("Mann-Whitney test G/LoF p=", signif(wil.stat$p.value, digits = 2))),
                                   aes(npcx=x, npcy=y, label=label),
                                   col='black') +
  scale_x_continuous(trans = ggallin::pseudolog10_trans)

wil.stat <- wilcox.test(ALL$conservation.entropy[ALL$LABEL=="GOF"], ALL$conservation.entropy[ALL$LABEL=="LOF"])
p5 <- ggplot(rbind(ALL[,c('conservation.entropy', 'LABEL')], benign[,c('conservation.entropy', 'LABEL')]), 
             aes(x=conservation.entropy, col=LABEL)) + geom_density() + 
  theme_bw() + ggpp::geom_text_npc(data=data.frame(x="middle", y="top",
                                                   label=paste0("Mann-Whitney test G/LoF p=", signif(wil.stat$p.value, digits = 2))),
                                   aes(npcx=x, npcy=y, label=label),
                                   col='black') 

p <- (p3 + p4) / (p2 + p5)
ggsave(plot = p, filename = "figs/fig.2a.pdf", height=5, width=12)
ggsave(plot=p1, filename = "figs/fig.2b.pdf", height = 5, width = 6)