PreMode / analysis /fig.2.R
gzhong's picture
Upload folder using huggingface_hub
7718235 verified
library(ggplot2)
# moved annotated file here, previously we annotate from scratch.
ALL <- read.csv('figs/ALL.csv', row.names = 1, na.strings = c(".", "NA"))
ALL <- ALL[ALL$data_source != 'glazer',]
# compare conservation with benign
benign <- read.csv('figs/benign.csv', row.names = 1, na.strings = c(".", "NA"))
benign <- benign[benign$uniprotID %in% ALL$uniprotID,]
# plot number of G/LoF across genes
gene.df <- data.frame(uniprotID=unique(ALL$uniprotID),
GoF=NA, LoF=NA)
for (i in 1:dim(gene.df)[1]) {
gene.df$GoF[i] <- sum(ALL$score[ALL$uniprotID==gene.df$uniprotID[i]]==1)
gene.df$LoF[i] <- sum(ALL$score[ALL$uniprotID==gene.df$uniprotID[i]]==-1)
}
gene.df$label <- NA
genes.dic <- c("Q09428"="ABCC8", "P15056"="BRAF", "O00555"="CACNA1A", "P21802"="FGFR2",
"Q14654"="KCNJ11", "P07949"="RET", "Q99250"="SCN2A", "Q14524"="SCN5A", "P04637"="TP53")
gene.df$label[gene.df$uniprotID %in% names(genes.dic)] <- genes.dic[gene.df$uniprotID[gene.df$uniprotID %in% names(genes.dic)]]
gene.df$transfer.learning <- NA
gene.df$transfer.learning[!is.na(gene.df$label)] <- 'Selected'
ggplot(gene.df, aes(x=GoF, y=LoF, col=transfer.learning, label=label)) +
geom_point() + ggrepel::geom_text_repel() + theme_bw() +
scale_x_continuous(trans = ggallin::pseudolog10_trans, breaks = c(5, 10, 20, 30, 40, 50, 75, 100)) +
scale_y_continuous(trans = ggallin::pseudolog10_trans, breaks = c(5, 10, 20, 40, 60, 80, 100, 200, 400))
ggsave('figs/fig.2c.pdf', height = 3.5, width = 5)
p <- list()
ion.genes <- unique(ALL$uniprotID[grepl("Heyne", ALL$data_source)])
for (j in c(0, 1, 2)) {
if (j==0) {
sse <- table(ALL$secondary_struc[!ALL$uniprotID %in% ion.genes], ALL$LABEL[!ALL$uniprotID %in% ion.genes])
} else if (j==1) {
sse <- table(ALL$secondary_struc[ALL$uniprotID %in% ion.genes], ALL$LABEL[ALL$uniprotID %in% ion.genes])
} else {
sse <- table(ALL$secondary_struc, ALL$LABEL)
}
sse.df <- matrix(NA, nrow = dim(sse)[1], ncol = dim(sse)[2])
colnames(sse.df) <- colnames(sse)
rownames(sse.df) <- rownames(sse)
for (i in 1:dim(sse)[2]) {
sse.df[,i] <- sse[,i]
}
sse.df <- as.data.frame(sse.df)
for (i in 1:dim(sse.df)[1]) {
res <- binom.test(sse.df[i,1], sse.df[i,1]+sse.df[i,2], p=sum(sse.df[,1])/sum(sse.df[,1]+sse.df[,2]))
sse.df$p.value[i] <- res$p.value
}
sse.df$q.value <- p.adjust(sse.df$p.value, method = "fdr")
code.dict <- c("H"="Alpha helix (4-12)", "B"="Isolated beta-bridge residue",
"E"="Beta Sheet", "G"="3-10 helix", "I"="Pi helix", "T"="Turn",
"S"="Bend", " "="none")
sse.df$sec_struc <- code.dict[rownames(sse.df)]
to.plot <- rbind(sse.df, sse.df)
to.plot$n_mutation <- c(sse.df$GOF, sse.df$LOF)
to.plot$frac_mutation <- c(sse.df$GOF/sum(sse.df$GOF), sse.df$LOF/sum(sse.df$LOF))
to.plot$label <- c(rep("GOF", dim(sse.df)[1]), rep("LOF", dim(sse.df)[1]))
to.plot$sec_struc <- gsub(" ", "\n", to.plot$sec_struc)
anno <- to.plot
anno$sec_struc[anno$q.value > 0.05] <- NA
anno$frac_mutation[anno$q.value > 0.05] <- NA
anno <- anno[!is.na(anno$sec_struc),]
anno$x <- as.numeric(as.factor(to.plot$sec_struc))[match(anno$sec_struc, to.plot$sec_struc)] - 0.2
anno$xend <- as.numeric(as.factor(to.plot$sec_struc))[match(anno$sec_struc, to.plot$sec_struc)] + 0.2
anno$y <- anno$frac_mutation + 0.025
anno <- anno[order(anno$x),]
to.keep <- c()
for (i in 1:(dim(anno)[1]/2)) {
to.keep <- c(to.keep, c(i*2-1, i*2)[which.max(anno$y[c(i*2-1, i*2)])])
}
anno <- anno[to.keep,]
anno$annotation <- NA
for (k in 1:dim(anno)[1]) {
anno$annotation[k] <- paste(c(rep(" ", k-1), "*", rep(" ", k-1)), collapse = "")
}
library(ggplot2)
library(ggsignif)
p1 <- ggplot(to.plot, aes(x=sec_struc, y=frac_mutation, fill=label)) +
geom_bar(stat='identity', position=position_dodge()) +
geom_signif(stat="identity",
data=anno,
aes(x=x,
xend=xend,
y=y, yend=y,
annotation=annotation)) + ylim(0, 0.8) +
xlab('secondary structures') +
# scale_x_discrete(guide = guide_axis(n.dodge=2)) +
theme_bw()
if (j==0) {
p1 <- p1 + ggtitle('Other Genes') + ggeasy::easy_center_title()
# ggsave('02.01.sse.pdf', p1, height = 3, width = 6)
} else {
p1 <- p1 + ggtitle('Na+/Ca2+ Channel Genes') + ggeasy::easy_center_title()
# ggsave('02.01.sse.Heyne.pdf', p1, height = 3, width = 6)
}
p[[j+1]] <- p1
}
library(patchwork)
p1 <- p[[2]]+p[[1]]+plot_layout(ncol = 1)
wil.stat <- wilcox.test(ALL$rsa[ALL$LABEL=="GOF"], ALL$rsa[ALL$LABEL=="LOF"])
p2 <- ggplot(rbind(ALL[,c("rsa", "LABEL")], benign[,c("rsa", "LABEL")]), aes(x=rsa, col=LABEL)) + geom_density() +
theme_bw() + ggpp::geom_text_npc(data=data.frame(x="middle", y="top",
label=paste0("Mann-Whitney test G/LoF p=", signif(wil.stat$p.value, digits = 2))),
aes(npcx=x, npcy=y, label=label),
col='black')
# ggsave('02.01.rsa.pdf', p, height = 4, width = 6)
wil.stat <- wilcox.test(ALL$pLDDT[ALL$LABEL=="GOF"], ALL$pLDDT[ALL$LABEL=="LOF"])
p3 <- ggplot(rbind(ALL[,c("pLDDT", "LABEL")], benign[,c("pLDDT", "LABEL")]), aes(x=pLDDT, col=LABEL)) + geom_density() +
theme_bw() + ggpp::geom_text_npc(data=data.frame(x="middle", y="top",
label=paste0("Mann-Whitney test G/LoF p=", signif(wil.stat$p.value, digits = 2))),
aes(npcx=x, npcy=y, label=label),
col='black')
wil.stat <- wilcox.test(ALL$FoldXddG[ALL$LABEL=="GOF"], ALL$FoldXddG[ALL$LABEL=="LOF"])
p4 <- ggplot(rbind(ALL[,c("FoldXddG", "LABEL")],
benign[,c("FoldXddG", "LABEL")]),
aes(x=FoldXddG, col=LABEL)) + geom_density() +
theme_bw() + ggpp::geom_text_npc(data=data.frame(x="right", y="top",
label=paste0("Mann-Whitney test G/LoF p=", signif(wil.stat$p.value, digits = 2))),
aes(npcx=x, npcy=y, label=label),
col='black') +
scale_x_continuous(trans = ggallin::pseudolog10_trans)
wil.stat <- wilcox.test(ALL$conservation.entropy[ALL$LABEL=="GOF"], ALL$conservation.entropy[ALL$LABEL=="LOF"])
p5 <- ggplot(rbind(ALL[,c('conservation.entropy', 'LABEL')], benign[,c('conservation.entropy', 'LABEL')]),
aes(x=conservation.entropy, col=LABEL)) + geom_density() +
theme_bw() + ggpp::geom_text_npc(data=data.frame(x="middle", y="top",
label=paste0("Mann-Whitney test G/LoF p=", signif(wil.stat$p.value, digits = 2))),
aes(npcx=x, npcy=y, label=label),
col='black')
p <- (p3 + p4) / (p2 + p5)
ggsave(plot = p, filename = "figs/fig.2a.pdf", height=5, width=12)
ggsave(plot=p1, filename = "figs/fig.2b.pdf", height = 5, width = 6)