PreMode / analysis /fig.sup.5.R
gzhong's picture
Upload folder using huggingface_hub
7718235 verified
# several questions to ask
# Average MSE, PreMode is better than experiment
# For points that have different replicates, which is better.
# For all points in each experiment, which point has better MSE.
# First get ground truth
ground.truth <- read.csv(paste0('../data.files/PTEN/assay.1.csv'), row.names = 1)
# Next set up metrics
all.premode <- list(c(), c(), c(), c(), c(), c(), c(), c())
all.baseline <- list(c(), c(), c(), c(), c(), c(), c(), c())
all.replicates <- list()
for (r in 1:8) {
replicate <- read.csv(paste0('PreMode/PTEN/replicate.', r, '.csv'))
training <- read.csv(paste0('../data.files/PTEN.replicate.rest.',
r, '/training.csv'), row.names = 1)
# ground.truth$aaChg <- paste0('p.', ground.truth$ref, ground.truth$pos.orig, ground.truth$alt)
replicate$baseline <- NA
replicate$observations <- NA
replicate.unique <- replicate[!duplicated(replicate$aaChg),]
for (i in 1:dim(replicate.unique)[1]) {
baseline <- training[training$aaChg == replicate.unique$aaChg[i],]
replicate.baseline <- replicate[replicate$aaChg == replicate.unique$aaChg[i] & !is.na(replicate$score),]
replicate.unique$base.line.1[i] <- mean(baseline$score, na.rm=T)
replicate.unique$base.line.2[i] <- mean(replicate.baseline$score, na.rm=T)
replicate.unique$ground.truth[i] <- ground.truth$score[ground.truth$VarID==replicate.unique$aaChg[i]]
replicate.unique$observations[i] <- dim(baseline)[1] + dim(replicate.baseline)[1]
replicate.unique$other.observations[i] <- dim(replicate.baseline)[1]
}
# calculate MSE
premode <- mean((replicate.unique$base.line.2 - replicate.unique$logits)^2, na.rm = T)
baseline <- mean((replicate.unique$base.line.1 - replicate.unique$base.line.2)^2, na.rm = T)
all.replicates[[r]] <- replicate.unique
all.premode[[1]] <- c(all.premode[[1]], premode)
all.baseline[[1]] <- c(all.baseline[[1]], baseline)
# next compare for each group of replicates
for (i in min(replicate.unique$other.observations, na.rm = T):max(replicate.unique$other.observations, na.rm = T)) {
premode <- mean((replicate.unique$ground.truth[replicate.unique$other.observations==i] -
replicate.unique$logits[replicate.unique$other.observations==i])^2, na.rm = T)
baseline <- mean((replicate.unique$base.line.1[replicate.unique$other.observations==i] -
replicate.unique$ground.truth[replicate.unique$other.observations==i])^2, na.rm = T)
all.premode[[i+1]] <- c(all.premode[[i+1]], premode)
all.baseline[[i+1]] <- c(all.baseline[[i+1]], baseline)
}
print(paste0('replicate ', r, ', PreMode: ', all.premode[[1]], ', Baseline: ', all.baseline[[1]]))
}
npoints <- table(all.replicates[[1]]$other.observations)
npoints <- c(sum(npoints), npoints)
names(npoints)[1] <- 'all'
to.plot <- data.frame(RMSE=sqrt(c(unlist(all.premode),
unlist(all.baseline))),
exp = rep(rep(1:8, 8), 2),
replicate=paste0(rep(rep(names(npoints), each=8), 2), " : ",
rep(rep(npoints, each=8), 2)),
model=c(rep("PreMode", length(names(npoints))*8), rep("Experiment", length(npoints)*8)))
library(ggplot2)
# for each experiment, check the points that are far away from PreMode prediction
# they should be far away from replicates as well.
library(ggpubr)
diff.plots <- list()
diff.plots.2 <- list()
for (r in 1:length(all.replicates)) {
all.replicates[[r]]$Experiment.PreMode.diff <- (all.replicates[[r]]$base.line.1 - all.replicates[[r]]$logits)
all.replicates[[r]]$Experiment.Groundtruth.diff <- (all.replicates[[r]]$base.line.1 - all.replicates[[r]]$ground.truth)
diff.plots[[r]] <- ggplot(all.replicates[[r]], aes(x=Experiment.PreMode.diff, y=Experiment.Groundtruth.diff, col=observations)) +
geom_smooth(method='lm', formula= y~x) +
stat_regline_equation(
aes(label = paste(after_stat(eq.label), after_stat(adj.rr.label), sep = "~~~~")),
formula = y~x
) +
geom_point(alpha=0.3) + xlab('Measurement - PreMode') + ylab('Measurement - Groundtruth') +
scale_color_gradientn(colours = c("red", "white", "blue")) +
ggtitle(paste0("Train on Experiment ", r)) +
theme_bw() + ggeasy::easy_center_title()
scl <- max(all.replicates[[r]]$logits, na.rm = T) - min(all.replicates[[r]]$logits, na.rm = T)
# all.replicates[[r]]$Experiment.PreMode.diff.rank <- dplyr::percent_rank(all.replicates[[r]]$Experiment.PreMode.diff)
all.replicates[[r]]$Experiment.PreMode.diff.bin <- 'Measurement\n~ PreMode'
all.replicates[[r]]$Experiment.PreMode.diff.bin[all.replicates[[r]]$Experiment.PreMode.diff>=scl/2] <- 'Measurement\n> PreMode'
all.replicates[[r]]$Experiment.PreMode.diff.bin[all.replicates[[r]]$Experiment.PreMode.diff<=-scl/2] <- 'Measurement\n< PreMode'
all.replicates[[r]]$Experiment.PreMode.diff.bin <- factor(all.replicates[[r]]$Experiment.PreMode.diff.bin, levels=c('Measurement\n< PreMode', 'Measurement\n~ PreMode', 'Measurement\n> PreMode'))
diff.plots.2[[r]] <- ggplot(all.replicates[[r]], aes(x=Experiment.PreMode.diff.bin, y=Experiment.Groundtruth.diff, col=Experiment.PreMode.diff.bin)) +
geom_violin() +
geom_boxplot(width=0.2) +
# geom_point(alpha=0.3) +
ggtitle(paste0("Train on Experiment ", r)) + labs(col='Variant Groups') + xlab('Measurement - PreMode') + ylab('Measurement - Groundtruth') +
theme_bw() + ggeasy::easy_center_title()
print(cor.test(all.replicates[[r]]$Experiment.PreMode.diff, all.replicates[[r]]$Experiment.Groundtruth.diff)$estimate)
}
library(patchwork)
p4 <- diff.plots[[1]] + diff.plots[[2]] + diff.plots[[3]] + diff.plots[[4]] +
diff.plots[[5]] + diff.plots[[6]] + diff.plots[[7]] + diff.plots[[8]] + patchwork::plot_layout(ncol=4)
p5 <- diff.plots.2[[1]] + diff.plots.2[[2]] + diff.plots.2[[3]] + diff.plots.2[[4]] +
diff.plots.2[[5]] + diff.plots.2[[6]] + diff.plots.2[[7]] + diff.plots.2[[8]] + patchwork::plot_layout(ncol=4)
ggsave(filename = 'figs/fig.sup.5a.pdf', p4, width = 20, height = 7.5)
ggsave(filename = 'figs/fig.sup.5b.pdf', p5, width = 20, height = 7.5)