File size: 6,146 Bytes
7718235
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# several questions to ask
# Average MSE, PreMode is better than experiment
# For points that have different replicates, which is better.
# For all points in each experiment, which point has better MSE.

# First get ground truth
ground.truth <- read.csv(paste0('../data.files/PTEN/assay.1.csv'), row.names = 1)
# Next set up metrics
all.premode <- list(c(), c(), c(), c(), c(), c(), c(), c())
all.baseline <- list(c(), c(), c(), c(), c(), c(), c(), c())
all.replicates <- list()
for (r in 1:8) {
  replicate <- read.csv(paste0('PreMode/PTEN/replicate.', r, '.csv'))
  training <- read.csv(paste0('../data.files/PTEN.replicate.rest.', 
                              r, '/training.csv'), row.names = 1)
  # ground.truth$aaChg <- paste0('p.', ground.truth$ref, ground.truth$pos.orig, ground.truth$alt)
  replicate$baseline <- NA
  replicate$observations <- NA
  replicate.unique <- replicate[!duplicated(replicate$aaChg),]
  for (i in 1:dim(replicate.unique)[1]) {
    baseline <- training[training$aaChg == replicate.unique$aaChg[i],]
    replicate.baseline <- replicate[replicate$aaChg == replicate.unique$aaChg[i] & !is.na(replicate$score),]
    replicate.unique$base.line.1[i] <- mean(baseline$score, na.rm=T)
    replicate.unique$base.line.2[i] <- mean(replicate.baseline$score, na.rm=T)
    replicate.unique$ground.truth[i] <- ground.truth$score[ground.truth$VarID==replicate.unique$aaChg[i]]
    replicate.unique$observations[i] <- dim(baseline)[1] + dim(replicate.baseline)[1]
    replicate.unique$other.observations[i] <- dim(replicate.baseline)[1]
  }
  # calculate MSE
  premode <- mean((replicate.unique$base.line.2 - replicate.unique$logits)^2, na.rm = T)
  baseline <- mean((replicate.unique$base.line.1 - replicate.unique$base.line.2)^2, na.rm = T)
  all.replicates[[r]] <- replicate.unique
  all.premode[[1]] <- c(all.premode[[1]], premode)
  all.baseline[[1]] <- c(all.baseline[[1]], baseline)
  # next compare for each group of replicates
  for (i in min(replicate.unique$other.observations, na.rm = T):max(replicate.unique$other.observations, na.rm = T)) {
    premode <- mean((replicate.unique$ground.truth[replicate.unique$other.observations==i] -
                       replicate.unique$logits[replicate.unique$other.observations==i])^2, na.rm = T)
    baseline <- mean((replicate.unique$base.line.1[replicate.unique$other.observations==i] - 
                        replicate.unique$ground.truth[replicate.unique$other.observations==i])^2, na.rm = T)
    all.premode[[i+1]] <- c(all.premode[[i+1]], premode)
    all.baseline[[i+1]] <- c(all.baseline[[i+1]], baseline)
  }
  print(paste0('replicate ', r, ', PreMode: ', all.premode[[1]], ', Baseline: ', all.baseline[[1]]))
}
npoints <- table(all.replicates[[1]]$other.observations)
npoints <- c(sum(npoints), npoints)
names(npoints)[1] <- 'all'
to.plot <- data.frame(RMSE=sqrt(c(unlist(all.premode), 
                            unlist(all.baseline))), 
                      exp = rep(rep(1:8, 8), 2),
                      replicate=paste0(rep(rep(names(npoints), each=8), 2), " : ",
                                       rep(rep(npoints, each=8), 2)),
                      model=c(rep("PreMode", length(names(npoints))*8), rep("Experiment", length(npoints)*8)))
library(ggplot2)
# for each experiment, check the points that are far away from PreMode prediction
# they should be far away from replicates as well.
library(ggpubr)
diff.plots <- list()
diff.plots.2 <- list()
for (r in 1:length(all.replicates)) {
  all.replicates[[r]]$Experiment.PreMode.diff <- (all.replicates[[r]]$base.line.1 - all.replicates[[r]]$logits)
  all.replicates[[r]]$Experiment.Groundtruth.diff <- (all.replicates[[r]]$base.line.1 - all.replicates[[r]]$ground.truth)
  diff.plots[[r]] <- ggplot(all.replicates[[r]], aes(x=Experiment.PreMode.diff, y=Experiment.Groundtruth.diff, col=observations)) +
    geom_smooth(method='lm', formula= y~x) +
    stat_regline_equation(
      aes(label =  paste(after_stat(eq.label), after_stat(adj.rr.label), sep = "~~~~")),
      formula = y~x
    ) +
    geom_point(alpha=0.3) + xlab('Measurement - PreMode') + ylab('Measurement - Groundtruth') +
    scale_color_gradientn(colours = c("red", "white", "blue")) +
    ggtitle(paste0("Train on Experiment ", r)) +
    theme_bw() + ggeasy::easy_center_title()
  scl <- max(all.replicates[[r]]$logits, na.rm = T) - min(all.replicates[[r]]$logits, na.rm = T)
  # all.replicates[[r]]$Experiment.PreMode.diff.rank <- dplyr::percent_rank(all.replicates[[r]]$Experiment.PreMode.diff)
  all.replicates[[r]]$Experiment.PreMode.diff.bin <- 'Measurement\n~ PreMode'
  all.replicates[[r]]$Experiment.PreMode.diff.bin[all.replicates[[r]]$Experiment.PreMode.diff>=scl/2] <- 'Measurement\n> PreMode'
  all.replicates[[r]]$Experiment.PreMode.diff.bin[all.replicates[[r]]$Experiment.PreMode.diff<=-scl/2] <- 'Measurement\n< PreMode'
  all.replicates[[r]]$Experiment.PreMode.diff.bin <- factor(all.replicates[[r]]$Experiment.PreMode.diff.bin, levels=c('Measurement\n< PreMode', 'Measurement\n~ PreMode', 'Measurement\n> PreMode'))
  diff.plots.2[[r]] <- ggplot(all.replicates[[r]], aes(x=Experiment.PreMode.diff.bin, y=Experiment.Groundtruth.diff, col=Experiment.PreMode.diff.bin)) +
    geom_violin() +
    geom_boxplot(width=0.2) +
    # geom_point(alpha=0.3) + 
    ggtitle(paste0("Train on Experiment ", r)) + labs(col='Variant Groups') + xlab('Measurement - PreMode') + ylab('Measurement - Groundtruth') + 
    theme_bw() + ggeasy::easy_center_title()
  print(cor.test(all.replicates[[r]]$Experiment.PreMode.diff, all.replicates[[r]]$Experiment.Groundtruth.diff)$estimate)
}

library(patchwork)
p4 <- diff.plots[[1]] + diff.plots[[2]] + diff.plots[[3]] + diff.plots[[4]] + 
  diff.plots[[5]] + diff.plots[[6]] + diff.plots[[7]] + diff.plots[[8]] + patchwork::plot_layout(ncol=4)
p5 <- diff.plots.2[[1]] + diff.plots.2[[2]] + diff.plots.2[[3]] + diff.plots.2[[4]] + 
  diff.plots.2[[5]] + diff.plots.2[[6]] + diff.plots.2[[7]] + diff.plots.2[[8]] + patchwork::plot_layout(ncol=4)

ggsave(filename = 'figs/fig.sup.5a.pdf', p4, width = 20, height = 7.5)
ggsave(filename = 'figs/fig.sup.5b.pdf', p5, width = 20, height = 7.5)