|
seqs <- c(); for (gene in c('PTEN', 'NUDT15', 'CCR5', 'CXCR4', 'GCK', 'CYP2C9', 'ASPA', 'SNCA')) {tmp <- read.csv(paste0('/share/vault/Users/gz2294/Data/DMS/MAVEDB/', gene, '/testing.csv'), row.names = 1); seqs <- c(seqs, tmp$wt.orig[1])} |
|
seq.df <- data.frame(hgnc=c('PTEN', 'NUDT15', 'CCR5', 'CXCR4', 'GCK', 'CYP2C9', 'ASPA', 'SNCA'), seqs) |
|
write.csv(seq.df, 'Hsu.et.al.git/fasta/all.fasta', quote = F, row.names = F) |
|
|
|
genes <- c('PTEN', 'NUDT15', 'CCR5', 'CXCR4', 'GCK', 'CYP2C9', 'ASPA', 'SNCA') |
|
for (gene in genes) { |
|
for (fold in 0:4) { |
|
training <- read.csv(paste0('../data.files/', gene, '/train.seed.', fold, '.csv'), |
|
row.names = 1) |
|
nscores <- sum(startsWith(colnames(training), 'score')) |
|
for (s in 1:nscores) { |
|
target.dir <- paste0('Hsu.et.al.git/data/', gene, '.fold.', fold, '.score.', s) |
|
dir.create(target.dir) |
|
dat <- training[,c('sequence.orig', paste0('score.', s))] |
|
colnames(dat) <- c('seq', 'log_fitness') |
|
dat$n_mut <- 1 |
|
dat$mutant <- paste0(training$ref, training$pos.orig, training$alt) |
|
write.csv(dat, paste0(target.dir, '/data.csv'), row.names = F, quote = F) |
|
write.csv(training$wt.orig[1], paste0(target.dir, '/wt.fasta'), row.names = F, quote = F) |
|
system(paste0('sed -i "s|^x|>', gene, '.score.', s, '|g" ', target.dir, '/wt.fasta')) |
|
} |
|
} |
|
} |