correct design naming..
[sgn.git] / R / gs.r
bloba59e36cb64c58cde7794aa9f31e53d6864dea551
1 #a script for calculating genomic
2 #estimated breeding values (GEBVs) using rrBLUP
4 options(echo = FALSE)
6 library(rrBLUP)
7 library(plyr)
8 library(mail)
9 library(imputation)
10 library(stringr)
11 library(nlme)
14 allArgs <- commandArgs()
16 inFile <- grep("input_files",
17 allArgs,
18 ignore.case = TRUE,
19 perl = TRUE,
20 value = TRUE
23 outFile <- grep("output_files",
24 allArgs,
25 ignore.case = TRUE,
26 perl = TRUE,
27 value = TRUE
30 outFiles <- scan(outFile,
31 what = "character"
34 inFiles <- scan(inFile,
35 what = "character"
39 traitsFile <- grep("traits",
40 inFiles,
41 ignore.case = TRUE,
42 fixed = FALSE,
43 value = TRUE
46 traitFile <- grep("trait_info",
47 inFiles,
48 ignore.case = TRUE,
49 fixed = FALSE,
50 value = TRUE
54 traitInfo <- scan(traitFile,
55 what = "character",
58 traitInfo <- strsplit(traitInfo, "\t");
59 traitId <- traitInfo[[1]]
60 trait <- traitInfo[[2]]
62 datasetInfoFile <- grep("dataset_info",
63 inFiles,
64 ignore.case = TRUE,
65 fixed = FALSE,
66 value = TRUE
68 datasetInfo <- c()
70 if(length(datasetInfoFile) != 0 ) {
71 datasetInfo <- scan(datasetInfoFile,
72 what= "character"
75 datasetInfo <- paste(datasetInfo, collapse = " ")
77 } else {
78 datasetInfo <- c('single population')
82 validationTrait <- paste("validation", trait, sep = "_")
84 validationFile <- grep(validationTrait,
85 outFiles,
86 ignore.case=TRUE,
87 fixed = FALSE,
88 value=TRUE
91 kinshipTrait <- paste("kinship", trait, sep = "_")
93 blupFile <- grep(kinshipTrait,
94 outFiles,
95 ignore.case = TRUE,
96 fixed = FALSE,
97 value = TRUE
100 markerTrait <- paste("marker", trait, sep = "_")
101 markerFile <- grep(markerTrait,
102 outFiles,
103 ignore.case = TRUE,
104 fixed = FALSE,
105 value = TRUE
108 traitPhenoFile <- paste("phenotype_trait", trait, sep = "_")
109 traitPhenoFile <- grep(traitPhenoFile,
110 outFiles,
111 ignore.case = TRUE,
112 fixed = FALSE,
113 value = TRUE
117 formattedPhenoDataFile <- grep("formatted_phenotype_data",
118 outFiles,
119 ignore.case = TRUE,
120 fixed = FALSE,
121 value = TRUE
124 varianceComponentsFile <- grep("variance_components",
125 outFiles,
126 ignore.case = TRUE,
127 fixed = FALSE,
128 value = TRUE
131 phenoFile <- grep("phenotype_data",
132 inFiles,
133 ignore.case = TRUE,
134 fixed = FALSE,
135 value = TRUE
138 message("phenotype dataset file: ", phenoFile)
139 message("dataset info: ", datasetInfo)
140 message("phenotype dataset file: ", phenoFile)
142 phenoData <- read.table(phenoFile,
143 header = TRUE,
144 row.names = NULL,
145 sep = "\t",
146 na.strings = c("NA", " ", "--", "-", "."),
147 dec = "."
150 phenoTrait <- c()
151 formattedPhenoData <- c()
153 if (datasetInfo == 'combined populations') {
154 dropColumns <- grep(trait,
155 names(phenoData),
156 ignore.case = TRUE,
157 value = TRUE,
158 fixed = FALSE
161 phenoTrait <- phenoData[,!(names(phenoData) %in% dropColumns)]
163 phenoTrait <- as.data.frame(phenoTrait)
164 row.names(phenoTrait) <- phenoTrait[, 1]
165 phenoTrait[, 1] <- NULL
166 colnames(phenoTrait) <- trait
168 } else {
170 dropColumns <- c("uniquename", "stock_name")
171 phenoData <- phenoData[,!(names(phenoData) %in% dropColumns)]
173 phenoTrait <- subset(phenoData,
174 select = c("object_name", "object_id", "design", "block", "replicate", trait)
177 experimentalDesign <- phenoTrait[2, 'design']
178 if (is.na(experimentalDesign) == TRUE) {experimentalDesign <- c('No Design')}
180 if (experimentalDesign == 'Augmented' || experimentalDesign == 'RCBD') {
181 message("experimental design: ", experimentalDesign)
183 augData <- subset(phenoData,
184 select = c("object_name", "object_id", "block", trait)
187 colnames(augData)[1] <- "genotypes"
188 colnames(augData)[4] <- "trait"
190 ff <- trait ~ 0 + genotypes
192 model <- lme(ff,
193 data=augData,
194 random = ~1|block,
195 method="REML",
196 na.action = na.omit
199 adjMeans <- data.matrix(fixed.effects(model))
201 colnames(adjMeans) <- trait
203 nn <- gsub('genotypes', '', rownames(adjMeans))
204 rownames(adjMeans) <- nn
205 adjMeans <- round(adjMeans, digits = 2)
207 phenoTrait <- data.frame(adjMeans)
209 } else if (experimentalDesign == 'Alpha') {
210 message("experimental design: ", experimentalDesign)
212 alphaData <- subset(phenoData,
213 select = c("object_name", "object_id", "block", "replicate", trait)
216 colnames(alphaData)[2] <- "genotypes"
217 colnames(alphaData)[5] <- "trait"
219 ff <- trait ~ 0 + genotypes
221 model <- lme(ff,
222 data=alphaData,
223 random = ~1|replicate/block,
224 method="REML",
225 na.action = na.omit
228 adjMeans <- data.matrix(fixed.effects(model))
229 colnames(adjMeans) <- trait
231 nn <- gsub('genotypes', '', rownames(adjMeans))
232 rownames(adjMeans) <- nn
233 adjMeans <- round(adjMeans, digits = 2)
235 phenoTrait <- data.frame(adjMeans)
237 } else {
239 phenoTrait <- subset(phenoData,
240 select = c("object_name", "object_id", trait)
243 if (sum(is.na(phenoTrait)) > 0) {
245 message("No. of pheno missing values: ", sum(is.na(phenoTrait)))
246 phenoTrait <- na.omit(phenoTrait)
250 #calculate mean of reps/plots of the same accession and
251 #create new df with the accession means
253 phenoTrait <- phenoTrait[order(row.names(phenoTrait)), ]
254 phenoTrait <- data.frame(phenoTrait)
255 message('phenotyped lines before averaging: ', length(row.names(phenoTrait)))
257 phenoTrait<-ddply(phenoTrait, "object_name", colwise(mean))
258 message('phenotyped lines after averaging: ', length(row.names(phenoTrait)))
260 row.names(phenoTrait) <- phenoTrait[, 1]
261 phenoTrait[, 1] <- NULL
263 #format all-traits population phenotype dataset
264 formattedPhenoData <- phenoData
265 dropColumns <- c("object_id", "stock_id", "design", "block", "replicate" )
267 formattedPhenoData <- formattedPhenoData[, !(names(formattedPhenoData) %in% dropColumns)]
268 formattedPhenoData <- ddply(formattedPhenoData,
269 "object_name",
270 colwise(mean)
273 row.names(formattedPhenoData) <- formattedPhenoData[, 1]
274 formattedPhenoData[, 1] <- NULL
276 formattedPhenoData <- round(formattedPhenoData,
277 digits=2
282 genoFile <- grep("genotype_data",
283 inFiles,
284 ignore.case = TRUE,
285 fixed = FALSE,
286 value = TRUE
289 genoData <- read.table(genoFile,
290 header = TRUE,
291 row.names = 1,
292 sep = "\t",
293 na.strings = c("NA", " ", "--", "-"),
294 dec = "."
297 genoData <- data.matrix(genoData[order(row.names(genoData)), ])
300 #impute genotype values for obs with missing values,
301 #based on mean of neighbouring 10 (arbitrary) obs
302 genoDataMissing <-c()
304 if (sum(is.na(genoData)) > 0) {
305 genoDataMissing<- c('yes')
306 message("sum of geno missing values, ", sum(is.na(genoData)) )
307 genoData <- kNNImpute(genoData, 10)
308 genoData <- data.frame(genoData)
310 #extract columns with imputed values
311 genoData <- subset(genoData,
312 select = grep("^x", names(genoData))
315 #remove prefix 'x.' from imputed columns
316 names(genoData) <- sub("x.", "", names(genoData))
318 genoData <- round(genoData, digits = 0)
319 genoData <- data.matrix(genoData)
325 predictionTempFile <- grep("prediction_population",
326 inFiles,
327 ignore.case = TRUE,
328 fixed = FALSE,
329 value = TRUE
332 predictionFile <- c()
334 message('prediction temp genotype file: ', predictionTempFile)
336 if (length(predictionTempFile) !=0 ) {
337 predictionFile <- scan(predictionTempFile,
338 what="character"
342 message('prediction genotype file: ', predictionFile)
344 predictionPopGEBVsFile <- grep("prediction_pop_gebvs",
345 outFiles,
346 ignore.case = TRUE,
347 fixed = FALSE,
348 value = TRUE
351 message("prediction gebv file: ", predictionPopGEBVsFile)
353 predictionData <- c()
355 if (length(predictionFile) !=0 ) {
356 predictionData <- read.table(predictionFile,
357 header = TRUE,
358 row.names = 1,
359 sep = "\t",
360 na.strings = c("NA", " ", "--", "-"),
361 dec = "."
365 #create phenotype and genotype datasets with
366 #common stocks only
367 message('phenotyped lines: ', length(row.names(phenoTrait)))
368 message('genotyped lines: ', length(row.names(genoData)))
370 #extract observation lines with both
371 #phenotype and genotype data only.
372 commonObs <- intersect(row.names(phenoTrait), row.names(genoData))
373 commonObs <- data.frame(commonObs)
374 rownames(commonObs)<-commonObs[, 1]
376 message('lines with both genotype and phenotype data: ', length(row.names(commonObs)))
377 #include in the genotype dataset only observation lines
378 #with phenotype data
379 message("genotype lines before filtering for phenotyped only: ", length(row.names(genoData)))
380 genoDataFiltered <- genoData[(rownames(genoData) %in% rownames(commonObs)), ]
381 message("genotype lines after filtering for phenotyped only: ", length(row.names(genoDataFiltered)))
382 #drop observation lines without genotype data
383 message("phenotype lines before filtering for genotyped only: ", length(row.names(phenoTrait)))
385 phenoTrait <- merge(data.frame(phenoTrait), commonObs, by=0, all=FALSE)
386 rownames(phenoTrait) <-phenoTrait[, 1]
387 phenoTrait <- subset(phenoTrait, select=trait)
389 message("phenotype lines after filtering for genotyped only: ", length(row.names(phenoTrait)))
391 #a set of only observation lines with genotype data
392 traitPhenoData <- as.data.frame(round(phenoTrait, digits=2))
393 phenoTrait <- data.matrix(phenoTrait)
394 genoDataFiltered <- data.matrix(genoDataFiltered)
397 #impute missing data in prediction data
398 predictionDataMissing <- c()
399 if (length(predictionData) != 0) {
400 #purge markers unique to both populations
401 commonMarkers <- intersect(names(data.frame(genoDataFiltered)), names(predictionData))
402 predictionData <- subset(predictionData, select = commonMarkers)
403 genoDataFiltered <- subset(genoDataFiltered, select= commonMarkers)
405 predictionData <- data.matrix(predictionData)
407 if (sum(is.na(predictionData)) > 0) {
408 predictionDataMissing <- c('yes')
409 message("sum of geno missing values in prediction data: ", sum(is.na(predictionData)) )
410 predictionData <-kNNImpute(predictionData, 10)
411 predictionData <-as.data.frame(predictionData)
413 #extract columns with imputed values
414 predictionData <- subset(predictionData,
415 select = grep("^x", names(predictionData))
418 #remove prefix 'x.' from imputed columns
419 names(predictionData) <- sub("x.", "", names(predictionData))
421 predictionData <- round(predictionData, digits = 0)
422 predictionData <- data.matrix(predictionData)
427 #change genotype coding to [-1, 0, 1], to use the A.mat ) if [0, 1, 2]
428 genoTrCode <- grep("2", genoDataFiltered[1, ], fixed=TRUE, value=TRUE)
429 if(length(genoTrCode) != 0) {
430 genoDataFiltered <- genoDataFiltered - 1
433 if (length(predictionData) != 0 ) {
434 genoSlCode <- grep("2", predictionData[1, ], fixed=TRUE, value=TRUE)
435 if (length(genoSlCode) != 0 ) {
436 predictionData <- predictionData - 1
440 ordered.markerEffects <- c()
441 if ( length(predictionData) == 0 ) {
442 markerEffects <- mixed.solve(y = phenoTrait,
443 Z = genoDataFiltered
446 ordered.markerEffects <- data.matrix(markerEffects$u)
447 ordered.markerEffects <- data.matrix(ordered.markerEffects [order (-ordered.markerEffects[, 1]), ])
448 ordered.markerEffects <- round(ordered.markerEffects,
449 digits=5
452 colnames(ordered.markerEffects) <- c("Marker Effects")
455 #correlation between breeding values based on
456 #marker effects and relationship matrix
457 #corGEBVs <- cor(genoDataMatrix %*% markerEffects$u, iGEBV$u)
460 #additive relationship model
461 #calculate the inner products for
462 #genotypes (realized relationship matrix)
463 genocrsprd <- tcrossprod(genoDataFiltered)
465 #construct an identity matrix for genotypes
466 identityMatrix <- diag(nrow(phenoTrait))
468 iGEBV <- mixed.solve(y = phenoTrait,
469 Z = identityMatrix,
470 K = genocrsprd
473 iGEBVu <- iGEBV$u
475 heritability <- c()
476 if ( is.null(predictionFile) == TRUE ) {
477 heritability <- round((iGEBV$Vu /(iGEBV$Vu + iGEBV$Ve) * 100), digits=3)
478 cat("\n", file=varianceComponentsFile, append=TRUE)
479 cat('Error variance', iGEBV$Ve, file=varianceComponentsFile, sep="\t", append=TRUE)
480 cat("\n", file=varianceComponentsFile, append=TRUE)
481 cat('Additive genetic variance', iGEBV$Vu, file=varianceComponentsFile, sep='\t', append=TRUE)
482 cat("\n", file=varianceComponentsFile, append=TRUE)
483 cat('&#956;', iGEBV$beta,file=varianceComponentsFile, sep='\t', append=TRUE)
484 cat("\n", file=varianceComponentsFile, append=TRUE)
485 cat('Heritability (h, %)', heritability, file=varianceComponentsFile, sep='\t', append=TRUE)
488 iGEBV <- data.matrix(iGEBVu)
490 ordered.iGEBV <- as.data.frame(iGEBV[order(-iGEBV[, 1]), ] )
492 ordered.iGEBV <- round(ordered.iGEBV,
493 digits = 3
496 combinedGebvsFile <- grep('selected_traits_gebv',
497 outFiles,
498 ignore.case = TRUE,
499 fixed = FALSE,
500 value = TRUE
503 allGebvs<-c()
504 if (length(combinedGebvsFile) != 0)
506 fileSize <- file.info(combinedGebvsFile)$size
507 if (fileSize != 0 )
509 combinedGebvs<-read.table(combinedGebvsFile,
510 header = TRUE,
511 row.names = 1,
512 dec = ".",
513 sep = "\t"
516 colnames(ordered.iGEBV) <- c(trait)
518 traitGEBV <- as.data.frame(ordered.iGEBV)
519 allGebvs <- merge(combinedGebvs, traitGEBV,
520 by = 0,
521 all = TRUE
524 rownames(allGebvs) <- allGebvs[,1]
525 allGebvs[,1] <- NULL
529 colnames(ordered.iGEBV) <- c(trait)
531 #cross-validation
532 validationAll <- c()
534 if(is.null(predictionFile) == TRUE) {
535 genoNum <- nrow(phenoTrait)
536 if(genoNum < 20 ) {
537 warning(genoNum, " is too small number of genotypes.")
539 reps <- round_any(genoNum, 10, f = ceiling) %/% 10
541 genotypeGroups <-c()
543 if (genoNum %% 10 == 0) {
544 genotypeGroups <- rep(1:10, reps)
545 } else {
546 genotypeGroups <- rep(1:10, reps) [- (genoNum %% 10) ]
549 set.seed(4567)
550 genotypeGroups <- genotypeGroups[ order (runif(genoNum)) ]
552 for (i in 1:10) {
553 tr <- paste("trPop", i, sep = ".")
554 sl <- paste("slPop", i, sep = ".")
556 trG <- which(genotypeGroups != i)
557 slG <- which(genotypeGroups == i)
559 assign(tr, trG)
560 assign(sl, slG)
562 kblup <- paste("rKblup", i, sep = ".")
564 result <- kinship.BLUP(y = phenoTrait[trG, ],
565 G.train = genoDataFiltered[trG, ],
566 G.pred = genoDataFiltered[slG, ],
567 mixed.method = "REML",
568 K.method = "RR",
571 assign(kblup, result)
573 #calculate cross-validation accuracy
574 valCorData <- merge(phenoTrait[slG, ], result$g.pred, by=0, all=FALSE)
575 rownames(valCorData) <- valCorData[, 1]
576 valCorData[, 1] <- NULL
578 accuracy <- try(cor(valCorData))
579 validation <- paste("validation", i, sep = ".")
581 cvTest <- paste("Validation test", i, sep = " ")
583 if ( class(accuracy) != "try-error")
585 accuracy <- round(accuracy[1,2], digits = 3)
586 accuracy <- data.matrix(accuracy)
588 colnames(accuracy) <- c("correlation")
589 rownames(accuracy) <- cvTest
591 assign(validation, accuracy)
593 if (!is.na(accuracy[1,1])) {
594 validationAll <- rbind(validationAll, accuracy)
599 validationAll <- data.matrix(validationAll[order(-validationAll[, 1]), ])
601 if (!is.null(validationAll)) {
602 validationMean <- data.matrix(round(colMeans(validationAll),
603 digits = 2
607 rownames(validationMean) <- c("Average")
609 validationAll <- rbind(validationAll, validationMean)
610 colnames(validationAll) <- c("Correlation")
613 #predict GEBVs for selection population
614 if (length(predictionData) !=0 ) {
615 predictionData <- data.matrix(round(predictionData, digits = 0 ))
618 predictionPopResult <- c()
619 predictionPopGEBVs <- c()
621 if (length(predictionData) != 0) {
622 message("running prediction for selection candidates...marker data", ncol(predictionData), " vs. ", ncol(genoDataFiltered))
624 predictionPopResult <- kinship.BLUP(y = phenoTrait,
625 G.train = genoDataFiltered,
626 G.pred = predictionData,
627 mixed.method = "REML",
628 K.method = "RR"
630 message("running prediction for selection candidates...DONE!!")
632 predictionPopGEBVs <- round(data.matrix(predictionPopResult$g.pred), digits = 2)
633 predictionPopGEBVs <- data.matrix(predictionPopGEBVs[order(-predictionPopGEBVs[, 1]), ])
635 colnames(predictionPopGEBVs) <- c(trait)
639 if (!is.null(predictionPopGEBVs) & length(predictionPopGEBVsFile) != 0) {
640 write.table(predictionPopGEBVs,
641 file = predictionPopGEBVsFile,
642 sep = "\t",
643 col.names = NA,
644 quote = FALSE,
645 append = FALSE
649 if(is.null(validationAll) == FALSE) {
650 write.table(validationAll,
651 file = validationFile,
652 sep = "\t",
653 col.names = NA,
654 quote = FALSE,
655 append = FALSE
659 if (is.null(ordered.markerEffects) == FALSE) {
660 write.table(ordered.markerEffects,
661 file = markerFile,
662 sep = "\t",
663 col.names = NA,
664 quote = FALSE,
665 append = FALSE
669 if (is.null(ordered.iGEBV) == FALSE) {
670 write.table(ordered.iGEBV,
671 file = blupFile,
672 sep = "\t",
673 col.names = NA,
674 quote = FALSE,
675 append = FALSE
679 if (length(combinedGebvsFile) != 0 ) {
680 if(file.info(combinedGebvsFile)$size == 0) {
681 write.table(ordered.iGEBV,
682 file = combinedGebvsFile,
683 sep = "\t",
684 col.names = NA,
685 quote = FALSE,
687 } else {
688 write.table(allGebvs,
689 file = combinedGebvsFile,
690 sep = "\t",
691 quote = FALSE,
692 col.names = NA,
697 if (!is.null(traitPhenoData) & length(traitPhenoFile) != 0) {
698 write.table(traitPhenoData,
699 file = traitPhenoFile,
700 sep = "\t",
701 col.names = NA,
702 quote = FALSE,
706 if (!is.null(formattedPhenoData) & length(formattedPhenoDataFile) != 0) {
707 write.table(formattedPhenoData,
708 file = formattedPhenoDataFile,
709 sep = "\t",
710 col.names = NA,
711 quote = FALSE,
715 if (!is.null(genoDataMissing)) {
716 write.table(genoData,
717 file = genoFile,
718 sep = "\t",
719 col.names = NA,
720 quote = FALSE,
725 if (!is.null(predictionDataMissing)) {
726 write.table(predictionData,
727 file = predictionFile,
728 sep = "\t",
729 col.names = NA,
730 quote = FALSE,
735 #should also send notification to analysis owner
736 to <- c("<iyt2@cornell.edu>")
737 subject <- paste(trait, ' GS analysis done', sep = ':')
738 body <- c("Dear User,\n\n")
739 body <- paste(body, 'The genomic selection analysis for', sep = "")
740 body <- paste(body, trait, sep = " ")
741 body <- paste(body, "is done.\n\nRegards and Thanks.\nSGN", sep = " ")
743 #should use SGN's smtp server eventually
744 sendmail(to, subject, body, password = "rmail")
746 q(save = "no", runLast = FALSE)