Merge branch 'master' into topic/dt_feature_search
[sgn.git] / R / gs.r
blob01c675f20acbea22b42553a29f589b3e4c094ec8
1 #a script for calculating genomic
2 #estimated breeding values (GEBVs) using rrBLUP
4 options(echo = FALSE)
6 library(rrBLUP)
7 library(plyr)
8 library(mail)
9 library(stringr)
10 library(lme4)
11 library(randomForest)
13 allArgs <- commandArgs()
15 inFile <- grep("input_files",
16 allArgs,
17 ignore.case = TRUE,
18 perl = TRUE,
19 value = TRUE
22 outFile <- grep("output_files",
23 allArgs,
24 ignore.case = TRUE,
25 perl = TRUE,
26 value = TRUE
29 outFiles <- scan(outFile,
30 what = "character"
33 inFiles <- scan(inFile,
34 what = "character"
38 traitsFile <- grep("traits",
39 inFiles,
40 ignore.case = TRUE,
41 fixed = FALSE,
42 value = TRUE
45 traitFile <- grep("trait_info",
46 inFiles,
47 ignore.case = TRUE,
48 fixed = FALSE,
49 value = TRUE
53 traitInfo <- scan(traitFile,
54 what = "character",
57 traitInfo <- strsplit(traitInfo, "\t");
58 traitId <- traitInfo[[1]]
59 trait <- traitInfo[[2]]
61 datasetInfoFile <- grep("dataset_info",
62 inFiles,
63 ignore.case = TRUE,
64 fixed = FALSE,
65 value = TRUE
67 datasetInfo <- c()
69 if (length(datasetInfoFile) != 0 ) {
70 datasetInfo <- scan(datasetInfoFile,
71 what= "character"
74 datasetInfo <- paste(datasetInfo, collapse = " ")
76 } else {
77 datasetInfo <- c('single population')
81 validationTrait <- paste("validation", trait, sep = "_")
83 validationFile <- grep(validationTrait,
84 outFiles,
85 ignore.case=TRUE,
86 fixed = FALSE,
87 value=TRUE
90 kinshipTrait <- paste("kinship", trait, sep = "_")
92 blupFile <- grep(kinshipTrait,
93 outFiles,
94 ignore.case = TRUE,
95 fixed = FALSE,
96 value = TRUE
99 markerTrait <- paste("marker", trait, sep = "_")
100 markerFile <- grep(markerTrait,
101 outFiles,
102 ignore.case = TRUE,
103 fixed = FALSE,
104 value = TRUE
107 traitPhenoFile <- paste("phenotype_trait", trait, sep = "_")
108 traitPhenoFile <- grep(traitPhenoFile,
109 outFiles,
110 ignore.case = TRUE,
111 fixed = FALSE,
112 value = TRUE
115 varianceComponentsFile <- grep("variance_components",
116 outFiles,
117 ignore.case = TRUE,
118 fixed = FALSE,
119 value = TRUE
122 formattedPhenoFile <- grep("formatted_phenotype_data",
123 inFiles,
124 ignore.case = TRUE,
125 fixed = FALSE,
126 value = TRUE
129 formattedPhenoData <- c()
130 phenoData <- c()
132 if (length(formattedPhenoFile) != 0 && file.info(formattedPhenoFile)$size != 0) {
133 formattedPhenoData <- read.table(formattedPhenoFile,
134 header = TRUE,
135 row.names = 1,
136 sep = "\t",
137 na.strings = c("NA", " ", "--", "-", "."),
138 dec = ".")
140 } else {
141 phenoFile <- grep("\\/phenotype_data",
142 inFiles,
143 ignore.case = TRUE,
144 fixed = FALSE,
145 value = TRUE,
146 perl = TRUE,
149 phenoData <- read.table(phenoFile,
150 header = TRUE,
151 row.names = NULL,
152 sep = "\t",
153 na.strings = c("NA", " ", "--", "-", "."),
154 dec = "."
158 phenoTrait <- c()
160 if (datasetInfo == 'combined populations') {
162 if (!is.null(formattedPhenoData)) {
163 phenoTrait <- subset(formattedPhenoData, select=trait)
164 phenoTrait <- na.omit(phenoTrait)
166 } else {
167 dropColumns <- grep(trait,
168 names(phenoData),
169 ignore.case = TRUE,
170 value = TRUE,
171 fixed = FALSE
174 phenoTrait <- phenoData[,!(names(phenoData) %in% dropColumns)]
176 phenoTrait <- as.data.frame(phenoTrait)
177 row.names(phenoTrait) <- phenoTrait[, 1]
178 phenoTrait[, 1] <- NULL
179 colnames(phenoTrait) <- trait
182 } else {
184 if (!is.null(formattedPhenoData)) {
185 phenoTrait <- subset(formattedPhenoData, select=trait)
186 phenoTrait <- na.omit(phenoTrait)
188 } else {
189 dropColumns <- c("uniquename", "stock_name")
190 phenoData <- phenoData[,!(names(phenoData) %in% dropColumns)]
192 phenoTrait <- subset(phenoData,
193 select = c("object_name", "object_id", "design", "block", "replicate", trait)
196 experimentalDesign <- phenoTrait[2, 'design']
198 if (class(phenoTrait[, trait]) != 'numeric') {
199 phenoTrait[, trait] <- as.numeric(as.character(phenoTrait[, trait]))
202 if (is.na(experimentalDesign) == TRUE) {experimentalDesign <- c('No Design')}
204 if ((experimentalDesign == 'Augmented' || experimentalDesign == 'RCBD') && unique(phenoTrait$block) > 1) {
206 message("GS experimental design: ", experimentalDesign)
208 augData <- subset(phenoTrait,
209 select = c("object_name", "object_id", "block", trait)
212 colnames(augData)[1] <- "genotypes"
213 colnames(augData)[4] <- "trait"
215 model <- try(lmer(trait ~ 0 + genotypes + (1|block),
216 augData,
217 na.action = na.omit
220 if (class(model) != "try-error") {
221 phenoTrait <- data.frame(fixef(model))
223 colnames(phenoTrait) <- trait
225 nn <- gsub('genotypes', '', rownames(phenoTrait))
226 rownames(phenoTrait) <- nn
228 phenoTrait <- round(phenoTrait, digits = 2)
231 } else if (experimentalDesign == 'Alpha') {
233 message("Experimental desgin: ", experimentalDesign)
235 alphaData <- subset(phenoData,
236 select = c("object_name", "object_id","block", "replicate", trait)
239 colnames(alphaData)[1] <- "genotypes"
240 colnames(alphaData)[5] <- "trait"
242 model <- try(lmer(trait ~ 0 + genotypes + (1|replicate/block),
243 alphaData,
244 na.action = na.omit
247 if (class(model) != "try-error") {
248 phenoTrait <- data.frame(fixef(model))
250 colnames(phenoTrait) <- trait
252 nn <- gsub('genotypes', '', rownames(phenoTrait))
253 rownames(phenoTrait) <- nn
255 phenoTrait <- round(phenoTrait, digits = 2)
259 } else {
261 phenoTrait <- subset(phenoData,
262 select = c("object_name", "object_id", trait)
265 if (sum(is.na(phenoTrait)) > 0) {
266 message("No. of pheno missing values: ", sum(is.na(phenoTrait)))
267 phenoTrait <- na.omit(phenoTrait)
270 #calculate mean of reps/plots of the same accession and
271 #create new df with the accession means
273 phenoTrait <- phenoTrait[order(row.names(phenoTrait)), ]
274 phenoTrait <- data.frame(phenoTrait)
275 message('phenotyped lines before averaging: ', length(row.names(phenoTrait)))
277 phenoTrait<-ddply(phenoTrait, "object_name", colwise(mean))
278 message('phenotyped lines after averaging: ', length(row.names(phenoTrait)))
280 phenoTrait <- subset(phenoTrait, select=c("object_name", trait))
281 row.names(phenoTrait) <- phenoTrait[, 1]
282 phenoTrait[, 1] <- NULL
284 #format all-traits population phenotype dataset
285 ## formattedPhenoData <- phenoData
286 ## dropColumns <- c("object_id", "stock_id", "design", "block", "replicate" )
288 ## formattedPhenoData <- formattedPhenoData[, !(names(formattedPhenoData) %in% dropColumns)]
289 ## formattedPhenoData <- ddply(formattedPhenoData,
290 ## "object_name",
291 ## colwise(mean)
292 ## )
294 ## row.names(formattedPhenoData) <- formattedPhenoData[, 1]
295 ## formattedPhenoData[, 1] <- NULL
297 ## formattedPhenoData <- round(formattedPhenoData,
298 ## digits=3
299 ## )
304 genoFile <- grep("genotype_data",
305 inFiles,
306 ignore.case = TRUE,
307 fixed = FALSE,
308 value = TRUE
311 genoData <- read.table(genoFile,
312 header = TRUE,
313 row.names = 1,
314 sep = "\t",
315 na.strings = c("NA", " ", "--", "-"),
316 dec = "."
319 genoData <- genoData[order(row.names(genoData)), ]
321 #impute genotype values for obs with missing values,
322 #based on mean of neighbouring 10 (arbitrary) obs
323 genoDataMissing <-c()
325 if (sum(is.na(genoData)) > 0) {
326 genoDataMissing<- c('yes')
328 message("sum of geno missing values, ", sum(is.na(genoData)) )
329 genoData <- na.roughfix(genoData)
330 genoData <- data.matrix(genoData)
333 predictionTempFile <- grep("prediction_population",
334 inFiles,
335 ignore.case = TRUE,
336 fixed = FALSE,
337 value = TRUE
340 predictionFile <- c()
342 message('prediction temp genotype file: ', predictionTempFile)
344 if (length(predictionTempFile) !=0 ) {
345 predictionFile <- scan(predictionTempFile,
346 what="character"
350 message('prediction genotype file: ', predictionFile)
352 predictionPopGEBVsFile <- grep("prediction_pop_gebvs",
353 outFiles,
354 ignore.case = TRUE,
355 fixed = FALSE,
356 value = TRUE
359 message("prediction gebv file: ", predictionPopGEBVsFile)
361 predictionData <- c()
363 if (length(predictionFile) !=0 ) {
364 predictionData <- read.table(predictionFile,
365 header = TRUE,
366 row.names = 1,
367 sep = "\t",
368 na.strings = c("NA", " ", "--", "-"),
369 dec = "."
373 #create phenotype and genotype datasets with
374 #common stocks only
375 message('phenotyped lines: ', length(row.names(phenoTrait)))
376 message('genotyped lines: ', length(row.names(genoData)))
378 #extract observation lines with both
379 #phenotype and genotype data only.
380 commonObs <- intersect(row.names(phenoTrait), row.names(genoData))
381 commonObs <- data.frame(commonObs)
382 rownames(commonObs)<-commonObs[, 1]
384 message('lines with both genotype and phenotype data: ', length(row.names(commonObs)))
385 #include in the genotype dataset only observation lines
386 #with phenotype data
387 message("genotype lines before filtering for phenotyped only: ", length(row.names(genoData)))
388 genoDataFiltered <- genoData[(rownames(genoData) %in% rownames(commonObs)), ]
389 message("genotype lines after filtering for phenotyped only: ", length(row.names(genoDataFiltered)))
390 #drop observation lines without genotype data
391 message("phenotype lines before filtering for genotyped only: ", length(row.names(phenoTrait)))
393 phenoTrait <- merge(data.frame(phenoTrait), commonObs, by=0, all=FALSE)
394 rownames(phenoTrait) <-phenoTrait[, 1]
395 phenoTrait <- subset(phenoTrait, select=trait)
397 message("phenotype lines after filtering for genotyped only: ", length(row.names(phenoTrait)))
398 #a set of only observation lines with genotype data
400 traitPhenoData <- data.frame(round(phenoTrait, digits=2))
401 phenoTrait <- data.matrix(phenoTrait)
402 genoDataFiltered <- data.matrix(genoDataFiltered)
404 #impute missing data in prediction data
405 predictionDataMissing <- c()
406 if (length(predictionData) != 0) {
407 #purge markers unique to both populations
408 commonMarkers <- intersect(names(data.frame(genoDataFiltered)), names(predictionData))
409 predictionData <- subset(predictionData, select = commonMarkers)
410 genoDataFiltered <- subset(genoDataFiltered, select= commonMarkers)
412 predictionData <- data.matrix(predictionData)
414 if (sum(is.na(predictionData)) > 0) {
415 predictionDataMissing <- c('yes')
416 message("sum of geno missing values, ", sum(is.na(predictionData)) )
417 predictionData <- data.matrix(na.roughfix(predictionData))
422 relationshipMatrixFile <- grep("relationship_matrix",
423 outFiles,
424 ignore.case = TRUE,
425 fixed = FALSE,
426 value = TRUE
429 message("relationship matrix file: ", relationshipMatrixFile)
430 #message("relationship matrix file size: ", file.info(relationshipMatrixFile)$size)
431 relationshipMatrix <- c()
432 if (length(relationshipMatrixFile) != 0) {
433 if (file.info(relationshipMatrixFile)$size > 0 ) {
434 relationshipDf <- read.table(relationshipMatrixFile,
435 header = TRUE,
436 row.names = 1,
437 sep = "\t",
438 check.names=FALSE,
439 dec = "."
442 relationshipMatrix <- data.matrix(relationshipDf)
445 #change genotype coding to [-1, 0, 1], to use the A.mat ) if [0, 1, 2]
446 genoTrCode <- grep("2", genoDataFiltered[1, ], fixed=TRUE, value=TRUE)
447 if(length(genoTrCode) != 0) {
448 genoDataFiltered <- genoDataFiltered - 1
451 if (length(predictionData) != 0 ) {
452 genoSlCode <- grep("2", predictionData[1, ], fixed=TRUE, value=TRUE)
453 if (length(genoSlCode) != 0 ) {
454 predictionData <- predictionData - 1
458 ordered.markerEffects <- c()
459 if ( length(predictionData) == 0 ) {
460 markerEffects <- mixed.solve(y = phenoTrait,
461 Z = genoDataFiltered
464 ordered.markerEffects <- data.matrix(markerEffects$u)
465 ordered.markerEffects <- data.matrix(ordered.markerEffects [order (-ordered.markerEffects[, 1]), ])
466 ordered.markerEffects <- round(ordered.markerEffects,
467 digits=5
470 colnames(ordered.markerEffects) <- c("Marker Effects")
473 #correlation between breeding values based on
474 #marker effects and relationship matrix
475 #corGEBVs <- cor(genoDataMatrix %*% markerEffects$u, iGEBV$u)
478 #additive relationship model
479 #calculate the inner products for
480 #genotypes (realized relationship matrix)
481 if (length(relationshipMatrixFile) != 0) {
482 if (file.info(relationshipMatrixFile)$size == 0) {
483 relationshipMatrix <- tcrossprod(data.matrix(genoData))
486 relationshipMatrixFiltered <- relationshipMatrix[(rownames(relationshipMatrix) %in% rownames(commonObs)),]
487 relationshipMatrixFiltered <- relationshipMatrixFiltered[, (colnames(relationshipMatrixFiltered) %in% rownames(commonObs))]
489 #construct an identity matrix for genotypes
490 identityMatrix <- diag(nrow(phenoTrait))
492 relationshipMatrixFiltered <- data.matrix(relationshipMatrixFiltered)
494 iGEBV <- mixed.solve(y = phenoTrait,
495 Z = identityMatrix,
496 K = relationshipMatrixFiltered
499 iGEBVu <- iGEBV$u
501 heritability <- c()
503 if ( is.null(predictionFile) == TRUE ) {
504 heritability <- round((iGEBV$Vu /(iGEBV$Vu + iGEBV$Ve) * 100), digits=3)
505 cat("\n", file=varianceComponentsFile, append=TRUE)
506 cat('Error variance', iGEBV$Ve, file=varianceComponentsFile, sep="\t", append=TRUE)
507 cat("\n", file=varianceComponentsFile, append=TRUE)
508 cat('Additive genetic variance', iGEBV$Vu, file=varianceComponentsFile, sep='\t', append=TRUE)
509 cat("\n", file=varianceComponentsFile, append=TRUE)
510 cat('&#956;', iGEBV$beta,file=varianceComponentsFile, sep='\t', append=TRUE)
511 cat("\n", file=varianceComponentsFile, append=TRUE)
512 cat('Heritability (h, %)', heritability, file=varianceComponentsFile, sep='\t', append=TRUE)
515 iGEBV <- data.matrix(iGEBVu)
517 ordered.iGEBV <- as.data.frame(iGEBV[order(-iGEBV[, 1]), ] )
519 ordered.iGEBV <- round(ordered.iGEBV,
520 digits = 3
523 combinedGebvsFile <- grep('selected_traits_gebv',
524 outFiles,
525 ignore.case = TRUE,
526 fixed = FALSE,
527 value = TRUE
530 allGebvs<-c()
531 if (length(combinedGebvsFile) != 0)
533 fileSize <- file.info(combinedGebvsFile)$size
534 if (fileSize != 0 )
536 combinedGebvs<-read.table(combinedGebvsFile,
537 header = TRUE,
538 row.names = 1,
539 dec = ".",
540 sep = "\t"
543 colnames(ordered.iGEBV) <- c(trait)
545 traitGEBV <- as.data.frame(ordered.iGEBV)
546 allGebvs <- merge(combinedGebvs, traitGEBV,
547 by = 0,
548 all = TRUE
551 rownames(allGebvs) <- allGebvs[,1]
552 allGebvs[,1] <- NULL
556 colnames(ordered.iGEBV) <- c(trait)
558 #cross-validation
559 validationAll <- c()
561 if(is.null(predictionFile)) {
562 genoNum <- nrow(phenoTrait)
563 if(genoNum < 20 ) {
564 warning(genoNum, " is too small number of genotypes.")
566 reps <- round_any(genoNum, 10, f = ceiling) %/% 10
568 genotypeGroups <-c()
570 if (genoNum %% 10 == 0) {
571 genotypeGroups <- rep(1:10, reps)
572 } else {
573 genotypeGroups <- rep(1:10, reps) [- (genoNum %% 10) ]
576 set.seed(4567)
577 genotypeGroups <- genotypeGroups[ order (runif(genoNum)) ]
579 for (i in 1:10) {
580 tr <- paste("trPop", i, sep = ".")
581 sl <- paste("slPop", i, sep = ".")
583 trG <- which(genotypeGroups != i)
584 slG <- which(genotypeGroups == i)
586 assign(tr, trG)
587 assign(sl, slG)
589 kblup <- paste("rKblup", i, sep = ".")
591 result <- kinship.BLUP(y = phenoTrait[trG, ],
592 G.train = genoDataFiltered[trG, ],
593 G.pred = genoDataFiltered[slG, ],
594 mixed.method = "REML",
595 K.method = "RR",
598 assign(kblup, result)
600 #calculate cross-validation accuracy
601 valCorData <- merge(phenoTrait[slG, ], result$g.pred, by=0, all=FALSE)
602 rownames(valCorData) <- valCorData[, 1]
603 valCorData[, 1] <- NULL
605 accuracy <- try(cor(valCorData))
606 validation <- paste("validation", i, sep = ".")
608 cvTest <- paste("Validation test", i, sep = " ")
610 if ( class(accuracy) != "try-error")
612 accuracy <- round(accuracy[1,2], digits = 3)
613 accuracy <- data.matrix(accuracy)
615 colnames(accuracy) <- c("correlation")
616 rownames(accuracy) <- cvTest
618 assign(validation, accuracy)
620 if (!is.na(accuracy[1,1])) {
621 validationAll <- rbind(validationAll, accuracy)
626 validationAll <- data.matrix(validationAll[order(-validationAll[, 1]), ])
628 if (!is.null(validationAll)) {
629 validationMean <- data.matrix(round(colMeans(validationAll),
630 digits = 2
634 rownames(validationMean) <- c("Average")
636 validationAll <- rbind(validationAll, validationMean)
637 colnames(validationAll) <- c("Correlation")
640 #predict GEBVs for selection population
641 if (length(predictionData) !=0 ) {
642 predictionData <- data.matrix(round(predictionData, digits = 0 ))
645 predictionPopResult <- c()
646 predictionPopGEBVs <- c()
648 if (length(predictionData) != 0) {
649 message("running prediction for selection candidates...marker data", ncol(predictionData), " vs. ", ncol(genoDataFiltered))
651 predictionPopResult <- kinship.BLUP(y = phenoTrait,
652 G.train = genoDataFiltered,
653 G.pred = predictionData,
654 mixed.method = "REML",
655 K.method = "RR"
657 message("running prediction for selection candidates...DONE!!")
659 predictionPopGEBVs <- round(data.matrix(predictionPopResult$g.pred), digits = 3)
660 predictionPopGEBVs <- data.matrix(predictionPopGEBVs[order(-predictionPopGEBVs[, 1]), ])
662 colnames(predictionPopGEBVs) <- c(trait)
666 if (!is.null(predictionPopGEBVs) & length(predictionPopGEBVsFile) != 0) {
667 write.table(predictionPopGEBVs,
668 file = predictionPopGEBVsFile,
669 sep = "\t",
670 col.names = NA,
671 quote = FALSE,
672 append = FALSE
676 if(!is.null(validationAll)) {
677 write.table(validationAll,
678 file = validationFile,
679 sep = "\t",
680 col.names = NA,
681 quote = FALSE,
682 append = FALSE
686 if (!is.null(ordered.markerEffects)) {
687 write.table(ordered.markerEffects,
688 file = markerFile,
689 sep = "\t",
690 col.names = NA,
691 quote = FALSE,
692 append = FALSE
696 if (!is.null(ordered.iGEBV)) {
697 write.table(ordered.iGEBV,
698 file = blupFile,
699 sep = "\t",
700 col.names = NA,
701 quote = FALSE,
702 append = FALSE
706 if (length(combinedGebvsFile) != 0 ) {
707 if(file.info(combinedGebvsFile)$size == 0) {
708 write.table(ordered.iGEBV,
709 file = combinedGebvsFile,
710 sep = "\t",
711 col.names = NA,
712 quote = FALSE,
714 } else {
715 write.table(allGebvs,
716 file = combinedGebvsFile,
717 sep = "\t",
718 quote = FALSE,
719 col.names = NA,
724 if (!is.null(traitPhenoData) & length(traitPhenoFile) != 0) {
725 write.table(traitPhenoData,
726 file = traitPhenoFile,
727 sep = "\t",
728 col.names = NA,
729 quote = FALSE,
735 if (!is.null(genoDataMissing)) {
736 write.table(genoData,
737 file = genoFile,
738 sep = "\t",
739 col.names = NA,
740 quote = FALSE,
745 if (!is.null(predictionDataMissing)) {
746 write.table(predictionData,
747 file = predictionFile,
748 sep = "\t",
749 col.names = NA,
750 quote = FALSE,
755 if (file.info(relationshipMatrixFile)$size == 0) {
756 write.table(relationshipMatrix,
757 file = relationshipMatrixFile,
758 sep = "\t",
759 col.names = NA,
760 quote = FALSE,
765 if (file.info(formattedPhenoFile)$size == 0 & !is.null(formattedPhenoData) ) {
766 write.table(formattedPhenoData,
767 file = formattedPhenoFile,
768 sep = "\t",
769 col.names = NA,
770 quote = FALSE,
775 #should also send notification to analysis owner
776 to <- c("<iyt2@cornell.edu>")
777 subject <- paste(trait, ' GS analysis done', sep = ':')
778 body <- c("Dear User,\n\n")
779 body <- paste(body, 'The genomic selection analysis for', sep = "")
780 body <- paste(body, trait, sep = " ")
781 body <- paste(body, "is done.\n\nRegards and Thanks.\nSGN", sep = " ")
783 #should use SGN's smtp server eventually
784 sendmail(to, subject, body, password = "rmail")
786 q(save = "no", runLast = FALSE)