Merge pull request #362 from solgenomics/topic/trial_summary_stats
[sgn.git] / R / gs.r
blobd8b8a973d1318e4d130b3739431f49a6a047c8da
1 #SNOPSIS
2 #calculates genomic estimated breeding values (GEBVs) using rrBLUP
4 #AUTHOR
5 # Isaak Y Tecle (iyt2@cornell.edu)
7 options(echo = FALSE)
9 library(rrBLUP)
10 library(plyr)
11 library(stringr)
12 library(lme4)
13 library(randomForest)
14 library(data.table)
17 allArgs <- commandArgs()
19 inFile <- grep("input_files",
20 allArgs,
21 ignore.case = TRUE,
22 perl = TRUE,
23 value = TRUE
26 outFile <- grep("output_files",
27 allArgs,
28 ignore.case = TRUE,
29 perl = TRUE,
30 value = TRUE
33 outFiles <- scan(outFile,
34 what = "character"
37 inFiles <- scan(inFile,
38 what = "character"
42 traitsFile <- grep("traits",
43 inFiles,
44 ignore.case = TRUE,
45 fixed = FALSE,
46 value = TRUE
49 traitFile <- grep("trait_info",
50 inFiles,
51 ignore.case = TRUE,
52 fixed = FALSE,
53 value = TRUE
57 traitInfo <- scan(traitFile,
58 what = "character",
61 traitInfo <- strsplit(traitInfo, "\t");
62 traitId <- traitInfo[[1]]
63 trait <- traitInfo[[2]]
65 datasetInfoFile <- grep("dataset_info",
66 inFiles,
67 ignore.case = TRUE,
68 fixed = FALSE,
69 value = TRUE
71 datasetInfo <- c()
73 if (length(datasetInfoFile) != 0 ) {
74 datasetInfo <- scan(datasetInfoFile,
75 what= "character"
78 datasetInfo <- paste(datasetInfo, collapse = " ")
80 } else {
81 datasetInfo <- c('single population')
85 validationTrait <- paste("validation", trait, sep = "_")
87 validationFile <- grep(validationTrait,
88 outFiles,
89 ignore.case=TRUE,
90 fixed = FALSE,
91 value=TRUE
94 kinshipTrait <- paste("kinship", trait, sep = "_")
96 blupFile <- grep(kinshipTrait,
97 outFiles,
98 ignore.case = TRUE,
99 fixed = FALSE,
100 value = TRUE
103 markerTrait <- paste("marker", trait, sep = "_")
104 markerFile <- grep(markerTrait,
105 outFiles,
106 ignore.case = TRUE,
107 fixed = FALSE,
108 value = TRUE
111 traitPhenoFile <- paste("phenotype_trait", trait, sep = "_")
112 traitPhenoFile <- grep(traitPhenoFile,
113 outFiles,
114 ignore.case = TRUE,
115 fixed = FALSE,
116 value = TRUE
119 varianceComponentsFile <- grep("variance_components",
120 outFiles,
121 ignore.case = TRUE,
122 fixed = FALSE,
123 value = TRUE
126 formattedPhenoFile <- grep("formatted_phenotype_data",
127 inFiles,
128 ignore.case = TRUE,
129 fixed = FALSE,
130 value = TRUE
133 formattedPhenoData <- c()
134 phenoData <- c()
136 if (length(formattedPhenoFile) != 0 && file.info(formattedPhenoFile)$size != 0) {
137 formattedPhenoData <- as.data.frame(fread(formattedPhenoFile,
138 na.strings = c("NA", " ", "--", "-", ".")
142 row.names(formattedPhenoData) <- formattedPhenoData[, 1]
143 formattedPhenoData[, 1] <- NULL
145 } else {
146 phenoFile <- grep("\\/phenotype_data",
147 inFiles,
148 ignore.case = TRUE,
149 fixed = FALSE,
150 value = TRUE,
151 perl = TRUE,
154 phenoData <- fread(phenoFile,
155 header=TRUE,
156 na.strings = c("NA", " ", "--", "-", "."),
161 phenoData <- as.data.frame(phenoData)
162 phenoTrait <- c()
164 if (datasetInfo == 'combined populations') {
166 if (!is.null(formattedPhenoData)) {
167 phenoTrait <- subset(formattedPhenoData, select=trait)
168 phenoTrait <- na.omit(phenoTrait)
170 } else {
171 dropColumns <- grep(trait,
172 names(phenoData),
173 ignore.case = TRUE,
174 value = TRUE,
175 fixed = FALSE
178 phenoTrait <- phenoData[,!(names(phenoData) %in% dropColumns)]
180 phenoTrait <- as.data.frame(phenoTrait)
181 row.names(phenoTrait) <- phenoTrait[, 1]
182 phenoTrait[, 1] <- NULL
183 colnames(phenoTrait) <- trait
186 } else {
188 if (!is.null(formattedPhenoData)) {
189 phenoTrait <- subset(formattedPhenoData, select=trait)
190 phenoTrait <- na.omit(phenoTrait)
192 } else {
193 dropColumns <- c("uniquename", "stock_name")
194 phenoData <- phenoData[,!(names(phenoData) %in% dropColumns)]
196 phenoTrait <- subset(phenoData,
197 select = c("object_name", "object_id", "design", "block", "replicate", trait)
200 experimentalDesign <- phenoTrait[2, 'design']
202 if (class(phenoTrait[, trait]) != 'numeric') {
203 phenoTrait[, trait] <- as.numeric(as.character(phenoTrait[, trait]))
206 if (is.na(experimentalDesign) == TRUE) {experimentalDesign <- c('No Design')}
208 if ((experimentalDesign == 'Augmented' || experimentalDesign == 'RCBD') && unique(phenoTrait$block) > 1) {
210 message("GS experimental design: ", experimentalDesign)
212 augData <- subset(phenoTrait,
213 select = c("object_name", "object_id", "block", trait)
216 colnames(augData)[1] <- "genotypes"
217 colnames(augData)[4] <- "trait"
219 model <- try(lmer(trait ~ 0 + genotypes + (1|block),
220 augData,
221 na.action = na.omit
224 if (class(model) != "try-error") {
225 phenoTrait <- data.frame(fixef(model))
227 colnames(phenoTrait) <- trait
229 nn <- gsub('genotypes', '', rownames(phenoTrait))
230 rownames(phenoTrait) <- nn
232 phenoTrait <- round(phenoTrait, digits = 2)
235 } else if (experimentalDesign == 'Alpha') {
237 message("Experimental desgin: ", experimentalDesign)
239 alphaData <- subset(phenoData,
240 select = c("object_name", "object_id","block", "replicate", trait)
243 colnames(alphaData)[1] <- "genotypes"
244 colnames(alphaData)[5] <- "trait"
246 model <- try(lmer(trait ~ 0 + genotypes + (1|replicate/block),
247 alphaData,
248 na.action = na.omit
251 if (class(model) != "try-error") {
252 phenoTrait <- data.frame(fixef(model))
254 colnames(phenoTrait) <- trait
256 nn <- gsub('genotypes', '', rownames(phenoTrait))
257 rownames(phenoTrait) <- nn
259 phenoTrait <- round(phenoTrait, digits = 2)
263 } else {
265 phenoTrait <- subset(phenoData,
266 select = c("object_name", "object_id", trait)
269 if (sum(is.na(phenoTrait)) > 0) {
270 message("No. of pheno missing values: ", sum(is.na(phenoTrait)))
271 phenoTrait <- na.omit(phenoTrait)
274 #calculate mean of reps/plots of the same accession and
275 #create new df with the accession means
277 phenoTrait <- phenoTrait[order(row.names(phenoTrait)), ]
278 phenoTrait <- data.frame(phenoTrait)
279 message('phenotyped lines before averaging: ', length(row.names(phenoTrait)))
281 phenoTrait<-ddply(phenoTrait, "object_name", colwise(mean))
282 message('phenotyped lines after averaging: ', length(row.names(phenoTrait)))
284 phenoTrait <- subset(phenoTrait, select=c("object_name", trait))
285 row.names(phenoTrait) <- phenoTrait[, 1]
286 phenoTrait[, 1] <- NULL
288 #format all-traits population phenotype dataset
289 ## formattedPhenoData <- phenoData
290 ## dropColumns <- c("object_id", "stock_id", "design", "block", "replicate" )
292 ## formattedPhenoData <- formattedPhenoData[, !(names(formattedPhenoData) %in% dropColumns)]
293 ## formattedPhenoData <- ddply(formattedPhenoData,
294 ## "object_name",
295 ## colwise(mean)
296 ## )
298 ## row.names(formattedPhenoData) <- formattedPhenoData[, 1]
299 ## formattedPhenoData[, 1] <- NULL
301 ## formattedPhenoData <- round(formattedPhenoData,
302 ## digits=3
303 ## )
308 genoFile <- grep("genotype_data",
309 inFiles,
310 ignore.case = TRUE,
311 fixed = FALSE,
312 value = TRUE
315 genoData <- fread(genoFile,
316 header=TRUE,
317 na.strings = c("NA", " ", "--", "-"),
320 genoData <- as.data.frame(genoData)
321 rownames(genoData) <- genoData[, 1]
322 genoData[, 1] <- NULL
323 genoData <- genoData[, colSums(is.na(genoData)) < nrow(genoData) * 0.5]
325 predictionTempFile <- grep("prediction_population",
326 inFiles,
327 ignore.case = TRUE,
328 fixed = FALSE,
329 value = TRUE
332 predictionFile <- c()
334 message('prediction temp genotype file: ', predictionTempFile)
336 if (length(predictionTempFile) !=0 ) {
337 predictionFile <- scan(predictionTempFile,
338 what="character"
342 message('prediction genotype file: ', predictionFile)
344 predictionPopGEBVsFile <- grep("prediction_pop_gebvs",
345 outFiles,
346 ignore.case = TRUE,
347 fixed = FALSE,
348 value = TRUE
350 message("prediction gebv file: ", predictionPopGEBVsFile)
352 predictionData <- c()
354 if (length(predictionFile) !=0 ) {
356 predictionData <- fread(predictionFile,
357 na.strings = c("NA", " ", "--", "-"),
360 predictionData <- as.data.frame(predictionData)
361 rownames(predictionData) <- predictionData[, 1]
362 predictionData[, 1] <- NULL
363 predictionData <- predictionData[, colSums(is.na(predictionData)) < nrow(predictionData) * 0.5]
366 #impute genotype values for obs with missing values,
367 #based on mean of neighbouring 10 (arbitrary) obs
368 genoDataMissing <- c()
370 if (sum(is.na(genoData)) > 0) {
371 genoDataMissing<- c('yes')
373 message("sum of geno missing values, ", sum(is.na(genoData)) )
374 genoData <- na.roughfix(genoData)
375 genoData <- data.matrix(genoData)
378 genoData <- genoData[order(row.names(genoData)), ]
380 #create phenotype and genotype datasets with
381 #common stocks only
382 message('phenotyped lines: ', length(row.names(phenoTrait)))
383 message('genotyped lines: ', length(row.names(genoData)))
385 #extract observation lines with both
386 #phenotype and genotype data only.
387 commonObs <- intersect(row.names(phenoTrait), row.names(genoData))
388 commonObs <- data.frame(commonObs)
389 rownames(commonObs)<-commonObs[, 1]
391 message('lines with both genotype and phenotype data: ', length(row.names(commonObs)))
392 #include in the genotype dataset only observation lines
393 #with phenotype data
394 message("genotype lines before filtering for phenotyped only: ", length(row.names(genoData)))
395 genoDataFiltered <- genoData[(rownames(genoData) %in% rownames(commonObs)), ]
396 message("genotype lines after filtering for phenotyped only: ", length(row.names(genoDataFiltered)))
397 #drop observation lines without genotype data
398 message("phenotype lines before filtering for genotyped only: ", length(row.names(phenoTrait)))
399 phenoTrait <- merge(data.frame(phenoTrait), commonObs, by=0, all=FALSE)
400 rownames(phenoTrait) <-phenoTrait[, 1]
401 phenoTrait <- subset(phenoTrait, select=trait)
403 message("phenotype lines after filtering for genotyped only: ", length(row.names(phenoTrait)))
404 #a set of only observation lines with genotype data
406 traitPhenoData <- data.frame(round(phenoTrait, digits=2))
407 phenoTrait <- data.matrix(phenoTrait)
408 genoDataFiltered <- data.matrix(genoDataFiltered)
410 #impute missing data in prediction data
411 predictionDataMissing <- c()
412 if (length(predictionData) != 0) {
413 #purge markers unique to both populations
414 commonMarkers <- intersect(names(data.frame(genoDataFiltered)), names(predictionData))
415 predictionData <- subset(predictionData, select = commonMarkers)
416 genoDataFiltered <- subset(genoDataFiltered, select= commonMarkers)
418 # predictionData <- data.matrix(predictionData)
420 if (sum(is.na(predictionData)) > 0) {
421 predictionDataMissing <- c('yes')
422 message("sum of geno missing values, ", sum(is.na(predictionData)) )
423 predictionData <- data.matrix(na.roughfix(predictionData))
428 relationshipMatrixFile <- grep("relationship_matrix",
429 outFiles,
430 ignore.case = TRUE,
431 fixed = FALSE,
432 value = TRUE
435 message("relationship matrix file: ", relationshipMatrixFile)
436 #message("relationship matrix file size: ", file.info(relationshipMatrixFile)$size)
437 relationshipMatrix <- c()
438 if (length(relationshipMatrixFile) != 0) {
439 if (file.info(relationshipMatrixFile)$size > 0 ) {
440 relationshipDf <- as.data.frame(fread(relationshipMatrixFile))
442 rownames(relationshipDf) <- relationshipDf[, 1]
443 relationshipDf[, 1] <- NULL
444 relationshipMatrix <- data.matrix(relationshipDf)
448 #change genotype coding to [-1, 0, 1], to use the A.mat ) if [0, 1, 2]
449 genoTrCode <- grep("2", genoDataFiltered[1, ], fixed=TRUE, value=TRUE)
450 if(length(genoTrCode) != 0) {
451 genoDataFiltered <- genoDataFiltered - 1
454 if (length(predictionData) != 0 ) {
455 genoSlCode <- grep("2", predictionData[1, ], fixed=TRUE, value=TRUE)
456 if (length(genoSlCode) != 0 ) {
457 predictionData <- predictionData - 1
461 ordered.markerEffects <- c()
462 if ( length(predictionData) == 0 ) {
463 markerEffects <- mixed.solve(y = phenoTrait,
464 Z = genoDataFiltered
467 ordered.markerEffects <- data.matrix(markerEffects$u)
468 ordered.markerEffects <- data.matrix(ordered.markerEffects [order (-ordered.markerEffects[, 1]), ])
469 ordered.markerEffects <- round(ordered.markerEffects,
470 digits=5
473 colnames(ordered.markerEffects) <- c("Marker Effects")
476 #correlation between breeding values based on
477 #marker effects and relationship matrix
478 #corGEBVs <- cor(genoDataMatrix %*% markerEffects$u, iGEBV$u)
481 #additive relationship model
482 #calculate the inner products for
483 #genotypes (realized relationship matrix)
484 if (length(relationshipMatrixFile) != 0) {
485 if (file.info(relationshipMatrixFile)$size == 0) {
486 relationshipMatrix <- tcrossprod(data.matrix(genoData))
489 relationshipMatrixFiltered <- relationshipMatrix[(rownames(relationshipMatrix) %in% rownames(commonObs)),]
490 relationshipMatrixFiltered <- relationshipMatrixFiltered[, (colnames(relationshipMatrixFiltered) %in% rownames(commonObs))]
492 #construct an identity matrix for genotypes
493 identityMatrix <- diag(nrow(phenoTrait))
495 relationshipMatrixFiltered <- data.matrix(relationshipMatrixFiltered)
497 iGEBV <- mixed.solve(y = phenoTrait,
498 Z = identityMatrix,
499 K = relationshipMatrixFiltered
502 iGEBVu <- iGEBV$u
504 heritability <- c()
506 if ( is.null(predictionFile) == TRUE ) {
507 # heritability <- round((iGEBV$Vu /(iGEBV$Vu + iGEBV$Ve) * 100), digits=3)
508 cat("\n", file=varianceComponentsFile, append=TRUE)
509 cat('Error variance', iGEBV$Ve, file=varianceComponentsFile, sep="\t", append=TRUE)
510 cat("\n", file=varianceComponentsFile, append=TRUE)
511 cat('Additive genetic variance', iGEBV$Vu, file=varianceComponentsFile, sep='\t', append=TRUE)
512 cat("\n", file=varianceComponentsFile, append=TRUE)
513 cat('&#956;', iGEBV$beta,file=varianceComponentsFile, sep='\t', append=TRUE)
514 cat("\n", file=varianceComponentsFile, append=TRUE)
515 # cat('Heritability (h, %)', heritability, file=varianceComponentsFile, sep='\t', append=TRUE)
518 iGEBV <- data.matrix(iGEBVu)
520 ordered.iGEBV <- as.data.frame(iGEBV[order(-iGEBV[, 1]), ] )
522 ordered.iGEBV <- round(ordered.iGEBV,
523 digits = 3
526 combinedGebvsFile <- grep('selected_traits_gebv',
527 outFiles,
528 ignore.case = TRUE,
529 fixed = FALSE,
530 value = TRUE
533 allGebvs<-c()
534 if (length(combinedGebvsFile) != 0)
536 fileSize <- file.info(combinedGebvsFile)$size
537 if (fileSize != 0 )
539 combinedGebvs <- as.data.frame(fread(combinedGebvsFile))
541 rownames(combinedGebvs) <- combinedGebvs[,1]
542 combinedGebvs[,1] <- NULL
544 colnames(ordered.iGEBV) <- c(trait)
546 traitGEBV <- as.data.frame(ordered.iGEBV)
547 allGebvs <- merge(combinedGebvs, traitGEBV,
548 by = 0,
549 all = TRUE
552 rownames(allGebvs) <- allGebvs[,1]
553 allGebvs[,1] <- NULL
557 colnames(ordered.iGEBV) <- c(trait)
559 #cross-validation
560 validationAll <- c()
562 if(is.null(predictionFile)) {
563 genoNum <- nrow(phenoTrait)
564 if(genoNum < 20 ) {
565 warning(genoNum, " is too small number of genotypes.")
567 reps <- round_any(genoNum, 10, f = ceiling) %/% 10
569 genotypeGroups <-c()
571 if (genoNum %% 10 == 0) {
572 genotypeGroups <- rep(1:10, reps)
573 } else {
574 genotypeGroups <- rep(1:10, reps) [- (genoNum %% 10) ]
577 set.seed(4567)
578 genotypeGroups <- genotypeGroups[ order (runif(genoNum)) ]
580 for (i in 1:10) {
581 tr <- paste("trPop", i, sep = ".")
582 sl <- paste("slPop", i, sep = ".")
584 trG <- which(genotypeGroups != i)
585 slG <- which(genotypeGroups == i)
587 assign(tr, trG)
588 assign(sl, slG)
590 kblup <- paste("rKblup", i, sep = ".")
592 result <- kinship.BLUP(y = phenoTrait[trG, ],
593 G.train = genoDataFiltered[trG, ],
594 G.pred = genoDataFiltered[slG, ],
595 mixed.method = "REML",
596 K.method = "RR",
599 assign(kblup, result)
601 #calculate cross-validation accuracy
602 valCorData <- merge(phenoTrait[slG, ], result$g.pred, by=0, all=FALSE)
603 rownames(valCorData) <- valCorData[, 1]
604 valCorData[, 1] <- NULL
606 accuracy <- try(cor(valCorData))
607 validation <- paste("validation", i, sep = ".")
609 cvTest <- paste("Validation test", i, sep = " ")
611 if ( class(accuracy) != "try-error")
613 accuracy <- round(accuracy[1,2], digits = 3)
614 accuracy <- data.matrix(accuracy)
616 colnames(accuracy) <- c("correlation")
617 rownames(accuracy) <- cvTest
619 assign(validation, accuracy)
621 if (!is.na(accuracy[1,1])) {
622 validationAll <- rbind(validationAll, accuracy)
627 validationAll <- data.matrix(validationAll[order(-validationAll[, 1]), ])
629 if (!is.null(validationAll)) {
630 validationMean <- data.matrix(round(colMeans(validationAll),
631 digits = 2
635 rownames(validationMean) <- c("Average")
637 validationAll <- rbind(validationAll, validationMean)
638 colnames(validationAll) <- c("Correlation")
641 #predict GEBVs for selection population
642 #if (length(predictionData) !=0 ) {
643 # predictionData <- data.matrix(round(predictionData, digits = 0 ))
646 predictionPopResult <- c()
647 predictionPopGEBVs <- c()
649 if (length(predictionData) != 0) {
650 message("running prediction for selection candidates...marker data", ncol(predictionData), " vs. ", ncol(genoDataFiltered))
652 predictionPopResult <- kinship.BLUP(y = phenoTrait,
653 G.train = genoDataFiltered,
654 G.pred = predictionData,
655 mixed.method = "REML",
656 K.method = "RR"
658 message("running prediction for selection candidates...DONE!!")
660 predictionPopGEBVs <- round(data.matrix(predictionPopResult$g.pred), digits = 3)
661 predictionPopGEBVs <- data.matrix(predictionPopGEBVs[order(-predictionPopGEBVs[, 1]), ])
663 colnames(predictionPopGEBVs) <- c(trait)
667 if (!is.null(predictionPopGEBVs) & length(predictionPopGEBVsFile) != 0) {
668 write.table(predictionPopGEBVs,
669 file = predictionPopGEBVsFile,
670 sep = "\t",
671 col.names = NA,
672 quote = FALSE,
673 append = FALSE
677 if(!is.null(validationAll)) {
678 write.table(validationAll,
679 file = validationFile,
680 sep = "\t",
681 col.names = NA,
682 quote = FALSE,
683 append = FALSE
687 if (!is.null(ordered.markerEffects)) {
688 write.table(ordered.markerEffects,
689 file = markerFile,
690 sep = "\t",
691 col.names = NA,
692 quote = FALSE,
693 append = FALSE
697 if (!is.null(ordered.iGEBV)) {
698 write.table(ordered.iGEBV,
699 file = blupFile,
700 sep = "\t",
701 col.names = NA,
702 quote = FALSE,
703 append = FALSE
707 if (length(combinedGebvsFile) != 0 ) {
708 if(file.info(combinedGebvsFile)$size == 0) {
709 write.table(ordered.iGEBV,
710 file = combinedGebvsFile,
711 sep = "\t",
712 col.names = NA,
713 quote = FALSE,
715 } else {
716 write.table(allGebvs,
717 file = combinedGebvsFile,
718 sep = "\t",
719 quote = FALSE,
720 col.names = NA,
725 if (!is.null(traitPhenoData) & length(traitPhenoFile) != 0) {
726 write.table(traitPhenoData,
727 file = traitPhenoFile,
728 sep = "\t",
729 col.names = NA,
730 quote = FALSE,
736 ## if (!is.null(genoDataMissing)) {
737 ## write.table(genoData,
738 ## file = genoFile,
739 ## sep = "\t",
740 ## col.names = NA,
741 ## quote = FALSE,
742 ## )
744 ## }
746 ## if (!is.null(predictionDataMissing)) {
747 ## write.table(predictionData,
748 ## file = predictionFile,
749 ## sep = "\t",
750 ## col.names = NA,
751 ## quote = FALSE,
752 ## )
753 ## }
756 if (file.info(relationshipMatrixFile)$size == 0) {
757 write.table(relationshipMatrix,
758 file = relationshipMatrixFile,
759 sep = "\t",
760 col.names = NA,
761 quote = FALSE,
766 if (file.info(formattedPhenoFile)$size == 0 && !is.null(formattedPhenoData) ) {
767 write.table(formattedPhenoData,
768 file = formattedPhenoFile,
769 sep = "\t",
770 col.names = NA,
771 quote = FALSE,
776 q(save = "no", runLast = FALSE)