Merge branch 'master' into topic/upload_fieldbook_images
[sgn.git] / R / gs.r
blobe9658a4ff50114249c544789a9026b5fc2479ee4
1 #SNOPSIS
2 #calculates genomic estimated breeding values (GEBVs) using rrBLUP,
3 #GBLUP method
5 #AUTHOR
6 # Isaak Y Tecle (iyt2@cornell.edu)
8 options(echo = FALSE)
10 library(rrBLUP)
11 library(plyr)
12 library(stringr)
13 library(lme4)
14 library(randomForest)
15 library(data.table)
18 allArgs <- commandArgs()
20 inputFiles <- scan(grep("input_files", allArgs, ignore.case = TRUE, perl = TRUE, value = TRUE),
21 what = "character")
23 outputFiles <- scan(grep("output_files", allArgs, ignore.case = TRUE,perl = TRUE, value = TRUE),
24 what = "character")
26 traitsFile <- grep("traits", inputFiles, ignore.case = TRUE, value = TRUE)
27 traitFile <- grep("trait_info", inputFiles, ignore.case = TRUE, value = TRUE)
28 traitInfo <- scan(traitFile, what = "character",)
29 traitInfo <- strsplit(traitInfo, "\t");
30 traitId <- traitInfo[[1]]
31 trait <- traitInfo[[2]]
33 datasetInfoFile <- grep("dataset_info", inputFiles, ignore.case = TRUE, value = TRUE)
34 datasetInfo <- c()
36 if (length(datasetInfoFile) != 0 ) {
37 datasetInfo <- scan(datasetInfoFile, what = "character")
38 datasetInfo <- paste(datasetInfo, collapse = " ")
39 } else {
40 datasetInfo <- c('single population')
43 validationTrait <- paste("validation", trait, sep = "_")
44 validationFile <- grep(validationTrait, outputFiles, ignore.case = TRUE, value = TRUE)
46 if (is.null(validationFile)) {
47 stop("Validation output file is missing.")
50 kinshipTrait <- paste("kinship", trait, sep = "_")
51 blupFile <- grep(kinshipTrait, outputFiles, ignore.case = TRUE, value = TRUE)
53 if (is.null(blupFile)) {
54 stop("GEBVs file is missing.")
56 markerTrait <- paste("marker", trait, sep = "_")
57 markerFile <- grep(markerTrait, outputFiles, ignore.case = TRUE, value = TRUE)
59 traitPhenoFile <- paste("phenotype_trait", trait, sep = "_")
60 traitPhenoFile <- grep(traitPhenoFile, outputFiles,ignore.case = TRUE, value = TRUE)
62 varianceComponentsFile <- grep("variance_components", outputFiles, ignore.case = TRUE, value = TRUE)
64 formattedPhenoFile <- grep("formatted_phenotype_data", inputFiles, ignore.case = TRUE, value = TRUE)
66 formattedPhenoData <- c()
67 phenoData <- c()
69 if (length(formattedPhenoFile) != 0 && file.info(formattedPhenoFile)$size != 0) {
70 formattedPhenoData <- as.data.frame(fread(formattedPhenoFile,
71 na.strings = c("NA", " ", "--", "-", ".")
74 row.names(formattedPhenoData) <- formattedPhenoData[, 1]
75 formattedPhenoData[, 1] <- NULL
76 } else {
77 phenoFile <- grep("\\/phenotype_data", inputFiles, ignore.case = TRUE, value = TRUE, perl = TRUE)
78 phenoData <- fread(phenoFile, na.strings = c("NA", " ", "--", "-", "."), header = TRUE)
81 phenoData <- as.data.frame(phenoData)
82 phenoTrait <- c()
84 if (datasetInfo == 'combined populations') {
86 if (!is.null(formattedPhenoData)) {
87 phenoTrait <- subset(formattedPhenoData, select = trait)
88 phenoTrait <- na.omit(phenoTrait)
90 } else {
91 dropColumns <- grep(trait, names(phenoData), ignore.case = TRUE, value = TRUE)
92 phenoTrait <- phenoData[, !(names(phenoData) %in% dropColumns)]
94 phenoTrait <- as.data.frame(phenoTrait)
95 row.names(phenoTrait) <- phenoTrait[, 1]
96 phenoTrait[, 1] <- NULL
97 colnames(phenoTrait) <- trait
100 } else {
102 if (!is.null(formattedPhenoData)) {
103 phenoTrait <- subset(formattedPhenoData, select = trait)
104 phenoTrait <- na.omit(phenoTrait)
106 } else {
107 dropColumns <- c("uniquename", "stock_name")
108 phenoData <- phenoData[, !(names(phenoData) %in% dropColumns)]
110 phenoTrait <- subset(phenoData, select = c("object_name", "object_id", "design", "block", "replicate", trait))
112 experimentalDesign <- phenoTrait[2, 'design']
114 if (class(phenoTrait[, trait]) != 'numeric') {
115 phenoTrait[, trait] <- as.numeric(as.character(phenoTrait[, trait]))
118 if (is.na(experimentalDesign) == TRUE) {experimentalDesign <- c('No Design')}
120 if ((experimentalDesign == 'Augmented' || experimentalDesign == 'RCBD') && unique(phenoTrait$block) > 1) {
122 message("GS experimental design: ", experimentalDesign)
124 augData <- subset(phenoTrait, select = c("object_name", "object_id", "block", trait))
126 colnames(augData)[1] <- "genotypes"
127 colnames(augData)[4] <- "trait"
129 model <- try(lmer(trait ~ 0 + genotypes + (1|block),
130 augData,
131 na.action = na.omit))
133 if (class(model) != "try-error") {
134 phenoTrait <- data.frame(fixef(model))
136 colnames(phenoTrait) <- trait
138 nn <- gsub('genotypes', '', rownames(phenoTrait))
139 rownames(phenoTrait) <- nn
141 phenoTrait <- round(phenoTrait, digits = 2)
144 } else if (experimentalDesign == 'Alpha') {
146 message("Experimental desgin: ", experimentalDesign)
148 alphaData <- subset(phenoData,
149 select = c("object_name", "object_id","block", "replicate", trait)
152 colnames(alphaData)[1] <- "genotypes"
153 colnames(alphaData)[5] <- "trait"
155 model <- try(lmer(trait ~ 0 + genotypes + (1|replicate/block),
156 alphaData,
157 na.action = na.omit))
159 if (class(model) != "try-error") {
160 phenoTrait <- data.frame(fixef(model))
162 colnames(phenoTrait) <- trait
164 nn <- gsub('genotypes', '', rownames(phenoTrait))
165 rownames(phenoTrait) <- nn
167 phenoTrait <- round(phenoTrait, digits = 2)
171 } else {
173 phenoTrait <- subset(phenoData,
174 select = c("object_name", "object_id", trait))
176 if (sum(is.na(phenoTrait)) > 0) {
177 message("No. of pheno missing values: ", sum(is.na(phenoTrait)))
178 phenoTrait <- na.omit(phenoTrait)
181 #calculate mean of reps/plots of the same accession and
182 #create new df with the accession means
184 phenoTrait <- phenoTrait[order(row.names(phenoTrait)), ]
185 phenoTrait <- data.frame(phenoTrait)
186 message('phenotyped lines before averaging: ', length(row.names(phenoTrait)))
188 phenoTrait<-ddply(phenoTrait, "object_name", colwise(mean))
189 message('phenotyped lines after averaging: ', length(row.names(phenoTrait)))
191 phenoTrait <- subset(phenoTrait, select = c("object_name", trait))
192 row.names(phenoTrait) <- phenoTrait[, 1]
193 phenoTrait[, 1] <- NULL
195 #format all-traits population phenotype dataset
196 ## formattedPhenoData <- phenoData
197 ## dropColumns <- c("object_id", "stock_id", "design", "block", "replicate" )
199 ## formattedPhenoData <- formattedPhenoData[, !(names(formattedPhenoData) %in% dropColumns)]
200 ## formattedPhenoData <- ddply(formattedPhenoData,
201 ## "object_name",
202 ## colwise(mean)
203 ## )
205 ## row.names(formattedPhenoData) <- formattedPhenoData[, 1]
206 ## formattedPhenoData[, 1] <- NULL
208 ## formattedPhenoData <- round(formattedPhenoData,
209 ## digits=3
210 ## )
215 genoFile <- grep("genotype_data", inputFiles, ignore.case = TRUE, value = TRUE)
216 genoData <- fread(genoFile, na.strings = c("NA", " ", "--", "-"), header = TRUE)
218 genoData <- as.data.frame(genoData)
219 rownames(genoData) <- genoData[, 1]
220 genoData[, 1] <- NULL
221 genoData <- genoData[, colSums(is.na(genoData)) < nrow(genoData) * 0.5]
223 predictionTempFile <- grep("prediction_population", inputFiles, ignore.case = TRUE, value = TRUE)
224 predictionFile <- c()
226 message('prediction temp genotype file: ', predictionTempFile)
228 if (length(predictionTempFile) !=0 ) {
229 predictionFile <- scan(predictionTempFile, what = "character")
232 message('prediction genotype file: ', predictionFile)
234 predictionPopGEBVsFile <- grep("prediction_pop_gebvs", outputFiles, ignore.case = TRUE, value = TRUE)
235 message("prediction gebv file: ", predictionPopGEBVsFile)
237 predictionData <- c()
239 if (length(predictionFile) !=0 ) {
241 predictionData <- as.data.frame(fread(predictionFile, na.strings = c("NA", " ", "--", "-"),))
243 rownames(predictionData) <- predictionData[, 1]
244 predictionData[, 1] <- NULL
245 predictionData <- predictionData[, colSums(is.na(predictionData)) < nrow(predictionData) * 0.5]
248 #impute genotype values for obs with missing values,
249 #based on mean of neighbouring 10 (arbitrary) obs
250 genoDataMissing <- c()
252 if (sum(is.na(genoData)) > 0) {
253 genoDataMissing<- c('yes')
255 message("sum of geno missing values, ", sum(is.na(genoData)) )
256 genoData <- na.roughfix(genoData)
257 genoData <- data.matrix(genoData)
260 genoData <- genoData[order(row.names(genoData)), ]
262 #create phenotype and genotype datasets with
263 #common stocks only
264 message('phenotyped lines: ', length(row.names(phenoTrait)))
265 message('genotyped lines: ', length(row.names(genoData)))
267 #extract observation lines with both
268 #phenotype and genotype data only.
269 commonObs <- intersect(row.names(phenoTrait), row.names(genoData))
270 commonObs <- data.frame(commonObs)
271 rownames(commonObs)<-commonObs[, 1]
273 message('lines with both genotype and phenotype data: ', length(row.names(commonObs)))
274 #include in the genotype dataset only observation lines
275 #with phenotype data
276 message("genotype lines before filtering for phenotyped only: ", length(row.names(genoData)))
277 genoDataFiltered <- genoData[(rownames(genoData) %in% rownames(commonObs)), ]
278 message("genotype lines after filtering for phenotyped only: ", length(row.names(genoDataFiltered)))
279 #drop observation lines without genotype data
280 message("phenotype lines before filtering for genotyped only: ", length(row.names(phenoTrait)))
281 phenoTrait <- merge(data.frame(phenoTrait), commonObs, by=0, all=FALSE)
282 rownames(phenoTrait) <-phenoTrait[, 1]
283 phenoTrait <- subset(phenoTrait, select=trait)
285 message("phenotype lines after filtering for genotyped only: ", length(row.names(phenoTrait)))
286 #a set of only observation lines with genotype data
288 traitPhenoData <- data.frame(round(phenoTrait, digits = 2))
289 phenoTrait <- data.matrix(phenoTrait)
290 genoDataFiltered <- data.matrix(genoDataFiltered)
292 #impute missing data in prediction data
293 predictionDataMissing <- c()
294 if (length(predictionData) != 0) {
295 #purge markers unique to both populations
296 commonMarkers <- intersect(names(data.frame(genoDataFiltered)), names(predictionData))
297 predictionData <- subset(predictionData, select = commonMarkers)
298 genoDataFiltered <- subset(genoDataFiltered, select= commonMarkers)
300 # predictionData <- data.matrix(predictionData)
302 if (sum(is.na(predictionData)) > 0) {
303 predictionDataMissing <- c('yes')
304 message("sum of geno missing values, ", sum(is.na(predictionData)) )
305 predictionData <- data.matrix(na.roughfix(predictionData))
310 relationshipMatrixFile <- grep("relationship_matrix", outputFiles, ignore.case = TRUE, value = TRUE)
312 message("relationship matrix file: ", relationshipMatrixFile)
314 relationshipMatrix <- c()
315 if (length(relationshipMatrixFile) != 0) {
316 if (file.info(relationshipMatrixFile)$size > 0 ) {
317 relationshipDf <- as.data.frame(fread(relationshipMatrixFile))
319 rownames(relationshipDf) <- relationshipDf[, 1]
320 relationshipDf[, 1] <- NULL
321 relationshipMatrix <- data.matrix(relationshipDf)
325 #change genotype coding to [-1, 0, 1], to use the A.mat ) if [0, 1, 2]
326 genoTrCode <- grep("2", genoDataFiltered[1, ], value = TRUE)
327 if(length(genoTrCode) != 0) {
328 genoDataFiltered <- genoDataFiltered - 1
331 if (length(predictionData) != 0 ) {
332 genoSlCode <- grep("2", predictionData[1, ], value = TRUE)
333 if (length(genoSlCode) != 0 ) {
334 predictionData <- predictionData - 1
338 ordered.markerEffects <- c()
339 if ( length(predictionData) == 0 ) {
340 markerEffects <- mixed.solve(y = phenoTrait,
341 Z = genoDataFiltered
344 ordered.markerEffects <- data.matrix(markerEffects$u)
345 ordered.markerEffects <- data.matrix(ordered.markerEffects [order (-ordered.markerEffects[, 1]), ])
346 ordered.markerEffects <- round(ordered.markerEffects, digits=5)
348 colnames(ordered.markerEffects) <- c("Marker Effects")
352 #additive relationship model
353 #calculate the inner products for
354 #genotypes (realized relationship matrix)
355 if (length(relationshipMatrixFile) != 0) {
356 if (file.info(relationshipMatrixFile)$size == 0) {
357 relationshipMatrix <- tcrossprod(data.matrix(genoData))
360 relationshipMatrixFiltered <- relationshipMatrix[(rownames(relationshipMatrix) %in% rownames(commonObs)),]
361 relationshipMatrixFiltered <- relationshipMatrixFiltered[, (colnames(relationshipMatrixFiltered) %in% rownames(commonObs))]
363 #construct an identity matrix for genotypes
364 identityMatrix <- diag(nrow(phenoTrait))
366 relationshipMatrixFiltered <- data.matrix(relationshipMatrixFiltered)
368 iGEBV <- mixed.solve(y = phenoTrait, Z = identityMatrix, K = relationshipMatrixFiltered)
369 iGEBVu <- iGEBV$u
371 heritability <- c()
373 if ( is.null(predictionFile) == TRUE ) {
374 additiveEffects <- data.frame(iGEBVu)
376 pN <- nrow(phenoTrait)
377 aN <- nrow(additiveEffects)
379 if (pN <= 1 || pN != aN) {
380 stop("phenoTrait and additiveEffects have different lengths: ",
381 pN, " and ", aN, ".")
384 if (TRUE %in% is.na(phenoTrait) || TRUE %in% is.na(additiveEffects)) {
385 stop(" Arguments phenoTrait and additiveEffects have missing values.")
388 phenoVariance <- var(phenoTrait)
389 gebvVariance <- var(additiveEffects)
390 heritability <- round((gebvVariance / phenoVariance), digits = 2)
392 cat("\n", file = varianceComponentsFile, append = FALSE)
393 cat('Error variance', iGEBV$Ve, file = varianceComponentsFile, sep = "\t", append = TRUE)
394 cat("\n", file = varianceComponentsFile, append = TRUE)
395 cat('Additive genetic variance', iGEBV$Vu, file = varianceComponentsFile, sep = '\t', append = TRUE)
396 cat("\n", file = varianceComponentsFile, append = TRUE)
397 cat('Phenotype mean', iGEBV$beta,file = varianceComponentsFile, sep = '\t', append = TRUE)
398 cat("\n", file = varianceComponentsFile, append = TRUE)
399 cat('Heritability (h)', heritability, file = varianceComponentsFile, sep = '\t', append = TRUE)
402 iGEBV <- data.matrix(iGEBVu)
403 ordered.iGEBV <- as.data.frame(iGEBV[order(-iGEBV[, 1]), ])
404 ordered.iGEBV <- round(ordered.iGEBV, digits = 3)
406 combinedGebvsFile <- grep('selected_traits_gebv', outputFiles, ignore.case = TRUE,value = TRUE)
408 allGebvs<-c()
409 if (length(combinedGebvsFile) != 0) {
410 fileSize <- file.info(combinedGebvsFile)$size
411 if (fileSize != 0 ) {
412 combinedGebvs <- as.data.frame(fread(combinedGebvsFile))
414 rownames(combinedGebvs) <- combinedGebvs[,1]
415 combinedGebvs[,1] <- NULL
417 colnames(ordered.iGEBV) <- c(trait)
419 traitGEBV <- as.data.frame(ordered.iGEBV)
420 allGebvs <- merge(combinedGebvs, traitGEBV,
421 by = 0,
422 all = TRUE
425 rownames(allGebvs) <- allGebvs[,1]
426 allGebvs[,1] <- NULL
430 colnames(ordered.iGEBV) <- c(trait)
432 #cross-validation
433 validationAll <- c()
435 if(is.null(predictionFile)) {
436 genoNum <- nrow(phenoTrait)
437 if(genoNum < 20 ) {
438 warning(genoNum, " is too small number of genotypes.")
441 reps <- round_any(genoNum, 10, f = ceiling) %/% 10
443 genotypeGroups <-c()
445 if (genoNum %% 10 == 0) {
446 genotypeGroups <- rep(1:10, reps)
447 } else {
448 genotypeGroups <- rep(1:10, reps) [- (genoNum %% 10) ]
451 set.seed(4567)
452 genotypeGroups <- genotypeGroups[ order (runif(genoNum)) ]
454 for (i in 1:10) {
455 tr <- paste("trPop", i, sep = ".")
456 sl <- paste("slPop", i, sep = ".")
458 trG <- which(genotypeGroups != i)
459 slG <- which(genotypeGroups == i)
461 assign(tr, trG)
462 assign(sl, slG)
464 kblup <- paste("rKblup", i, sep = ".")
466 result <- kinship.BLUP(y = phenoTrait[trG, ],
467 G.train = genoDataFiltered[trG, ],
468 G.pred = genoDataFiltered[slG, ],
469 mixed.method = "REML",
470 K.method = "RR",
473 assign(kblup, result)
475 #calculate cross-validation accuracy
476 valCorData <- merge(phenoTrait[slG, ], result$g.pred, by=0, all=FALSE)
477 rownames(valCorData) <- valCorData[, 1]
478 valCorData[, 1] <- NULL
480 accuracy <- try(cor(valCorData))
481 validation <- paste("validation", i, sep = ".")
483 cvTest <- paste("Validation test", i, sep = " ")
485 if ( class(accuracy) != "try-error")
487 accuracy <- round(accuracy[1,2], digits = 3)
488 accuracy <- data.matrix(accuracy)
490 colnames(accuracy) <- c("correlation")
491 rownames(accuracy) <- cvTest
493 assign(validation, accuracy)
495 if (!is.na(accuracy[1,1])) {
496 validationAll <- rbind(validationAll, accuracy)
501 validationAll <- data.matrix(validationAll[order(-validationAll[, 1]), ])
503 if (!is.null(validationAll)) {
504 validationMean <- data.matrix(round(colMeans(validationAll), digits = 2))
506 rownames(validationMean) <- c("Average")
508 validationAll <- rbind(validationAll, validationMean)
509 colnames(validationAll) <- c("Correlation")
513 predictionPopResult <- c()
514 predictionPopGEBVs <- c()
516 if (length(predictionData) != 0) {
517 message("running prediction for selection candidates...marker data", ncol(predictionData), " vs. ", ncol(genoDataFiltered))
519 predictionPopResult <- kinship.BLUP(y = phenoTrait,
520 G.train = genoDataFiltered,
521 G.pred = predictionData,
522 mixed.method = "REML",
523 K.method = "RR"
525 message("running prediction for selection candidates...DONE!!")
527 predictionPopGEBVs <- round(data.matrix(predictionPopResult$g.pred), digits = 3)
528 predictionPopGEBVs <- data.matrix(predictionPopGEBVs[order(-predictionPopGEBVs[, 1]), ])
530 colnames(predictionPopGEBVs) <- c(trait)
534 if (!is.null(predictionPopGEBVs) & length(predictionPopGEBVsFile) != 0) {
535 write.table(predictionPopGEBVs,
536 file = predictionPopGEBVsFile,
537 sep = "\t",
538 col.names = NA,
539 quote = FALSE,
540 append = FALSE
544 if(!is.null(validationAll)) {
545 write.table(validationAll,
546 file = validationFile,
547 sep = "\t",
548 col.names = NA,
549 quote = FALSE,
550 append = FALSE
554 if (!is.null(ordered.markerEffects)) {
555 write.table(ordered.markerEffects,
556 file = markerFile,
557 sep = "\t",
558 col.names = NA,
559 quote = FALSE,
560 append = FALSE
564 if (!is.null(ordered.iGEBV)) {
565 write.table(ordered.iGEBV,
566 file = blupFile,
567 sep = "\t",
568 col.names = NA,
569 quote = FALSE,
570 append = FALSE
574 if (length(combinedGebvsFile) != 0 ) {
575 if(file.info(combinedGebvsFile)$size == 0) {
576 write.table(ordered.iGEBV,
577 file = combinedGebvsFile,
578 sep = "\t",
579 col.names = NA,
580 quote = FALSE,
582 } else {
583 write.table(allGebvs,
584 file = combinedGebvsFile,
585 sep = "\t",
586 quote = FALSE,
587 col.names = NA,
592 if (!is.null(traitPhenoData) & length(traitPhenoFile) != 0) {
593 write.table(traitPhenoData,
594 file = traitPhenoFile,
595 sep = "\t",
596 col.names = NA,
597 quote = FALSE,
603 ## if (!is.null(genoDataMissing)) {
604 ## write.table(genoData,
605 ## file = genoFile,
606 ## sep = "\t",
607 ## col.names = NA,
608 ## quote = FALSE,
609 ## )
611 ## }
613 ## if (!is.null(predictionDataMissing)) {
614 ## write.table(predictionData,
615 ## file = predictionFile,
616 ## sep = "\t",
617 ## col.names = NA,
618 ## quote = FALSE,
619 ## )
620 ## }
623 if (file.info(relationshipMatrixFile)$size == 0) {
624 write.table(relationshipMatrix,
625 file = relationshipMatrixFile,
626 sep = "\t",
627 col.names = NA,
628 quote = FALSE,
633 if (file.info(formattedPhenoFile)$size == 0 && !is.null(formattedPhenoData) ) {
634 write.table(formattedPhenoData,
635 file = formattedPhenoFile,
636 sep = "\t",
637 col.names = NA,
638 quote = FALSE,
642 message("Done.")
644 q(save = "no", runLast = FALSE)