improve condition..
[sgn.git] / R / solGS / gs.r
blob2d21e97a668cbeb2a8180d92c00f5f84adca5662
1 #SNOPSIS
2 #calculates genomic estimated breeding values (GEBVs) using rrBLUP,
3 #GBLUP method
5 #AUTHOR
6 # Isaak Y Tecle (iyt2@cornell.edu)
8 options(echo = FALSE)
10 library(methods)
11 library(rrBLUP)
12 library(plyr)
13 library(stringr)
14 #library(lme4)
15 library(randomForest)
16 library(data.table)
17 library(parallel)
18 library(genoDataFilter)
19 library(phenoAnalysis)
20 library(caret)
21 library(dplyr)
24 allArgs <- commandArgs()
26 inputFiles <- scan(grep("input_files", allArgs, ignore.case = TRUE, perl = TRUE, value = TRUE),
27 what = "character")
29 outputFiles <- scan(grep("output_files", allArgs, ignore.case = TRUE,perl = TRUE, value = TRUE),
30 what = "character")
32 traitsFile <- grep("traits", inputFiles, ignore.case = TRUE, value = TRUE)
33 traitFile <- grep("trait_info", inputFiles, ignore.case = TRUE, value = TRUE)
34 traitInfo <- scan(traitFile, what = "character",)
35 traitInfo <- strsplit(traitInfo, "\t");
36 traitId <- traitInfo[[1]]
37 trait <- traitInfo[[2]]
39 datasetInfoFile <- grep("dataset_info", inputFiles, ignore.case = TRUE, value = TRUE)
40 datasetInfo <- c()
42 if (length(datasetInfoFile) != 0 ) {
43 datasetInfo <- scan(datasetInfoFile, what = "character")
44 datasetInfo <- paste(datasetInfo, collapse = " ")
45 } else {
46 datasetInfo <- c('single population')
49 validationTrait <- paste("validation", trait, sep = "_")
50 validationFile <- grep(validationTrait, outputFiles, ignore.case = TRUE, value = TRUE)
52 if (is.null(validationFile)) {
53 stop("Validation output file is missing.")
56 kinshipTrait <- paste("rrblup_gebvs", trait, sep = "_")
57 blupFile <- grep(kinshipTrait, outputFiles, ignore.case = TRUE, value = TRUE)
59 if (is.null(blupFile)) {
60 stop("GEBVs file is missing.")
62 markerTrait <- paste("marker_effects", trait, sep = "_")
63 markerFile <- grep(markerTrait, outputFiles, ignore.case = TRUE, value = TRUE)
65 traitPhenoFile <- paste("phenotype_trait", trait, sep = "_")
66 traitPhenoFile <- grep(traitPhenoFile, outputFiles,ignore.case = TRUE, value = TRUE)
68 varianceComponentsFile <- grep("variance_components", outputFiles, ignore.case = TRUE, value = TRUE)
69 filteredGenoFile <- grep("filtered_genotype_data", outputFiles, ignore.case = TRUE, value = TRUE)
70 formattedPhenoFile <- grep("formatted_phenotype_data", inputFiles, ignore.case = TRUE, value = TRUE)
72 formattedPhenoData <- c()
73 phenoData <- c()
75 genoFile <- grep("genotype_data_", inputFiles, ignore.case = TRUE, perl=TRUE, value = TRUE)
76 message('geno file ', genoFile)
78 if (is.null(genoFile)) {
79 stop("genotype data file is missing.")
82 if (file.info(genoFile)$size == 0) {
83 stop("genotype data file is empty.")
86 readFilteredGenoData <- c()
87 filteredGenoData <- c()
88 if (length(filteredGenoFile) != 0 && file.info(filteredGenoFile)$size != 0) {
89 filteredGenoData <- fread(filteredGenoFile, na.strings = c("NA", " ", "--", "-"), header = TRUE)
90 readFilteredGenoData <- 1
91 message('read in filtered geno data')
94 genoData <- c()
95 if (is.null(filteredGenoData)) {
96 genoData <- fread(genoFile, na.strings = c("NA", " ", "--", "-"), header = TRUE)
97 genoData <- unique(genoData, by='V1')
98 message('read in unfiltered geno data')
101 if (length(formattedPhenoFile) != 0 && file.info(formattedPhenoFile)$size != 0) {
102 formattedPhenoData <- as.data.frame(fread(formattedPhenoFile,
103 na.strings = c("NA", " ", "--", "-", ".")
106 } else {
107 phenoFile <- grep("\\/phenotype_data", inputFiles, ignore.case = TRUE, value = TRUE, perl = TRUE)
109 if (is.null(phenoFile)) {
110 stop("phenotype data file is missing.")
113 if (file.info(phenoFile)$size == 0) {
114 stop("phenotype data file is empty.")
117 phenoData <- fread(phenoFile, na.strings = c("NA", " ", "--", "-", "."), header = TRUE)
120 phenoData <- as.data.frame(phenoData)
121 phenoTrait <- c()
123 if (datasetInfo == 'combined populations') {
125 if (!is.null(formattedPhenoData)) {
126 phenoTrait <- subset(formattedPhenoData, select = trait)
127 phenoTrait <- na.omit(phenoTrait)
129 } else {
130 dropColumns <- grep(trait, names(phenoData), ignore.case = TRUE, value = TRUE)
131 phenoTrait <- phenoData[, !(names(phenoData) %in% dropColumns)]
133 phenoTrait <- as.data.frame(phenoTrait)
134 colnames(phenoTrait) <- c('genotypes', trait)
136 } else {
138 if (!is.null(formattedPhenoData)) {
139 phenoTrait <- subset(formattedPhenoData, select = c('V1', trait))
140 phenoTrait <- as.data.frame(phenoTrait)
141 phenoTrait <- na.omit(phenoTrait)
143 colnames(phenoTrait)[1] <- 'genotypes'
145 } else if (length(grep('uploaded', phenoFile)) != 0) {
147 phenoTrait <- averageTrait(phenoData, trait)
149 } else {
151 phenoTrait <- getAdjMeans(phenoData, trait)
156 if (is.null(filteredGenoData)) {
158 #genoDataFilter::filterGenoData
159 genoData <- filterGenoData(genoData, maf=0.01)
160 genoData <- roundAlleleDosage(genoData)
162 genoData <- as.data.frame(genoData)
163 rownames(genoData) <- genoData[, 1]
164 genoData[, 1] <- NULL
165 filteredGenoData <- genoData
167 } else {
168 genoData <- as.data.frame(filteredGenoData)
169 rownames(genoData) <- genoData[, 1]
170 genoData[, 1] <- NULL
173 genoData <- genoData[order(row.names(genoData)), ]
175 predictionTempFile <- grep("prediction_population", inputFiles, ignore.case = TRUE, value = TRUE)
177 predictionFile <- c()
178 filteredPredGenoFile <- c()
179 predictionAllFiles <- c()
181 message('prediction temp genotype file: ', predictionTempFile)
183 if (length(predictionTempFile) !=0 ) {
184 predictionAllFiles <- scan(predictionTempFile, what = "character")
186 predictionFile <- grep("\\/genotype_data", predictionAllFiles, ignore.case = TRUE, perl=TRUE, value = TRUE)
187 message('prediction unfiltered genotype file: ', predictionFile)
189 filteredPredGenoFile <- grep("filtered_genotype_data_", predictionAllFiles, ignore.case = TRUE, perl=TRUE, value = TRUE)
190 message('prediction filtered genotype file: ', predictionFile)
193 predictionPopGEBVsFile <- grep("prediction_pop_gebvs", outputFiles, ignore.case = TRUE, value = TRUE)
195 message("filtered pred geno file: ", filteredPredGenoFile)
196 message("prediction gebv file: ", predictionPopGEBVsFile)
198 predictionData <- c()
199 readFilteredPredGenoData <- c()
200 filteredPredGenoData <- c()
202 if (length(filteredPredGenoFile) != 0 && file.info(filteredPredGenoFile)$size != 0) {
203 predictionData <- fread(filteredPredGenoFile, na.strings = c("NA", " ", "--", "-"),)
204 readFilteredPredGenoData <- 1
206 predictionData <- as.data.frame(predictionData)
207 rownames(predictionData) <- predictionData[, 1]
208 predictionData[, 1] <- NULL
210 message('read in filtered prediction genotype data')
211 } else if (length(predictionFile) != 0) {
213 predictionData <- fread(predictionFile, na.strings = c("NA", " ", "--", "-"),)
214 predictionData <- unique(predictionData, by='V1')
216 predictionData <- filterGenoData(predictionData, maf=0.01)
217 predictionData <- roundAlleleDosage(predictionData)
219 predictionData <- as.data.frame(predictionData)
220 rownames(predictionData) <- predictionData[, 1]
221 predictionData[, 1] <- NULL
222 filteredPredGenoData <- predictionData
226 #impute genotype values for obs with missing values,
227 genoDataMissing <- c()
229 if (sum(is.na(genoData)) > 0) {
230 genoDataMissing<- c('yes')
232 message("sum of geno missing values, ", sum(is.na(genoData)) )
233 genoData <- na.roughfix(genoData)
234 genoData <- data.matrix(genoData)
237 #create phenotype and genotype datasets with
238 #common stocks only
239 message('phenotyped lines: ', length(row.names(phenoTrait)))
240 message('genotyped lines: ', length(row.names(genoData)))
242 #extract observation lines with both
243 #phenotype and genotype data only.
244 commonObs <- intersect(phenoTrait$genotypes, row.names(genoData))
245 commonObs <- data.frame(commonObs)
246 rownames(commonObs) <- commonObs[, 1]
248 message('lines with both genotype and phenotype data: ', length(row.names(commonObs)))
250 #include in the genotype dataset only phenotyped lines
251 message("genotype lines before filtering for phenotyped only: ", length(row.names(genoData)))
252 genoDataFilteredObs <- genoData[(rownames(genoData) %in% rownames(commonObs)), ]
253 message("genotype lines after filtering for phenotyped only: ", length(row.names(genoDataFilteredObs)))
255 #drop phenotyped lines without genotype data
256 message("phenotype lines before filtering for genotyped only: ", length(row.names(phenoTrait)))
257 phenoTrait <- phenoTrait[(phenoTrait$genotypes %in% rownames(commonObs)), ]
258 message("phenotype lines after filtering for genotyped only: ", length(row.names(phenoTrait)))
260 phenoTraitMarker <- as.data.frame(phenoTrait)
261 rownames(phenoTraitMarker) <- phenoTraitMarker[, 1]
262 phenoTraitMarker[, 1] <- NULL
264 #impute missing data in prediction data
265 predictionDataMissing <- c()
266 if (length(predictionData) != 0) {
267 #purge markers unique to both populations
268 commonMarkers <- intersect(names(data.frame(genoDataFilteredObs)), names(predictionData))
269 predictionData <- subset(predictionData, select = commonMarkers)
270 genoDataFilteredObs <- subset(genoDataFilteredObs, select= commonMarkers)
272 if (sum(is.na(predictionData)) > 0) {
273 predictionDataMissing <- c('yes')
274 message("sum of geno missing values, ", sum(is.na(predictionData)) )
275 predictionData <- data.matrix(na.roughfix(predictionData))
279 #change genotype coding to [-1, 0, 1], to use the A.mat ) if [0, 1, 2]
280 genoTrCode <- grep("2", genoDataFilteredObs[1, ], value = TRUE)
281 if(length(genoTrCode) != 0) {
282 genoData <- genoData - 1
283 genoDataFilteredObs <- genoDataFilteredObs - 1
286 if (length(predictionData) != 0 ) {
287 genoSlCode <- grep("2", predictionData[1, ], value = TRUE)
288 if (length(genoSlCode) != 0 ) {
289 predictionData <- predictionData - 1
293 ordered.markerEffects <- c()
294 ordered.trGEBV <- c()
295 validationAll <- c()
296 combinedGebvsFile <- c()
297 allGebvs <- c()
298 traitPhenoData <- c()
299 relationshipMatrix <- c()
301 #additive relationship model
302 #calculate the inner products for
303 #genotypes (realized relationship matrix)
304 relationshipMatrixFile <- grep("relationship_matrix", outputFiles, ignore.case = TRUE, value = TRUE)
306 message("relationship matrix file: ", relationshipMatrixFile)
308 if (length(relationshipMatrixFile) != 0) {
309 if (file.info(relationshipMatrixFile)$size > 0 ) {
310 relationshipMatrix <- as.data.frame(fread(relationshipMatrixFile))
312 rownames(relationshipMatrix) <- relationshipMatrix[, 1]
313 relationshipMatrix[, 1] <- NULL
314 colnames(relationshipMatrix) <- rownames(relationshipMatrix)
315 relationshipMatrix <- data.matrix(relationshipMatrix)
316 } else {
317 relationshipMatrix <- A.mat(genoData)
318 diag(relationshipMatrix) <- diag(relationshipMatrix) + 1e-6
319 colnames(relationshipMatrix) <- rownames(relationshipMatrix)
323 relationshipMatrixFiltered <- relationshipMatrix[(rownames(relationshipMatrix) %in% rownames(commonObs)), ]
324 relationshipMatrixFiltered <- relationshipMatrixFiltered[, (colnames(relationshipMatrixFiltered) %in% rownames(commonObs))]
325 relationshipMatrix <- data.frame(relationshipMatrix)
327 nCores <- detectCores()
328 message('no cores: ', nCores)
329 if (nCores > 1) {
330 nCores <- (nCores %/% 2)
331 } else {
332 nCores <- 1
335 message('assgined no cores: ', nCores)
337 if (length(predictionData) == 0) {
339 trGEBV <- kin.blup(data = phenoTrait,
340 geno = 'genotypes',
341 pheno = trait,
342 K = relationshipMatrixFiltered,
343 n.core = nCores,
346 trGEBVu <- trGEBV$g
348 phenoTraitMarker <- data.matrix(phenoTraitMarker)
349 genoDataFilteredObs <- data.matrix(genoDataFilteredObs)
351 markerEffects <- mixed.solve(y = phenoTraitMarker,
352 Z = genoDataFilteredObs
355 ordered.markerEffects <- data.matrix(markerEffects$u)
356 ordered.markerEffects <- data.matrix(ordered.markerEffects [order (-ordered.markerEffects[, 1]), ])
357 ordered.markerEffects <- round(ordered.markerEffects, 5)
359 colnames(ordered.markerEffects) <- c("Marker Effects")
360 ordered.markerEffects <- data.frame(ordered.markerEffects)
363 traitPhenoData <- data.frame(round(phenoTraitMarker, 2))
365 heritability <- round((trGEBV$Vg/(trGEBV$Ve + trGEBV$Vg)), 2)
367 cat("\n", file = varianceComponentsFile, append = FALSE)
368 cat('Error variance', trGEBV$Ve, file = varianceComponentsFile, sep = "\t", append = TRUE)
369 cat("\n", file = varianceComponentsFile, append = TRUE)
370 cat('Additive genetic variance', trGEBV$Vg, file = varianceComponentsFile, sep = '\t', append = TRUE)
371 cat("\n", file = varianceComponentsFile, append = TRUE)
372 cat('Heritability (h)', heritability, file = varianceComponentsFile, sep = '\t', append = TRUE)
375 trGEBV <- data.matrix(trGEBVu)
376 ordered.trGEBV <- as.data.frame(trGEBV[order(-trGEBV[, 1]), ])
377 ordered.trGEBV <- round(ordered.trGEBV, 3)
379 combinedGebvsFile <- grep('selected_traits_gebv', outputFiles, ignore.case = TRUE,value = TRUE)
381 if (length(combinedGebvsFile) != 0) {
382 fileSize <- file.info(combinedGebvsFile)$size
383 if (fileSize != 0 ) {
384 combinedGebvs <- as.data.frame(fread(combinedGebvsFile))
386 rownames(combinedGebvs) <- combinedGebvs[,1]
387 combinedGebvs[,1] <- NULL
389 colnames(ordered.trGEBV) <- c(trait)
391 traitGEBV <- as.data.frame(ordered.trGEBV)
392 allGebvs <- merge(combinedGebvs, traitGEBV,
393 by = 0,
394 all = TRUE
397 rownames(allGebvs) <- allGebvs[,1]
398 allGebvs[,1] <- NULL
402 colnames(ordered.trGEBV) <- c(trait)
404 #cross-validation
406 if (is.null(predictionFile)) {
407 genoNum <- nrow(phenoTrait)
408 if (genoNum < 20 ) {
409 warning(genoNum, " is too small number of genotypes.")
412 set.seed(4567)
414 k <- 10
415 times <- 2
416 cvFolds <- createMultiFolds(phenoTrait[, 2], k=k, times=times)
418 for ( r in 1:times) {
419 re <- paste0('Rep', r)
421 for (i in 1:k) {
422 fo <- ifelse(i < 10, 'Fold0', 'Fold')
424 trFoRe <- paste0(fo, i, '.', re)
425 trG <- cvFolds[[trFoRe]]
426 slG <- as.numeric(rownames(phenoTrait[-trG,]))
428 kblup <- paste("rKblup", i, sep = ".")
430 result <- kin.blup(data = phenoTrait[trG,],
431 geno = 'genotypes',
432 pheno = trait,
433 K = relationshipMatrixFiltered,
434 n.core = nCores,
437 assign(kblup, result)
439 #calculate cross-validation accuracy
440 valBlups <- result$g
441 valBlups <- data.frame(valBlups)
443 slG <- slG[which(slG <= nrow(phenoTrait))]
445 slGDf <- phenoTrait[(rownames(phenoTrait) %in% slG),]
446 rownames(slGDf) <- slGDf[, 1]
447 slGDf[, 1] <- NULL
449 valBlups <- valBlups[(rownames(valBlups) %in% rownames(slGDf)), ]
450 valCorData <- merge(slGDf, valBlups, by=0)
451 rownames(valCorData) <- valCorData[, 1]
452 valCorData[, 1] <- NULL
454 accuracy <- try(cor(valCorData))
456 validation <- paste("validation", trFoRe, sep = ".")
458 cvTest <- paste("CV", trFoRe, sep = " ")
460 if ( class(accuracy) != "try-error")
462 accuracy <- round(accuracy[1,2], digits = 3)
463 accuracy <- data.matrix(accuracy)
465 colnames(accuracy) <- c("correlation")
466 rownames(accuracy) <- cvTest
468 assign(validation, accuracy)
470 if (!is.na(accuracy[1,1])) {
471 validationAll <- rbind(validationAll, accuracy)
477 validationAll <- data.matrix(validationAll[order(-validationAll[, 1]), ])
479 if (!is.null(validationAll)) {
480 validationMean <- data.matrix(round(colMeans(validationAll), digits = 2))
482 rownames(validationMean) <- c("Average")
484 validationAll <- rbind(validationAll, validationMean)
485 colnames(validationAll) <- c("Correlation")
488 validationAll <- data.frame(validationAll)
493 predictionPopResult <- c()
494 predictionPopGEBVs <- c()
496 if (length(predictionData) != 0) {
497 message("running prediction for selection candidates...marker data", ncol(predictionData), " vs. ", ncol(genoDataFilteredObs))
499 genoDataTrSl <- rbind(genoDataFilteredObs, predictionData)
500 rTrSl <- A.mat(genoDataTrSl)
502 predictionPopResult <- kin.blup(data = phenoTrait,
503 geno = 'genotypes',
504 pheno = trait,
505 K = rTrSl,
506 n.core = nCores,
509 message("running prediction for selection candidates...DONE!!")
510 predictionPopGEBVs <- round(data.frame(predictionPopResult$g), 3)
511 genotypesSl <- rownames(predictionData)
512 predictionPopGEBVs <- predictionPopGEBVs[(rownames(predictionPopGEBVs) %in% genotypesSl), ]
513 predictionPopGEBVs <- data.frame(predictionPopGEBVs)
514 predictionPopGEBVs <- data.frame(predictionPopGEBVs[order(-predictionPopGEBVs[, 1]), ])
516 colnames(predictionPopGEBVs) <- c(trait)
519 if (!is.null(predictionPopGEBVs) & length(predictionPopGEBVsFile) != 0) {
520 fwrite(predictionPopGEBVs,
521 file = predictionPopGEBVsFile,
522 row.names = TRUE,
523 sep = "\t",
524 quote = FALSE,
528 if(!is.null(validationAll)) {
529 fwrite(validationAll,
530 file = validationFile,
531 row.names = TRUE,
532 sep = "\t",
533 quote = FALSE,
538 if (!is.null(ordered.markerEffects)) {
539 fwrite(ordered.markerEffects,
540 file = markerFile,
541 row.names = TRUE,
542 sep = "\t",
543 quote = FALSE,
548 if (!is.null(ordered.trGEBV)) {
549 fwrite(ordered.trGEBV,
550 file = blupFile,
551 row.names = TRUE,
552 sep = "\t",
553 quote = FALSE,
558 if (length(combinedGebvsFile) != 0 ) {
559 if(file.info(combinedGebvsFile)$size == 0) {
560 fwrite(ordered.trGEBV,
561 file = combinedGebvsFile,
562 row.names = TRUE,
563 sep = "\t",
564 quote = FALSE,
566 } else {
567 fwrite(allGebvs,
568 file = combinedGebvsFile,
569 row.names = TRUE,
570 sep = "\t",
571 quote = FALSE,
577 if (!is.null(traitPhenoData) & length(traitPhenoFile) != 0) {
578 fwrite(traitPhenoData,
579 file = traitPhenoFile,
580 row.names = TRUE,
581 sep = "\t",
582 quote = FALSE,
587 if (!is.null(filteredGenoData) && is.null(readFilteredGenoData)) {
588 fwrite(filteredGenoData,
589 file = filteredGenoFile,
590 row.names = TRUE,
591 sep = "\t",
592 quote = FALSE,
597 if (length(filteredPredGenoFile) != 0 && is.null(readFilteredPredGenoData)) {
598 fwrite(filteredPredGenoData,
599 file = filteredPredGenoFile,
600 row.names = TRUE,
601 sep = "\t",
602 quote = FALSE,
606 ## if (!is.null(genoDataMissing)) {
607 ## write.table(genoData,
608 ## file = genoFile,
609 ## sep = "\t",
610 ## col.names = NA,
611 ## quote = FALSE,
612 ## )
614 ## }
616 ## if (!is.null(predictionDataMissing)) {
617 ## write.table(predictionData,
618 ## file = predictionFile,
619 ## sep = "\t",
620 ## col.names = NA,
621 ## quote = FALSE,
622 ## )
623 ## }
625 if (file.info(relationshipMatrixFile)$size == 0) {
626 fwrite(relationshipMatrix,
627 file = relationshipMatrixFile,
628 row.names = TRUE,
629 sep = "\t",
630 quote = FALSE,
635 if (file.info(formattedPhenoFile)$size == 0 && !is.null(formattedPhenoData) ) {
636 fwrite(formattedPhenoData,
637 file = formattedPhenoFile,
638 row.names = TRUE,
639 sep = "\t",
640 quote = FALSE,
644 message("Done.")
646 q(save = "no", runLast = FALSE)