2 #calculates genomic estimated breeding values (GEBVs) using rrBLUP
5 # Isaak Y Tecle (iyt2@cornell.edu)
17 allArgs
<- commandArgs()
19 inFile
<- grep("input_files",
26 outFile
<- grep("output_files",
33 outFiles
<- scan(outFile
,
37 inFiles
<- scan(inFile
,
42 traitsFile
<- grep("traits",
49 traitFile
<- grep("trait_info",
57 traitInfo
<- scan(traitFile
,
61 traitInfo
<- strsplit(traitInfo
, "\t");
62 traitId
<- traitInfo
[[1]]
63 trait
<- traitInfo
[[2]]
65 datasetInfoFile
<- grep("dataset_info",
73 if (length(datasetInfoFile
) != 0 ) {
74 datasetInfo
<- scan(datasetInfoFile
,
78 datasetInfo
<- paste(datasetInfo
, collapse
= " ")
81 datasetInfo
<- c('single population')
85 validationTrait
<- paste("validation", trait
, sep
= "_")
87 validationFile
<- grep(validationTrait
,
94 kinshipTrait
<- paste("kinship", trait
, sep
= "_")
96 blupFile
<- grep(kinshipTrait
,
103 markerTrait
<- paste("marker", trait
, sep
= "_")
104 markerFile
<- grep(markerTrait
,
111 traitPhenoFile
<- paste("phenotype_trait", trait
, sep
= "_")
112 traitPhenoFile
<- grep(traitPhenoFile
,
119 varianceComponentsFile
<- grep("variance_components",
126 formattedPhenoFile
<- grep("formatted_phenotype_data",
133 formattedPhenoData
<- c()
136 if (length(formattedPhenoFile
) != 0 && file
.info(formattedPhenoFile
)$size
!= 0) {
137 formattedPhenoData
<- as
.data
.frame(fread(formattedPhenoFile
,
138 na
.strings
= c("NA", " ", "--", "-", ".")
142 row
.names(formattedPhenoData
) <- formattedPhenoData
[, 1]
143 formattedPhenoData
[, 1] <- NULL
146 phenoFile
<- grep("\\/phenotype_data",
154 phenoData
<- fread(phenoFile
,
156 na
.strings
= c("NA", " ", "--", "-", "."),
161 phenoData
<- as
.data
.frame(phenoData
)
164 if (datasetInfo
== 'combined populations') {
166 if (!is
.null(formattedPhenoData
)) {
167 phenoTrait
<- subset(formattedPhenoData
, select
=trait
)
168 phenoTrait
<- na
.omit(phenoTrait
)
171 dropColumns
<- grep(trait
,
178 phenoTrait
<- phenoData
[,!(names(phenoData
) %in% dropColumns
)]
180 phenoTrait
<- as
.data
.frame(phenoTrait
)
181 row
.names(phenoTrait
) <- phenoTrait
[, 1]
182 phenoTrait
[, 1] <- NULL
183 colnames(phenoTrait
) <- trait
188 if (!is
.null(formattedPhenoData
)) {
189 phenoTrait
<- subset(formattedPhenoData
, select
=trait
)
190 phenoTrait
<- na
.omit(phenoTrait
)
193 dropColumns
<- c("uniquename", "stock_name")
194 phenoData
<- phenoData
[,!(names(phenoData
) %in% dropColumns
)]
196 phenoTrait
<- subset(phenoData
,
197 select
= c("object_name", "object_id", "design", "block", "replicate", trait
)
200 experimentalDesign
<- phenoTrait
[2, 'design']
202 if (class(phenoTrait
[, trait
]) != 'numeric') {
203 phenoTrait
[, trait
] <- as
.numeric(as
.character(phenoTrait
[, trait
]))
206 if (is
.na(experimentalDesign
) == TRUE) {experimentalDesign
<- c('No Design')}
208 if ((experimentalDesign
== 'Augmented' || experimentalDesign
== 'RCBD') && unique(phenoTrait$block
) > 1) {
210 message("GS experimental design: ", experimentalDesign
)
212 augData
<- subset(phenoTrait
,
213 select
= c("object_name", "object_id", "block", trait
)
216 colnames(augData
)[1] <- "genotypes"
217 colnames(augData
)[4] <- "trait"
219 model
<- try(lmer(trait
~ 0 + genotypes
+ (1|block
),
224 if (class(model
) != "try-error") {
225 phenoTrait
<- data
.frame(fixef(model
))
227 colnames(phenoTrait
) <- trait
229 nn
<- gsub('genotypes', '', rownames(phenoTrait
))
230 rownames(phenoTrait
) <- nn
232 phenoTrait
<- round(phenoTrait
, digits
= 2)
235 } else if (experimentalDesign
== 'Alpha') {
237 message("Experimental desgin: ", experimentalDesign
)
239 alphaData
<- subset(phenoData
,
240 select
= c("object_name", "object_id","block", "replicate", trait
)
243 colnames(alphaData
)[1] <- "genotypes"
244 colnames(alphaData
)[5] <- "trait"
246 model
<- try(lmer(trait
~ 0 + genotypes
+ (1|replicate
/block
),
251 if (class(model
) != "try-error") {
252 phenoTrait
<- data
.frame(fixef(model
))
254 colnames(phenoTrait
) <- trait
256 nn
<- gsub('genotypes', '', rownames(phenoTrait
))
257 rownames(phenoTrait
) <- nn
259 phenoTrait
<- round(phenoTrait
, digits
= 2)
265 phenoTrait
<- subset(phenoData
,
266 select
= c("object_name", "object_id", trait
)
269 if (sum(is
.na(phenoTrait
)) > 0) {
270 message("No. of pheno missing values: ", sum(is
.na(phenoTrait
)))
271 phenoTrait
<- na
.omit(phenoTrait
)
274 #calculate mean of reps/plots of the same accession and
275 #create new df with the accession means
277 phenoTrait
<- phenoTrait
[order(row
.names(phenoTrait
)), ]
278 phenoTrait
<- data
.frame(phenoTrait
)
279 message('phenotyped lines before averaging: ', length(row
.names(phenoTrait
)))
281 phenoTrait
<-ddply(phenoTrait
, "object_name", colwise(mean
))
282 message('phenotyped lines after averaging: ', length(row
.names(phenoTrait
)))
284 phenoTrait
<- subset(phenoTrait
, select
=c("object_name", trait
))
285 row
.names(phenoTrait
) <- phenoTrait
[, 1]
286 phenoTrait
[, 1] <- NULL
288 #format all-traits population phenotype dataset
289 ## formattedPhenoData <- phenoData
290 ## dropColumns <- c("object_id", "stock_id", "design", "block", "replicate" )
292 ## formattedPhenoData <- formattedPhenoData[, !(names(formattedPhenoData) %in% dropColumns)]
293 ## formattedPhenoData <- ddply(formattedPhenoData,
298 ## row.names(formattedPhenoData) <- formattedPhenoData[, 1]
299 ## formattedPhenoData[, 1] <- NULL
301 ## formattedPhenoData <- round(formattedPhenoData,
308 genoFile
<- grep("genotype_data",
315 genoData
<- fread(genoFile
,
317 na
.strings
= c("NA", " ", "--", "-"),
320 genoData
<- as
.data
.frame(genoData
)
321 rownames(genoData
) <- genoData
[, 1]
322 genoData
[, 1] <- NULL
323 genoData
<- genoData
[, colSums(is
.na(genoData
)) < nrow(genoData
) * 0.5]
325 predictionTempFile
<- grep("prediction_population",
332 predictionFile
<- c()
334 message('prediction temp genotype file: ', predictionTempFile
)
336 if (length(predictionTempFile
) !=0 ) {
337 predictionFile
<- scan(predictionTempFile
,
342 message('prediction genotype file: ', predictionFile
)
344 predictionPopGEBVsFile
<- grep("prediction_pop_gebvs",
350 message("prediction gebv file: ", predictionPopGEBVsFile
)
352 predictionData
<- c()
354 if (length(predictionFile
) !=0 ) {
356 predictionData
<- fread(predictionFile
,
357 na
.strings
= c("NA", " ", "--", "-"),
360 predictionData
<- as
.data
.frame(predictionData
)
361 rownames(predictionData
) <- predictionData
[, 1]
362 predictionData
[, 1] <- NULL
363 predictionData
<- predictionData
[, colSums(is
.na(predictionData
)) < nrow(predictionData
) * 0.5]
366 #impute genotype values for obs with missing values,
367 #based on mean of neighbouring 10 (arbitrary) obs
368 genoDataMissing
<- c()
370 if (sum(is
.na(genoData
)) > 0) {
371 genoDataMissing
<- c('yes')
373 message("sum of geno missing values, ", sum(is
.na(genoData
)) )
374 genoData
<- na
.roughfix(genoData
)
375 genoData
<- data
.matrix(genoData
)
378 genoData
<- genoData
[order(row
.names(genoData
)), ]
380 #create phenotype and genotype datasets with
382 message('phenotyped lines: ', length(row
.names(phenoTrait
)))
383 message('genotyped lines: ', length(row
.names(genoData
)))
385 #extract observation lines with both
386 #phenotype and genotype data only.
387 commonObs
<- intersect(row
.names(phenoTrait
), row
.names(genoData
))
388 commonObs
<- data
.frame(commonObs
)
389 rownames(commonObs
)<-commonObs
[, 1]
391 message('lines with both genotype and phenotype data: ', length(row
.names(commonObs
)))
392 #include in the genotype dataset only observation lines
394 message("genotype lines before filtering for phenotyped only: ", length(row
.names(genoData
)))
395 genoDataFiltered
<- genoData
[(rownames(genoData
) %in% rownames(commonObs
)), ]
396 message("genotype lines after filtering for phenotyped only: ", length(row
.names(genoDataFiltered
)))
397 #drop observation lines without genotype data
398 message("phenotype lines before filtering for genotyped only: ", length(row
.names(phenoTrait
)))
399 phenoTrait
<- merge(data
.frame(phenoTrait
), commonObs
, by
=0, all
=FALSE)
400 rownames(phenoTrait
) <-phenoTrait
[, 1]
401 phenoTrait
<- subset(phenoTrait
, select
=trait
)
403 message("phenotype lines after filtering for genotyped only: ", length(row
.names(phenoTrait
)))
404 #a set of only observation lines with genotype data
406 traitPhenoData
<- data
.frame(round(phenoTrait
, digits
=2))
407 phenoTrait
<- data
.matrix(phenoTrait
)
408 genoDataFiltered
<- data
.matrix(genoDataFiltered
)
410 #impute missing data in prediction data
411 predictionDataMissing
<- c()
412 if (length(predictionData
) != 0) {
413 #purge markers unique to both populations
414 commonMarkers
<- intersect(names(data
.frame(genoDataFiltered
)), names(predictionData
))
415 predictionData
<- subset(predictionData
, select
= commonMarkers
)
416 genoDataFiltered
<- subset(genoDataFiltered
, select
= commonMarkers
)
418 # predictionData <- data.matrix(predictionData)
420 if (sum(is
.na(predictionData
)) > 0) {
421 predictionDataMissing
<- c('yes')
422 message("sum of geno missing values, ", sum(is
.na(predictionData
)) )
423 predictionData
<- data
.matrix(na
.roughfix(predictionData
))
428 relationshipMatrixFile
<- grep("relationship_matrix",
435 message("relationship matrix file: ", relationshipMatrixFile
)
436 #message("relationship matrix file size: ", file.info(relationshipMatrixFile)$size)
437 relationshipMatrix
<- c()
438 if (length(relationshipMatrixFile
) != 0) {
439 if (file
.info(relationshipMatrixFile
)$size
> 0 ) {
440 relationshipDf
<- as
.data
.frame(fread(relationshipMatrixFile
))
442 rownames(relationshipDf
) <- relationshipDf
[, 1]
443 relationshipDf
[, 1] <- NULL
444 relationshipMatrix
<- data
.matrix(relationshipDf
)
448 #change genotype coding to [-1, 0, 1], to use the A.mat ) if [0, 1, 2]
449 genoTrCode
<- grep("2", genoDataFiltered
[1, ], fixed
=TRUE, value
=TRUE)
450 if(length(genoTrCode
) != 0) {
451 genoDataFiltered
<- genoDataFiltered
- 1
454 if (length(predictionData
) != 0 ) {
455 genoSlCode
<- grep("2", predictionData
[1, ], fixed
=TRUE, value
=TRUE)
456 if (length(genoSlCode
) != 0 ) {
457 predictionData
<- predictionData
- 1
461 ordered
.markerEffects
<- c()
462 if ( length(predictionData
) == 0 ) {
463 markerEffects
<- mixed
.solve(y
= phenoTrait
,
467 ordered
.markerEffects
<- data
.matrix(markerEffects$u
)
468 ordered
.markerEffects
<- data
.matrix(ordered
.markerEffects
[order (-ordered
.markerEffects
[, 1]), ])
469 ordered
.markerEffects
<- round(ordered
.markerEffects
,
473 colnames(ordered
.markerEffects
) <- c("Marker Effects")
476 #correlation between breeding values based on
477 #marker effects and relationship matrix
478 #corGEBVs <- cor(genoDataMatrix %*% markerEffects$u, iGEBV$u)
481 #additive relationship model
482 #calculate the inner products for
483 #genotypes (realized relationship matrix)
484 if (length(relationshipMatrixFile
) != 0) {
485 if (file
.info(relationshipMatrixFile
)$size
== 0) {
486 relationshipMatrix
<- tcrossprod(data
.matrix(genoData
))
489 relationshipMatrixFiltered
<- relationshipMatrix
[(rownames(relationshipMatrix
) %in% rownames(commonObs
)),]
490 relationshipMatrixFiltered
<- relationshipMatrixFiltered
[, (colnames(relationshipMatrixFiltered
) %in% rownames(commonObs
))]
492 #construct an identity matrix for genotypes
493 identityMatrix
<- diag(nrow(phenoTrait
))
495 relationshipMatrixFiltered
<- data
.matrix(relationshipMatrixFiltered
)
497 iGEBV
<- mixed
.solve(y
= phenoTrait
,
499 K
= relationshipMatrixFiltered
506 if ( is
.null(predictionFile
) == TRUE ) {
507 # heritability <- round((iGEBV$Vu /(iGEBV$Vu + iGEBV$Ve) * 100), digits=3)
508 cat("\n", file
=varianceComponentsFile
, append
=TRUE)
509 cat('Error variance', iGEBV$Ve
, file
=varianceComponentsFile
, sep
="\t", append
=TRUE)
510 cat("\n", file
=varianceComponentsFile
, append
=TRUE)
511 cat('Additive genetic variance', iGEBV$Vu
, file
=varianceComponentsFile
, sep
='\t', append
=TRUE)
512 cat("\n", file
=varianceComponentsFile
, append
=TRUE)
513 cat('μ', iGEBV$beta
,file
=varianceComponentsFile
, sep
='\t', append
=TRUE)
514 cat("\n", file
=varianceComponentsFile
, append
=TRUE)
515 # cat('Heritability (h, %)', heritability, file=varianceComponentsFile, sep='\t', append=TRUE)
518 iGEBV
<- data
.matrix(iGEBVu
)
520 ordered
.iGEBV
<- as
.data
.frame(iGEBV
[order(-iGEBV
[, 1]), ] )
522 ordered
.iGEBV
<- round(ordered
.iGEBV
,
526 combinedGebvsFile
<- grep('selected_traits_gebv',
534 if (length(combinedGebvsFile
) != 0)
536 fileSize
<- file
.info(combinedGebvsFile
)$size
539 combinedGebvs
<- as
.data
.frame(fread(combinedGebvsFile
))
541 rownames(combinedGebvs
) <- combinedGebvs
[,1]
542 combinedGebvs
[,1] <- NULL
544 colnames(ordered
.iGEBV
) <- c(trait
)
546 traitGEBV
<- as
.data
.frame(ordered
.iGEBV
)
547 allGebvs
<- merge(combinedGebvs
, traitGEBV
,
552 rownames(allGebvs
) <- allGebvs
[,1]
557 colnames(ordered
.iGEBV
) <- c(trait
)
562 if(is
.null(predictionFile
)) {
563 genoNum
<- nrow(phenoTrait
)
565 warning(genoNum
, " is too small number of genotypes.")
567 reps
<- round_any(genoNum
, 10, f
= ceiling
) %/% 10
571 if (genoNum
%% 10 == 0) {
572 genotypeGroups
<- rep(1:10, reps
)
574 genotypeGroups
<- rep(1:10, reps
) [- (genoNum
%% 10) ]
578 genotypeGroups
<- genotypeGroups
[ order (runif(genoNum
)) ]
581 tr
<- paste("trPop", i
, sep
= ".")
582 sl
<- paste("slPop", i
, sep
= ".")
584 trG
<- which(genotypeGroups
!= i
)
585 slG
<- which(genotypeGroups
== i
)
590 kblup
<- paste("rKblup", i
, sep
= ".")
592 result
<- kinship
.BLUP(y
= phenoTrait
[trG
, ],
593 G
.train
= genoDataFiltered
[trG
, ],
594 G
.pred
= genoDataFiltered
[slG
, ],
595 mixed
.method
= "REML",
599 assign(kblup
, result
)
601 #calculate cross-validation accuracy
602 valCorData
<- merge(phenoTrait
[slG
, ], result$g
.pred
, by
=0, all
=FALSE)
603 rownames(valCorData
) <- valCorData
[, 1]
604 valCorData
[, 1] <- NULL
606 accuracy
<- try(cor(valCorData
))
607 validation
<- paste("validation", i
, sep
= ".")
609 cvTest
<- paste("Validation test", i
, sep
= " ")
611 if ( class(accuracy
) != "try-error")
613 accuracy
<- round(accuracy
[1,2], digits
= 3)
614 accuracy
<- data
.matrix(accuracy
)
616 colnames(accuracy
) <- c("correlation")
617 rownames(accuracy
) <- cvTest
619 assign(validation
, accuracy
)
621 if (!is
.na(accuracy
[1,1])) {
622 validationAll
<- rbind(validationAll
, accuracy
)
627 validationAll
<- data
.matrix(validationAll
[order(-validationAll
[, 1]), ])
629 if (!is
.null(validationAll
)) {
630 validationMean
<- data
.matrix(round(colMeans(validationAll
),
635 rownames(validationMean
) <- c("Average")
637 validationAll
<- rbind(validationAll
, validationMean
)
638 colnames(validationAll
) <- c("Correlation")
641 #predict GEBVs for selection population
642 #if (length(predictionData) !=0 ) {
643 # predictionData <- data.matrix(round(predictionData, digits = 0 ))
646 predictionPopResult
<- c()
647 predictionPopGEBVs
<- c()
649 if (length(predictionData
) != 0) {
650 message("running prediction for selection candidates...marker data", ncol(predictionData
), " vs. ", ncol(genoDataFiltered
))
652 predictionPopResult
<- kinship
.BLUP(y
= phenoTrait
,
653 G
.train
= genoDataFiltered
,
654 G
.pred
= predictionData
,
655 mixed
.method
= "REML",
658 message("running prediction for selection candidates...DONE!!")
660 predictionPopGEBVs
<- round(data
.matrix(predictionPopResult$g
.pred
), digits
= 3)
661 predictionPopGEBVs
<- data
.matrix(predictionPopGEBVs
[order(-predictionPopGEBVs
[, 1]), ])
663 colnames(predictionPopGEBVs
) <- c(trait
)
667 if (!is
.null(predictionPopGEBVs
) & length(predictionPopGEBVsFile
) != 0) {
668 write
.table(predictionPopGEBVs
,
669 file
= predictionPopGEBVsFile
,
677 if(!is
.null(validationAll
)) {
678 write
.table(validationAll
,
679 file
= validationFile
,
687 if (!is
.null(ordered
.markerEffects
)) {
688 write
.table(ordered
.markerEffects
,
697 if (!is
.null(ordered
.iGEBV
)) {
698 write
.table(ordered
.iGEBV
,
707 if (length(combinedGebvsFile
) != 0 ) {
708 if(file
.info(combinedGebvsFile
)$size
== 0) {
709 write
.table(ordered
.iGEBV
,
710 file
= combinedGebvsFile
,
716 write
.table(allGebvs
,
717 file
= combinedGebvsFile
,
725 if (!is
.null(traitPhenoData
) & length(traitPhenoFile
) != 0) {
726 write
.table(traitPhenoData
,
727 file
= traitPhenoFile
,
736 ## if (!is.null(genoDataMissing)) {
737 ## write.table(genoData,
746 ## if (!is.null(predictionDataMissing)) {
747 ## write.table(predictionData,
748 ## file = predictionFile,
756 if (file
.info(relationshipMatrixFile
)$size
== 0) {
757 write
.table(relationshipMatrix
,
758 file
= relationshipMatrixFile
,
766 if (file
.info(formattedPhenoFile
)$size
== 0 && !is
.null(formattedPhenoData
) ) {
767 write
.table(formattedPhenoData
,
768 file
= formattedPhenoFile
,
776 q(save
= "no", runLast
= FALSE)