1 #a script for calculating genomic
2 #estimated breeding values (GEBVs) using rrBLUP
13 allArgs
<- commandArgs()
15 inFile
<- grep("input_files",
22 outFile
<- grep("output_files",
29 outFiles
<- scan(outFile
,
33 inFiles
<- scan(inFile
,
38 traitsFile
<- grep("traits",
45 traitFile
<- grep("trait_info",
53 traitInfo
<- scan(traitFile
,
57 traitInfo
<- strsplit(traitInfo
, "\t");
58 traitId
<- traitInfo
[[1]]
59 trait
<- traitInfo
[[2]]
61 datasetInfoFile
<- grep("dataset_info",
69 if (length(datasetInfoFile
) != 0 ) {
70 datasetInfo
<- scan(datasetInfoFile
,
74 datasetInfo
<- paste(datasetInfo
, collapse
= " ")
77 datasetInfo
<- c('single population')
81 validationTrait
<- paste("validation", trait
, sep
= "_")
83 validationFile
<- grep(validationTrait
,
90 kinshipTrait
<- paste("kinship", trait
, sep
= "_")
92 blupFile
<- grep(kinshipTrait
,
99 markerTrait
<- paste("marker", trait
, sep
= "_")
100 markerFile
<- grep(markerTrait
,
107 traitPhenoFile
<- paste("phenotype_trait", trait
, sep
= "_")
108 traitPhenoFile
<- grep(traitPhenoFile
,
115 varianceComponentsFile
<- grep("variance_components",
122 formattedPhenoFile
<- grep("formatted_phenotype_data",
129 formattedPhenoData
<- c()
132 if (length(formattedPhenoFile
) != 0 && file
.info(formattedPhenoFile
)$size
!= 0) {
133 formattedPhenoData
<- read
.table(formattedPhenoFile
,
137 na
.strings
= c("NA", " ", "--", "-", "."),
141 phenoFile
<- grep("\\/phenotype_data",
149 phenoData
<- read
.table(phenoFile
,
153 na
.strings
= c("NA", " ", "--", "-", "."),
160 if (datasetInfo
== 'combined populations') {
162 if (!is
.null(formattedPhenoData
)) {
163 phenoTrait
<- subset(formattedPhenoData
, select
=trait
)
164 phenoTrait
<- na
.omit(phenoTrait
)
167 dropColumns
<- grep(trait
,
174 phenoTrait
<- phenoData
[,!(names(phenoData
) %in% dropColumns
)]
176 phenoTrait
<- as
.data
.frame(phenoTrait
)
177 row
.names(phenoTrait
) <- phenoTrait
[, 1]
178 phenoTrait
[, 1] <- NULL
179 colnames(phenoTrait
) <- trait
184 if (!is
.null(formattedPhenoData
)) {
185 phenoTrait
<- subset(formattedPhenoData
, select
=trait
)
186 phenoTrait
<- na
.omit(phenoTrait
)
189 dropColumns
<- c("uniquename", "stock_name")
190 phenoData
<- phenoData
[,!(names(phenoData
) %in% dropColumns
)]
192 phenoTrait
<- subset(phenoData
,
193 select
= c("object_name", "object_id", "design", "block", "replicate", trait
)
196 experimentalDesign
<- phenoTrait
[2, 'design']
198 if (class(phenoTrait
[, trait
]) != 'numeric') {
199 phenoTrait
[, trait
] <- as
.numeric(as
.character(phenoTrait
[, trait
]))
202 if (is
.na(experimentalDesign
) == TRUE) {experimentalDesign
<- c('No Design')}
204 if ((experimentalDesign
== 'Augmented' || experimentalDesign
== 'RCBD') && unique(phenoTrait$block
) > 1) {
206 message("GS experimental design: ", experimentalDesign
)
208 augData
<- subset(phenoTrait
,
209 select
= c("object_name", "object_id", "block", trait
)
212 colnames(augData
)[1] <- "genotypes"
213 colnames(augData
)[4] <- "trait"
215 model
<- try(lmer(trait
~ 0 + genotypes
+ (1|block
),
220 if (class(model
) != "try-error") {
221 phenoTrait
<- data
.frame(fixef(model
))
223 colnames(phenoTrait
) <- trait
225 nn
<- gsub('genotypes', '', rownames(phenoTrait
))
226 rownames(phenoTrait
) <- nn
228 phenoTrait
<- round(phenoTrait
, digits
= 2)
231 } else if (experimentalDesign
== 'Alpha') {
233 message("Experimental desgin: ", experimentalDesign
)
235 alphaData
<- subset(phenoData
,
236 select
= c("object_name", "object_id","block", "replicate", trait
)
239 colnames(alphaData
)[1] <- "genotypes"
240 colnames(alphaData
)[5] <- "trait"
242 model
<- try(lmer(trait
~ 0 + genotypes
+ (1|replicate
/block
),
247 if (class(model
) != "try-error") {
248 phenoTrait
<- data
.frame(fixef(model
))
250 colnames(phenoTrait
) <- trait
252 nn
<- gsub('genotypes', '', rownames(phenoTrait
))
253 rownames(phenoTrait
) <- nn
255 phenoTrait
<- round(phenoTrait
, digits
= 2)
261 phenoTrait
<- subset(phenoData
,
262 select
= c("object_name", "object_id", trait
)
265 if (sum(is
.na(phenoTrait
)) > 0) {
266 message("No. of pheno missing values: ", sum(is
.na(phenoTrait
)))
267 phenoTrait
<- na
.omit(phenoTrait
)
270 #calculate mean of reps/plots of the same accession and
271 #create new df with the accession means
273 phenoTrait
<- phenoTrait
[order(row
.names(phenoTrait
)), ]
274 phenoTrait
<- data
.frame(phenoTrait
)
275 message('phenotyped lines before averaging: ', length(row
.names(phenoTrait
)))
277 phenoTrait
<-ddply(phenoTrait
, "object_name", colwise(mean
))
278 message('phenotyped lines after averaging: ', length(row
.names(phenoTrait
)))
280 phenoTrait
<- subset(phenoTrait
, select
=c("object_name", trait
))
281 row
.names(phenoTrait
) <- phenoTrait
[, 1]
282 phenoTrait
[, 1] <- NULL
284 #format all-traits population phenotype dataset
285 ## formattedPhenoData <- phenoData
286 ## dropColumns <- c("object_id", "stock_id", "design", "block", "replicate" )
288 ## formattedPhenoData <- formattedPhenoData[, !(names(formattedPhenoData) %in% dropColumns)]
289 ## formattedPhenoData <- ddply(formattedPhenoData,
294 ## row.names(formattedPhenoData) <- formattedPhenoData[, 1]
295 ## formattedPhenoData[, 1] <- NULL
297 ## formattedPhenoData <- round(formattedPhenoData,
304 genoFile
<- grep("genotype_data",
311 genoData
<- read
.table(genoFile
,
315 na
.strings
= c("NA", " ", "--", "-"),
319 genoData
<- genoData
[order(row
.names(genoData
)), ]
321 #impute genotype values for obs with missing values,
322 #based on mean of neighbouring 10 (arbitrary) obs
323 genoDataMissing
<-c()
325 if (sum(is
.na(genoData
)) > 0) {
326 genoDataMissing
<- c('yes')
328 message("sum of geno missing values, ", sum(is
.na(genoData
)) )
329 genoData
<- na
.roughfix(genoData
)
330 genoData
<- data
.matrix(genoData
)
333 predictionTempFile
<- grep("prediction_population",
340 predictionFile
<- c()
342 message('prediction temp genotype file: ', predictionTempFile
)
344 if (length(predictionTempFile
) !=0 ) {
345 predictionFile
<- scan(predictionTempFile
,
350 message('prediction genotype file: ', predictionFile
)
352 predictionPopGEBVsFile
<- grep("prediction_pop_gebvs",
359 message("prediction gebv file: ", predictionPopGEBVsFile
)
361 predictionData
<- c()
363 if (length(predictionFile
) !=0 ) {
364 predictionData
<- read
.table(predictionFile
,
368 na
.strings
= c("NA", " ", "--", "-"),
373 #create phenotype and genotype datasets with
375 message('phenotyped lines: ', length(row
.names(phenoTrait
)))
376 message('genotyped lines: ', length(row
.names(genoData
)))
378 #extract observation lines with both
379 #phenotype and genotype data only.
380 commonObs
<- intersect(row
.names(phenoTrait
), row
.names(genoData
))
381 commonObs
<- data
.frame(commonObs
)
382 rownames(commonObs
)<-commonObs
[, 1]
384 message('lines with both genotype and phenotype data: ', length(row
.names(commonObs
)))
385 #include in the genotype dataset only observation lines
387 message("genotype lines before filtering for phenotyped only: ", length(row
.names(genoData
)))
388 genoDataFiltered
<- genoData
[(rownames(genoData
) %in% rownames(commonObs
)), ]
389 message("genotype lines after filtering for phenotyped only: ", length(row
.names(genoDataFiltered
)))
390 #drop observation lines without genotype data
391 message("phenotype lines before filtering for genotyped only: ", length(row
.names(phenoTrait
)))
393 phenoTrait
<- merge(data
.frame(phenoTrait
), commonObs
, by
=0, all
=FALSE)
394 rownames(phenoTrait
) <-phenoTrait
[, 1]
395 phenoTrait
<- subset(phenoTrait
, select
=trait
)
397 message("phenotype lines after filtering for genotyped only: ", length(row
.names(phenoTrait
)))
398 #a set of only observation lines with genotype data
400 traitPhenoData
<- data
.frame(round(phenoTrait
, digits
=2))
401 phenoTrait
<- data
.matrix(phenoTrait
)
402 genoDataFiltered
<- data
.matrix(genoDataFiltered
)
404 #impute missing data in prediction data
405 predictionDataMissing
<- c()
406 if (length(predictionData
) != 0) {
407 #purge markers unique to both populations
408 commonMarkers
<- intersect(names(data
.frame(genoDataFiltered
)), names(predictionData
))
409 predictionData
<- subset(predictionData
, select
= commonMarkers
)
410 genoDataFiltered
<- subset(genoDataFiltered
, select
= commonMarkers
)
412 predictionData
<- data
.matrix(predictionData
)
414 if (sum(is
.na(predictionData
)) > 0) {
415 predictionDataMissing
<- c('yes')
416 message("sum of geno missing values, ", sum(is
.na(predictionData
)) )
417 predictionData
<- data
.matrix(na
.roughfix(predictionData
))
422 relationshipMatrixFile
<- grep("relationship_matrix",
429 message("relationship matrix file: ", relationshipMatrixFile
)
430 #message("relationship matrix file size: ", file.info(relationshipMatrixFile)$size)
431 relationshipMatrix
<- c()
432 if (length(relationshipMatrixFile
) != 0) {
433 if (file
.info(relationshipMatrixFile
)$size
> 0 ) {
434 relationshipDf
<- read
.table(relationshipMatrixFile
,
442 relationshipMatrix
<- data
.matrix(relationshipDf
)
445 #change genotype coding to [-1, 0, 1], to use the A.mat ) if [0, 1, 2]
446 genoTrCode
<- grep("2", genoDataFiltered
[1, ], fixed
=TRUE, value
=TRUE)
447 if(length(genoTrCode
) != 0) {
448 genoDataFiltered
<- genoDataFiltered
- 1
451 if (length(predictionData
) != 0 ) {
452 genoSlCode
<- grep("2", predictionData
[1, ], fixed
=TRUE, value
=TRUE)
453 if (length(genoSlCode
) != 0 ) {
454 predictionData
<- predictionData
- 1
458 ordered
.markerEffects
<- c()
459 if ( length(predictionData
) == 0 ) {
460 markerEffects
<- mixed
.solve(y
= phenoTrait
,
464 ordered
.markerEffects
<- data
.matrix(markerEffects$u
)
465 ordered
.markerEffects
<- data
.matrix(ordered
.markerEffects
[order (-ordered
.markerEffects
[, 1]), ])
466 ordered
.markerEffects
<- round(ordered
.markerEffects
,
470 colnames(ordered
.markerEffects
) <- c("Marker Effects")
473 #correlation between breeding values based on
474 #marker effects and relationship matrix
475 #corGEBVs <- cor(genoDataMatrix %*% markerEffects$u, iGEBV$u)
478 #additive relationship model
479 #calculate the inner products for
480 #genotypes (realized relationship matrix)
481 if (length(relationshipMatrixFile
) != 0) {
482 if (file
.info(relationshipMatrixFile
)$size
== 0) {
483 relationshipMatrix
<- tcrossprod(data
.matrix(genoData
))
486 relationshipMatrixFiltered
<- relationshipMatrix
[(rownames(relationshipMatrix
) %in% rownames(commonObs
)),]
487 relationshipMatrixFiltered
<- relationshipMatrixFiltered
[, (colnames(relationshipMatrixFiltered
) %in% rownames(commonObs
))]
489 #construct an identity matrix for genotypes
490 identityMatrix
<- diag(nrow(phenoTrait
))
492 relationshipMatrixFiltered
<- data
.matrix(relationshipMatrixFiltered
)
494 iGEBV
<- mixed
.solve(y
= phenoTrait
,
496 K
= relationshipMatrixFiltered
503 if ( is
.null(predictionFile
) == TRUE ) {
504 heritability
<- round((iGEBV$Vu
/(iGEBV$Vu
+ iGEBV$Ve
) * 100), digits
=3)
505 cat("\n", file
=varianceComponentsFile
, append
=TRUE)
506 cat('Error variance', iGEBV$Ve
, file
=varianceComponentsFile
, sep
="\t", append
=TRUE)
507 cat("\n", file
=varianceComponentsFile
, append
=TRUE)
508 cat('Additive genetic variance', iGEBV$Vu
, file
=varianceComponentsFile
, sep
='\t', append
=TRUE)
509 cat("\n", file
=varianceComponentsFile
, append
=TRUE)
510 cat('μ', iGEBV$beta
,file
=varianceComponentsFile
, sep
='\t', append
=TRUE)
511 cat("\n", file
=varianceComponentsFile
, append
=TRUE)
512 cat('Heritability (h, %)', heritability
, file
=varianceComponentsFile
, sep
='\t', append
=TRUE)
515 iGEBV
<- data
.matrix(iGEBVu
)
517 ordered
.iGEBV
<- as
.data
.frame(iGEBV
[order(-iGEBV
[, 1]), ] )
519 ordered
.iGEBV
<- round(ordered
.iGEBV
,
523 combinedGebvsFile
<- grep('selected_traits_gebv',
531 if (length(combinedGebvsFile
) != 0)
533 fileSize
<- file
.info(combinedGebvsFile
)$size
536 combinedGebvs
<-read
.table(combinedGebvsFile
,
543 colnames(ordered
.iGEBV
) <- c(trait
)
545 traitGEBV
<- as
.data
.frame(ordered
.iGEBV
)
546 allGebvs
<- merge(combinedGebvs
, traitGEBV
,
551 rownames(allGebvs
) <- allGebvs
[,1]
556 colnames(ordered
.iGEBV
) <- c(trait
)
561 if(is
.null(predictionFile
)) {
562 genoNum
<- nrow(phenoTrait
)
564 warning(genoNum
, " is too small number of genotypes.")
566 reps
<- round_any(genoNum
, 10, f
= ceiling
) %/% 10
570 if (genoNum
%% 10 == 0) {
571 genotypeGroups
<- rep(1:10, reps
)
573 genotypeGroups
<- rep(1:10, reps
) [- (genoNum
%% 10) ]
577 genotypeGroups
<- genotypeGroups
[ order (runif(genoNum
)) ]
580 tr
<- paste("trPop", i
, sep
= ".")
581 sl
<- paste("slPop", i
, sep
= ".")
583 trG
<- which(genotypeGroups
!= i
)
584 slG
<- which(genotypeGroups
== i
)
589 kblup
<- paste("rKblup", i
, sep
= ".")
591 result
<- kinship
.BLUP(y
= phenoTrait
[trG
, ],
592 G
.train
= genoDataFiltered
[trG
, ],
593 G
.pred
= genoDataFiltered
[slG
, ],
594 mixed
.method
= "REML",
598 assign(kblup
, result
)
600 #calculate cross-validation accuracy
601 valCorData
<- merge(phenoTrait
[slG
, ], result$g
.pred
, by
=0, all
=FALSE)
602 rownames(valCorData
) <- valCorData
[, 1]
603 valCorData
[, 1] <- NULL
605 accuracy
<- try(cor(valCorData
))
606 validation
<- paste("validation", i
, sep
= ".")
608 cvTest
<- paste("Validation test", i
, sep
= " ")
610 if ( class(accuracy
) != "try-error")
612 accuracy
<- round(accuracy
[1,2], digits
= 3)
613 accuracy
<- data
.matrix(accuracy
)
615 colnames(accuracy
) <- c("correlation")
616 rownames(accuracy
) <- cvTest
618 assign(validation
, accuracy
)
620 if (!is
.na(accuracy
[1,1])) {
621 validationAll
<- rbind(validationAll
, accuracy
)
626 validationAll
<- data
.matrix(validationAll
[order(-validationAll
[, 1]), ])
628 if (!is
.null(validationAll
)) {
629 validationMean
<- data
.matrix(round(colMeans(validationAll
),
634 rownames(validationMean
) <- c("Average")
636 validationAll
<- rbind(validationAll
, validationMean
)
637 colnames(validationAll
) <- c("Correlation")
640 #predict GEBVs for selection population
641 if (length(predictionData
) !=0 ) {
642 predictionData
<- data
.matrix(round(predictionData
, digits
= 0 ))
645 predictionPopResult
<- c()
646 predictionPopGEBVs
<- c()
648 if (length(predictionData
) != 0) {
649 message("running prediction for selection candidates...marker data", ncol(predictionData
), " vs. ", ncol(genoDataFiltered
))
651 predictionPopResult
<- kinship
.BLUP(y
= phenoTrait
,
652 G
.train
= genoDataFiltered
,
653 G
.pred
= predictionData
,
654 mixed
.method
= "REML",
657 message("running prediction for selection candidates...DONE!!")
659 predictionPopGEBVs
<- round(data
.matrix(predictionPopResult$g
.pred
), digits
= 3)
660 predictionPopGEBVs
<- data
.matrix(predictionPopGEBVs
[order(-predictionPopGEBVs
[, 1]), ])
662 colnames(predictionPopGEBVs
) <- c(trait
)
666 if (!is
.null(predictionPopGEBVs
) & length(predictionPopGEBVsFile
) != 0) {
667 write
.table(predictionPopGEBVs
,
668 file
= predictionPopGEBVsFile
,
676 if(!is
.null(validationAll
)) {
677 write
.table(validationAll
,
678 file
= validationFile
,
686 if (!is
.null(ordered
.markerEffects
)) {
687 write
.table(ordered
.markerEffects
,
696 if (!is
.null(ordered
.iGEBV
)) {
697 write
.table(ordered
.iGEBV
,
706 if (length(combinedGebvsFile
) != 0 ) {
707 if(file
.info(combinedGebvsFile
)$size
== 0) {
708 write
.table(ordered
.iGEBV
,
709 file
= combinedGebvsFile
,
715 write
.table(allGebvs
,
716 file
= combinedGebvsFile
,
724 if (!is
.null(traitPhenoData
) & length(traitPhenoFile
) != 0) {
725 write
.table(traitPhenoData
,
726 file
= traitPhenoFile
,
735 if (!is
.null(genoDataMissing
)) {
736 write
.table(genoData
,
745 if (!is
.null(predictionDataMissing
)) {
746 write
.table(predictionData
,
747 file
= predictionFile
,
755 if (file
.info(relationshipMatrixFile
)$size
== 0) {
756 write
.table(relationshipMatrix
,
757 file
= relationshipMatrixFile
,
765 if (file
.info(formattedPhenoFile
)$size
== 0 & !is
.null(formattedPhenoData
) ) {
766 write
.table(formattedPhenoData
,
767 file
= formattedPhenoFile
,
775 #should also send notification to analysis owner
776 to
<- c("<iyt2@cornell.edu>")
777 subject
<- paste(trait
, ' GS analysis done', sep
= ':')
778 body
<- c("Dear User,\n\n")
779 body
<- paste(body
, 'The genomic selection analysis for', sep
= "")
780 body
<- paste(body
, trait
, sep
= " ")
781 body
<- paste(body
, "is done.\n\nRegards and Thanks.\nSGN", sep
= " ")
783 #should use SGN's smtp server eventually
784 sendmail(to
, subject
, body
, password
= "rmail")
786 q(save
= "no", runLast
= FALSE)