1 #a script for calculating genomic
2 #estimated breeding values (GEBVs) using rrBLUP
14 allArgs
<- commandArgs()
16 inFile
<- grep("input_files",
23 outFile
<- grep("output_files",
30 outFiles
<- scan(outFile
,
34 inFiles
<- scan(inFile
,
39 traitsFile
<- grep("traits",
46 traitFile
<- grep("trait_info",
54 traitInfo
<- scan(traitFile
,
58 traitInfo
<- strsplit(traitInfo
, "\t");
59 traitId
<- traitInfo
[[1]]
60 trait
<- traitInfo
[[2]]
62 datasetInfoFile
<- grep("dataset_info",
70 if(length(datasetInfoFile
) != 0 ) {
71 datasetInfo
<- scan(datasetInfoFile
,
75 datasetInfo
<- paste(datasetInfo
, collapse
= " ")
78 datasetInfo
<- c('single population')
82 validationTrait
<- paste("validation", trait
, sep
= "_")
84 validationFile
<- grep(validationTrait
,
91 kinshipTrait
<- paste("kinship", trait
, sep
= "_")
93 blupFile
<- grep(kinshipTrait
,
100 markerTrait
<- paste("marker", trait
, sep
= "_")
101 markerFile
<- grep(markerTrait
,
108 traitPhenoFile
<- paste("phenotype_trait", trait
, sep
= "_")
109 traitPhenoFile
<- grep(traitPhenoFile
,
117 formattedPhenoDataFile
<- grep("formatted_phenotype_data",
124 varianceComponentsFile
<- grep("variance_components",
131 phenoFile
<- grep("phenotype_data",
138 message("phenotype dataset file: ", phenoFile
)
139 message("dataset info: ", datasetInfo
)
140 message("phenotype dataset file: ", phenoFile
)
142 phenoData
<- read
.table(phenoFile
,
146 na
.strings
= c("NA", " ", "--", "-", "."),
151 formattedPhenoData
<- c()
153 if (datasetInfo
== 'combined populations') {
154 dropColumns
<- grep(trait
,
161 phenoTrait
<- phenoData
[,!(names(phenoData
) %in% dropColumns
)]
163 phenoTrait
<- as
.data
.frame(phenoTrait
)
164 row
.names(phenoTrait
) <- phenoTrait
[, 1]
165 phenoTrait
[, 1] <- NULL
166 colnames(phenoTrait
) <- trait
170 dropColumns
<- c("uniquename", "stock_name")
171 phenoData
<- phenoData
[,!(names(phenoData
) %in% dropColumns
)]
173 phenoTrait
<- subset(phenoData
,
174 select
= c("object_name", "object_id", "design", "block", "replicate", trait
)
177 experimentalDesign
<- phenoTrait
[2, 'design']
178 if (is
.na(experimentalDesign
) == TRUE) {experimentalDesign
<- c('No Design')}
180 if (experimentalDesign
== 'Augmented' || experimentalDesign
== 'RCBD') {
181 message("experimental design: ", experimentalDesign
)
183 augData
<- subset(phenoData
,
184 select
= c("object_name", "object_id", "block", trait
)
187 colnames(augData
)[1] <- "genotypes"
188 colnames(augData
)[4] <- "trait"
190 ff
<- trait
~ 0 + genotypes
199 adjMeans
<- data
.matrix(fixed
.effects(model
))
201 colnames(adjMeans
) <- trait
203 nn
<- gsub('genotypes', '', rownames(adjMeans
))
204 rownames(adjMeans
) <- nn
205 adjMeans
<- round(adjMeans
, digits
= 2)
207 phenoTrait
<- data
.frame(adjMeans
)
209 } else if (experimentalDesign
== 'Alpha') {
210 message("experimental design: ", experimentalDesign
)
212 alphaData
<- subset(phenoData
,
213 select
= c("object_name", "object_id", "block", "replicate", trait
)
216 colnames(alphaData
)[2] <- "genotypes"
217 colnames(alphaData
)[5] <- "trait"
219 ff
<- trait
~ 0 + genotypes
223 random
= ~1|replicate
/block
,
228 adjMeans
<- data
.matrix(fixed
.effects(model
))
229 colnames(adjMeans
) <- trait
231 nn
<- gsub('genotypes', '', rownames(adjMeans
))
232 rownames(adjMeans
) <- nn
233 adjMeans
<- round(adjMeans
, digits
= 2)
235 phenoTrait
<- data
.frame(adjMeans
)
239 phenoTrait
<- subset(phenoData
,
240 select
= c("object_name", "object_id", trait
)
243 if (sum(is
.na(phenoTrait
)) > 0) {
245 message("No. of pheno missing values: ", sum(is
.na(phenoTrait
)))
246 phenoTrait
<- na
.omit(phenoTrait
)
250 #calculate mean of reps/plots of the same accession and
251 #create new df with the accession means
253 phenoTrait
<- phenoTrait
[order(row
.names(phenoTrait
)), ]
254 phenoTrait
<- data
.frame(phenoTrait
)
255 message('phenotyped lines before averaging: ', length(row
.names(phenoTrait
)))
257 phenoTrait
<-ddply(phenoTrait
, "object_name", colwise(mean
))
258 message('phenotyped lines after averaging: ', length(row
.names(phenoTrait
)))
260 row
.names(phenoTrait
) <- phenoTrait
[, 1]
261 phenoTrait
[, 1] <- NULL
263 #format all-traits population phenotype dataset
264 formattedPhenoData
<- phenoData
265 dropColumns
<- c("object_id", "stock_id", "design", "block", "replicate" )
267 formattedPhenoData
<- formattedPhenoData
[, !(names(formattedPhenoData
) %in% dropColumns
)]
268 formattedPhenoData
<- ddply(formattedPhenoData
,
273 row
.names(formattedPhenoData
) <- formattedPhenoData
[, 1]
274 formattedPhenoData
[, 1] <- NULL
276 formattedPhenoData
<- round(formattedPhenoData
,
282 genoFile
<- grep("genotype_data",
289 genoData
<- read
.table(genoFile
,
293 na
.strings
= c("NA", " ", "--", "-"),
297 genoData
<- data
.matrix(genoData
[order(row
.names(genoData
)), ])
300 #impute genotype values for obs with missing values,
301 #based on mean of neighbouring 10 (arbitrary) obs
302 genoDataMissing
<-c()
304 if (sum(is
.na(genoData
)) > 0) {
305 genoDataMissing
<- c('yes')
306 message("sum of geno missing values, ", sum(is
.na(genoData
)) )
307 genoData
<- kNNImpute(genoData
, 10)
308 genoData
<- data
.frame(genoData
)
310 #extract columns with imputed values
311 genoData
<- subset(genoData
,
312 select
= grep("^x", names(genoData
))
315 #remove prefix 'x.' from imputed columns
316 names(genoData
) <- sub("x.", "", names(genoData
))
318 genoData
<- round(genoData
, digits
= 0)
319 genoData
<- data
.matrix(genoData
)
325 predictionTempFile
<- grep("prediction_population",
332 predictionFile
<- c()
334 message('prediction temp genotype file: ', predictionTempFile
)
336 if (length(predictionTempFile
) !=0 ) {
337 predictionFile
<- scan(predictionTempFile
,
342 message('prediction genotype file: ', predictionFile
)
344 predictionPopGEBVsFile
<- grep("prediction_pop_gebvs",
351 message("prediction gebv file: ", predictionPopGEBVsFile
)
353 predictionData
<- c()
355 if (length(predictionFile
) !=0 ) {
356 predictionData
<- read
.table(predictionFile
,
360 na
.strings
= c("NA", " ", "--", "-"),
365 #create phenotype and genotype datasets with
367 message('phenotyped lines: ', length(row
.names(phenoTrait
)))
368 message('genotyped lines: ', length(row
.names(genoData
)))
370 #extract observation lines with both
371 #phenotype and genotype data only.
372 commonObs
<- intersect(row
.names(phenoTrait
), row
.names(genoData
))
373 commonObs
<- data
.frame(commonObs
)
374 rownames(commonObs
)<-commonObs
[, 1]
376 message('lines with both genotype and phenotype data: ', length(row
.names(commonObs
)))
377 #include in the genotype dataset only observation lines
379 message("genotype lines before filtering for phenotyped only: ", length(row
.names(genoData
)))
380 genoDataFiltered
<- genoData
[(rownames(genoData
) %in% rownames(commonObs
)), ]
381 message("genotype lines after filtering for phenotyped only: ", length(row
.names(genoDataFiltered
)))
382 #drop observation lines without genotype data
383 message("phenotype lines before filtering for genotyped only: ", length(row
.names(phenoTrait
)))
385 phenoTrait
<- merge(data
.frame(phenoTrait
), commonObs
, by
=0, all
=FALSE)
386 rownames(phenoTrait
) <-phenoTrait
[, 1]
387 phenoTrait
<- subset(phenoTrait
, select
=trait
)
389 message("phenotype lines after filtering for genotyped only: ", length(row
.names(phenoTrait
)))
391 #a set of only observation lines with genotype data
392 traitPhenoData
<- as
.data
.frame(round(phenoTrait
, digits
=2))
393 phenoTrait
<- data
.matrix(phenoTrait
)
394 genoDataFiltered
<- data
.matrix(genoDataFiltered
)
397 #impute missing data in prediction data
398 predictionDataMissing
<- c()
399 if (length(predictionData
) != 0) {
400 #purge markers unique to both populations
401 commonMarkers
<- intersect(names(data
.frame(genoDataFiltered
)), names(predictionData
))
402 predictionData
<- subset(predictionData
, select
= commonMarkers
)
403 genoDataFiltered
<- subset(genoDataFiltered
, select
= commonMarkers
)
405 predictionData
<- data
.matrix(predictionData
)
407 if (sum(is
.na(predictionData
)) > 0) {
408 predictionDataMissing
<- c('yes')
409 message("sum of geno missing values in prediction data: ", sum(is
.na(predictionData
)) )
410 predictionData
<-kNNImpute(predictionData
, 10)
411 predictionData
<-as
.data
.frame(predictionData
)
413 #extract columns with imputed values
414 predictionData
<- subset(predictionData
,
415 select
= grep("^x", names(predictionData
))
418 #remove prefix 'x.' from imputed columns
419 names(predictionData
) <- sub("x.", "", names(predictionData
))
421 predictionData
<- round(predictionData
, digits
= 0)
422 predictionData
<- data
.matrix(predictionData
)
427 #change genotype coding to [-1, 0, 1], to use the A.mat ) if [0, 1, 2]
428 genoTrCode
<- grep("2", genoDataFiltered
[1, ], fixed
=TRUE, value
=TRUE)
429 if(length(genoTrCode
) != 0) {
430 genoDataFiltered
<- genoDataFiltered
- 1
433 if (length(predictionData
) != 0 ) {
434 genoSlCode
<- grep("2", predictionData
[1, ], fixed
=TRUE, value
=TRUE)
435 if (length(genoSlCode
) != 0 ) {
436 predictionData
<- predictionData
- 1
440 ordered
.markerEffects
<- c()
441 if ( length(predictionData
) == 0 ) {
442 markerEffects
<- mixed
.solve(y
= phenoTrait
,
446 ordered
.markerEffects
<- data
.matrix(markerEffects$u
)
447 ordered
.markerEffects
<- data
.matrix(ordered
.markerEffects
[order (-ordered
.markerEffects
[, 1]), ])
448 ordered
.markerEffects
<- round(ordered
.markerEffects
,
452 colnames(ordered
.markerEffects
) <- c("Marker Effects")
455 #correlation between breeding values based on
456 #marker effects and relationship matrix
457 #corGEBVs <- cor(genoDataMatrix %*% markerEffects$u, iGEBV$u)
460 #additive relationship model
461 #calculate the inner products for
462 #genotypes (realized relationship matrix)
463 genocrsprd
<- tcrossprod(genoDataFiltered
)
465 #construct an identity matrix for genotypes
466 identityMatrix
<- diag(nrow(phenoTrait
))
468 iGEBV
<- mixed
.solve(y
= phenoTrait
,
476 if ( is
.null(predictionFile
) == TRUE ) {
477 heritability
<- round((iGEBV$Vu
/(iGEBV$Vu
+ iGEBV$Ve
) * 100), digits
=3)
478 cat("\n", file
=varianceComponentsFile
, append
=TRUE)
479 cat('Error variance', iGEBV$Ve
, file
=varianceComponentsFile
, sep
="\t", append
=TRUE)
480 cat("\n", file
=varianceComponentsFile
, append
=TRUE)
481 cat('Additive genetic variance', iGEBV$Vu
, file
=varianceComponentsFile
, sep
='\t', append
=TRUE)
482 cat("\n", file
=varianceComponentsFile
, append
=TRUE)
483 cat('μ', iGEBV$beta
,file
=varianceComponentsFile
, sep
='\t', append
=TRUE)
484 cat("\n", file
=varianceComponentsFile
, append
=TRUE)
485 cat('Heritability (h, %)', heritability
, file
=varianceComponentsFile
, sep
='\t', append
=TRUE)
488 iGEBV
<- data
.matrix(iGEBVu
)
490 ordered
.iGEBV
<- as
.data
.frame(iGEBV
[order(-iGEBV
[, 1]), ] )
492 ordered
.iGEBV
<- round(ordered
.iGEBV
,
496 combinedGebvsFile
<- grep('selected_traits_gebv',
504 if (length(combinedGebvsFile
) != 0)
506 fileSize
<- file
.info(combinedGebvsFile
)$size
509 combinedGebvs
<-read
.table(combinedGebvsFile
,
516 colnames(ordered
.iGEBV
) <- c(trait
)
518 traitGEBV
<- as
.data
.frame(ordered
.iGEBV
)
519 allGebvs
<- merge(combinedGebvs
, traitGEBV
,
524 rownames(allGebvs
) <- allGebvs
[,1]
529 colnames(ordered
.iGEBV
) <- c(trait
)
534 if(is
.null(predictionFile
) == TRUE) {
535 genoNum
<- nrow(phenoTrait
)
537 warning(genoNum
, " is too small number of genotypes.")
539 reps
<- round_any(genoNum
, 10, f
= ceiling
) %/% 10
543 if (genoNum
%% 10 == 0) {
544 genotypeGroups
<- rep(1:10, reps
)
546 genotypeGroups
<- rep(1:10, reps
) [- (genoNum
%% 10) ]
550 genotypeGroups
<- genotypeGroups
[ order (runif(genoNum
)) ]
553 tr
<- paste("trPop", i
, sep
= ".")
554 sl
<- paste("slPop", i
, sep
= ".")
556 trG
<- which(genotypeGroups
!= i
)
557 slG
<- which(genotypeGroups
== i
)
562 kblup
<- paste("rKblup", i
, sep
= ".")
564 result
<- kinship
.BLUP(y
= phenoTrait
[trG
, ],
565 G
.train
= genoDataFiltered
[trG
, ],
566 G
.pred
= genoDataFiltered
[slG
, ],
567 mixed
.method
= "REML",
571 assign(kblup
, result
)
573 #calculate cross-validation accuracy
574 valCorData
<- merge(phenoTrait
[slG
, ], result$g
.pred
, by
=0, all
=FALSE)
575 rownames(valCorData
) <- valCorData
[, 1]
576 valCorData
[, 1] <- NULL
578 accuracy
<- try(cor(valCorData
))
579 validation
<- paste("validation", i
, sep
= ".")
581 cvTest
<- paste("Validation test", i
, sep
= " ")
583 if ( class(accuracy
) != "try-error")
585 accuracy
<- round(accuracy
[1,2], digits
= 3)
586 accuracy
<- data
.matrix(accuracy
)
588 colnames(accuracy
) <- c("correlation")
589 rownames(accuracy
) <- cvTest
591 assign(validation
, accuracy
)
593 if (!is
.na(accuracy
[1,1])) {
594 validationAll
<- rbind(validationAll
, accuracy
)
599 validationAll
<- data
.matrix(validationAll
[order(-validationAll
[, 1]), ])
601 if (!is
.null(validationAll
)) {
602 validationMean
<- data
.matrix(round(colMeans(validationAll
),
607 rownames(validationMean
) <- c("Average")
609 validationAll
<- rbind(validationAll
, validationMean
)
610 colnames(validationAll
) <- c("Correlation")
613 #predict GEBVs for selection population
614 if (length(predictionData
) !=0 ) {
615 predictionData
<- data
.matrix(round(predictionData
, digits
= 0 ))
618 predictionPopResult
<- c()
619 predictionPopGEBVs
<- c()
621 if (length(predictionData
) != 0) {
622 message("running prediction for selection candidates...marker data", ncol(predictionData
), " vs. ", ncol(genoDataFiltered
))
624 predictionPopResult
<- kinship
.BLUP(y
= phenoTrait
,
625 G
.train
= genoDataFiltered
,
626 G
.pred
= predictionData
,
627 mixed
.method
= "REML",
630 message("running prediction for selection candidates...DONE!!")
632 predictionPopGEBVs
<- round(data
.matrix(predictionPopResult$g
.pred
), digits
= 2)
633 predictionPopGEBVs
<- data
.matrix(predictionPopGEBVs
[order(-predictionPopGEBVs
[, 1]), ])
635 colnames(predictionPopGEBVs
) <- c(trait
)
639 if (!is
.null(predictionPopGEBVs
) & length(predictionPopGEBVsFile
) != 0) {
640 write
.table(predictionPopGEBVs
,
641 file
= predictionPopGEBVsFile
,
649 if(is
.null(validationAll
) == FALSE) {
650 write
.table(validationAll
,
651 file
= validationFile
,
659 if (is
.null(ordered
.markerEffects
) == FALSE) {
660 write
.table(ordered
.markerEffects
,
669 if (is
.null(ordered
.iGEBV
) == FALSE) {
670 write
.table(ordered
.iGEBV
,
679 if (length(combinedGebvsFile
) != 0 ) {
680 if(file
.info(combinedGebvsFile
)$size
== 0) {
681 write
.table(ordered
.iGEBV
,
682 file
= combinedGebvsFile
,
688 write
.table(allGebvs
,
689 file
= combinedGebvsFile
,
697 if (!is
.null(traitPhenoData
) & length(traitPhenoFile
) != 0) {
698 write
.table(traitPhenoData
,
699 file
= traitPhenoFile
,
706 if (!is
.null(formattedPhenoData
) & length(formattedPhenoDataFile
) != 0) {
707 write
.table(formattedPhenoData
,
708 file
= formattedPhenoDataFile
,
715 if (!is
.null(genoDataMissing
)) {
716 write
.table(genoData
,
725 if (!is
.null(predictionDataMissing
)) {
726 write
.table(predictionData
,
727 file
= predictionFile
,
735 #should also send notification to analysis owner
736 to
<- c("<iyt2@cornell.edu>")
737 subject
<- paste(trait
, ' GS analysis done', sep
= ':')
738 body
<- c("Dear User,\n\n")
739 body
<- paste(body
, 'The genomic selection analysis for', sep
= "")
740 body
<- paste(body
, trait
, sep
= " ")
741 body
<- paste(body
, "is done.\n\nRegards and Thanks.\nSGN", sep
= " ")
743 #should use SGN's smtp server eventually
744 sendmail(to
, subject
, body
, password
= "rmail")
746 q(save
= "no", runLast
= FALSE)