2 #calculates genomic estimated breeding values (GEBVs) using rrBLUP,
6 # Isaak Y Tecle (iyt2@cornell.edu)
18 allArgs
<- commandArgs()
20 inputFiles
<- scan(grep("input_files", allArgs
, ignore
.case
= TRUE, perl
= TRUE, value
= TRUE),
23 outputFiles
<- scan(grep("output_files", allArgs
, ignore
.case
= TRUE,perl
= TRUE, value
= TRUE),
26 traitsFile
<- grep("traits", inputFiles
, ignore
.case
= TRUE, value
= TRUE)
27 traitFile
<- grep("trait_info", inputFiles
, ignore
.case
= TRUE, value
= TRUE)
28 traitInfo
<- scan(traitFile
, what
= "character",)
29 traitInfo
<- strsplit(traitInfo
, "\t");
30 traitId
<- traitInfo
[[1]]
31 trait
<- traitInfo
[[2]]
33 datasetInfoFile
<- grep("dataset_info", inputFiles
, ignore
.case
= TRUE, value
= TRUE)
36 if (length(datasetInfoFile
) != 0 ) {
37 datasetInfo
<- scan(datasetInfoFile
, what
= "character")
38 datasetInfo
<- paste(datasetInfo
, collapse
= " ")
40 datasetInfo
<- c('single population')
43 validationTrait
<- paste("validation", trait
, sep
= "_")
44 validationFile
<- grep(validationTrait
, outputFiles
, ignore
.case
= TRUE, value
= TRUE)
46 if (is
.null(validationFile
)) {
47 stop("Validation output file is missing.")
50 kinshipTrait
<- paste("kinship", trait
, sep
= "_")
51 blupFile
<- grep(kinshipTrait
, outputFiles
, ignore
.case
= TRUE, value
= TRUE)
53 if (is
.null(blupFile
)) {
54 stop("GEBVs file is missing.")
56 markerTrait
<- paste("marker", trait
, sep
= "_")
57 markerFile
<- grep(markerTrait
, outputFiles
, ignore
.case
= TRUE, value
= TRUE)
59 traitPhenoFile
<- paste("phenotype_trait", trait
, sep
= "_")
60 traitPhenoFile
<- grep(traitPhenoFile
, outputFiles
,ignore
.case
= TRUE, value
= TRUE)
62 varianceComponentsFile
<- grep("variance_components", outputFiles
, ignore
.case
= TRUE, value
= TRUE)
64 formattedPhenoFile
<- grep("formatted_phenotype_data", inputFiles
, ignore
.case
= TRUE, value
= TRUE)
66 formattedPhenoData
<- c()
69 if (length(formattedPhenoFile
) != 0 && file
.info(formattedPhenoFile
)$size
!= 0) {
70 formattedPhenoData
<- as
.data
.frame(fread(formattedPhenoFile
,
71 na
.strings
= c("NA", " ", "--", "-", ".")
74 row
.names(formattedPhenoData
) <- formattedPhenoData
[, 1]
75 formattedPhenoData
[, 1] <- NULL
77 phenoFile
<- grep("\\/phenotype_data", inputFiles
, ignore
.case
= TRUE, value
= TRUE, perl
= TRUE)
78 phenoData
<- fread(phenoFile
, na
.strings
= c("NA", " ", "--", "-", "."), header
= TRUE)
81 phenoData
<- as
.data
.frame(phenoData
)
84 if (datasetInfo
== 'combined populations') {
86 if (!is
.null(formattedPhenoData
)) {
87 phenoTrait
<- subset(formattedPhenoData
, select
= trait
)
88 phenoTrait
<- na
.omit(phenoTrait
)
91 dropColumns
<- grep(trait
, names(phenoData
), ignore
.case
= TRUE, value
= TRUE)
92 phenoTrait
<- phenoData
[, !(names(phenoData
) %in% dropColumns
)]
94 phenoTrait
<- as
.data
.frame(phenoTrait
)
95 row
.names(phenoTrait
) <- phenoTrait
[, 1]
96 phenoTrait
[, 1] <- NULL
97 colnames(phenoTrait
) <- trait
102 if (!is
.null(formattedPhenoData
)) {
103 phenoTrait
<- subset(formattedPhenoData
, select
= trait
)
104 phenoTrait
<- na
.omit(phenoTrait
)
107 dropColumns
<- c("uniquename", "stock_name")
108 phenoData
<- phenoData
[, !(names(phenoData
) %in% dropColumns
)]
110 phenoTrait
<- subset(phenoData
, select
= c("object_name", "object_id", "design", "block", "replicate", trait
))
112 experimentalDesign
<- phenoTrait
[2, 'design']
114 if (class(phenoTrait
[, trait
]) != 'numeric') {
115 phenoTrait
[, trait
] <- as
.numeric(as
.character(phenoTrait
[, trait
]))
118 if (is
.na(experimentalDesign
) == TRUE) {experimentalDesign
<- c('No Design')}
120 if ((experimentalDesign
== 'Augmented' || experimentalDesign
== 'RCBD') && unique(phenoTrait$block
) > 1) {
122 message("GS experimental design: ", experimentalDesign
)
124 augData
<- subset(phenoTrait
, select
= c("object_name", "object_id", "block", trait
))
126 colnames(augData
)[1] <- "genotypes"
127 colnames(augData
)[4] <- "trait"
129 model
<- try(lmer(trait
~ 0 + genotypes
+ (1|block
),
131 na
.action
= na
.omit
))
133 if (class(model
) != "try-error") {
134 phenoTrait
<- data
.frame(fixef(model
))
136 colnames(phenoTrait
) <- trait
138 nn
<- gsub('genotypes', '', rownames(phenoTrait
))
139 rownames(phenoTrait
) <- nn
141 phenoTrait
<- round(phenoTrait
, digits
= 2)
144 } else if (experimentalDesign
== 'Alpha') {
146 message("Experimental desgin: ", experimentalDesign
)
148 alphaData
<- subset(phenoData
,
149 select
= c("object_name", "object_id","block", "replicate", trait
)
152 colnames(alphaData
)[1] <- "genotypes"
153 colnames(alphaData
)[5] <- "trait"
155 model
<- try(lmer(trait
~ 0 + genotypes
+ (1|replicate
/block
),
157 na
.action
= na
.omit
))
159 if (class(model
) != "try-error") {
160 phenoTrait
<- data
.frame(fixef(model
))
162 colnames(phenoTrait
) <- trait
164 nn
<- gsub('genotypes', '', rownames(phenoTrait
))
165 rownames(phenoTrait
) <- nn
167 phenoTrait
<- round(phenoTrait
, digits
= 2)
173 phenoTrait
<- subset(phenoData
,
174 select
= c("object_name", "object_id", trait
))
176 if (sum(is
.na(phenoTrait
)) > 0) {
177 message("No. of pheno missing values: ", sum(is
.na(phenoTrait
)))
178 phenoTrait
<- na
.omit(phenoTrait
)
181 #calculate mean of reps/plots of the same accession and
182 #create new df with the accession means
184 phenoTrait
<- phenoTrait
[order(row
.names(phenoTrait
)), ]
185 phenoTrait
<- data
.frame(phenoTrait
)
186 message('phenotyped lines before averaging: ', length(row
.names(phenoTrait
)))
188 phenoTrait
<-ddply(phenoTrait
, "object_name", colwise(mean
))
189 message('phenotyped lines after averaging: ', length(row
.names(phenoTrait
)))
191 phenoTrait
<- subset(phenoTrait
, select
= c("object_name", trait
))
192 row
.names(phenoTrait
) <- phenoTrait
[, 1]
193 phenoTrait
[, 1] <- NULL
195 #format all-traits population phenotype dataset
196 ## formattedPhenoData <- phenoData
197 ## dropColumns <- c("object_id", "stock_id", "design", "block", "replicate" )
199 ## formattedPhenoData <- formattedPhenoData[, !(names(formattedPhenoData) %in% dropColumns)]
200 ## formattedPhenoData <- ddply(formattedPhenoData,
205 ## row.names(formattedPhenoData) <- formattedPhenoData[, 1]
206 ## formattedPhenoData[, 1] <- NULL
208 ## formattedPhenoData <- round(formattedPhenoData,
215 genoFile
<- grep("genotype_data", inputFiles
, ignore
.case
= TRUE, value
= TRUE)
216 genoData
<- fread(genoFile
, na
.strings
= c("NA", " ", "--", "-"), header
= TRUE)
218 genoData
<- as
.data
.frame(genoData
)
219 rownames(genoData
) <- genoData
[, 1]
220 genoData
[, 1] <- NULL
221 genoData
<- genoData
[, colSums(is
.na(genoData
)) < nrow(genoData
) * 0.5]
223 predictionTempFile
<- grep("prediction_population", inputFiles
, ignore
.case
= TRUE, value
= TRUE)
224 predictionFile
<- c()
226 message('prediction temp genotype file: ', predictionTempFile
)
228 if (length(predictionTempFile
) !=0 ) {
229 predictionFile
<- scan(predictionTempFile
, what
= "character")
232 message('prediction genotype file: ', predictionFile
)
234 predictionPopGEBVsFile
<- grep("prediction_pop_gebvs", outputFiles
, ignore
.case
= TRUE, value
= TRUE)
235 message("prediction gebv file: ", predictionPopGEBVsFile
)
237 predictionData
<- c()
239 if (length(predictionFile
) !=0 ) {
241 predictionData
<- as
.data
.frame(fread(predictionFile
, na
.strings
= c("NA", " ", "--", "-"),))
243 rownames(predictionData
) <- predictionData
[, 1]
244 predictionData
[, 1] <- NULL
245 predictionData
<- predictionData
[, colSums(is
.na(predictionData
)) < nrow(predictionData
) * 0.5]
248 #impute genotype values for obs with missing values,
249 #based on mean of neighbouring 10 (arbitrary) obs
250 genoDataMissing
<- c()
252 if (sum(is
.na(genoData
)) > 0) {
253 genoDataMissing
<- c('yes')
255 message("sum of geno missing values, ", sum(is
.na(genoData
)) )
256 genoData
<- na
.roughfix(genoData
)
257 genoData
<- data
.matrix(genoData
)
260 genoData
<- genoData
[order(row
.names(genoData
)), ]
262 #create phenotype and genotype datasets with
264 message('phenotyped lines: ', length(row
.names(phenoTrait
)))
265 message('genotyped lines: ', length(row
.names(genoData
)))
267 #extract observation lines with both
268 #phenotype and genotype data only.
269 commonObs
<- intersect(row
.names(phenoTrait
), row
.names(genoData
))
270 commonObs
<- data
.frame(commonObs
)
271 rownames(commonObs
)<-commonObs
[, 1]
273 message('lines with both genotype and phenotype data: ', length(row
.names(commonObs
)))
274 #include in the genotype dataset only observation lines
276 message("genotype lines before filtering for phenotyped only: ", length(row
.names(genoData
)))
277 genoDataFiltered
<- genoData
[(rownames(genoData
) %in% rownames(commonObs
)), ]
278 message("genotype lines after filtering for phenotyped only: ", length(row
.names(genoDataFiltered
)))
279 #drop observation lines without genotype data
280 message("phenotype lines before filtering for genotyped only: ", length(row
.names(phenoTrait
)))
281 phenoTrait
<- merge(data
.frame(phenoTrait
), commonObs
, by
=0, all
=FALSE)
282 rownames(phenoTrait
) <-phenoTrait
[, 1]
283 phenoTrait
<- subset(phenoTrait
, select
=trait
)
285 message("phenotype lines after filtering for genotyped only: ", length(row
.names(phenoTrait
)))
286 #a set of only observation lines with genotype data
288 traitPhenoData
<- data
.frame(round(phenoTrait
, digits
= 2))
289 phenoTrait
<- data
.matrix(phenoTrait
)
290 genoDataFiltered
<- data
.matrix(genoDataFiltered
)
292 #impute missing data in prediction data
293 predictionDataMissing
<- c()
294 if (length(predictionData
) != 0) {
295 #purge markers unique to both populations
296 commonMarkers
<- intersect(names(data
.frame(genoDataFiltered
)), names(predictionData
))
297 predictionData
<- subset(predictionData
, select
= commonMarkers
)
298 genoDataFiltered
<- subset(genoDataFiltered
, select
= commonMarkers
)
300 # predictionData <- data.matrix(predictionData)
302 if (sum(is
.na(predictionData
)) > 0) {
303 predictionDataMissing
<- c('yes')
304 message("sum of geno missing values, ", sum(is
.na(predictionData
)) )
305 predictionData
<- data
.matrix(na
.roughfix(predictionData
))
310 relationshipMatrixFile
<- grep("relationship_matrix", outputFiles
, ignore
.case
= TRUE, value
= TRUE)
312 message("relationship matrix file: ", relationshipMatrixFile
)
314 relationshipMatrix
<- c()
315 if (length(relationshipMatrixFile
) != 0) {
316 if (file
.info(relationshipMatrixFile
)$size
> 0 ) {
317 relationshipDf
<- as
.data
.frame(fread(relationshipMatrixFile
))
319 rownames(relationshipDf
) <- relationshipDf
[, 1]
320 relationshipDf
[, 1] <- NULL
321 relationshipMatrix
<- data
.matrix(relationshipDf
)
325 #change genotype coding to [-1, 0, 1], to use the A.mat ) if [0, 1, 2]
326 genoTrCode
<- grep("2", genoDataFiltered
[1, ], value
= TRUE)
327 if(length(genoTrCode
) != 0) {
328 genoDataFiltered
<- genoDataFiltered
- 1
331 if (length(predictionData
) != 0 ) {
332 genoSlCode
<- grep("2", predictionData
[1, ], value
= TRUE)
333 if (length(genoSlCode
) != 0 ) {
334 predictionData
<- predictionData
- 1
338 ordered
.markerEffects
<- c()
339 if ( length(predictionData
) == 0 ) {
340 markerEffects
<- mixed
.solve(y
= phenoTrait
,
344 ordered
.markerEffects
<- data
.matrix(markerEffects$u
)
345 ordered
.markerEffects
<- data
.matrix(ordered
.markerEffects
[order (-ordered
.markerEffects
[, 1]), ])
346 ordered
.markerEffects
<- round(ordered
.markerEffects
, digits
=5)
348 colnames(ordered
.markerEffects
) <- c("Marker Effects")
352 #additive relationship model
353 #calculate the inner products for
354 #genotypes (realized relationship matrix)
355 if (length(relationshipMatrixFile
) != 0) {
356 if (file
.info(relationshipMatrixFile
)$size
== 0) {
357 relationshipMatrix
<- tcrossprod(data
.matrix(genoData
))
360 relationshipMatrixFiltered
<- relationshipMatrix
[(rownames(relationshipMatrix
) %in% rownames(commonObs
)),]
361 relationshipMatrixFiltered
<- relationshipMatrixFiltered
[, (colnames(relationshipMatrixFiltered
) %in% rownames(commonObs
))]
363 #construct an identity matrix for genotypes
364 identityMatrix
<- diag(nrow(phenoTrait
))
366 relationshipMatrixFiltered
<- data
.matrix(relationshipMatrixFiltered
)
368 iGEBV
<- mixed
.solve(y
= phenoTrait
, Z
= identityMatrix
, K
= relationshipMatrixFiltered
)
373 if ( is
.null(predictionFile
) == TRUE ) {
374 additiveEffects
<- data
.frame(iGEBVu
)
376 pN
<- nrow(phenoTrait
)
377 aN
<- nrow(additiveEffects
)
379 if (pN
<= 1 || pN
!= aN
) {
380 stop("phenoTrait and additiveEffects have different lengths: ",
381 pN
, " and ", aN
, ".")
384 if (TRUE %in% is
.na(phenoTrait
) || TRUE %in% is
.na(additiveEffects
)) {
385 stop(" Arguments phenoTrait and additiveEffects have missing values.")
388 phenoVariance
<- var(phenoTrait
)
389 gebvVariance
<- var(additiveEffects
)
390 heritability
<- round((gebvVariance
/ phenoVariance
), digits
= 2)
392 cat("\n", file
= varianceComponentsFile
, append
= FALSE)
393 cat('Error variance', iGEBV$Ve
, file
= varianceComponentsFile
, sep
= "\t", append
= TRUE)
394 cat("\n", file
= varianceComponentsFile
, append
= TRUE)
395 cat('Additive genetic variance', iGEBV$Vu
, file
= varianceComponentsFile
, sep
= '\t', append
= TRUE)
396 cat("\n", file
= varianceComponentsFile
, append
= TRUE)
397 cat('Phenotype mean', iGEBV$beta
,file
= varianceComponentsFile
, sep
= '\t', append
= TRUE)
398 cat("\n", file
= varianceComponentsFile
, append
= TRUE)
399 cat('Heritability (h)', heritability
, file
= varianceComponentsFile
, sep
= '\t', append
= TRUE)
402 iGEBV
<- data
.matrix(iGEBVu
)
403 ordered
.iGEBV
<- as
.data
.frame(iGEBV
[order(-iGEBV
[, 1]), ])
404 ordered
.iGEBV
<- round(ordered
.iGEBV
, digits
= 3)
406 combinedGebvsFile
<- grep('selected_traits_gebv', outputFiles
, ignore
.case
= TRUE,value
= TRUE)
409 if (length(combinedGebvsFile
) != 0) {
410 fileSize
<- file
.info(combinedGebvsFile
)$size
411 if (fileSize
!= 0 ) {
412 combinedGebvs
<- as
.data
.frame(fread(combinedGebvsFile
))
414 rownames(combinedGebvs
) <- combinedGebvs
[,1]
415 combinedGebvs
[,1] <- NULL
417 colnames(ordered
.iGEBV
) <- c(trait
)
419 traitGEBV
<- as
.data
.frame(ordered
.iGEBV
)
420 allGebvs
<- merge(combinedGebvs
, traitGEBV
,
425 rownames(allGebvs
) <- allGebvs
[,1]
430 colnames(ordered
.iGEBV
) <- c(trait
)
435 if(is
.null(predictionFile
)) {
436 genoNum
<- nrow(phenoTrait
)
438 warning(genoNum
, " is too small number of genotypes.")
441 reps
<- round_any(genoNum
, 10, f
= ceiling
) %/% 10
445 if (genoNum
%% 10 == 0) {
446 genotypeGroups
<- rep(1:10, reps
)
448 genotypeGroups
<- rep(1:10, reps
) [- (genoNum
%% 10) ]
452 genotypeGroups
<- genotypeGroups
[ order (runif(genoNum
)) ]
455 tr
<- paste("trPop", i
, sep
= ".")
456 sl
<- paste("slPop", i
, sep
= ".")
458 trG
<- which(genotypeGroups
!= i
)
459 slG
<- which(genotypeGroups
== i
)
464 kblup
<- paste("rKblup", i
, sep
= ".")
466 result
<- kinship
.BLUP(y
= phenoTrait
[trG
, ],
467 G
.train
= genoDataFiltered
[trG
, ],
468 G
.pred
= genoDataFiltered
[slG
, ],
469 mixed
.method
= "REML",
473 assign(kblup
, result
)
475 #calculate cross-validation accuracy
476 valCorData
<- merge(phenoTrait
[slG
, ], result$g
.pred
, by
=0, all
=FALSE)
477 rownames(valCorData
) <- valCorData
[, 1]
478 valCorData
[, 1] <- NULL
480 accuracy
<- try(cor(valCorData
))
481 validation
<- paste("validation", i
, sep
= ".")
483 cvTest
<- paste("Validation test", i
, sep
= " ")
485 if ( class(accuracy
) != "try-error")
487 accuracy
<- round(accuracy
[1,2], digits
= 3)
488 accuracy
<- data
.matrix(accuracy
)
490 colnames(accuracy
) <- c("correlation")
491 rownames(accuracy
) <- cvTest
493 assign(validation
, accuracy
)
495 if (!is
.na(accuracy
[1,1])) {
496 validationAll
<- rbind(validationAll
, accuracy
)
501 validationAll
<- data
.matrix(validationAll
[order(-validationAll
[, 1]), ])
503 if (!is
.null(validationAll
)) {
504 validationMean
<- data
.matrix(round(colMeans(validationAll
), digits
= 2))
506 rownames(validationMean
) <- c("Average")
508 validationAll
<- rbind(validationAll
, validationMean
)
509 colnames(validationAll
) <- c("Correlation")
513 predictionPopResult
<- c()
514 predictionPopGEBVs
<- c()
516 if (length(predictionData
) != 0) {
517 message("running prediction for selection candidates...marker data", ncol(predictionData
), " vs. ", ncol(genoDataFiltered
))
519 predictionPopResult
<- kinship
.BLUP(y
= phenoTrait
,
520 G
.train
= genoDataFiltered
,
521 G
.pred
= predictionData
,
522 mixed
.method
= "REML",
525 message("running prediction for selection candidates...DONE!!")
527 predictionPopGEBVs
<- round(data
.matrix(predictionPopResult$g
.pred
), digits
= 3)
528 predictionPopGEBVs
<- data
.matrix(predictionPopGEBVs
[order(-predictionPopGEBVs
[, 1]), ])
530 colnames(predictionPopGEBVs
) <- c(trait
)
534 if (!is
.null(predictionPopGEBVs
) & length(predictionPopGEBVsFile
) != 0) {
535 write
.table(predictionPopGEBVs
,
536 file
= predictionPopGEBVsFile
,
544 if(!is
.null(validationAll
)) {
545 write
.table(validationAll
,
546 file
= validationFile
,
554 if (!is
.null(ordered
.markerEffects
)) {
555 write
.table(ordered
.markerEffects
,
564 if (!is
.null(ordered
.iGEBV
)) {
565 write
.table(ordered
.iGEBV
,
574 if (length(combinedGebvsFile
) != 0 ) {
575 if(file
.info(combinedGebvsFile
)$size
== 0) {
576 write
.table(ordered
.iGEBV
,
577 file
= combinedGebvsFile
,
583 write
.table(allGebvs
,
584 file
= combinedGebvsFile
,
592 if (!is
.null(traitPhenoData
) & length(traitPhenoFile
) != 0) {
593 write
.table(traitPhenoData
,
594 file
= traitPhenoFile
,
603 ## if (!is.null(genoDataMissing)) {
604 ## write.table(genoData,
613 ## if (!is.null(predictionDataMissing)) {
614 ## write.table(predictionData,
615 ## file = predictionFile,
623 if (file
.info(relationshipMatrixFile
)$size
== 0) {
624 write
.table(relationshipMatrix
,
625 file
= relationshipMatrixFile
,
633 if (file
.info(formattedPhenoFile
)$size
== 0 && !is
.null(formattedPhenoData
) ) {
634 write
.table(formattedPhenoData
,
635 file
= formattedPhenoFile
,
644 q(save
= "no", runLast
= FALSE)