2 #calculates genomic estimated breeding values (GEBVs) using rrBLUP,
6 # Isaak Y Tecle (iyt2@cornell.edu)
18 library(genoDataFilter
)
19 library(phenoAnalysis
)
24 allArgs
<- commandArgs()
26 inputFiles
<- scan(grep("input_files", allArgs
, ignore
.case
= TRUE, perl
= TRUE, value
= TRUE),
29 outputFiles
<- scan(grep("output_files", allArgs
, ignore
.case
= TRUE,perl
= TRUE, value
= TRUE),
32 traitsFile
<- grep("traits", inputFiles
, ignore
.case
= TRUE, value
= TRUE)
33 traitFile
<- grep("trait_info", inputFiles
, ignore
.case
= TRUE, value
= TRUE)
34 traitInfo
<- scan(traitFile
, what
= "character",)
35 traitInfo
<- strsplit(traitInfo
, "\t");
36 traitId
<- traitInfo
[[1]]
37 trait
<- traitInfo
[[2]]
39 datasetInfoFile
<- grep("dataset_info", inputFiles
, ignore
.case
= TRUE, value
= TRUE)
42 if (length(datasetInfoFile
) != 0 ) {
43 datasetInfo
<- scan(datasetInfoFile
, what
= "character")
44 datasetInfo
<- paste(datasetInfo
, collapse
= " ")
46 datasetInfo
<- c('single population')
49 validationTrait
<- paste("validation", trait
, sep
= "_")
50 validationFile
<- grep(validationTrait
, outputFiles
, ignore
.case
= TRUE, value
= TRUE)
52 if (is
.null(validationFile
)) {
53 stop("Validation output file is missing.")
56 kinshipTrait
<- paste("rrblup_gebvs", trait
, sep
= "_")
57 blupFile
<- grep(kinshipTrait
, outputFiles
, ignore
.case
= TRUE, value
= TRUE)
59 if (is
.null(blupFile
)) {
60 stop("GEBVs file is missing.")
62 markerTrait
<- paste("marker_effects", trait
, sep
= "_")
63 markerFile
<- grep(markerTrait
, outputFiles
, ignore
.case
= TRUE, value
= TRUE)
65 traitPhenoFile
<- paste("phenotype_trait", trait
, sep
= "_")
66 traitPhenoFile
<- grep(traitPhenoFile
, outputFiles
,ignore
.case
= TRUE, value
= TRUE)
68 varianceComponentsFile
<- grep("variance_components", outputFiles
, ignore
.case
= TRUE, value
= TRUE)
69 filteredGenoFile
<- grep("filtered_genotype_data", outputFiles
, ignore
.case
= TRUE, value
= TRUE)
70 formattedPhenoFile
<- grep("formatted_phenotype_data", inputFiles
, ignore
.case
= TRUE, value
= TRUE)
72 formattedPhenoData
<- c()
75 genoFile
<- grep("genotype_data_", inputFiles
, ignore
.case
= TRUE, perl
=TRUE, value
= TRUE)
76 message('geno file ', genoFile
)
78 if (is
.null(genoFile
)) {
79 stop("genotype data file is missing.")
82 if (file
.info(genoFile
)$size
== 0) {
83 stop("genotype data file is empty.")
86 readFilteredGenoData
<- c()
87 filteredGenoData
<- c()
88 if (length(filteredGenoFile
) != 0 && file
.info(filteredGenoFile
)$size
!= 0) {
89 filteredGenoData
<- fread(filteredGenoFile
, na
.strings
= c("NA", " ", "--", "-"), header
= TRUE)
90 readFilteredGenoData
<- 1
91 message('read in filtered geno data')
95 if (is
.null(filteredGenoData
)) {
96 genoData
<- fread(genoFile
, na
.strings
= c("NA", " ", "--", "-"), header
= TRUE)
97 genoData
<- unique(genoData
, by
='V1')
98 message('read in unfiltered geno data')
101 if (length(formattedPhenoFile
) != 0 && file
.info(formattedPhenoFile
)$size
!= 0) {
102 formattedPhenoData
<- as
.data
.frame(fread(formattedPhenoFile
,
103 na
.strings
= c("NA", " ", "--", "-", ".")
107 phenoFile
<- grep("\\/phenotype_data", inputFiles
, ignore
.case
= TRUE, value
= TRUE, perl
= TRUE)
109 if (is
.null(phenoFile
)) {
110 stop("phenotype data file is missing.")
113 if (file
.info(phenoFile
)$size
== 0) {
114 stop("phenotype data file is empty.")
117 phenoData
<- fread(phenoFile
, na
.strings
= c("NA", " ", "--", "-", "."), header
= TRUE)
120 phenoData
<- as
.data
.frame(phenoData
)
123 if (datasetInfo
== 'combined populations') {
125 if (!is
.null(formattedPhenoData
)) {
126 phenoTrait
<- subset(formattedPhenoData
, select
= trait
)
127 phenoTrait
<- na
.omit(phenoTrait
)
130 dropColumns
<- grep(trait
, names(phenoData
), ignore
.case
= TRUE, value
= TRUE)
131 phenoTrait
<- phenoData
[, !(names(phenoData
) %in% dropColumns
)]
133 phenoTrait
<- as
.data
.frame(phenoTrait
)
134 colnames(phenoTrait
) <- c('genotypes', trait
)
138 if (!is
.null(formattedPhenoData
)) {
139 phenoTrait
<- subset(formattedPhenoData
, select
= c('V1', trait
))
140 phenoTrait
<- as
.data
.frame(phenoTrait
)
141 phenoTrait
<- na
.omit(phenoTrait
)
143 colnames(phenoTrait
)[1] <- 'genotypes'
145 } else if (length(grep('uploaded', phenoFile
)) != 0) {
147 phenoTrait
<- averageTrait(phenoData
, trait
)
151 phenoTrait
<- getAdjMeans(phenoData
, trait
)
156 if (is
.null(filteredGenoData
)) {
158 #genoDataFilter::filterGenoData
159 genoData
<- filterGenoData(genoData
, maf
=0.01)
160 genoData
<- roundAlleleDosage(genoData
)
162 genoData
<- as
.data
.frame(genoData
)
163 rownames(genoData
) <- genoData
[, 1]
164 genoData
[, 1] <- NULL
165 filteredGenoData
<- genoData
168 genoData
<- as
.data
.frame(filteredGenoData
)
169 rownames(genoData
) <- genoData
[, 1]
170 genoData
[, 1] <- NULL
173 genoData
<- genoData
[order(row
.names(genoData
)), ]
175 predictionTempFile
<- grep("prediction_population", inputFiles
, ignore
.case
= TRUE, value
= TRUE)
177 predictionFile
<- c()
178 filteredPredGenoFile
<- c()
179 predictionAllFiles
<- c()
181 message('prediction temp genotype file: ', predictionTempFile
)
183 if (length(predictionTempFile
) !=0 ) {
184 predictionAllFiles
<- scan(predictionTempFile
, what
= "character")
186 predictionFile
<- grep("\\/genotype_data", predictionAllFiles
, ignore
.case
= TRUE, perl
=TRUE, value
= TRUE)
187 message('prediction unfiltered genotype file: ', predictionFile
)
189 filteredPredGenoFile
<- grep("filtered_genotype_data_", predictionAllFiles
, ignore
.case
= TRUE, perl
=TRUE, value
= TRUE)
190 message('prediction filtered genotype file: ', predictionFile
)
193 predictionPopGEBVsFile
<- grep("prediction_pop_gebvs", outputFiles
, ignore
.case
= TRUE, value
= TRUE)
195 message("filtered pred geno file: ", filteredPredGenoFile
)
196 message("prediction gebv file: ", predictionPopGEBVsFile
)
198 predictionData
<- c()
199 readFilteredPredGenoData
<- c()
200 filteredPredGenoData
<- c()
202 if (length(filteredPredGenoFile
) != 0 && file
.info(filteredPredGenoFile
)$size
!= 0) {
203 predictionData
<- fread(filteredPredGenoFile
, na
.strings
= c("NA", " ", "--", "-"),)
204 readFilteredPredGenoData
<- 1
206 predictionData
<- as
.data
.frame(predictionData
)
207 rownames(predictionData
) <- predictionData
[, 1]
208 predictionData
[, 1] <- NULL
210 message('read in filtered prediction genotype data')
211 } else if (length(predictionFile
) != 0) {
213 predictionData
<- fread(predictionFile
, na
.strings
= c("NA", " ", "--", "-"),)
214 predictionData
<- unique(predictionData
, by
='V1')
216 predictionData
<- filterGenoData(predictionData
, maf
=0.01)
217 predictionData
<- roundAlleleDosage(predictionData
)
219 predictionData
<- as
.data
.frame(predictionData
)
220 rownames(predictionData
) <- predictionData
[, 1]
221 predictionData
[, 1] <- NULL
222 filteredPredGenoData
<- predictionData
226 #impute genotype values for obs with missing values,
227 genoDataMissing
<- c()
229 if (sum(is
.na(genoData
)) > 0) {
230 genoDataMissing
<- c('yes')
232 message("sum of geno missing values, ", sum(is
.na(genoData
)) )
233 genoData
<- na
.roughfix(genoData
)
234 genoData
<- data
.matrix(genoData
)
237 #create phenotype and genotype datasets with
239 message('phenotyped lines: ', length(row
.names(phenoTrait
)))
240 message('genotyped lines: ', length(row
.names(genoData
)))
242 #extract observation lines with both
243 #phenotype and genotype data only.
244 commonObs
<- intersect(phenoTrait$genotypes
, row
.names(genoData
))
245 commonObs
<- data
.frame(commonObs
)
246 rownames(commonObs
) <- commonObs
[, 1]
248 message('lines with both genotype and phenotype data: ', length(row
.names(commonObs
)))
250 #include in the genotype dataset only phenotyped lines
251 message("genotype lines before filtering for phenotyped only: ", length(row
.names(genoData
)))
252 genoDataFilteredObs
<- genoData
[(rownames(genoData
) %in% rownames(commonObs
)), ]
253 message("genotype lines after filtering for phenotyped only: ", length(row
.names(genoDataFilteredObs
)))
255 #drop phenotyped lines without genotype data
256 message("phenotype lines before filtering for genotyped only: ", length(row
.names(phenoTrait
)))
257 phenoTrait
<- phenoTrait
[(phenoTrait$genotypes
%in% rownames(commonObs
)), ]
258 message("phenotype lines after filtering for genotyped only: ", length(row
.names(phenoTrait
)))
260 phenoTraitMarker
<- as
.data
.frame(phenoTrait
)
261 rownames(phenoTraitMarker
) <- phenoTraitMarker
[, 1]
262 phenoTraitMarker
[, 1] <- NULL
264 #impute missing data in prediction data
265 predictionDataMissing
<- c()
266 if (length(predictionData
) != 0) {
267 #purge markers unique to both populations
268 commonMarkers
<- intersect(names(data
.frame(genoDataFilteredObs
)), names(predictionData
))
269 predictionData
<- subset(predictionData
, select
= commonMarkers
)
270 genoDataFilteredObs
<- subset(genoDataFilteredObs
, select
= commonMarkers
)
272 if (sum(is
.na(predictionData
)) > 0) {
273 predictionDataMissing
<- c('yes')
274 message("sum of geno missing values, ", sum(is
.na(predictionData
)) )
275 predictionData
<- data
.matrix(na
.roughfix(predictionData
))
279 #change genotype coding to [-1, 0, 1], to use the A.mat ) if [0, 1, 2]
280 genoTrCode
<- grep("2", genoDataFilteredObs
[1, ], value
= TRUE)
281 if(length(genoTrCode
) != 0) {
282 genoData
<- genoData
- 1
283 genoDataFilteredObs
<- genoDataFilteredObs
- 1
286 if (length(predictionData
) != 0 ) {
287 genoSlCode
<- grep("2", predictionData
[1, ], value
= TRUE)
288 if (length(genoSlCode
) != 0 ) {
289 predictionData
<- predictionData
- 1
293 ordered
.markerEffects
<- c()
294 ordered
.trGEBV
<- c()
296 combinedGebvsFile
<- c()
298 traitPhenoData
<- c()
299 relationshipMatrix
<- c()
301 #additive relationship model
302 #calculate the inner products for
303 #genotypes (realized relationship matrix)
304 relationshipMatrixFile
<- grep("relationship_matrix", outputFiles
, ignore
.case
= TRUE, value
= TRUE)
306 message("relationship matrix file: ", relationshipMatrixFile
)
308 if (length(relationshipMatrixFile
) != 0) {
309 if (file
.info(relationshipMatrixFile
)$size
> 0 ) {
310 relationshipMatrix
<- as
.data
.frame(fread(relationshipMatrixFile
))
312 rownames(relationshipMatrix
) <- relationshipMatrix
[, 1]
313 relationshipMatrix
[, 1] <- NULL
314 colnames(relationshipMatrix
) <- rownames(relationshipMatrix
)
315 relationshipMatrix
<- data
.matrix(relationshipMatrix
)
317 relationshipMatrix
<- A
.mat(genoData
)
318 diag(relationshipMatrix
) <- diag(relationshipMatrix
) + 1e-6
319 colnames(relationshipMatrix
) <- rownames(relationshipMatrix
)
323 relationshipMatrixFiltered
<- relationshipMatrix
[(rownames(relationshipMatrix
) %in% rownames(commonObs
)), ]
324 relationshipMatrixFiltered
<- relationshipMatrixFiltered
[, (colnames(relationshipMatrixFiltered
) %in% rownames(commonObs
))]
325 relationshipMatrix
<- data
.frame(relationshipMatrix
)
327 nCores
<- detectCores()
328 message('no cores: ', nCores
)
330 nCores
<- (nCores
%/% 2)
335 message('assgined no cores: ', nCores
)
337 if (length(predictionData
) == 0) {
339 trGEBV
<- kin
.blup(data
= phenoTrait
,
342 K
= relationshipMatrixFiltered
,
348 phenoTraitMarker
<- data
.matrix(phenoTraitMarker
)
349 genoDataFilteredObs
<- data
.matrix(genoDataFilteredObs
)
351 markerEffects
<- mixed
.solve(y
= phenoTraitMarker
,
352 Z
= genoDataFilteredObs
355 ordered
.markerEffects
<- data
.matrix(markerEffects$u
)
356 ordered
.markerEffects
<- data
.matrix(ordered
.markerEffects
[order (-ordered
.markerEffects
[, 1]), ])
357 ordered
.markerEffects
<- round(ordered
.markerEffects
, 5)
359 colnames(ordered
.markerEffects
) <- c("Marker Effects")
360 ordered
.markerEffects
<- data
.frame(ordered
.markerEffects
)
363 traitPhenoData
<- data
.frame(round(phenoTraitMarker
, 2))
365 heritability
<- round((trGEBV$Vg
/(trGEBV$Ve
+ trGEBV$Vg
)), 2)
367 cat("\n", file
= varianceComponentsFile
, append
= FALSE)
368 cat('Error variance', trGEBV$Ve
, file
= varianceComponentsFile
, sep
= "\t", append
= TRUE)
369 cat("\n", file
= varianceComponentsFile
, append
= TRUE)
370 cat('Additive genetic variance', trGEBV$Vg
, file
= varianceComponentsFile
, sep
= '\t', append
= TRUE)
371 cat("\n", file
= varianceComponentsFile
, append
= TRUE)
372 cat('Heritability (h)', heritability
, file
= varianceComponentsFile
, sep
= '\t', append
= TRUE)
375 trGEBV
<- data
.matrix(trGEBVu
)
376 ordered
.trGEBV
<- as
.data
.frame(trGEBV
[order(-trGEBV
[, 1]), ])
377 ordered
.trGEBV
<- round(ordered
.trGEBV
, 3)
379 combinedGebvsFile
<- grep('selected_traits_gebv', outputFiles
, ignore
.case
= TRUE,value
= TRUE)
381 if (length(combinedGebvsFile
) != 0) {
382 fileSize
<- file
.info(combinedGebvsFile
)$size
383 if (fileSize
!= 0 ) {
384 combinedGebvs
<- as
.data
.frame(fread(combinedGebvsFile
))
386 rownames(combinedGebvs
) <- combinedGebvs
[,1]
387 combinedGebvs
[,1] <- NULL
389 colnames(ordered
.trGEBV
) <- c(trait
)
391 traitGEBV
<- as
.data
.frame(ordered
.trGEBV
)
392 allGebvs
<- merge(combinedGebvs
, traitGEBV
,
397 rownames(allGebvs
) <- allGebvs
[,1]
402 colnames(ordered
.trGEBV
) <- c(trait
)
406 if (is
.null(predictionFile
)) {
407 genoNum
<- nrow(phenoTrait
)
409 warning(genoNum
, " is too small number of genotypes.")
416 cvFolds
<- createMultiFolds(phenoTrait
[, 2], k
=k
, times
=times
)
418 for ( r
in 1:times
) {
419 re
<- paste0('Rep', r
)
422 fo
<- ifelse(i
< 10, 'Fold0', 'Fold')
424 trFoRe
<- paste0(fo
, i
, '.', re
)
425 trG
<- cvFolds
[[trFoRe
]]
426 slG
<- as
.numeric(rownames(phenoTrait
[-trG
,]))
428 kblup
<- paste("rKblup", i
, sep
= ".")
430 result
<- kin
.blup(data
= phenoTrait
[trG
,],
433 K
= relationshipMatrixFiltered
,
437 assign(kblup
, result
)
439 #calculate cross-validation accuracy
441 valBlups
<- data
.frame(valBlups
)
443 slG
<- slG
[which(slG
<= nrow(phenoTrait
))]
445 slGDf
<- phenoTrait
[(rownames(phenoTrait
) %in% slG
),]
446 rownames(slGDf
) <- slGDf
[, 1]
449 valBlups
<- valBlups
[(rownames(valBlups
) %in% rownames(slGDf
)), ]
450 valCorData
<- merge(slGDf
, valBlups
, by
=0)
451 rownames(valCorData
) <- valCorData
[, 1]
452 valCorData
[, 1] <- NULL
454 accuracy
<- try(cor(valCorData
))
456 validation
<- paste("validation", trFoRe
, sep
= ".")
458 cvTest
<- paste("CV", trFoRe
, sep
= " ")
460 if ( class(accuracy
) != "try-error")
462 accuracy
<- round(accuracy
[1,2], digits
= 3)
463 accuracy
<- data
.matrix(accuracy
)
465 colnames(accuracy
) <- c("correlation")
466 rownames(accuracy
) <- cvTest
468 assign(validation
, accuracy
)
470 if (!is
.na(accuracy
[1,1])) {
471 validationAll
<- rbind(validationAll
, accuracy
)
477 validationAll
<- data
.matrix(validationAll
[order(-validationAll
[, 1]), ])
479 if (!is
.null(validationAll
)) {
480 validationMean
<- data
.matrix(round(colMeans(validationAll
), digits
= 2))
482 rownames(validationMean
) <- c("Average")
484 validationAll
<- rbind(validationAll
, validationMean
)
485 colnames(validationAll
) <- c("Correlation")
488 validationAll
<- data
.frame(validationAll
)
493 predictionPopResult
<- c()
494 predictionPopGEBVs
<- c()
496 if (length(predictionData
) != 0) {
497 message("running prediction for selection candidates...marker data", ncol(predictionData
), " vs. ", ncol(genoDataFilteredObs
))
499 genoDataTrSl
<- rbind(genoDataFilteredObs
, predictionData
)
500 rTrSl
<- A
.mat(genoDataTrSl
)
502 predictionPopResult
<- kin
.blup(data
= phenoTrait
,
509 message("running prediction for selection candidates...DONE!!")
510 predictionPopGEBVs
<- round(data
.frame(predictionPopResult$g
), 3)
511 genotypesSl
<- rownames(predictionData
)
512 predictionPopGEBVs
<- predictionPopGEBVs
[(rownames(predictionPopGEBVs
) %in% genotypesSl
), ]
513 predictionPopGEBVs
<- data
.frame(predictionPopGEBVs
)
514 predictionPopGEBVs
<- data
.frame(predictionPopGEBVs
[order(-predictionPopGEBVs
[, 1]), ])
516 colnames(predictionPopGEBVs
) <- c(trait
)
519 if (!is
.null(predictionPopGEBVs
) & length(predictionPopGEBVsFile
) != 0) {
520 fwrite(predictionPopGEBVs
,
521 file
= predictionPopGEBVsFile
,
528 if(!is
.null(validationAll
)) {
529 fwrite(validationAll
,
530 file
= validationFile
,
538 if (!is
.null(ordered
.markerEffects
)) {
539 fwrite(ordered
.markerEffects
,
548 if (!is
.null(ordered
.trGEBV
)) {
549 fwrite(ordered
.trGEBV
,
558 if (length(combinedGebvsFile
) != 0 ) {
559 if(file
.info(combinedGebvsFile
)$size
== 0) {
560 fwrite(ordered
.trGEBV
,
561 file
= combinedGebvsFile
,
568 file
= combinedGebvsFile
,
577 if (!is
.null(traitPhenoData
) & length(traitPhenoFile
) != 0) {
578 fwrite(traitPhenoData
,
579 file
= traitPhenoFile
,
587 if (!is
.null(filteredGenoData
) && is
.null(readFilteredGenoData
)) {
588 fwrite(filteredGenoData
,
589 file
= filteredGenoFile
,
597 if (length(filteredPredGenoFile
) != 0 && is
.null(readFilteredPredGenoData
)) {
598 fwrite(filteredPredGenoData
,
599 file
= filteredPredGenoFile
,
606 ## if (!is.null(genoDataMissing)) {
607 ## write.table(genoData,
616 ## if (!is.null(predictionDataMissing)) {
617 ## write.table(predictionData,
618 ## file = predictionFile,
625 if (file
.info(relationshipMatrixFile
)$size
== 0) {
626 fwrite(relationshipMatrix
,
627 file
= relationshipMatrixFile
,
635 if (file
.info(formattedPhenoFile
)$size
== 0 && !is
.null(formattedPhenoData
) ) {
636 fwrite(formattedPhenoData
,
637 file
= formattedPhenoFile
,
646 q(save
= "no", runLast
= FALSE)