1 #a script for calculating genomic
2 #estimated breeding values (GEBVs) using rrBLUP
12 allArgs
<- commandArgs()
14 inFile
<- grep("input_files",
21 outFile
<- grep("output_files",
28 outFiles
<- scan(outFile
,
33 inFiles
<- scan(inFile
,
38 traitsFile
<- grep("traits",
45 traitFile
<- grep("trait_info",
54 traitInfo
<- scan(traitFile
,
58 traitInfo
<-strsplit(traitInfo
, "\t");
59 traitId
<-traitInfo
[[1]]
62 datasetInfoFile
<- grep("dataset_info",
70 if(length(datasetInfoFile
) != 0 )
72 datasetInfo
<- scan(datasetInfoFile
,
76 datasetInfo
<- paste(datasetInfo
, collapse
= " ")
79 datasetInfo
<- c('single population')
83 validationTrait
<- paste("validation", trait
, sep
= "_")
85 validationFile
<- grep(validationTrait
,
92 kinshipTrait
<- paste("kinship", trait
, sep
= "_")
94 blupFile
<- grep(kinshipTrait
,
101 markerTrait
<- paste("marker", trait
, sep
= "_")
102 markerFile
<- grep(markerTrait
,
109 traitPhenoFile
<- paste("phenotype_trait", trait
, sep
= "_")
110 traitPhenoFile
<- grep(traitPhenoFile
,
117 print(traitPhenoFile
)
119 phenoFile
<- grep("phenotype_data",
125 message("phenotype dataset file: ", phenoFile
)
126 message("dataset info: ", datasetInfo
)
127 ## rowNamesColumn <- c()
128 ## if (datasetInfo == 'combined populations')
130 ## rowNamesColumn <- 1
132 ## rowNamesColumn <- NULL
134 ## message("row names column: ", rowNamesColumn)
135 phenoData
<- read
.table(phenoFile
,
139 na
.strings
= c("NA", " ", "--", "-"),
145 if (datasetInfo
== 'combined populations')
147 dropColumns
<- grep(trait
,
154 phenoTrait
<- phenoData
[,!(names(phenoData
) %in% dropColumns
)]
156 row
.names(phenoTrait
) <- phenoTrait
[, 1]
157 phenoTrait
[, 1] <- NULL
159 print(phenoTrait
[1:10, ])
163 dropColumns
<- c("uniquename", "stock_name")
164 phenoData
<- phenoData
[,!(names(phenoData
) %in% dropColumns
)]
165 phenoTrait
<- subset(phenoData
,
166 select
= c("object_name", "stock_id", trait
)
169 if (sum(is
.na(phenoTrait
)) > 0)
171 print("sum of pheno missing values")
172 print(sum(is
.na(phenoTrait
)))
174 #fill in for missing data with mean value
175 phenoTrait
[, trait
] <- replace (phenoTrait
[, trait
],
176 is
.na(phenoTrait
[, trait
]),
177 mean(phenoTrait
[, trait
], na
.rm
=TRUE)
181 #calculate mean of reps/plots of the same accession and
182 #create new df with the accession means
183 dropColumns
<- c("stock_id")
184 phenoTrait
<- phenoTrait
[,!(names(phenoTrait
) %in% dropColumns
)]
185 phenoTrait
<- phenoTrait
[order(row
.names(phenoTrait
)), ]
186 phenoTrait
<- data
.frame(phenoTrait
)
187 print('phenotyped lines before averaging')
188 print(length(row
.names(phenoTrait
)))
189 phenoTrait
<-ddply(phenoTrait
, "object_name", colwise(mean
))
190 print('phenotyped lines after averaging')
191 print(length(row
.names(phenoTrait
)))
193 #make stock_names row names
194 row
.names(phenoTrait
) <- phenoTrait
[, 1]
195 phenoTrait
[, 1] <- NULL
200 genoFile
<- grep("genotype_data",
209 if (trait
== 'FHB' || trait
== 'DON')
211 genoFile
<- c("~/cxgn/sgn-home/isaak/GS/barley/cap123_geno_training.txt")
214 genoData
<- read
.table(genoFile
,
218 na
.strings
= c("NA", " ", "--", "-"),
222 genoData
<- data
.matrix(genoData
[order(row
.names(genoData
)), ])
223 print(genoData
[1:10, 1:4])
225 predictionTempFile
<- grep("prediction_population",
232 predictionFile
<- c()
233 message('prediction temp genotype file: ', predictionTempFile
)
234 if (length(predictionTempFile
) !=0 )
236 predictionFile
<- scan(predictionTempFile
,
241 message('prediction genotype file: ', predictionFile
)
242 predictionPopGEBVsFile
<- grep("prediction_pop_gebvs",
249 message("prediction gebv file: ", predictionPopGEBVsFile
)
251 if (trait
== 'FHB' || trait
== 'DON')
253 predictionFile
<- c("~/cxgn/sgn-home/isaak/GS/barley/cap123_geno_prediction.txt")
256 predictionData
<- c()
258 if (length(predictionFile
) !=0 )
260 predictionData
<- read
.table(predictionFile
,
264 na
.strings
= c("NA", " ", "--", "-"),
269 #add checks for all input data
270 #create phenotype and genotype datasets with
272 message('phenotyped lines: ', length(row
.names(phenoTrait
)))
273 message('genotyped lines: ', length(row
.names(genoData
)))
275 #extract observation lines with both
276 #phenotype and genotype data only.
277 commonObs
<- intersect(row
.names(phenoTrait
), row
.names(genoData
))
278 commonObs
<-data
.frame(commonObs
)
279 rownames(commonObs
)<-commonObs
[, 1]
280 message('lines with both genotype and phenotype data: ', length(row
.names(commonObs
)))
281 #include in the genotype dataset only observation lines
283 message("genotype lines before filtering for phenotyped only: ", length(row
.names(genoData
)))
284 genoData
<-genoData
[(rownames(genoData
) %in% rownames(commonObs
)), ]
285 message("genotype lines after filtering for phenotyped only: ", length(row
.names(genoData
)))
286 #drop observation lines without genotype data
287 message("phenotype lines before filtering for genotyped only: ", length(row
.names(phenoTrait
)))
289 phenoTrait
<- merge(data
.frame(phenoTrait
), commonObs
, by
=0, all
=FALSE)
290 rownames(phenoTrait
) <-phenoTrait
[,1]
291 phenoTrait
[, 1] <- NULL
292 phenoTrait
[, 2] <- NULL
294 message("phenotype lines after filtering for genotyped only: ", length(row
.names(phenoTrait
)))
296 #a set of only observation lines with genotype data
297 traitPhenoData
<- as
.data
.frame(round(phenoTrait
, digits
=2))
299 #if (length(genotypesDiff) > 0)
300 # stop("Genotypes in the phenotype and genotype datasets don't match.")
302 phenoTrait
<- data
.matrix(phenoTrait
)
303 genoDataMatrix
<- data
.matrix(genoData
)
305 #impute genotype values for obs with missing values,
306 #based on mean of neighbouring 10 (arbitrary) obs
307 if (sum(is
.na(genoDataMatrix
)) > 0)
309 print("sum of geno missing values")
310 print(sum(is
.na(genoDataMatrix
)))
311 genoDataMatrix
<-kNNImpute(genoDataMatrix
, 10)
312 genoDataMatrix
<-as
.data
.frame(genoDataMatrix
)
314 #extract columns with imputed values
315 genoDataMatrix
<- subset(genoDataMatrix
,
316 select
= grep("^x", names(genoDataMatrix
))
319 #remove prefix 'x.' from imputed columns
320 names(genoDataMatrix
) <- sub("x.", "", names(genoDataMatrix
))
322 genoDataMatrix
<- round(genoDataMatrix
, digits
= 0)
323 genoDataMatrix
<- data
.matrix(genoDataMatrix
)
326 #impute missing data in prediction data
327 if (length(predictionData
) != 0)
329 predictionData
<- data
.matrix(predictionData
)
330 print('before imputation prediction data')
331 print(predictionData
[1:10, 1:4])
332 if (sum(is
.na(predictionData
)) > 0)
334 print("sum of geno missing values")
335 print(sum(is
.na(predictionData
)))
336 predictionData
<-kNNImpute(predictionData
, 10)
337 predictionData
<-as
.data
.frame(predictionData
)
339 #extract columns with imputed values
340 predictionData
<- subset(predictionData
,
341 select
= grep("^x", names(predictionData
))
344 #remove prefix 'x.' from imputed columns
345 names(predictionData
) <- sub("x.", "", names(predictionData
))
347 predictionData
<- round(predictionData
, digits
= 0)
348 predictionData
<- data
.matrix(predictionData
)
353 #change genotype coding to [-1, 0, 1], to use the A.mat )
354 genoDataMatrix
<- genoDataMatrix
- 1
356 if (length(predictionData
) != 0)
358 predictionData
<- predictionData
- 1
361 #use REML (default) to calculate variance components
363 #calculate GEBV using marker effects (as random effects)
365 markerGEBV
<- mixed
.solve(y
= phenoTrait
,
369 ordered
.markerGEBV2
<- data
.matrix(markerGEBV$u
)
370 ordered
.markerGEBV2
<- data
.matrix(ordered
.markerGEBV2
[order (-ordered
.markerGEBV2
[, 1]), ])
371 ordered
.markerGEBV2
<- round(ordered
.markerGEBV2
,
375 colnames(ordered
.markerGEBV2
) <- c("Marker Effects")
377 #additive relationship model
378 #calculate the inner products for
379 #genotypes (realized relationship matrix)
380 genocrsprd
<- tcrossprod(genoDataMatrix
)
382 #construct an identity matrix for genotypes
383 identityMatrix
<- diag(nrow(phenoTrait
))
385 iGEBV
<- mixed
.solve(y
= phenoTrait
,
390 #correlation between breeding values based on
391 #marker effects and relationship matrix
392 corGEBVs
<- cor(genoDataMatrix
%*% markerGEBV$u
, iGEBV$u
)
395 #iGEBVu<-iGEBVu[order(-ans$u), ]
396 iGEBV
<- data
.matrix(iGEBVu
)
398 ordered
.iGEBV
<- as
.data
.frame(iGEBV
[order(-iGEBV
[, 1]), ] )
400 ordered
.iGEBV
<- round(ordered
.iGEBV
,
404 combinedGebvsFile
<- grep('selected_traits_gebv',
412 if (length(combinedGebvsFile
) != 0)
414 fileSize
<- file
.info(combinedGebvsFile
)$size
417 combinedGebvs
<-read
.table(combinedGebvsFile
,
424 colnames(ordered
.iGEBV
) <- c(trait
)
426 traitGEBV
<- as
.data
.frame(ordered
.iGEBV
)
427 allGebvs
<- merge(combinedGebvs
, traitGEBV
,
431 rownames(allGebvs
) <- allGebvs
[,1]
436 colnames(ordered
.iGEBV
) <- c(trait
)
438 #TO-DO:account for minor allele frequency
442 reps
<- round_any(nrow(phenoTrait
), 10, f
= ceiling
) %/% 10
446 if (nrow(phenoTrait
) %% 10 == 0)
448 genotypeGroups
<- rep(1:10, reps
)
450 genotypeGroups
<- rep(1:10, reps
) [- (nrow(phenoTrait
) %% 10) ]
454 genotypeGroups
<- genotypeGroups
[ order (runif(nrow(phenoTrait
))) ]
460 tr
<- paste("trPop", i
, sep
= ".")
461 sl
<- paste("slPop", i
, sep
= ".")
463 trG
<- which(genotypeGroups
!= i
)
464 slG
<- which(genotypeGroups
== i
)
469 kblup
<- paste("rKblup", i
, sep
= ".")
471 result
<- kinship
.BLUP(y
= phenoTrait
[trG
],
472 G
.train
= genoDataMatrix
[trG
, ],
473 G
.pred
= genoDataMatrix
[slG
, ],
474 mixed
.method
= "REML",
478 assign(kblup
, result
)
480 #calculate cross-validation accuracy
481 accuracy
<- try(cor(result$g
.pred
, phenoTrait
[slG
]))
483 validation
<- paste("validation", i
, sep
= ".")
485 cvTest
<- paste("Test", i
, sep
= " ")
487 if (class(accuracy
) != "try-error")
489 accuracy
<- round(accuracy
, digits
= 2)
490 accuracy
<- data
.matrix(accuracy
)
492 colnames(accuracy
) <- c("correlation")
493 rownames(accuracy
) <- cvTest
495 assign(validation
, accuracy
)
497 validationAll
<- rbind(validationAll
, accuracy
)
501 validationAll
<- data
.matrix(validationAll
)
502 validationAll
<- data
.matrix(validationAll
[order(-validationAll
[, 1]), ])
504 if (is
.null(validationAll
) == FALSE)
506 validationMean
<- data
.matrix(round(colMeans(validationAll
),
511 rownames(validationMean
) <- c("Average")
513 validationAll
<- rbind(validationAll
, validationMean
)
514 colnames(validationAll
) <- c("Correlation")
517 #predict GEBVs for selection population
518 if (length(predictionData
) !=0 )
520 predictionData
<- data
.matrix(round(predictionData
, digits
= 0 ))
521 print("prediction genotype data")
522 print(predictionData
[1:10, 1:20])
525 predictionPopResult
<- c()
526 predictionPopGEBVs
<- c()
528 if(length(predictionData
) != 0)
530 predictionPopResult
<- kinship
.BLUP(y
= phenoTrait
,
531 G
.train
= genoDataMatrix
,
532 G
.pred
= predictionData
,
533 mixed
.method
= "REML",
537 predictionPopGEBVs
<- round(data
.matrix(predictionPopResult$g
.pred
), digits
= 2)
538 predictionPopGEBVs
<- data
.matrix(predictionPopGEBVs
[order(-predictionPopGEBVs
[, 1]), ])
541 colnames(predictionPopGEBVs
) <- c(trait
)
542 print("prediction output")
543 print(predictionPopGEBVs
[1:10,])
548 if(!is
.null(predictionPopGEBVs
) & length(predictionPopGEBVsFile
) != 0)
550 write
.table(predictionPopGEBVs
,
551 file
= predictionPopGEBVsFile
,
559 if(is
.null(validationAll
) == FALSE)
561 write
.table(validationAll
,
562 file
= validationFile
,
570 if(is
.null(ordered
.markerGEBV2
) == FALSE)
572 write
.table(ordered
.markerGEBV2
,
581 if(is
.null(ordered
.iGEBV
) == FALSE)
583 write
.table(ordered
.iGEBV
,
592 if(length(combinedGebvsFile
) != 0 )
594 if(file
.info(combinedGebvsFile
)$size
== 0)
596 write
.table(ordered
.iGEBV
,
597 file
= combinedGebvsFile
,
604 write
.table(allGebvs
,
605 file
= combinedGebvsFile
,
613 if(!is
.null(traitPhenoData
) & length(traitPhenoFile
) != 0)
615 write
.table(traitPhenoData
,
616 file
= traitPhenoFile
,
624 #should also send notification to analysis owner
625 to
<- c("<iyt2@cornell.edu>")
626 subject
<- paste(trait
, ' GS analysis done', sep
= ':')
627 body
<- c("Dear User,\n\n")
628 body
<- paste(body
, 'The genomic selection analysis for', sep
= "")
629 body
<- paste(body
, trait
, sep
= " ")
630 body
<- paste(body
, "is done.\n\nRegards and Thanks.\nSGN", sep
= " ")
632 #should use SGN's smtp server eventually
633 sendmail(to
, subject
, body
, password
= "rmail")
635 q(save
= "no", runLast
= FALSE)