add documentation for some of the functions.
[sgn.git] / R / gs.r
blob7c7f8a303b3d35392671c73487958607d16d7c53
1 #a script for calculating genomic
2 #estimated breeding values (GEBVs) using rrBLUP
4 options(echo = FALSE)
6 library(rrBLUP)
7 library(plyr)
8 library(mail)
9 library(imputation)
10 library(stringr)
12 allArgs <- commandArgs()
14 inFile <- grep("input_files",
15 allArgs,
16 ignore.case = TRUE,
17 perl = TRUE,
18 value = TRUE
21 outFile <- grep("output_files",
22 allArgs,
23 ignore.case = TRUE,
24 perl = TRUE,
25 value = TRUE
28 outFiles <- scan(outFile,
29 what = "character"
31 print(outFiles)
33 inFiles <- scan(inFile,
34 what = "character"
36 print(inFiles)
38 traitsFile <- grep("traits",
39 inFiles,
40 ignore.case = TRUE,
41 fixed = FALSE,
42 value = TRUE
45 traitFile <- grep("trait_info",
46 inFiles,
47 ignore.case = TRUE,
48 fixed = FALSE,
49 value = TRUE
52 print(traitFile)
54 traitInfo <- scan(traitFile,
55 what = "character",
58 traitInfo<-strsplit(traitInfo, "\t");
59 traitId<-traitInfo[[1]]
60 trait<-traitInfo[[2]]
62 datasetInfoFile <- grep("dataset_info",
63 inFiles,
64 ignore.case = TRUE,
65 fixed = FALSE,
66 value = TRUE
68 datasetInfo <- c()
70 if(length(datasetInfoFile) != 0 )
72 datasetInfo <- scan(datasetInfoFile,
73 what= "character"
76 datasetInfo <- paste(datasetInfo, collapse = " ")
78 } else {
79 datasetInfo <- c('single population')
83 validationTrait <- paste("validation", trait, sep = "_")
85 validationFile <- grep(validationTrait,
86 outFiles,
87 ignore.case=TRUE,
88 fixed = FALSE,
89 value=TRUE
92 kinshipTrait <- paste("kinship", trait, sep = "_")
94 blupFile <- grep(kinshipTrait,
95 outFiles,
96 ignore.case = TRUE,
97 fixed = FALSE,
98 value = TRUE
100 print(blupFile)
101 markerTrait <- paste("marker", trait, sep = "_")
102 markerFile <- grep(markerTrait,
103 outFiles,
104 ignore.case = TRUE,
105 fixed = FALSE,
106 value = TRUE
108 print(markerFile)
109 traitPhenoFile <- paste("phenotype_trait", trait, sep = "_")
110 traitPhenoFile <- grep(traitPhenoFile,
111 outFiles,
112 ignore.case = TRUE,
113 fixed = FALSE,
114 value = TRUE
117 print(traitPhenoFile)
119 phenoFile <- grep("phenotype_data",
120 inFiles,
121 ignore.case = TRUE,
122 fixed = FALSE,
123 value = TRUE
125 message("phenotype dataset file: ", phenoFile)
126 message("dataset info: ", datasetInfo)
127 ## rowNamesColumn <- c()
128 ## if (datasetInfo == 'combined populations')
129 ## {
130 ## rowNamesColumn <- 1
131 ## }else {
132 ## rowNamesColumn <- NULL
133 ## }
134 ## message("row names column: ", rowNamesColumn)
135 phenoData <- read.table(phenoFile,
136 header = TRUE,
137 row.names = NULL,
138 sep = "\t",
139 na.strings = c("NA", " ", "--", "-"),
140 dec = "."
143 phenoTrait <- c()
145 if (datasetInfo == 'combined populations')
147 dropColumns <- grep(trait,
148 names(phenoData),
149 ignore.case = TRUE,
150 value = TRUE,
151 fixed = FALSE
154 phenoTrait <- phenoData[,!(names(phenoData) %in% dropColumns)]
156 row.names(phenoTrait) <- phenoTrait[, 1]
157 phenoTrait[, 1] <- NULL
159 print(phenoTrait[1:10, ])
161 } else {
163 dropColumns <- c("uniquename", "stock_name")
164 phenoData <- phenoData[,!(names(phenoData) %in% dropColumns)]
165 phenoTrait <- subset(phenoData,
166 select = c("object_name", "stock_id", trait)
169 if (sum(is.na(phenoTrait)) > 0)
171 print("sum of pheno missing values")
172 print(sum(is.na(phenoTrait)))
174 #fill in for missing data with mean value
175 phenoTrait[, trait] <- replace (phenoTrait[, trait],
176 is.na(phenoTrait[, trait]),
177 mean(phenoTrait[, trait], na.rm =TRUE)
181 #calculate mean of reps/plots of the same accession and
182 #create new df with the accession means
183 dropColumns <- c("stock_id")
184 phenoTrait <- phenoTrait[,!(names(phenoTrait) %in% dropColumns)]
185 phenoTrait <- phenoTrait[order(row.names(phenoTrait)), ]
186 phenoTrait <- data.frame(phenoTrait)
187 print('phenotyped lines before averaging')
188 print(length(row.names(phenoTrait)))
189 phenoTrait<-ddply(phenoTrait, "object_name", colwise(mean))
190 print('phenotyped lines after averaging')
191 print(length(row.names(phenoTrait)))
193 #make stock_names row names
194 row.names(phenoTrait) <- phenoTrait[, 1]
195 phenoTrait[, 1] <- NULL
200 genoFile <- grep("genotype_data",
201 inFiles,
202 ignore.case = TRUE,
203 fixed = FALSE,
204 value = TRUE
207 print(genoFile)
209 if (trait == 'FHB' || trait == 'DON')
211 genoFile <- c("~/cxgn/sgn-home/isaak/GS/barley/cap123_geno_training.txt")
214 genoData <- read.table(genoFile,
215 header = TRUE,
216 row.names = 1,
217 sep = "\t",
218 na.strings = c("NA", " ", "--", "-"),
219 dec = "."
222 genoData <- data.matrix(genoData[order(row.names(genoData)), ])
223 print(genoData[1:10, 1:4])
225 predictionTempFile <- grep("prediction_population",
226 inFiles,
227 ignore.case = TRUE,
228 fixed = FALSE,
229 value = TRUE
232 predictionFile <- c()
233 message('prediction temp genotype file: ', predictionTempFile)
234 if (length(predictionTempFile) !=0 )
236 predictionFile <- scan(predictionTempFile,
237 what="character"
241 message('prediction genotype file: ', predictionFile)
242 predictionPopGEBVsFile <- grep("prediction_pop_gebvs",
243 outFiles,
244 ignore.case = TRUE,
245 fixed = FALSE,
246 value = TRUE
249 message("prediction gebv file: ", predictionPopGEBVsFile)
251 if (trait == 'FHB' || trait == 'DON')
253 predictionFile <- c("~/cxgn/sgn-home/isaak/GS/barley/cap123_geno_prediction.txt")
256 predictionData <- c()
258 if (length(predictionFile) !=0 )
260 predictionData <- read.table(predictionFile,
261 header = TRUE,
262 row.names = 1,
263 sep = "\t",
264 na.strings = c("NA", " ", "--", "-"),
265 dec = "."
269 #add checks for all input data
270 #create phenotype and genotype datasets with
271 #common stocks only
272 message('phenotyped lines: ', length(row.names(phenoTrait)))
273 message('genotyped lines: ', length(row.names(genoData)))
275 #extract observation lines with both
276 #phenotype and genotype data only.
277 commonObs <- intersect(row.names(phenoTrait), row.names(genoData))
278 commonObs<-data.frame(commonObs)
279 rownames(commonObs)<-commonObs[, 1]
280 message('lines with both genotype and phenotype data: ', length(row.names(commonObs)))
281 #include in the genotype dataset only observation lines
282 #with phenotype data
283 message("genotype lines before filtering for phenotyped only: ", length(row.names(genoData)))
284 genoData<-genoData[(rownames(genoData) %in% rownames(commonObs)), ]
285 message("genotype lines after filtering for phenotyped only: ", length(row.names(genoData)))
286 #drop observation lines without genotype data
287 message("phenotype lines before filtering for genotyped only: ", length(row.names(phenoTrait)))
289 phenoTrait <- merge(data.frame(phenoTrait), commonObs, by=0, all=FALSE)
290 rownames(phenoTrait) <-phenoTrait[,1]
291 phenoTrait[, 1] <- NULL
292 phenoTrait[, 2] <- NULL
294 message("phenotype lines after filtering for genotyped only: ", length(row.names(phenoTrait)))
296 #a set of only observation lines with genotype data
297 traitPhenoData <- as.data.frame(round(phenoTrait, digits=2))
299 #if (length(genotypesDiff) > 0)
300 # stop("Genotypes in the phenotype and genotype datasets don't match.")
302 phenoTrait <- data.matrix(phenoTrait)
303 genoDataMatrix <- data.matrix(genoData)
305 #impute genotype values for obs with missing values,
306 #based on mean of neighbouring 10 (arbitrary) obs
307 if (sum(is.na(genoDataMatrix)) > 0)
309 print("sum of geno missing values")
310 print(sum(is.na(genoDataMatrix)))
311 genoDataMatrix <-kNNImpute(genoDataMatrix, 10)
312 genoDataMatrix <-as.data.frame(genoDataMatrix)
314 #extract columns with imputed values
315 genoDataMatrix <- subset(genoDataMatrix,
316 select = grep("^x", names(genoDataMatrix))
319 #remove prefix 'x.' from imputed columns
320 names(genoDataMatrix) <- sub("x.", "", names(genoDataMatrix))
322 genoDataMatrix <- round(genoDataMatrix, digits = 0)
323 genoDataMatrix <- data.matrix(genoDataMatrix)
326 #impute missing data in prediction data
327 if (length(predictionData) != 0)
329 predictionData <- data.matrix(predictionData)
330 print('before imputation prediction data')
331 print(predictionData[1:10, 1:4])
332 if (sum(is.na(predictionData)) > 0)
334 print("sum of geno missing values")
335 print(sum(is.na(predictionData)))
336 predictionData <-kNNImpute(predictionData, 10)
337 predictionData <-as.data.frame(predictionData)
339 #extract columns with imputed values
340 predictionData <- subset(predictionData,
341 select = grep("^x", names(predictionData))
344 #remove prefix 'x.' from imputed columns
345 names(predictionData) <- sub("x.", "", names(predictionData))
347 predictionData <- round(predictionData, digits = 0)
348 predictionData <- data.matrix(predictionData)
353 #change genotype coding to [-1, 0, 1], to use the A.mat )
354 genoDataMatrix <- genoDataMatrix - 1
356 if (length(predictionData) != 0)
358 predictionData <- predictionData - 1
361 #use REML (default) to calculate variance components
363 #calculate GEBV using marker effects (as random effects)
365 markerGEBV <- mixed.solve(y = phenoTrait,
366 Z = genoDataMatrix
369 ordered.markerGEBV2 <- data.matrix(markerGEBV$u)
370 ordered.markerGEBV2 <- data.matrix(ordered.markerGEBV2 [order (-ordered.markerGEBV2[, 1]), ])
371 ordered.markerGEBV2 <- round(ordered.markerGEBV2,
372 digits=3
375 colnames(ordered.markerGEBV2) <- c("Marker Effects")
377 #additive relationship model
378 #calculate the inner products for
379 #genotypes (realized relationship matrix)
380 genocrsprd <- tcrossprod(genoDataMatrix)
382 #construct an identity matrix for genotypes
383 identityMatrix <- diag(nrow(phenoTrait))
385 iGEBV <- mixed.solve(y = phenoTrait,
386 Z = identityMatrix,
387 K = genocrsprd
390 #correlation between breeding values based on
391 #marker effects and relationship matrix
392 corGEBVs <- cor(genoDataMatrix %*% markerGEBV$u, iGEBV$u)
394 iGEBVu <- iGEBV$u
395 #iGEBVu<-iGEBVu[order(-ans$u), ]
396 iGEBV <- data.matrix(iGEBVu)
398 ordered.iGEBV <- as.data.frame(iGEBV[order(-iGEBV[, 1]), ] )
400 ordered.iGEBV <- round(ordered.iGEBV,
401 digits = 3
404 combinedGebvsFile <- grep('selected_traits_gebv',
405 outFiles,
406 ignore.case = TRUE,
407 fixed = FALSE,
408 value = TRUE
411 allGebvs<-c()
412 if (length(combinedGebvsFile) != 0)
414 fileSize <- file.info(combinedGebvsFile)$size
415 if (fileSize != 0 )
417 combinedGebvs<-read.table(combinedGebvsFile,
418 header = TRUE,
419 row.names = 1,
420 dec = ".",
421 sep = "\t"
424 colnames(ordered.iGEBV) <- c(trait)
426 traitGEBV <- as.data.frame(ordered.iGEBV)
427 allGebvs <- merge(combinedGebvs, traitGEBV,
428 by = 0,
429 all = TRUE
431 rownames(allGebvs) <- allGebvs[,1]
432 allGebvs[,1] <- NULL
436 colnames(ordered.iGEBV) <- c(trait)
438 #TO-DO:account for minor allele frequency
440 #cross-validation
442 reps <- round_any(nrow(phenoTrait), 10, f = ceiling) %/% 10
444 genotypeGroups <-c()
446 if (nrow(phenoTrait) %% 10 == 0)
448 genotypeGroups <- rep(1:10, reps)
449 } else {
450 genotypeGroups <- rep(1:10, reps) [- (nrow(phenoTrait) %% 10) ]
453 set.seed(4567)
454 genotypeGroups <- genotypeGroups[ order (runif(nrow(phenoTrait))) ]
456 validationAll <- c()
458 for (i in 1:10)
460 tr <- paste("trPop", i, sep = ".")
461 sl <- paste("slPop", i, sep = ".")
463 trG <- which(genotypeGroups != i)
464 slG <- which(genotypeGroups == i)
466 assign(tr, trG)
467 assign(sl, slG)
469 kblup <- paste("rKblup", i, sep = ".")
471 result <- kinship.BLUP(y = phenoTrait[trG],
472 G.train = genoDataMatrix[trG, ],
473 G.pred = genoDataMatrix[slG, ],
474 mixed.method = "REML",
475 K.method = "RR"
478 assign(kblup, result)
480 #calculate cross-validation accuracy
481 accuracy <- try(cor(result$g.pred, phenoTrait[slG]))
483 validation <- paste("validation", i, sep = ".")
485 cvTest <- paste("Test", i, sep = " ")
487 if (class(accuracy) != "try-error")
489 accuracy <- round(accuracy, digits = 2)
490 accuracy <- data.matrix(accuracy)
492 colnames(accuracy) <- c("correlation")
493 rownames(accuracy) <- cvTest
495 assign(validation, accuracy)
497 validationAll <- rbind(validationAll, accuracy)
501 validationAll <- data.matrix(validationAll)
502 validationAll <- data.matrix(validationAll[order(-validationAll[, 1]), ])
504 if (is.null(validationAll) == FALSE)
506 validationMean <- data.matrix(round(colMeans(validationAll),
507 digits = 2
511 rownames(validationMean) <- c("Average")
513 validationAll <- rbind(validationAll, validationMean)
514 colnames(validationAll) <- c("Correlation")
517 #predict GEBVs for selection population
518 if (length(predictionData) !=0 )
520 predictionData <- data.matrix(round(predictionData, digits = 0 ))
521 print("prediction genotype data")
522 print(predictionData[1:10, 1:20])
525 predictionPopResult <- c()
526 predictionPopGEBVs <- c()
528 if(length(predictionData) != 0)
530 predictionPopResult <- kinship.BLUP(y = phenoTrait,
531 G.train = genoDataMatrix,
532 G.pred = predictionData,
533 mixed.method = "REML",
534 K.method = "RR"
537 predictionPopGEBVs <- round(data.matrix(predictionPopResult$g.pred), digits = 2)
538 predictionPopGEBVs <- data.matrix(predictionPopGEBVs[order(-predictionPopGEBVs[, 1]), ])
541 colnames(predictionPopGEBVs) <- c(trait)
542 print("prediction output")
543 print(predictionPopGEBVs[1:10,])
548 if(!is.null(predictionPopGEBVs) & length(predictionPopGEBVsFile) != 0)
550 write.table(predictionPopGEBVs,
551 file = predictionPopGEBVsFile,
552 sep = "\t",
553 col.names = NA,
554 quote = FALSE,
555 append = FALSE
559 if(is.null(validationAll) == FALSE)
561 write.table(validationAll,
562 file = validationFile,
563 sep = "\t",
564 col.names = NA,
565 quote = FALSE,
566 append = FALSE
570 if(is.null(ordered.markerGEBV2) == FALSE)
572 write.table(ordered.markerGEBV2,
573 file = markerFile,
574 sep = "\t",
575 col.names = NA,
576 quote = FALSE,
577 append = FALSE
581 if(is.null(ordered.iGEBV) == FALSE)
583 write.table(ordered.iGEBV,
584 file = blupFile,
585 sep = "\t",
586 col.names = NA,
587 quote = FALSE,
588 append = FALSE
592 if(length(combinedGebvsFile) != 0 )
594 if(file.info(combinedGebvsFile)$size == 0)
596 write.table(ordered.iGEBV,
597 file = combinedGebvsFile,
598 sep = "\t",
599 col.names = NA,
600 quote = FALSE,
602 }else
604 write.table(allGebvs,
605 file = combinedGebvsFile,
606 sep = "\t",
607 quote = FALSE,
608 col.names = NA,
613 if(!is.null(traitPhenoData) & length(traitPhenoFile) != 0)
615 write.table(traitPhenoData,
616 file = traitPhenoFile,
617 sep = "\t",
618 col.names = NA,
619 quote = FALSE,
620 append = FALSE
624 #should also send notification to analysis owner
625 to <- c("<iyt2@cornell.edu>")
626 subject <- paste(trait, ' GS analysis done', sep = ':')
627 body <- c("Dear User,\n\n")
628 body <- paste(body, 'The genomic selection analysis for', sep = "")
629 body <- paste(body, trait, sep = " ")
630 body <- paste(body, "is done.\n\nRegards and Thanks.\nSGN", sep = " ")
632 #should use SGN's smtp server eventually
633 sendmail(to, subject, body, password = "rmail")
635 q(save = "no", runLast = FALSE)