Merge branch 'master' into topic/tracking_transformation
[sgn.git] / R / solGS / gs.r
blob6275c0ff57c5f30d4fd5292d95c9844b2da77450
1 #SNOPSIS
2 #calculates genomic estimated breeding values (GEBVs) using rrBLUP,
3 #GBLUP method
5 #AUTHOR
6 # Isaak Y Tecle (iyt2@cornell.edu)
8 options(echo = FALSE)
9 # options(warn = -1)
10 suppressWarnings(suppressPackageStartupMessages({
11 library(methods)
12 library(rrBLUP)
13 library(plyr)
14 library(stringr)
15 library(randomForest)
16 library(parallel)
17 library(genoDataFilter)
18 library(phenoAnalysis)
19 library(caret)
20 library(dplyr)
21 library(tibble)
22 library(rlang)
23 library(jsonlite)
24 library(data.table)
25 }))
26 library(genoDataFilter)
27 library(Matrix)
29 allArgs <- commandArgs()
32 inputFiles <- scan(grep("input_files", allArgs, value = TRUE),
33 what = "character")
35 outputFiles <- scan(grep("output_files", allArgs, value = TRUE),
36 what = "character")
38 traitsFile <- grep("traits", inputFiles, value = TRUE)
39 modelInfoFile <- grep("model_info", inputFiles, value = TRUE)
40 message('model_info_file ', modelInfoFile)
42 modelInfo <- read.table(modelInfoFile,
43 header=TRUE, sep ="\t",
44 as.is = c('Value'))
46 modelInfo <- column_to_rownames(modelInfo, var="Name")
47 traitId <- modelInfo["trait_id", 1]
48 traitAbbr <- modelInfo["trait_abbr", 1]
49 modelId <- modelInfo["model_id", 1]
50 protocolId <- modelInfo["protocol_id", 1]
51 protocolPage <- modelInfo["protocol_url", 1]
53 message('trait_id ', traitId)
54 message('trait_abbr ', traitAbbr)
55 message('protocol_id ', protocolId)
56 message('protocol detail page ', protocolPage)
58 message('model_id ', modelId)
60 datasetInfoFile <- grep("dataset_info", inputFiles, value = TRUE)
61 datasetInfo <- c()
63 if (length(datasetInfoFile) != 0 ) {
64 datasetInfo <- scan(datasetInfoFile, what = "character")
65 datasetInfo <- paste(datasetInfo, collapse = " ")
66 } else {
67 datasetInfo <- c('single_population')
70 #validationTrait <- paste("validation", trait, sep = "_")
71 validationFile <- grep('validation', outputFiles, value = TRUE)
73 if (is.null(validationFile)) {
74 stop("Validation output file is missing.")
77 #kinshipTrait <- paste("rrblup_training_gebvs", trait, sep = "_")
78 blupFile <- grep('rrblup_training_gebvs', outputFiles, value = TRUE)
80 if (is.null(blupFile)) {
81 stop("GEBVs file is missing.")
84 #markerTrait <- paste("marker_effects", trait, sep = "_")
85 markerFile <- grep('marker_effects', outputFiles, value = TRUE)
87 #traitPhenoFile <- paste("trait_phenotype_data", traitId, sep = "_")
88 modelPhenoFile <- grep('model_phenodata', outputFiles, value = TRUE)
89 message('model input trait pheno file ', modelPhenoFile)
90 modelGenoFile <- grep('model_genodata', outputFiles, value = TRUE)
91 message('model input trait geno file ', modelGenoFile)
92 traitRawPhenoFile <- grep('trait_raw_phenodata', outputFiles, value = TRUE)
93 varianceComponentsFile <- grep("variance_components", outputFiles, value = TRUE)
94 analysisReportFile <- grep("_report_", outputFiles, value = TRUE)
95 genoFilteringLogFile <- grep("genotype_filtering_log", outputFiles, value = TRUE)
97 filteredTrainingGenoFile <- grep("filtered_training_genotype_data", outputFiles, value = TRUE)
98 filteredSelGenoFile <- grep("filtered_selection_genotype_data", outputFiles, value = TRUE)
99 formattedPhenoFile <- grep("formatted_phenotype_data", inputFiles, value = TRUE)
101 genoFile <- grep("genotype_data_", inputFiles, value = TRUE)
103 if (is.null(genoFile)) {
104 stop("genotype data file is missing.")
107 if (file.info(genoFile)$size == 0) {
108 stop(paste0("genotype data file ", genoFile, " is empty."))
111 readfilteredTrainingGenoData <- c()
112 filteredTrainingGenoData <- c()
113 genoFilterLog <- c()
114 formattedPhenoData <- c()
115 phenoData <- c()
116 genoData <- c()
117 maf <- 0.01
118 markerFilter <- 0.6
119 cloneFilter <- 0.8
121 logHeading <- paste0("Genomic Prediction Analysis Log for ", traitAbbr, ".\n")
122 logHeading <- append(logHeading, paste0("Date: ", format(Sys.time(), "%d %b %Y %H:%M"), "\n\n\n"))
123 logHeading <- format(logHeading, width=80, justify="c")
124 trainingLog <- paste0("\n\n#Preprocessing training population genotype data.\n\n")
125 trainingLog <- append(trainingLog, "The following data filtering will be applied to the genotype dataset:\n\n")
126 trainingLog <- append(trainingLog, paste0("Markers with less or equal to ", maf * 100, "% minor allele frequency (maf) will be removed.\n"))
127 trainingLog <- append(trainingLog, paste0("\nMarkers with greater or equal to ", markerFilter * 100, "% missing values will be removed.\n"))
128 trainingLog <- append(trainingLog, paste0("Clones with greater or equal to ", cloneFilter * 100, "% missing values will be removed.\n") )
130 if (length(filteredTrainingGenoFile) != 0 && file.info(filteredTrainingGenoFile)$size != 0) {
131 filteredTrainingGenoData <- fread(filteredTrainingGenoFile,
132 na.strings = c("NA", "", "--", "-"),
133 header = TRUE)
135 genoData <- data.frame(filteredTrainingGenoData)
136 genoData <- column_to_rownames(genoData, 'V1')
137 readfilteredTrainingGenoData <- 1
140 if (is.null(filteredTrainingGenoData)) {
141 genoData <- fread(genoFile,
142 na.strings = c("NA", "", "--", "-"),
143 header = TRUE)
144 genoData <- unique(genoData, by='V1')
145 genoData <- data.frame(genoData)
146 genoData <- column_to_rownames(genoData, 'V1')
147 #genoDataFilter::filterGenoData
148 genoData <- convertToNumeric(genoData)
150 trainingLog <- append(trainingLog, "#Running training population genotype data cleaning.\n\n")
151 genoFilterOut <- filterGenoData(genoData, maf=maf, markerFilter=markerFilter, indFilter=cloneFilter, logReturn=TRUE)
153 genoData <- genoFilterOut$data
154 genoFilteringLog <- genoFilterOut$log
155 genoData <- roundAlleleDosage(genoData)
156 filteredTrainingGenoData <- genoData
158 } else {
159 genoFilteringLog <- scan(genoFilteringLogFile, what = "character", sep="\n")
160 genoFilteringLog <- paste0(genoFilteringLog, collapse="\n")
163 message("genofilteringlogfile: ", genoFilteringLogFile)
164 message(genoFilteringLog)
165 trainingLog <- append(trainingLog, genoFilteringLog)
167 genoData <- genoData[order(row.names(genoData)), ]
169 if (length(formattedPhenoFile) != 0 && file.info(formattedPhenoFile)$size != 0) {
170 formattedPhenoData <- data.frame(fread(formattedPhenoFile,
171 header = TRUE,
172 na.strings = c("NA", "", "--", "-", ".")
175 } else {
177 if (datasetInfo == 'combined_populations') {
179 phenoFile <- grep("model_phenodata", inputFiles, value = TRUE)
180 } else {
182 phenoFile <- grep("\\/phenotype_data", inputFiles, value = TRUE)
185 if (is.null(phenoFile)) {
186 stop("phenotype data file is missing.")
189 if (file.info(phenoFile)$size == 0) {
190 stop(paste0("phenotype data file ", phenoFile, " is empty."))
194 phenoData <- data.frame(fread(phenoFile,
195 sep = "\t",
196 na.strings = c("NA", "", "--", "-", "."),
197 header = TRUE))
202 phenoTrait <- c()
203 traitRawPhenoData <- c()
204 anovaLog <- paste0("#Preprocessing training population phenotype data.\n\n")
206 if (datasetInfo == 'combined_populations') {
207 anovaLog <- scan(analysisReportFile, what = "character", sep="\n")
208 anovaLog <- paste0(anovaLog, collapse="\n")
210 if (!is.null(formattedPhenoData)) {
211 phenoTrait <- subset(formattedPhenoData, select = traitAbbr)
212 phenoTrait <- na.omit(phenoTrait)
214 } else {
216 if (any(grepl('Average', names(phenoData)))) {
217 phenoTrait <- phenoData %>% select(V1, Average) %>% data.frame
218 } else {
219 phenoTrait <- phenoData
222 colnames(phenoTrait) <- c('genotypes', traitAbbr)
224 } else {
226 if (!is.null(formattedPhenoData)) {
227 phenoTrait <- subset(formattedPhenoData, select = c('V1', traitAbbr))
228 phenoTrait <- as.data.frame(phenoTrait)
229 phenoTrait <- na.omit(phenoTrait)
230 colnames(phenoTrait)[1] <- 'genotypes'
232 } else if (length(grep('list', phenoFile)) != 0) {
233 phenoTrait <- averageTrait(phenoData, traitAbbr)
235 } else {
236 meansResult <- getAdjMeans(phenoData,
237 traitName = traitAbbr,
238 calcAverages = TRUE,
239 logReturn = TRUE)
243 anovaLog <- paste0(anovaLog, meansResult$log)
244 phenoTrait <- meansResult$adjMeans
247 keepMetaCols <- c('observationUnitName', 'germplasmName', 'studyDbId', 'locationName',
248 'studyYear', 'replicate', 'blockNumber', traitAbbr)
250 traitRawPhenoData <- phenoData %>%
251 select(all_of(keepMetaCols))
256 meanType <- names(phenoTrait)[2]
257 names(phenoTrait) <- c('genotypes', traitAbbr)
259 selectionTempFile <- grep("selection_population", inputFiles, value = TRUE)
261 selectionFile <- c()
262 filteredPredGenoFile <- c()
263 selectionAllFiles <- c()
265 if (length(selectionTempFile) !=0 ) {
266 selectionAllFiles <- scan(selectionTempFile, what = "character")
268 selectionFile <- grep("\\/genotype_data", selectionAllFiles, value = TRUE)
270 #filteredPredGenoFile <- grep("filtered_genotype_data_", selectionAllFiles, value = TRUE)
273 selectionPopGEBVsFile <- grep("rrblup_selection_gebvs", outputFiles, value = TRUE)
275 selectionData <- c()
276 readFilteredPredGenoData <- c()
277 filteredPredGenoData <- c()
279 ## if (length(filteredPredGenoFile) != 0 && file.info(filteredPredGenoFile)$size != 0) {
280 ## selectionData <- fread(filteredPredGenoFile, na.strings = c("NA", " ", "--", "-"),)
281 ## readFilteredPredGenoData <- 1
283 ## selectionData <- data.frame(selectionData)
284 ## rownames(selectionData) <- selectionData[, 1]
285 ## selectionData[, 1] <- NULL
287 ## } else
288 selectionLog <- c()
289 if (length(selectionFile) != 0) {
290 selectionLog <- append(selectionLog, paste0("#Data preprocessing of selection population genotype data.\n\n"))
292 selectionData <- fread(selectionFile,
293 header = TRUE,
294 na.strings = c("NA", "", "--", "-"))
296 selectionData <- data.frame(selectionData)
297 selectionData <- unique(selectionData, by='V1')
298 selectionData <- column_to_rownames(selectionData, 'V1')
299 selectionData <- convertToNumeric(selectionData)
301 selectionLog <- append(selectionLog, paste0("Running selection population genotype data cleaning."))
303 selectionFilterOut <- filterGenoData(selectionData, maf=maf, markerFilter=markerFilter, indFilter=cloneFilter, logReturn=TRUE)
304 selectionData <- selectionFilterOut$data
305 selectionLog <- append(selectionLog, selectionFilterOut$log)
306 selectionData <- roundAlleleDosage(selectionData)
309 #impute genotype values for obs with missing values,
310 genoDataMissing <- c()
312 if (sum(is.na(genoData)) > 0) {
313 genoDataMissing<- c('yes')
315 genoData <- na.roughfix(genoData)
316 genoData <- data.frame(genoData)
319 #create phenotype and genotype datasets with
320 #common stocks only
322 #extract observation lines with both
323 #phenotype and genotype data only.
324 trainingLog <- append(trainingLog, paste0("\n\n#Filtering for training population genotypes with both phenotype and marker data.\n\n"))
325 trainingLog <- append(trainingLog, paste0("After calculating trait averages, the training population phenotype dataset has ", length(rownames(phenoTrait)), " individuals.\n") )
326 trainingLog <- append(trainingLog, paste0("After cleaning up for missing values, the training population genotype dataset has ", length(rownames(genoData)), " individuals.\n") )
328 commonObs <- intersect(phenoTrait$genotypes, row.names(genoData))
330 trainingLog <- append(trainingLog, paste0(length(commonObs), " individuals are shared in both phenotype and genotype datasets.\n"))
332 #remove genotyped lines without phenotype data
333 genoDataFilteredObs <- genoData[(rownames(genoData) %in% commonObs), ]
335 trainingLog <- append(trainingLog, paste0("After removing individuals without phenotype data, this genotype dataset has ", length(rownames(genoDataFilteredObs)), " individuals.\n"))
337 #remove phenotyped lines without genotype data
338 phenoTrait <- phenoTrait[(phenoTrait$genotypes %in% commonObs), ]
340 trainingLog <- append(trainingLog, paste0("After removing individuals without genotype data, this phenotype dataset has ", length(rownames(phenoTrait)), " individuals.\n" ))
342 phenoTraitMarker <- data.frame(phenoTrait)
343 rownames(phenoTraitMarker) <- phenoTraitMarker[, 1]
344 phenoTraitMarker[, 1] <- NULL
346 #impute missing data in prediction data
348 selectionDataMissing <- c()
349 if (length(selectionData) != 0) {
350 #purge markers unique to both populations
351 trainingMarkers <- names(genoDataFilteredObs)
352 selectionMarkers <- names(selectionData)
354 selectionLog <- append(selectionLog, paste0("#Comparing markers in the training and selection populations genotype datasets.\n\n" ))
356 selectionLog <- append(selectionLog, paste0("The training population genotype dataset has ", length(trainingMarkers), " markers.\n" ))
357 selectionLog <- append(selectionLog, paste0("The selection population genotype dataset has ", length(selectionMarkers), " markers.\n" ))
359 commonMarkers <- intersect(trainingMarkers, selectionMarkers)
360 selectionLog <- append(selectionLog, paste0("The training and selection populations genotype dataset have ", length(trainingMarkers), " markers in common.\n" ))
362 genoDataFilteredObs <- subset(genoDataFilteredObs, select= commonMarkers)
363 selectionLog <- append(selectionLog, paste0("After filtering for shared markers, the training population genotype dataset has ", length(names(selectionData)), " markers.\n" ))
365 selectionData <- subset(selectionData, select = commonMarkers)
366 selectionLog <- append(selectionLog, paste0("After filtering for shared markers, the selection population genotype dataset has ", length(names(selectionData)), " markers.\n" ))
368 if (sum(is.na(selectionData)) > 0) {
369 selectionDataMissing <- c('yes')
370 selectionData <- na.roughfix(selectionData)
371 selectionData <- data.frame(selectionData)
374 #change genotype coding to [-1, 0, 1], to use the A.mat ) if [0, 1, 2]
375 genoTrCode <- grep("2", genoDataFilteredObs[1, ], value = TRUE)
376 if(length(genoTrCode)) {
377 genoData <- genoData - 1
378 genoDataFilteredObs <- genoDataFilteredObs - 1
381 if (length(selectionData) != 0 ) {
382 genoSlCode <- grep("2", selectionData[1, ], value = TRUE)
383 if (length(genoSlCode) != 0 ) {
384 selectionData <- selectionData - 1
388 ordered.markerEffects <- c()
389 trGEBV <- c()
390 validationAll <- c()
391 combinedGebvsFile <- c()
392 allGebvs <- c()
393 modelPhenoData <- c()
394 relationshipMatrix <- c()
396 #additive relationship model
397 #calculate the inner products for
398 #genotypes (realized relationship matrix)
399 relationshipMatrixFile <- grep("relationship_matrix_table", outputFiles, value = TRUE)
400 relationshipMatrixJsonFile <- grep("relationship_matrix_json", outputFiles, value = TRUE)
402 traitRelationshipMatrixFile <- grep("relationship_matrix_adjusted_table", outputFiles, value = TRUE)
403 traitRelationshipMatrixJsonFile <- grep("relationship_matrix_adjusted_json", outputFiles, value = TRUE)
405 inbreedingFile <- grep('inbreeding_coefficients', outputFiles, value=TRUE)
406 aveKinshipFile <- grep('average_kinship', outputFiles, value=TRUE)
408 inbreeding <- c()
409 aveKinship <- c()
411 if (length(relationshipMatrixFile) != 0) {
412 if (file.info(relationshipMatrixFile)$size > 0 ) {
413 relationshipMatrix <- data.frame(fread(relationshipMatrixFile,
414 header = TRUE))
416 rownames(relationshipMatrix) <- relationshipMatrix[, 1]
417 relationshipMatrix[, 1] <- NULL
418 colnames(relationshipMatrix) <- rownames(relationshipMatrix)
419 relationshipMatrix <- data.matrix(relationshipMatrix)
421 } else {
422 relationshipMatrix <- A.mat(genoData)
423 diag(relationshipMatrix) <- diag(relationshipMatrix) %>% replace(., . < 1, 1)
424 relationshipMatrix <- relationshipMatrix %>% replace(., . <= 0, 0.00001)
426 inbreeding <- diag(relationshipMatrix)
427 inbreeding <- inbreeding - 1
428 inbreeding <- data.frame(inbreeding)
430 inbreeding <- inbreeding %>%
431 rownames_to_column('genotypes') %>%
432 rename(Inbreeding = inbreeding) %>%
433 arrange(Inbreeding) %>%
434 mutate_at('Inbreeding', round, 3) %>%
435 column_to_rownames('genotypes')
439 relationshipMatrix <- data.frame(relationshipMatrix)
440 colnames(relationshipMatrix) <- rownames(relationshipMatrix)
442 relationshipMatrix <- rownames_to_column(relationshipMatrix, var="genotypes")
443 relationshipMatrix <- relationshipMatrix %>% mutate_if(is.numeric, round, 5)
444 relationshipMatrix <- column_to_rownames(relationshipMatrix, var="genotypes")
446 traitRelationshipMatrix <- relationshipMatrix[(rownames(relationshipMatrix) %in% commonObs), ]
447 traitRelationshipMatrix <- traitRelationshipMatrix[, (colnames(traitRelationshipMatrix) %in% commonObs)]
449 kinshipLog <- c()
450 if (any(eigen(traitRelationshipMatrix)$values < 0) ) {
451 kinshipLog <- paste0("\n\nNote: The kinship matrix of this dataset causes 'Not positive semi-definite error' while running the Cholesky decomposition. To fix this and run the modeling, a corrected positive semi-definite matrix was computed using the 'Matrix::nearPD' function. The negative eigen values from this decomposition nudged to positive values.\n\n")
453 traitRelationshipMatrix <- Matrix::nearPD(as.matrix(traitRelationshipMatrix))$mat
456 traitRelationshipMatrix <- data.matrix(traitRelationshipMatrix)
458 nCores <- detectCores()
460 if (nCores > 1) {
461 nCores <- (nCores %/% 2)
462 } else {
463 nCores <- 1
465 varCompData <- c()
466 modelingLog <- paste0("\n\n#Training a model for ", traitAbbr, ".\n\n")
467 modelingLog <- append(modelingLog, paste0("The genomic prediction modeling follows a two-step approach. First trait average values, as described above, are computed for each genotype. This is followed by the model fitting on the basis of single phenotype value for each genotype entry and kinship matrix computed from their marker data.\n"))
469 if (length(kinshipLog)) {
470 modelingLog <- append(modelingLog, paste0(kinshipLog))
473 if (length(selectionData) == 0) {
475 trModel <- kin.blup(data = phenoTrait,
476 geno = 'genotypes',
477 pheno = traitAbbr,
478 K = traitRelationshipMatrix,
479 n.core = nCores,
480 PEV = TRUE
483 modelingLog <- paste0(modelingLog, "The model training is based on rrBLUP R package, version ", packageVersion('rrBLUP'), ". GEBVs are predicted using the kin.blup function and GBLUP method.\n\n")
485 trGEBV <- trModel$g
486 trGEBVPEV <- trModel$PEV
487 trGEBVSE <- sqrt(trGEBVPEV)
488 trGEBVSE <- data.frame(round(trGEBVSE, 2))
490 trGEBV <- data.frame(round(trGEBV, 2))
492 colnames(trGEBVSE) <- c('SE')
493 colnames(trGEBV) <- traitAbbr
495 trGEBVSE <- rownames_to_column(trGEBVSE, var="genotypes")
496 trGEBV <- rownames_to_column(trGEBV, var="genotypes")
498 trGEBVSE <- full_join(trGEBV, trGEBVSE)
500 trGEBVSE <- trGEBVSE %>% arrange(across(traitAbbr, desc))
501 trGEBVSE <- column_to_rownames(trGEBVSE, var="genotypes")
503 trGEBV <- trGEBV %>% arrange(across(traitAbbr, desc))
504 trGEBV <- column_to_rownames(trGEBV, var="genotypes")
506 phenoTraitMarker <- data.matrix(phenoTraitMarker)
507 genoDataFilteredObs <- data.matrix(genoDataFilteredObs)
509 markerEffects <- mixed.solve(y = phenoTraitMarker,
510 Z = genoDataFilteredObs
513 ordered.markerEffects <- data.matrix(markerEffects$u)
514 ordered.markerEffects <- data.matrix(ordered.markerEffects [order (-ordered.markerEffects[, 1]), ])
515 ordered.markerEffects <- round(ordered.markerEffects, 5)
517 colnames(ordered.markerEffects) <- c("Marker Effects")
518 ordered.markerEffects <- data.frame(ordered.markerEffects)
520 modelPhenoData <- data.frame(round(phenoTraitMarker, 2))
522 heritability <- round((trModel$Vg/(trModel$Ve + trModel$Vg)), 2)
523 additiveVar <- round(trModel$Vg, 2)
524 errorVar <- round(trModel$Ve, 2)
526 varCompData <- c("\nAdditive genetic variance\t", additiveVar, "\n")
527 varCompData <- append(varCompData, c("Error variance\t", errorVar, "\n"))
528 varCompData <- append(varCompData, c("SNP heritability (h)\t", heritability, "\n"))
530 combinedGebvsFile <- grep('selected_traits_gebv', outputFiles, ignore.case = TRUE,value = TRUE)
532 if (length(combinedGebvsFile) != 0) {
533 fileSize <- file.info(combinedGebvsFile)$size
534 if (fileSize != 0 ) {
535 combinedGebvs <- data.frame(fread(combinedGebvsFile,
536 header = TRUE))
538 rownames(combinedGebvs) <- combinedGebvs[,1]
539 combinedGebvs[,1] <- NULL
541 allGebvs <- merge(combinedGebvs, trGEBV,
542 by = 0,
543 all = TRUE
546 rownames(allGebvs) <- allGebvs[,1]
547 allGebvs[,1] <- NULL
551 #cross-validation
553 if (is.null(selectionFile)) {
554 genoNum <- nrow(phenoTrait)
556 if (genoNum < 20 ) {
557 warning(genoNum, " is too small number of genotypes.")
560 set.seed(4567)
562 k <- 10
563 reps <- 2
564 cvFolds <- createMultiFolds(phenoTrait[, 2], k=k, times=reps)
566 modelingLog <- paste0(modelingLog, "Model prediction accuracy is evaluated using cross-validation method. ", k, " folds, replicated ", reps, " times are used to predict the model accuracy.\n\n")
568 for ( r in 1:reps) {
569 re <- paste0('Rep', r)
571 for (i in 1:k) {
572 fo <- ifelse(i < 10, 'Fold0', 'Fold')
574 trFoRe <- paste0(fo, i, '.', re)
575 trG <- cvFolds[[trFoRe]]
576 slG <- as.numeric(rownames(phenoTrait[-trG,]))
578 kblup <- paste("rKblup", i, sep = ".")
580 result <- kin.blup(data = phenoTrait[trG,],
581 geno = 'genotypes',
582 pheno = traitAbbr,
583 K = traitRelationshipMatrix,
584 n.core = nCores,
585 PEV = TRUE
588 assign(kblup, result)
590 #calculate cross-validation accuracy
591 valBlups <- result$g
592 valBlups <- data.frame(valBlups)
594 slG <- slG[which(slG <= nrow(phenoTrait))]
596 slGDf <- phenoTrait[(rownames(phenoTrait) %in% slG),]
597 rownames(slGDf) <- slGDf[, 1]
598 slGDf[, 1] <- NULL
600 valBlups <- rownames_to_column(valBlups, var="genotypes")
601 slGDf <- rownames_to_column(slGDf, var="genotypes")
603 valCorData <- inner_join(slGDf, valBlups, by="genotypes")
604 valCorData$genotypes <- NULL
606 accuracy <- try(cor(valCorData))
607 validation <- paste("validation", trFoRe, sep = ".")
608 cvTest <- paste("CV", trFoRe, sep = " ")
610 if (inherits(accuracy, 'try-error') == FALSE)
612 accuracy <- round(accuracy[1,2], digits = 3)
613 accuracy <- data.matrix(accuracy)
615 colnames(accuracy) <- c("correlation")
616 rownames(accuracy) <- cvTest
618 assign(validation, accuracy)
620 if (!is.na(accuracy[1,1])) {
621 validationAll <- rbind(validationAll, accuracy)
627 validationAll <- data.frame(validationAll[order(-validationAll[, 1]), ])
628 colnames(validationAll) <- c('Correlation')
632 selectionPopResult <- c()
633 selectionPopGEBVs <- c()
634 selectionPopGEBVSE <- c()
636 #selection pop geno data after cleaning up and removing unique markers to selection pop
637 filteredSelGenoData <- selectionData
638 if (length(selectionData) != 0) {
640 genoDataTrSl <- rbind(genoDataFilteredObs, selectionData)
641 rTrSl <- A.mat(genoDataTrSl)
643 selectionPopResult <- kin.blup(data = phenoTrait,
644 geno = 'genotypes',
645 pheno = traitAbbr,
646 K = rTrSl,
647 n.core = nCores,
648 PEV = TRUE
651 selectionPopGEBVs <- round(data.frame(selectionPopResult$g), 2)
652 colnames(selectionPopGEBVs) <- traitAbbr
653 selectionPopGEBVs <- rownames_to_column(selectionPopGEBVs, var="genotypes")
655 selectionPopPEV <- selectionPopResult$PEV
656 selectionPopSE <- sqrt(selectionPopPEV)
657 selectionPopSE <- data.frame(round(selectionPopSE, 2))
658 colnames(selectionPopSE) <- 'SE'
659 genotypesSl <- rownames(selectionData)
661 selectionPopSE <- rownames_to_column(selectionPopSE, var="genotypes")
662 selectionPopSE <- selectionPopSE %>% filter(genotypes %in% genotypesSl)
664 selectionPopGEBVs <- selectionPopGEBVs %>% filter(genotypes %in% genotypesSl)
666 selectionPopGEBVSE <- inner_join(selectionPopGEBVs, selectionPopSE, by="genotypes")
668 sortVar <- parse_expr(traitAbbr)
669 selectionPopGEBVs <- selectionPopGEBVs %>% arrange(desc((!!sortVar)))
670 selectionPopGEBVs <- column_to_rownames(selectionPopGEBVs, var="genotypes")
672 selectionPopGEBVSE <- selectionPopGEBVSE %>% arrange(desc((!!sortVar)))
673 selectionPopGEBVSE <- column_to_rownames(selectionPopGEBVSE, var="genotypes")
676 if (!is.null(selectionPopGEBVs) & length(selectionPopGEBVsFile) != 0) {
677 fwrite(selectionPopGEBVs,
678 file = selectionPopGEBVsFile,
679 row.names = TRUE,
680 sep = "\t",
681 quote = FALSE,
685 if(!is.null(validationAll)) {
686 fwrite(validationAll,
687 file = validationFile,
688 row.names = TRUE,
689 sep = "\t",
690 quote = FALSE,
695 if (!is.null(ordered.markerEffects)) {
696 fwrite(ordered.markerEffects,
697 file = markerFile,
698 row.names = TRUE,
699 sep = "\t",
700 quote = FALSE,
705 if (!is.null(trGEBV)) {
706 fwrite(trGEBV,
707 file = blupFile,
708 row.names = TRUE,
709 sep = "\t",
710 quote = FALSE,
714 if (length(combinedGebvsFile) != 0 ) {
715 if(file.info(combinedGebvsFile)$size == 0) {
716 fwrite(trGEBV,
717 file = combinedGebvsFile,
718 row.names = TRUE,
719 sep = "\t",
720 quote = FALSE,
722 } else {
723 fwrite(allGebvs,
724 file = combinedGebvsFile,
725 row.names = TRUE,
726 sep = "\t",
727 quote = FALSE,
732 if (!is.null(modelPhenoData) && length(modelPhenoFile) != 0) {
734 if (!is.null(meanType)) {
735 colnames(modelPhenoData) <- meanType
738 fwrite(modelPhenoData,
739 file = modelPhenoFile,
740 row.names = TRUE,
741 sep = "\t",
742 quote = FALSE,
746 if (!is.null(genoDataFilteredObs) && length(modelGenoFile) != 0) {
748 fwrite(genoDataFilteredObs,
749 file = modelGenoFile,
750 row.names = TRUE,
751 sep = "\t",
752 quote = FALSE,
756 if (!is.null(traitRawPhenoData) && length(traitRawPhenoFile) != 0) {
758 fwrite(traitRawPhenoData,
759 file = traitRawPhenoFile,
760 row.names = FALSE,
761 sep = "\t",
762 na = 'NA',
763 quote = FALSE,
767 if (!is.null(filteredTrainingGenoData) && file.info(filteredTrainingGenoFile)$size == 0) {
768 fwrite(filteredTrainingGenoData,
769 file = filteredTrainingGenoFile,
770 row.names = TRUE,
771 sep = "\t",
772 quote = FALSE,
775 cat(genoFilteringLog, fill = TRUE, file = genoFilteringLogFile, append=FALSE)
778 if (length(filteredSelGenoFile) != 0 && file.info(filteredSelGenoFile)$size == 0) {
779 fwrite(filteredSelGenoData,
780 file = filteredSelGenoFile,
781 row.names = TRUE,
782 sep = "\t",
783 quote = FALSE,
787 ## if (!is.null(genoDataMissing)) {
788 ## write.table(genoData,
789 ## file = genoFile,
790 ## sep = "\t",
791 ## col.names = NA,
792 ## quote = FALSE,
793 ## )
795 ## }
797 ## if (!is.null(predictionDataMissing)) {
798 ## write.table(predictionData,
799 ## file = predictionFile,
800 ## sep = "\t",
801 ## col.names = NA,
802 ## quote = FALSE,
803 ## )
804 ## }
806 if (file.info(relationshipMatrixFile)$size == 0) {
808 fwrite(relationshipMatrix,
809 file = relationshipMatrixFile,
810 row.names = TRUE,
811 sep = "\t",
812 quote = FALSE,
816 if (file.info(relationshipMatrixJsonFile)$size == 0) {
818 relationshipMatrixJson <- relationshipMatrix
819 relationshipMatrixJson[upper.tri(relationshipMatrixJson)] <- NA
822 relationshipMatrixJson <- data.frame(relationshipMatrixJson)
824 relationshipMatrixList <- list(labels = names(relationshipMatrixJson),
825 values = relationshipMatrixJson)
827 relationshipMatrixJson <- jsonlite::toJSON(relationshipMatrixList)
830 write(relationshipMatrixJson,
831 file = relationshipMatrixJsonFile,
835 if (file.info(traitRelationshipMatrixFile)$size == 0) {
837 inbre <- diag(traitRelationshipMatrix)
838 inbre <- inbre - 1
840 diag(traitRelationshipMatrix) <- inbre
842 traitRelationshipMatrix <- data.frame(traitRelationshipMatrix) %>% replace(., . < 0, 0)
844 fwrite(traitRelationshipMatrix,
845 file = traitRelationshipMatrixFile,
846 row.names = TRUE,
847 sep = "\t",
848 quote = FALSE,
851 if (file.info(traitRelationshipMatrixJsonFile)$size == 0) {
853 traitRelationshipMatrixJson <- traitRelationshipMatrix
854 traitRelationshipMatrixJson[upper.tri(traitRelationshipMatrixJson)] <- NA
856 traitRelationshipMatrixJson <- data.frame(traitRelationshipMatrixJson)
858 traitRelationshipMatrixList <- list(labels = names(traitRelationshipMatrixJson),
859 values = traitRelationshipMatrixJson)
861 traitRelationshipMatrixJson <- jsonlite::toJSON(traitRelationshipMatrixList)
863 write(traitRelationshipMatrixJson,
864 file = traitRelationshipMatrixJsonFile,
870 if (file.info(inbreedingFile)$size == 0) {
872 fwrite(inbreeding,
873 file = inbreedingFile,
874 row.names = TRUE,
875 sep = "\t",
876 quote = FALSE,
880 if (file.info(aveKinshipFile)$size == 0) {
882 aveKinship <- data.frame(apply(traitRelationshipMatrix, 1, mean))
884 aveKinship<- aveKinship %>%
885 rownames_to_column('genotypes') %>%
886 rename(Mean_kinship = contains('traitRe')) %>%
887 arrange(Mean_kinship) %>%
888 mutate_at('Mean_kinship', round, 3) %>%
889 column_to_rownames('genotypes')
891 fwrite(aveKinship,
892 file = aveKinshipFile,
893 row.names = TRUE,
894 sep = "\t",
895 quote = FALSE,
899 if (file.info(formattedPhenoFile)$size == 0 && !is.null(formattedPhenoData) ) {
900 fwrite(formattedPhenoData,
901 file = formattedPhenoFile,
902 row.names = TRUE,
903 sep = "\t",
904 quote = FALSE,
907 if (!is.null(varCompData)) {
908 cat(varCompData, file = varianceComponentsFile)
911 if (!is.null(selectionLog)) {
912 cat(logHeading, selectionLog, fill = TRUE, file = analysisReportFile, append=FALSE)
913 } else {
914 cat(logHeading, anovaLog, trainingLog, modelingLog, fill = TRUE, file = analysisReportFile, append=FALSE)
917 message("Done.")
919 q(save = "no", runLast = FALSE)