R/solGS/gs.r

   1 #SNOPSIS
   2 #calculates genomic estimated breeding values (GEBVs) using rrBLUP,
   3 #GBLUP method
   4
   5 #AUTHOR
   6 # Isaak Y Tecle (iyt2@cornell.edu)
   7
   8 options(echo = FALSE)
   9 # options(warn = -1)
  10 suppressWarnings(suppressPackageStartupMessages({
  11 library(methods)
  12 library(rrBLUP)
  13 library(plyr)
  14 library(stringr)
  15 library(randomForest)
  16 library(parallel)
  17 library(genoDataFilter)
  18 library(phenoAnalysis)
  19 library(caret)
  20 library(dplyr)
  21 library(tibble)
  22 library(rlang)
  23 library(jsonlite)
  24 library(data.table)
  25   }))
  26 library(genoDataFilter)
  27 library(Matrix)
  28
  29 allArgs <- commandArgs()
  30
  31
  32 inputFiles  <- scan(grep("input_files", allArgs, value = TRUE),
  33                    what = "character")
  34
  35 outputFiles <- scan(grep("output_files", allArgs, value = TRUE),
  36                     what = "character")
  37
  38 traitsFile <- grep("traits", inputFiles,  value = TRUE)
  39 modelInfoFile  <- grep("model_info", inputFiles, value = TRUE)
  40 message('model_info_file ', modelInfoFile)
  41
  42 modelInfo  <- read.table(modelInfoFile,
  43                          header=TRUE, sep ="\t",
  44                          as.is = c('Value'))
  45
  46 modelInfo  <- column_to_rownames(modelInfo, var="Name")
  47 traitId    <- modelInfo["trait_id", 1]
  48 traitAbbr  <- modelInfo["trait_abbr", 1]
  49 modelId    <- modelInfo["model_id", 1]
  50 protocolId <- modelInfo["protocol_id", 1]
  51 protocolPage <- modelInfo["protocol_url", 1]
  52
  53 message('trait_id ', traitId)
  54 message('trait_abbr ', traitAbbr)
  55 message('protocol_id ', protocolId)
  56 message('protocol detail page ', protocolPage)
  57
  58 message('model_id ', modelId)
  59
  60 datasetInfoFile <- grep("dataset_info", inputFiles, value = TRUE)
  61 datasetInfo     <- c()
  62
  63 if (length(datasetInfoFile) != 0 ) {
  64     datasetInfo <- scan(datasetInfoFile, what = "character")
  65     datasetInfo <- paste(datasetInfo, collapse = " ")
  66   } else {
  67     datasetInfo <- c('single_population')
  68   }
  69
  70 #validationTrait <- paste("validation", trait, sep = "_")
  71 validationFile  <- grep('validation', outputFiles, value = TRUE)
  72
  73 if (is.null(validationFile)) {
  74   stop("Validation output file is missing.")
  75 }
  76
  77 #kinshipTrait <- paste("rrblup_training_gebvs", trait, sep = "_")
  78 blupFile     <- grep('rrblup_training_gebvs', outputFiles, value = TRUE)
  79
  80 if (is.null(blupFile)) {
  81   stop("GEBVs file is missing.")
  82 }
  83
  84 #markerTrait <- paste("marker_effects", trait, sep = "_")
  85 markerFile  <- grep('marker_effects', outputFiles, value = TRUE)
  86
  87 #traitPhenoFile <- paste("trait_phenotype_data", traitId, sep = "_")
  88 modelPhenoFile <- grep('model_phenodata', outputFiles, value = TRUE)
  89 message('model input trait pheno file ', modelPhenoFile)
  90 modelGenoFile <- grep('model_genodata', outputFiles, value = TRUE)
  91 message('model input trait geno file ', modelGenoFile)
  92 traitRawPhenoFile <- grep('trait_raw_phenodata', outputFiles, value = TRUE)
  93 varianceComponentsFile <- grep("variance_components", outputFiles, value = TRUE)
  94 analysisReportFile <- grep("_report_", outputFiles, value = TRUE)
  95 genoFilteringLogFile <- grep("genotype_filtering_log", outputFiles, value = TRUE)
  96
  97 filteredTrainingGenoFile       <- grep("filtered_training_genotype_data", outputFiles, value = TRUE)
  98 filteredSelGenoFile       <- grep("filtered_selection_genotype_data", outputFiles, value = TRUE)
  99 formattedPhenoFile     <- grep("formatted_phenotype_data", inputFiles, value = TRUE)
 100
 101 genoFile <- grep("genotype_data_", inputFiles, value = TRUE)
 102
 103 if (is.null(genoFile)) {
 104   stop("genotype data file is missing.")
 105 }
 106
 107 if (file.info(genoFile)$size == 0) {
 108   stop(paste0("genotype data file ", genoFile, " is empty."))
 109 }
 110
 111 readfilteredTrainingGenoData <- c()
 112 filteredTrainingGenoData <- c()
 113 genoFilterLog <- c()
 114 formattedPhenoData <- c()
 115 phenoData          <- c()
 116 genoData           <- c()
 117 maf <- 0.01
 118 markerFilter <- 0.6
 119 cloneFilter <- 0.8
 120
 121 logHeading <- paste0("Genomic Prediction Analysis Log for ", traitAbbr,  ".\n")
 122 logHeading <- append(logHeading,  paste0("Date: ", format(Sys.time(), "%d %b %Y %H:%M"), "\n\n\n"))
 123 logHeading <- format(logHeading, width=80, justify="c")
 124 trainingLog <- paste0("\n\n#Preprocessing training population genotype data.\n\n")
 125 trainingLog <- append(trainingLog, "The following data filtering will be applied to the genotype dataset:\n\n")
 126 trainingLog <- append(trainingLog, paste0("Markers with less or equal to ", maf * 100, "% minor allele frequency (maf)  will be removed.\n"))
 127 trainingLog <- append(trainingLog, paste0("\nMarkers with greater or equal to ", markerFilter * 100, "% missing values will be removed.\n"))
 128 trainingLog <- append(trainingLog, paste0("Clones  with greater or equal to ", cloneFilter * 100, "% missing values  will be removed.\n") )
 129
 130 if (length(filteredTrainingGenoFile) != 0 && file.info(filteredTrainingGenoFile)$size != 0) {
 131     filteredTrainingGenoData     <- fread(filteredTrainingGenoFile,
 132                                   na.strings = c("NA", "", "--", "-"),
 133                                   header = TRUE)
 134
 135     genoData <-  data.frame(filteredTrainingGenoData)
 136     genoData <- column_to_rownames(genoData, 'V1')
 137     readfilteredTrainingGenoData <- 1
 138 }
 139
 140 if (is.null(filteredTrainingGenoData)) {
 141     genoData <- fread(genoFile,
 142                       na.strings = c("NA", "", "--", "-"),
 143                       header = TRUE)
 144     genoData <- unique(genoData, by='V1')
 145     genoData <- data.frame(genoData)
 146     genoData <- column_to_rownames(genoData, 'V1')
 147   #genoDataFilter::filterGenoData
 148     genoData <- convertToNumeric(genoData)
 149
 150     trainingLog <- append(trainingLog, "#Running training population genotype data cleaning.\n\n")
 151     genoFilterOut <- filterGenoData(genoData, maf=maf, markerFilter=markerFilter, indFilter=cloneFilter, logReturn=TRUE)
 152
 153     genoData <- genoFilterOut$data
 154     genoFilteringLog <- genoFilterOut$log
 155     genoData <- roundAlleleDosage(genoData)
 156     filteredTrainingGenoData   <- genoData
 157
 158 } else {
 159   genoFilteringLog <- scan(genoFilteringLogFile, what = "character", sep="\n")
 160   genoFilteringLog <- paste0(genoFilteringLog, collapse="\n")
 161 }
 162
 163 message("genofilteringlogfile: ", genoFilteringLogFile)
 164 message(genoFilteringLog)
 165 trainingLog <- append(trainingLog, genoFilteringLog)
 166
 167 genoData <- genoData[order(row.names(genoData)), ]
 168
 169 if (length(formattedPhenoFile) != 0 && file.info(formattedPhenoFile)$size != 0) {
 170     formattedPhenoData <- data.frame(fread(formattedPhenoFile,
 171                                            header = TRUE,
 172                                            na.strings = c("NA", "", "--", "-", ".")
 173                                             ))
 174
 175 } else {
 176
 177     if (datasetInfo == 'combined_populations') {
 178
 179          phenoFile <- grep("model_phenodata", inputFiles, value = TRUE)
 180     } else {
 181
 182         phenoFile <- grep("\\/phenotype_data", inputFiles, value = TRUE)
 183     }
 184
 185     if (is.null(phenoFile)) {
 186         stop("phenotype data file is missing.")
 187     }
 188
 189     if (file.info(phenoFile)$size == 0) {
 190        stop(paste0("phenotype data file ", phenoFile, " is empty."))
 191
 192     }
 193
 194     phenoData <- data.frame(fread(phenoFile,
 195                                   sep = "\t",
 196                                   na.strings = c("NA", "", "--", "-", "."),
 197                                   header = TRUE))
 198
 199
 200 }
 201
 202 phenoTrait <- c()
 203 traitRawPhenoData <- c()
 204 anovaLog <- paste0("#Preprocessing training population phenotype data.\n\n")
 205
 206 if (datasetInfo == 'combined_populations') {
 207    anovaLog <- scan(analysisReportFile, what = "character", sep="\n")
 208   anovaLog <- paste0(anovaLog, collapse="\n")
 209
 210    if (!is.null(formattedPhenoData)) {
 211       phenoTrait <- subset(formattedPhenoData, select = traitAbbr)
 212       phenoTrait <- na.omit(phenoTrait)
 213
 214   } else {
 215
 216       if (any(grepl('Average', names(phenoData)))) {
 217           phenoTrait <- phenoData %>% select(V1, Average) %>% data.frame
 218       } else {
 219           phenoTrait <- phenoData
 220       }
 221
 222       colnames(phenoTrait)  <- c('genotypes', traitAbbr)
 223   }
 224  } else {
 225
 226      if (!is.null(formattedPhenoData)) {
 227          phenoTrait <- subset(formattedPhenoData, select = c('V1', traitAbbr))
 228          phenoTrait <- as.data.frame(phenoTrait)
 229          phenoTrait <- na.omit(phenoTrait)
 230          colnames(phenoTrait)[1] <- 'genotypes'
 231
 232      } else if (length(grep('list', phenoFile)) != 0) {
 233          phenoTrait <- averageTrait(phenoData, traitAbbr)
 234
 235      } else {
 236          meansResult <- getAdjMeans(phenoData,
 237                                    traitName = traitAbbr,
 238                                    calcAverages = TRUE,
 239                                    logReturn = TRUE)
 240
 241
 242
 243           anovaLog <- paste0(anovaLog, meansResult$log)
 244           phenoTrait <- meansResult$adjMeans
 245      }
 246
 247      keepMetaCols <- c('observationUnitName', 'germplasmName', 'studyDbId', 'locationName',
 248                     'studyYear', 'replicate', 'blockNumber', traitAbbr)
 249
 250       traitRawPhenoData <- phenoData %>%
 251                                           select(all_of(keepMetaCols))
 252
 253
 254 }
 255
 256 meanType <- names(phenoTrait)[2]
 257 names(phenoTrait)  <- c('genotypes', traitAbbr)
 258
 259 selectionTempFile <- grep("selection_population", inputFiles, value = TRUE)
 260
 261 selectionFile       <- c()
 262 filteredPredGenoFile <- c()
 263 selectionAllFiles   <- c()
 264
 265 if (length(selectionTempFile) !=0 ) {
 266   selectionAllFiles <- scan(selectionTempFile, what = "character")
 267
 268   selectionFile <- grep("\\/genotype_data", selectionAllFiles, value = TRUE)
 269
 270   #filteredPredGenoFile   <- grep("filtered_genotype_data_",  selectionAllFiles, value = TRUE)
 271 }
 272
 273 selectionPopGEBVsFile <- grep("rrblup_selection_gebvs", outputFiles, value = TRUE)
 274
 275 selectionData            <- c()
 276 readFilteredPredGenoData <- c()
 277 filteredPredGenoData     <- c()
 278
 279 ## if (length(filteredPredGenoFile) != 0 && file.info(filteredPredGenoFile)$size != 0) {
 280 ##   selectionData <- fread(filteredPredGenoFile, na.strings = c("NA", " ", "--", "-"),)
 281 ##   readFilteredPredGenoData <- 1
 282
 283 ##   selectionData           <- data.frame(selectionData)
 284 ##   rownames(selectionData) <- selectionData[, 1]
 285 ##   selectionData[, 1]      <- NULL
 286
 287 ## } else
 288 selectionLog <- c()
 289 if (length(selectionFile) != 0) {
 290 selectionLog <- append(selectionLog, paste0("#Data preprocessing of selection population genotype data.\n\n"))
 291
 292     selectionData <- fread(selectionFile,
 293                            header = TRUE,
 294                            na.strings = c("NA", "", "--", "-"))
 295
 296   selectionData <- data.frame(selectionData)
 297   selectionData <- unique(selectionData, by='V1')
 298   selectionData <- column_to_rownames(selectionData, 'V1')
 299   selectionData <- convertToNumeric(selectionData)
 300
 301   selectionLog <- append(selectionLog, paste0("Running selection population genotype data cleaning."))
 302
 303   selectionFilterOut <- filterGenoData(selectionData, maf=maf, markerFilter=markerFilter, indFilter=cloneFilter, logReturn=TRUE)
 304   selectionData <- selectionFilterOut$data
 305   selectionLog <- append(selectionLog, selectionFilterOut$log)
 306   selectionData <- roundAlleleDosage(selectionData)
 307 }
 308
 309 #impute genotype values for obs with missing values,
 310 genoDataMissing <- c()
 311
 312 if (sum(is.na(genoData)) > 0) {
 313   genoDataMissing<- c('yes')
 314
 315   genoData <- na.roughfix(genoData)
 316   genoData <- data.frame(genoData)
 317 }
 318
 319 #create phenotype and genotype datasets with
 320 #common stocks only
 321
 322 #extract observation lines with both
 323 #phenotype and genotype data only.
 324 trainingLog <- append(trainingLog, paste0("\n\n#Filtering for training population genotypes with both phenotype and marker data.\n\n"))
 325 trainingLog <- append(trainingLog, paste0("After calculating trait averages, the training population phenotype dataset has ", length(rownames(phenoTrait)), " individuals.\n") )
 326 trainingLog <- append(trainingLog, paste0("After cleaning up for missing values, the training population genotype dataset has ", length(rownames(genoData)), " individuals.\n") )
 327
 328 commonObs           <- intersect(phenoTrait$genotypes, row.names(genoData))
 329
 330 trainingLog <- append(trainingLog, paste0(length(commonObs), " individuals are shared in both phenotype and genotype datasets.\n"))
 331
 332 #remove genotyped lines without phenotype data
 333 genoDataFilteredObs <- genoData[(rownames(genoData) %in% commonObs), ]
 334
 335 trainingLog <- append(trainingLog, paste0("After removing individuals without phenotype data, this genotype dataset has ", length(rownames(genoDataFilteredObs)), " individuals.\n"))
 336
 337 #remove phenotyped lines without genotype data
 338 phenoTrait <- phenoTrait[(phenoTrait$genotypes %in% commonObs), ]
 339
 340 trainingLog <- append(trainingLog, paste0("After removing individuals without genotype data, this phenotype dataset has ", length(rownames(phenoTrait)), " individuals.\n" ))
 341
 342 phenoTraitMarker           <- data.frame(phenoTrait)
 343 rownames(phenoTraitMarker) <- phenoTraitMarker[, 1]
 344 phenoTraitMarker[, 1]      <- NULL
 345
 346 #impute missing data in prediction data
 347
 348 selectionDataMissing <- c()
 349 if (length(selectionData) != 0) {
 350   #purge markers unique to both populations
 351   trainingMarkers <- names(genoDataFilteredObs)
 352   selectionMarkers <-  names(selectionData)
 353
 354 selectionLog <- append(selectionLog, paste0("#Comparing markers in the training and selection populations genotype datasets.\n\n" ))
 355
 356 selectionLog <- append(selectionLog, paste0("The training population genotype dataset has ", length(trainingMarkers), " markers.\n" ))
 357 selectionLog <- append(selectionLog, paste0("The selection population genotype dataset has ", length(selectionMarkers), " markers.\n" ))
 358
 359 commonMarkers  <- intersect(trainingMarkers, selectionMarkers)
 360 selectionLog <- append(selectionLog, paste0("The training and selection populations genotype dataset have ", length(trainingMarkers), " markers in common.\n" ))
 361
 362 genoDataFilteredObs <- subset(genoDataFilteredObs, select= commonMarkers)
 363 selectionLog <- append(selectionLog, paste0("After filtering for shared markers, the training population genotype dataset has ", length(names(selectionData)), " markers.\n" ))
 364
 365 selectionData      <- subset(selectionData, select = commonMarkers)
 366 selectionLog <- append(selectionLog, paste0("After filtering for shared markers, the selection population genotype dataset has ", length(names(selectionData)), " markers.\n" ))
 367
 368   if (sum(is.na(selectionData)) > 0) {
 369     selectionDataMissing <- c('yes')
 370     selectionData <- na.roughfix(selectionData)
 371     selectionData <- data.frame(selectionData)
 372   }
 373 }
 374 #change genotype coding to [-1, 0, 1], to use the A.mat ) if  [0, 1, 2]
 375 genoTrCode <- grep("2", genoDataFilteredObs[1, ], value = TRUE)
 376 if(length(genoTrCode)) {
 377   genoData            <- genoData - 1
 378   genoDataFilteredObs <- genoDataFilteredObs - 1
 379 }
 380
 381 if (length(selectionData) != 0 ) {
 382   genoSlCode <- grep("2", selectionData[1, ], value = TRUE)
 383   if (length(genoSlCode) != 0 ) {
 384     selectionData <- selectionData - 1
 385   }
 386 }
 387
 388 ordered.markerEffects <- c()
 389 trGEBV                <- c()
 390 validationAll         <- c()
 391 combinedGebvsFile     <- c()
 392 allGebvs              <- c()
 393 modelPhenoData        <- c()
 394 relationshipMatrix    <- c()
 395
 396 #additive relationship model
 397 #calculate the inner products for
 398 #genotypes (realized relationship matrix)
 399 relationshipMatrixFile <- grep("relationship_matrix_table", outputFiles, value = TRUE)
 400 relationshipMatrixJsonFile <- grep("relationship_matrix_json", outputFiles, value = TRUE)
 401
 402 traitRelationshipMatrixFile <- grep("relationship_matrix_adjusted_table", outputFiles, value = TRUE)
 403 traitRelationshipMatrixJsonFile <- grep("relationship_matrix_adjusted_json", outputFiles, value = TRUE)
 404
 405 inbreedingFile <- grep('inbreeding_coefficients', outputFiles, value=TRUE)
 406 aveKinshipFile <- grep('average_kinship', outputFiles, value=TRUE)
 407
 408 inbreeding <- c()
 409 aveKinship <- c()
 410
 411 if (length(relationshipMatrixFile) != 0) {
 412   if (file.info(relationshipMatrixFile)$size > 0 ) {
 413       relationshipMatrix <- data.frame(fread(relationshipMatrixFile,
 414                          header = TRUE))
 415
 416       rownames(relationshipMatrix) <- relationshipMatrix[, 1]
 417       relationshipMatrix[, 1]      <- NULL
 418       colnames(relationshipMatrix) <- rownames(relationshipMatrix)
 419       relationshipMatrix           <- data.matrix(relationshipMatrix)
 420
 421   } else {
 422     relationshipMatrix           <- A.mat(genoData)
 423   diag(relationshipMatrix)     <- diag(relationshipMatrix) %>% replace(., . < 1, 1)
 424 relationshipMatrix <- relationshipMatrix %>% replace(., . <= 0, 0.00001)
 425
 426     inbreeding <- diag(relationshipMatrix)
 427     inbreeding <- inbreeding - 1
 428     inbreeding <- data.frame(inbreeding)
 429
 430     inbreeding <- inbreeding %>%
 431         rownames_to_column('genotypes') %>%
 432         rename(Inbreeding = inbreeding) %>%
 433         arrange(Inbreeding) %>%
 434         mutate_at('Inbreeding', round, 3) %>%
 435         column_to_rownames('genotypes')
 436   }
 437 }
 438
 439 relationshipMatrix <- data.frame(relationshipMatrix)
 440 colnames(relationshipMatrix) <- rownames(relationshipMatrix)
 441
 442 relationshipMatrix <- rownames_to_column(relationshipMatrix, var="genotypes")
 443 relationshipMatrix <- relationshipMatrix %>% mutate_if(is.numeric, round, 5)
 444 relationshipMatrix <- column_to_rownames(relationshipMatrix, var="genotypes")
 445
 446 traitRelationshipMatrix <- relationshipMatrix[(rownames(relationshipMatrix) %in% commonObs), ]
 447 traitRelationshipMatrix <- traitRelationshipMatrix[, (colnames(traitRelationshipMatrix) %in% commonObs)]
 448
 449 kinshipLog <- c()
 450 if (any(eigen(traitRelationshipMatrix)$values < 0) ) {
 451 kinshipLog <- paste0("\n\nNote: The kinship matrix of this dataset causes 'Not positive semi-definite error' while running the Cholesky decomposition. To fix this and run the modeling, a corrected positive semi-definite matrix was computed using the 'Matrix::nearPD' function. The negative eigen values from this decomposition nudged to positive values.\n\n")
 452
 453 traitRelationshipMatrix <- Matrix::nearPD(as.matrix(traitRelationshipMatrix))$mat
 454 }
 455
 456 traitRelationshipMatrix <- data.matrix(traitRelationshipMatrix)
 457
 458 nCores <- detectCores()
 459
 460 if (nCores > 1) {
 461   nCores <- (nCores %/% 2)
 462 } else {
 463   nCores <- 1
 464 }
 465 varCompData <- c()
 466 modelingLog <- paste0("\n\n#Training a model for ", traitAbbr, ".\n\n")
 467 modelingLog <- append(modelingLog, paste0("The genomic prediction modeling follows a two-step approach. First trait average values, as described above, are computed for each genotype. This is followed by the model fitting on the basis of single phenotype value for each genotype entry and kinship  matrix computed from their marker data.\n"))
 468
 469 if (length(kinshipLog)) {
 470 modelingLog <- append(modelingLog, paste0(kinshipLog))
 471 }
 472
 473 if (length(selectionData) == 0) {
 474
 475   trModel  <- kin.blup(data   = phenoTrait,
 476                       geno   = 'genotypes',
 477                       pheno  = traitAbbr,
 478                       K      = traitRelationshipMatrix,
 479                       n.core = nCores,
 480                       PEV    = TRUE
 481                      )
 482
 483 modelingLog <- paste0(modelingLog, "The model training is based on rrBLUP R package, version ", packageVersion('rrBLUP'), ". GEBVs are predicted using the kin.blup function and GBLUP method.\n\n")
 484
 485   trGEBV    <- trModel$g
 486   trGEBVPEV <- trModel$PEV
 487   trGEBVSE  <- sqrt(trGEBVPEV)
 488   trGEBVSE  <- data.frame(round(trGEBVSE, 2))
 489
 490   trGEBV <- data.frame(round(trGEBV, 2))
 491
 492   colnames(trGEBVSE) <- c('SE')
 493   colnames(trGEBV) <- traitAbbr
 494
 495   trGEBVSE <- rownames_to_column(trGEBVSE, var="genotypes")
 496   trGEBV   <- rownames_to_column(trGEBV, var="genotypes")
 497
 498   trGEBVSE <- full_join(trGEBV, trGEBVSE)
 499
 500   trGEBVSE <-  trGEBVSE %>% arrange(across(traitAbbr, desc))
 501   trGEBVSE <- column_to_rownames(trGEBVSE, var="genotypes")
 502
 503   trGEBV <-  trGEBV %>% arrange(across(traitAbbr, desc))
 504   trGEBV <- column_to_rownames(trGEBV, var="genotypes")
 505
 506   phenoTraitMarker    <- data.matrix(phenoTraitMarker)
 507   genoDataFilteredObs <- data.matrix(genoDataFilteredObs)
 508
 509   markerEffects <- mixed.solve(y = phenoTraitMarker,
 510                                Z = genoDataFilteredObs
 511                                )
 512
 513   ordered.markerEffects <- data.matrix(markerEffects$u)
 514   ordered.markerEffects <- data.matrix(ordered.markerEffects [order (-ordered.markerEffects[, 1]), ])
 515   ordered.markerEffects <- round(ordered.markerEffects, 5)
 516
 517   colnames(ordered.markerEffects) <- c("Marker Effects")
 518   ordered.markerEffects <- data.frame(ordered.markerEffects)
 519
 520   modelPhenoData   <- data.frame(round(phenoTraitMarker, 2))
 521
 522   heritability  <- round((trModel$Vg/(trModel$Ve + trModel$Vg)), 2)
 523   additiveVar <- round(trModel$Vg, 2)
 524   errorVar <- round(trModel$Ve, 2)
 525
 526  varCompData <- c("\nAdditive genetic variance\t", additiveVar, "\n")
 527  varCompData <- append(varCompData, c("Error variance\t", errorVar, "\n"))
 528  varCompData <- append(varCompData, c("SNP heritability (h)\t", heritability, "\n"))
 529
 530   combinedGebvsFile <- grep('selected_traits_gebv', outputFiles, ignore.case = TRUE,value = TRUE)
 531
 532   if (length(combinedGebvsFile) != 0) {
 533       fileSize <- file.info(combinedGebvsFile)$size
 534       if (fileSize != 0 ) {
 535           combinedGebvs <- data.frame(fread(combinedGebvsFile,
 536                                             header = TRUE))
 537
 538         rownames(combinedGebvs) <- combinedGebvs[,1]
 539           combinedGebvs[,1]       <- NULL
 540
 541           allGebvs <- merge(combinedGebvs, trGEBV,
 542                             by = 0,
 543                             all = TRUE
 544                             )
 545
 546           rownames(allGebvs) <- allGebvs[,1]
 547           allGebvs[,1] <- NULL
 548       }
 549   }
 550
 551 #cross-validation
 552
 553   if (is.null(selectionFile)) {
 554       genoNum <- nrow(phenoTrait)
 555
 556       if (genoNum < 20 ) {
 557           warning(genoNum, " is too small number of genotypes.")
 558       }
 559
 560       set.seed(4567)
 561
 562       k <- 10
 563       reps <- 2
 564       cvFolds <- createMultiFolds(phenoTrait[, 2], k=k, times=reps)
 565
 566   modelingLog <- paste0(modelingLog, "Model prediction accuracy is evaluated using cross-validation method. ",  k,  " folds, replicated ", reps, " times are used to predict the model accuracy.\n\n")
 567
 568       for ( r in 1:reps) {
 569           re <- paste0('Rep', r)
 570
 571           for (i in 1:k) {
 572               fo <- ifelse(i < 10, 'Fold0', 'Fold')
 573
 574               trFoRe <- paste0(fo, i, '.', re)
 575               trG <- cvFolds[[trFoRe]]
 576               slG <- as.numeric(rownames(phenoTrait[-trG,]))
 577
 578               kblup <- paste("rKblup", i, sep = ".")
 579
 580               result <- kin.blup(data  = phenoTrait[trG,],
 581                                  geno  = 'genotypes',
 582                                  pheno = traitAbbr,
 583                                  K     = traitRelationshipMatrix,
 584                                  n.core = nCores,
 585                                  PEV    = TRUE
 586                                  )
 587
 588               assign(kblup, result)
 589
 590               #calculate cross-validation accuracy
 591               valBlups   <- result$g
 592               valBlups   <- data.frame(valBlups)
 593
 594               slG <- slG[which(slG <= nrow(phenoTrait))]
 595
 596               slGDf <- phenoTrait[(rownames(phenoTrait) %in% slG),]
 597               rownames(slGDf) <- slGDf[, 1]
 598               slGDf[, 1] <- NULL
 599
 600               valBlups <-  rownames_to_column(valBlups, var="genotypes")
 601               slGDf    <-  rownames_to_column(slGDf, var="genotypes")
 602
 603               valCorData <- inner_join(slGDf, valBlups, by="genotypes")
 604               valCorData$genotypes <- NULL
 605
 606               accuracy   <- try(cor(valCorData))
 607               validation <- paste("validation", trFoRe, sep = ".")
 608               cvTest <- paste("CV", trFoRe, sep = " ")
 609
 610               if (inherits(accuracy, 'try-error') == FALSE)
 611               {
 612                   accuracy <- round(accuracy[1,2], digits = 3)
 613                   accuracy <- data.matrix(accuracy)
 614
 615                   colnames(accuracy) <- c("correlation")
 616                   rownames(accuracy) <- cvTest
 617
 618                   assign(validation, accuracy)
 619
 620                   if (!is.na(accuracy[1,1])) {
 621                       validationAll <- rbind(validationAll, accuracy)
 622                   }
 623               }
 624           }
 625       }
 626
 627       validationAll <- data.frame(validationAll[order(-validationAll[, 1]), ])
 628       colnames(validationAll) <- c('Correlation')
 629   }
 630 }
 631
 632 selectionPopResult <- c()
 633 selectionPopGEBVs  <- c()
 634 selectionPopGEBVSE <- c()
 635
 636 #selection pop geno data after  cleaning up and removing unique markers to selection pop
 637 filteredSelGenoData <- selectionData
 638 if (length(selectionData) != 0) {
 639
 640     genoDataTrSl <- rbind(genoDataFilteredObs, selectionData)
 641     rTrSl <- A.mat(genoDataTrSl)
 642
 643     selectionPopResult <- kin.blup(data   = phenoTrait,
 644                                     geno   = 'genotypes',
 645                                     pheno  = traitAbbr,
 646                                     K      = rTrSl,
 647                                     n.core = nCores,
 648                                     PEV    = TRUE
 649                                     )
 650
 651     selectionPopGEBVs <- round(data.frame(selectionPopResult$g), 2)
 652     colnames(selectionPopGEBVs) <- traitAbbr
 653     selectionPopGEBVs <- rownames_to_column(selectionPopGEBVs, var="genotypes")
 654
 655     selectionPopPEV <- selectionPopResult$PEV
 656     selectionPopSE  <- sqrt(selectionPopPEV)
 657     selectionPopSE  <- data.frame(round(selectionPopSE, 2))
 658     colnames(selectionPopSE) <- 'SE'
 659     genotypesSl     <- rownames(selectionData)
 660
 661     selectionPopSE <- rownames_to_column(selectionPopSE, var="genotypes")
 662     selectionPopSE <-  selectionPopSE %>% filter(genotypes %in% genotypesSl)
 663
 664     selectionPopGEBVs <-  selectionPopGEBVs %>% filter(genotypes %in% genotypesSl)
 665
 666     selectionPopGEBVSE <- inner_join(selectionPopGEBVs, selectionPopSE, by="genotypes")
 667
 668     sortVar <- parse_expr(traitAbbr)
 669     selectionPopGEBVs <- selectionPopGEBVs %>% arrange(desc((!!sortVar)))
 670     selectionPopGEBVs <- column_to_rownames(selectionPopGEBVs, var="genotypes")
 671
 672     selectionPopGEBVSE <-  selectionPopGEBVSE %>% arrange(desc((!!sortVar)))
 673     selectionPopGEBVSE <- column_to_rownames(selectionPopGEBVSE, var="genotypes")
 674 }
 675
 676 if (!is.null(selectionPopGEBVs) & length(selectionPopGEBVsFile) != 0)  {
 677     fwrite(selectionPopGEBVs,
 678            file  = selectionPopGEBVsFile,
 679            row.names = TRUE,
 680            sep   = "\t",
 681            quote = FALSE,
 682            )
 683 }
 684
 685 if(!is.null(validationAll)) {
 686     fwrite(validationAll,
 687            file  = validationFile,
 688            row.names = TRUE,
 689            sep   = "\t",
 690            quote = FALSE,
 691            )
 692 }
 693
 694
 695 if (!is.null(ordered.markerEffects)) {
 696     fwrite(ordered.markerEffects,
 697            file  = markerFile,
 698            row.names = TRUE,
 699            sep   = "\t",
 700            quote = FALSE,
 701            )
 702 }
 703
 704
 705 if (!is.null(trGEBV)) {
 706     fwrite(trGEBV,
 707            file  = blupFile,
 708            row.names = TRUE,
 709            sep   = "\t",
 710            quote = FALSE,
 711            )
 712 }
 713
 714 if (length(combinedGebvsFile) != 0 ) {
 715     if(file.info(combinedGebvsFile)$size == 0) {
 716         fwrite(trGEBV,
 717                file  = combinedGebvsFile,
 718                row.names = TRUE,
 719                sep   = "\t",
 720                quote = FALSE,
 721                )
 722       } else {
 723       fwrite(allGebvs,
 724              file  = combinedGebvsFile,
 725              row.names = TRUE,
 726              sep   = "\t",
 727              quote = FALSE,
 728              )
 729     }
 730 }
 731
 732 if (!is.null(modelPhenoData) && length(modelPhenoFile) != 0) {
 733
 734     if (!is.null(meanType)) {
 735         colnames(modelPhenoData) <- meanType
 736     }
 737
 738     fwrite(modelPhenoData,
 739            file  = modelPhenoFile,
 740            row.names = TRUE,
 741            sep   = "\t",
 742            quote = FALSE,
 743            )
 744 }
 745
 746 if (!is.null(genoDataFilteredObs) && length(modelGenoFile) != 0) {
 747
 748     fwrite(genoDataFilteredObs,
 749            file  = modelGenoFile,
 750            row.names = TRUE,
 751            sep   = "\t",
 752            quote = FALSE,
 753            )
 754 }
 755
 756 if (!is.null(traitRawPhenoData) && length(traitRawPhenoFile) != 0) {
 757
 758     fwrite(traitRawPhenoData,
 759            file  = traitRawPhenoFile,
 760            row.names = FALSE,
 761            sep   = "\t",
 762            na = 'NA',
 763            quote = FALSE,
 764            )
 765 }
 766
 767 if (!is.null(filteredTrainingGenoData) && file.info(filteredTrainingGenoFile)$size == 0) {
 768   fwrite(filteredTrainingGenoData,
 769          file  = filteredTrainingGenoFile,
 770          row.names = TRUE,
 771          sep   = "\t",
 772          quote = FALSE,
 773          )
 774
 775   cat(genoFilteringLog, fill = TRUE,  file = genoFilteringLogFile, append=FALSE)
 776 }
 777
 778 if (length(filteredSelGenoFile) != 0 && file.info(filteredSelGenoFile)$size == 0) {
 779   fwrite(filteredSelGenoData,
 780          file  = filteredSelGenoFile,
 781          row.names = TRUE,
 782          sep   = "\t",
 783          quote = FALSE,
 784          )
 785 }
 786
 787 ## if (!is.null(genoDataMissing)) {
 788 ##   write.table(genoData,
 789 ##               file = genoFile,
 790 ##               sep = "\t",
 791 ##               col.names = NA,
 792 ##               quote = FALSE,
 793 ##             )
 794
 795 ## }
 796
 797 ## if (!is.null(predictionDataMissing)) {
 798 ##   write.table(predictionData,
 799 ##               file = predictionFile,
 800 ##               sep = "\t",
 801 ##               col.names = NA,
 802 ##               quote = FALSE,
 803 ##               )
 804 ## }
 805
 806 if (file.info(relationshipMatrixFile)$size == 0) {
 807
 808   fwrite(relationshipMatrix,
 809          file  = relationshipMatrixFile,
 810          row.names = TRUE,
 811          sep   = "\t",
 812          quote = FALSE,
 813          )
 814 }
 815
 816 if (file.info(relationshipMatrixJsonFile)$size == 0) {
 817
 818     relationshipMatrixJson <- relationshipMatrix
 819     relationshipMatrixJson[upper.tri(relationshipMatrixJson)] <- NA
 820
 821
 822     relationshipMatrixJson <- data.frame(relationshipMatrixJson)
 823
 824     relationshipMatrixList <- list(labels = names(relationshipMatrixJson),
 825                                        values = relationshipMatrixJson)
 826
 827     relationshipMatrixJson <- jsonlite::toJSON(relationshipMatrixList)
 828
 829
 830     write(relationshipMatrixJson,
 831                     file  = relationshipMatrixJsonFile,
 832                     )
 833 }
 834
 835 if (file.info(traitRelationshipMatrixFile)$size == 0) {
 836
 837     inbre <- diag(traitRelationshipMatrix)
 838     inbre <- inbre - 1
 839
 840     diag(traitRelationshipMatrix) <- inbre
 841
 842     traitRelationshipMatrix <- data.frame(traitRelationshipMatrix) %>% replace(., . < 0, 0)
 843
 844     fwrite(traitRelationshipMatrix,
 845            file  = traitRelationshipMatrixFile,
 846            row.names = TRUE,
 847            sep   = "\t",
 848            quote = FALSE,
 849            )
 850
 851     if (file.info(traitRelationshipMatrixJsonFile)$size == 0) {
 852
 853         traitRelationshipMatrixJson <- traitRelationshipMatrix
 854         traitRelationshipMatrixJson[upper.tri(traitRelationshipMatrixJson)] <- NA
 855
 856         traitRelationshipMatrixJson <- data.frame(traitRelationshipMatrixJson)
 857
 858         traitRelationshipMatrixList <- list(labels = names(traitRelationshipMatrixJson),
 859                                             values = traitRelationshipMatrixJson)
 860
 861         traitRelationshipMatrixJson <- jsonlite::toJSON(traitRelationshipMatrixList)
 862
 863         write(traitRelationshipMatrixJson,
 864               file  = traitRelationshipMatrixJsonFile,
 865               )
 866     }
 867 }
 868
 869
 870 if (file.info(inbreedingFile)$size == 0) {
 871
 872   fwrite(inbreeding,
 873          file  = inbreedingFile,
 874          row.names = TRUE,
 875          sep   = "\t",
 876          quote = FALSE,
 877          )
 878 }
 879
 880 if (file.info(aveKinshipFile)$size == 0) {
 881
 882     aveKinship <- data.frame(apply(traitRelationshipMatrix, 1, mean))
 883
 884     aveKinship<- aveKinship %>%
 885         rownames_to_column('genotypes') %>%
 886         rename(Mean_kinship = contains('traitRe')) %>%
 887         arrange(Mean_kinship) %>%
 888         mutate_at('Mean_kinship', round, 3) %>%
 889         column_to_rownames('genotypes')
 890
 891     fwrite(aveKinship,
 892            file  = aveKinshipFile,
 893            row.names = TRUE,
 894            sep   = "\t",
 895            quote = FALSE,
 896            )
 897 }
 898
 899 if (file.info(formattedPhenoFile)$size == 0 && !is.null(formattedPhenoData) ) {
 900   fwrite(formattedPhenoData,
 901          file = formattedPhenoFile,
 902          row.names = TRUE,
 903          sep = "\t",
 904          quote = FALSE,
 905          )
 906 }
 907 if (!is.null(varCompData)) {
 908   cat(varCompData, file = varianceComponentsFile)
 909 }
 910
 911 if (!is.null(selectionLog)) {
 912   cat(logHeading, selectionLog, fill = TRUE,  file = analysisReportFile, append=FALSE)
 913 } else {
 914   cat(logHeading, anovaLog, trainingLog, modelingLog, fill = TRUE,  file = analysisReportFile, append=FALSE)
 915 }
 916
 917 message("Done.")
 918
 919 q(save = "no", runLast = FALSE)