R/GCPC.R

   1 ################################################################################
   2 # Genomic prediction of cross performance for YamBase
   3 ################################################################################
   4
   5 # There are ten main steps to this protocol:
   6 # 1. Load the software needed.
   7 # 2. Declare user-supplied variables.
   8 # 3. Read in the genotype data and convert to numeric allele counts.
   9 # 4. Get the genetic predictors needed.
  10 # 5. Process the phenotypic data.
  11 # 6. Fit the mixed models in sommer.
  12 # 7. Backsolve from individual estimates to marker effect estimates / GBLUP -> RR-BLUP
  13 # 8. Weight the marker effects and add them together to form an index of merit.
  14 # 9. Predict the crosses.
  15 # 10. Format the information needed for output.
  16
  17
  18 # Get Arguments
  19 args <- commandArgs(trailingOnly = TRUE)
  20 if (length(args) < 3) {
  21   stop("Two or more arguments are required.")
  22 }
  23 phenotypeFile <- args[1]
  24 genotypeFile <- args[2]
  25 traits <- args[3]
  26 weights <- args[4]
  27 userSexes <- args[5]
  28 userFixed <- args[6]
  29 userRandom <- args[7]
  30
  31 # userSexes = as.vector(userSexes)
  32
  33 # L = length(userSexes)
  34 # if (L==1 && userSexes[1]!="") {write('PLANT SEX is empty: ', stderr())}
  35 write(paste("PLANT SEX CVTERM: |", userSexes, "|"), stderr())
  36
  37
  38 ################################################################################
  39 # 1. Load software needed
  40 ################################################################################
  41 library(dplyr)
  42 library(tidyr)
  43 library(sommer)
  44 library(AGHmatrix)
  45 library(VariantAnnotation) # Bioconductor package
  46 library(tools)
  47 # library(rstatix)
  48 Rcpp::sourceCpp("/home/production/cxgn/QuantGenResources/CalcCrossMeans.cpp") # this is called CalcCrossMean.cpp on Github
  49
  50
  51
  52
  53
  54
  55 ################################################################################
  56 # 2. Declare user-supplied variables
  57 ################################################################################
  58
  59 # a. Define path with internal YamBase instructions such that the object 'userGeno'
  60 #    is defined as a VCF file of genotypes.
  61
  62 # userGeno <- path
  63
  64
  65 # b. Define path2 with internal YamBase instructions such that the object 'userPheno'
  66 #    is defined as the phenotype file.
  67
  68 # userPheno <- path2
  69 # write(paste("READING PHENOTYPEFILE: ",phenotypeFile), stderr())
  70 userPheno <- read.delim(phenotypeFile, header = TRUE, sep = "\t", fill = TRUE) # testing only
  71 # write(colnames(userPheno), stderr())
  72 # write(summary(userPheno), stderr())
  73 ## userPheno <- userPheno[userPheno$Trial == "SCG", ] #testing only-- needs to replaced with 2-stage
  74
  75 # write("DONE WITH PHENOTYPEFILE"), stderr())
  76
  77 # c. The user should be able to select their fixed variables from a menu
  78 #    of the column names of the userPheno object. The possible interaction terms
  79 #    also need to be shown somehow. Then, those strings should be passed
  80 #    to this vector, 'userFixed'. Please set userFixed to NA if no fixed effects
  81 #    besides f are requested.
  82 #    f is automatically included as a fixed effect- a note to the user would be good.
  83
  84 # userFixed <- c()
  85 # userFixed <- c("studyYear") # for testing only
  86 userFixed <- unlist(strsplit(userFixed, split = ",", fixed = T))
  87
  88
  89 # d. The user should be able to select their random variables from a menu
  90 #    of the column names of the userPheno object. The possible interaction terms
  91 #    also need to be shown somehow. Then, those strings should be passed
  92 #    to this vector, 'userRandom'.
  93
  94 # userRandom <- c()
  95 # userRandom <- "blockNumber" # for testing only
  96 userRandom <- unlist(strsplit(userRandom, split = ",", fixed = T))
  97
  98 # e. The user should be able to indicate which of the userPheno column names
  99 #    represents individual genotypes identically as they are represented in the VCF
 100 #    column names. No check to ensure matching at this stage. This single string
 101 #    should be passed to this vector, userID.
 102
 103 # userID <- c()
 104 userID <- "germplasmName" # for testing only
 105
 106
 107 # f. The user must indicate the ploidy level of their organism, and the integer
 108 #    provided should be passed to the vector 'userPloidy'. CalcCrossMeans.cpp
 109 #    currently supports ploidy = {2, 4, 6}. Ideally, the user could select
 110 #    their ploidy from a drop-down to avoid errors here, and there would be a note
 111 #    that other ploidies are not currently supported. If not, a possible error is
 112 #    provided.
 113
 114 userPloidy <- c()
 115 userPloidy <- 2 # for testing only
 116
 117 # if(userPloidy %in% c(2, 4, 6) != TRUE){
 118 #   stop("Only ploidies of 2, 4, and 6 are supported currently. \n
 119 #        Please confirm your ploidy level is supported.")
 120 # }
 121
 122
 123 # g. The user should be able to select their response variables from a drop-down menu
 124 #    of the column names of the userPheno object. Then, those strings should be passed
 125 #    to this vector, 'userResponse'.
 126
 127 write(paste("TRAIT STRING:", traits), stderr())
 128 # userResponse <- c()
 129 # userResponse <- c("YIELD", "DMC", "OXBI") # for testing only
 130 userResponse <- unlist(strsplit(traits, split = ",", fixed = T))
 131
 132 write(paste("USER RESPONSE", userResponse), stderr())
 133 write(paste("first element: ", userResponse[1]), stderr())
 134 # h. The user must indicate weights for each response. The order of the vector
 135 #    of response weights must match the order of the responses in userResponse.
 136
 137 # userWeights <- c()
 138 # userWeights <- c(1, 0.8, 0.2) # for YIELD, DMC, and OXBI respectively; for testing only
 139 userWeights <- as.numeric(unlist(strsplit(weights, split = ",", fixed = T)))
 140
 141 write(paste("WEIGHTS", userWeights), stderr())
 142
 143 # i. The user can indicate the number of crosses they wish to output.
 144 #    The maximum possible is a full diallel.
 145
 146 # userNCrosses <- c()
 147 userNCrosses <- 100 # for testing only
 148
 149
 150 # j. The user can (optionally) input the individuals' sexes and indicate the column
 151 #    name of the userPheno object which corresponds to sex. The column name
 152 #   string should be passed to the 'userSexes' object. If the user does not wish
 153 #   to remove crosses with incompatible sexes (e.g. because the information is not available),
 154 #   then userSexes should be set to NA.
 155
 156
 157 # userSexes <- c()
 158 # userSexes <- "Sex" # for testing only
 159 # userPheno$Sex <- sample(c("M", "F"), size = nrow(userPheno), replace = TRUE, prob = c(0.7, 0.3)) # for testing only
 160 # Please note that for the test above, sex is sampled randomly for each entry, so the same accession can have
 161 # different sexes. This does not matter for the code or testing.
 162
 163 ################################################################################
 164 # 3. Read in the genotype data and convert to numeric allele counts.
 165 ################################################################################
 166
 167 # a. The VCF file object 'userGeno' needs to be converted to a numeric matrix
 168 #    of allele counts in whic:
 169 #    Rownames represent the individual genotype IDs
 170 #    Colnames represent the site IDs
 171 #    A cell within a given row and column represents the row individual's
 172 #    genotype at the site in the column.
 173
 174 #   The individual's genotype should be an integer from 0... ploidy to represent
 175 #   counts of the alternate allele at the site. Diploid example:
 176 #    0 = homozygous reference
 177 #    1 = heterozygous
 178 #    2 = homozygous alternate
 179
 180 #    The genotypes must not contain monomorphic or non-biallelic sites.
 181 #    Users need to pre-process their VCF to remove these (e.g. in TASSEL or R)
 182 #    I can put an error message into this script if a user tries to input
 183 #    monomorphic or biallelic sites which could be communicated through the GUI.
 184 #    It's also possible to filter them here.
 185
 186 if (file_ext(genotypeFile) == "vcf") {
 187   write(paste("READING VARIANT FILE ", genotypeFile), stderr())
 188   #  Import VCF with VariantAnnotation package and extract matrix of dosages
 189   myVCF <- readVcf(genotypeFile)
 190   # G <- t(geno(myVCF)$DS) # Individual in row, genotype in column
 191   mat <- genotypeToSnpMatrix(myVCF)
 192   # G <- t(geno(myVCF)$DS) # Individual in row, genotype in column
 193   G <- as(mat$genotypes, "numeric")
 194   G <- G[, colSums(is.na(G)) < nrow(G)]
 195
 196   #   TEST temporarily import the genotypes via HapMap:
 197   # source("R/hapMap2numeric.R") # replace and delete
 198   # G <- hapMap2numeric(genotypeFile) # replace and delete
 199 } else {
 200   # accession_names     abc      abc2    abc3
 201   # marker1                   0      0        2
 202   # marker2                   1      0        0
 203   # marker3                   0      0        0
 204
 205   write(paste("READING DOSAGE FILE ", genotypeFile), stderr())
 206   GF <- read.delim(genotypeFile)
 207   GD <- GF[, -1]
 208   GM <- as.matrix(GD)
 209   G <- t(GM)
 210 }
 211
 212 write("G Matrix start --------", stderr())
 213 write(G[1:5, 1:5], stderr())
 214 write("G Matrix end =========", stderr())
 215
 216
 217
 218
 219 ################################################################################
 220 # 4. Get the genetic predictors needed.
 221 ################################################################################
 222
 223 write("GENETIC PREDICTIONS...", stderr())
 224 # 4a. Get the inbreeding coefficent, f, as described by Xiang et al., 2016
 225 # The following constructs f as the average heterozygosity of the individual
 226 # The coefficient of f estimated later then needs to be divided by the number of markers
 227 # in the matrix D before adding it to the estimated dominance marker effects
 228 # One unit of change in f represents changing all loci from homozygous to heterozygous
 229
 230 ### GC <- G - (userPloidy/2) #this centers G
 231 GC <- G * (userPloidy - G) * (2 / userPloidy)^2 # center at G
 232 f <- rowSums(GC, na.rm = TRUE) / apply(GC, 1, function(x) sum(!is.na(x)))
 233
 234 # Another alternate way to construct f is the total number of heterozygous loci in the individual
 235 # The coefficient of this construction of f does not need to be divided by the number of markers
 236 # It is simply added to each marker dominance effect
 237 # The coefficient of this construction of f represents the average dominance effect of a marker
 238 # One unit of change in f represents changing one locus from homozygous to heterozygous
 239 # f <- rowSums(D, na.rm = TRUE)
 240
 241
 242 write("DISTANCE MATRIX...", stderr())
 243 # 4b. Get the additive and dominance relationship matrices following Batista et al., 2021
 244 # https://doi.org/10.1007/s00122-021-03994-w
 245
 246 # Additive: this gives a different result than AGHmatrix VanRaden's Gmatrix
 247 # AGHmatrix: Weights are implemented for "VanRaden" method as described in Liu (2020)?
 248 allele_freq <- colSums(G) / (userPloidy * nrow(G))
 249 W <- t(G) - userPloidy * allele_freq
 250 WWt <- crossprod(W)
 251 denom <- sum(userPloidy * allele_freq * (1 - allele_freq))
 252 A <- WWt / denom
 253
 254 # Check with paper equation:
 255 # w <- G - (userPloidy/2)
 256 # num <- w %*% t(w)
 257 # denom = sum(userPloidy * allele_freq * (1 - allele_freq))
 258 # A2 <- num/denom
 259 # table(A == A2)
 260 # cor(as.vector(A), as.vector(A2)) # 0.9996...
 261
 262
 263 # Dominance or digenic dominance
 264 if (userPloidy == 2) {
 265   D <- Gmatrix(G, method = "Su", ploidy = userPloidy, missingValue = NA)
 266 }
 267
 268 if (userPloidy > 2) {
 269   # Digenic dominance
 270   C_matrix <- matrix(length(combn(userPloidy, 2)) / 2,
 271     nrow = nrow(t(G)),
 272     ncol = ncol(t(G))
 273   )
 274
 275   Ploidy_matrix <- matrix(userPloidy,
 276     nrow = nrow(t(G)),
 277     ncol = ncol(t(G))
 278   )
 279
 280   Q <- (allele_freq^2 * C_matrix) -
 281     (Ploidy_matrix - 1) * allele_freq * t(G) +
 282     0.5 * t(G) * (t(G) - 1)
 283
 284   Dnum <- crossprod(Q)
 285   denomDom <- sum(C_matrix[, 1] * allele_freq^2 * (1 - allele_freq)^2)
 286   D <- Dnum / denomDom
 287 }
 288
 289
 290
 291
 292
 293
 294 ################################################################################
 295 # 5. Process the phenotypic data.
 296 ################################################################################
 297
 298 # write(summary(userPheno), stderr())
 299
 300 # a. Paste f into the phenotype dataframe
 301 write("processing phenotypic data...", stderr())
 302 userPheno$f <- f[as.character(userPheno[, userID])]
 303
 304 # write(summary(userPheno), stderr())
 305
 306 # b. Scale the response variables.
 307 write("processing phenotypic data... scaling...", stderr())
 308 write(paste("USER RESPONSE LENGTH = ", length(userResponse)), stderr())
 309 for (i in 1:length(userResponse)) {
 310   write(paste("working on user response ", userResponse[i]), stderr())
 311   userPheno[, userResponse[i]] <- (userPheno[, userResponse[i]] - mean(userPheno[, userResponse[i]], na.rm = TRUE)) / sd(userPheno[, userResponse[i]], na.rm = TRUE)
 312 }
 313
 314 write(paste("accession count: ", length(userPheno[, userID])), stderr())
 315 write("processing phenotypic data... adding dominance effects", stderr())
 316 # c. Paste in a second ID column for the dominance effects.
 317
 318 # write(summary(userPheno), stderr())
 319
 320 dominanceEffectCol <- paste(userID, "2", sep = "")
 321 write(paste("NEW COL NAME: ", dominanceEffectCol), stderr())
 322
 323 write(paste("USER_ID COLUMN: ", userPheno[, userID]), stderr())
 324 userPheno[, dominanceEffectCol] <- userPheno[, userID]
 325
 326 write(paste("USER PHENO userID2 COL", userPheno[, dominanceEffectCol]), stderr())
 327 uniq <- length(sapply(lapply(userPheno, unique), length))
 328 write(paste("UNIQUE", uniq), stderr())
 329
 330
 331 # Additional steps could be added here to remove outliers etc.
 332
 333
 334
 335
 336
 337 ################################################################################
 338 # 6. Fit the mixed models in sommer.
 339 ################################################################################
 340
 341 write("Fit mixed model in sommer", stderr())
 342 # 6a. Make a list to save the models.
 343
 344 userModels <- list()
 345
 346 for (i in 1:length(userResponse)) {
 347   write(paste("User response: ", userResponse[i]), stderr())
 348   # check if fixed effects besides f are requested, then paste together
 349   # response variable and fixed effects
 350   if (!is.na(userFixed[1])) {
 351     fixedEff <- paste(userFixed, collapse = " + ")
 352     fixedEff <- paste(fixedEff, "f", sep = " + ")
 353     fixedArg <- paste(userResponse[i], " ~ ", fixedEff, sep = "")
 354   }
 355   if (is.na(userFixed[1])) {
 356     fixedArg <- paste(userResponse[i], " ~ ", "f")
 357   }
 358
 359
 360   # check if random effects besides genotypic additive and dominance effects
 361   # are requested, then paste together the formula
 362
 363   write("Generate formula...", stderr())
 364
 365   if (!is.na(userRandom[1])) {
 366     randEff <- paste(userRandom, collapse = " + ")
 367     ID2 <- paste(userID, 2, sep = "")
 368     randEff2 <- paste("~vsr(", userID, ", Gu = A) + vsr(", ID2, ", Gu = D)", sep = "")
 369     randArg <- paste(randEff2, randEff, sep = " + ")
 370   }
 371   if (is.na(userRandom[1])) {
 372     ID2 <- paste(userID, 2, sep = "")
 373     randArg <- paste("~vsr(", userID, ", Gu = A) + vsr(", ID2, ", Gu = D)", sep = "")
 374   }
 375
 376   write(paste("Fit mixed GBLUP model...", randArg), stderr())
 377
 378   #  write(paste("USER PHENO:", userPheno), stderr())
 379   #  write(paste("COLNAMES: ", colnames(userPheno)), stderr())
 380   # fit the mixed GBLUP model
 381   myMod <- mmer(
 382     fixed = as.formula(fixedArg),
 383     random = as.formula(randArg),
 384     rcov = ~units,
 385     getPEV = FALSE,
 386     data = userPheno
 387   )
 388
 389
 390   # save the fit model
 391
 392   write(paste("I = ", i), stderr())
 393
 394   userModels[[i]] <- myMod
 395 }
 396
 397
 398
 399
 400
 401
 402 ######################################################################################
 403 # 7. Backsolve from individual estimates to marker effect estimates / GBLUP -> RR-BLUP
 404 ######################################################################################
 405
 406 # a. Get the matrices and inverses needed
 407 #    This is not correct for polyploids yet.
 408 A.G <- G - (userPloidy / 2) # this is the additive genotype matrix (coded -1 0 1 for diploids)
 409 D.G <- 1 - abs(A.G) # this is the dominance genotype matrix (coded 0 1 0 for diploids)
 410
 411
 412 A.T <- A.G %*% t(A.G) ## additive genotype matrix
 413 A.Tinv <- solve(A.T) # inverse; may cause an error sometimes, if so, add a small amount to the diag
 414 A.TTinv <- t(A.G) %*% A.Tinv # M'%*% (M'M)-
 415
 416 D.T <- D.G %*% t(D.G) ## dominance genotype matrix
 417 D.Tinv <- solve(D.T) ## inverse
 418 D.TTinv <- t(D.G) %*% D.Tinv # M'%*% (M'M)-
 419
 420
 421 # b. Loop through and backsolve to marker effects.
 422
 423 write("backsolve marker effects...", stderr())
 424
 425 userAddEff <- list() # save them in order
 426 userDomEff <- list() # save them in order
 427
 428 for (i in 1:length(userModels)) {
 429   myMod <- userModels[[i]]
 430
 431   # get the additive and dominance effects out of the sommer list
 432   subMod <- myMod$U
 433   subModA <- subMod[[1]]
 434   subModA <- subModA[[1]]
 435   subModD <- subMod[[2]]
 436   subModD <- subModD[[1]]
 437
 438   # backsolve
 439   addEff <- A.TTinv %*% matrix(subModA[colnames(A.TTinv)], ncol = 1) # these must be reordered to match A.TTinv
 440   domEff <- D.TTinv %*% matrix(subModD[colnames(D.TTinv)], ncol = 1) # these must be reordered to match D.TTinv
 441
 442   # add f coefficient back into the dominance effects
 443   subModf <- myMod$Beta
 444   fCoef <- subModf[subModf$Effect == "f", "Estimate"] # raw f coefficient
 445   fCoefScal <- fCoef / ncol(G) # divides f coefficient by number of markers
 446   dirDomEff <- domEff + fCoefScal
 447
 448   # save
 449   userAddEff[[i]] <- addEff
 450   userDomEff[[i]] <- dirDomEff
 451 }
 452
 453
 454
 455
 456
 457
 458 ################################################################################
 459 # 8. Weight the marker effects and add them together to form an index of merit.
 460 ################################################################################
 461
 462 write("weight marker effects...", stderr())
 463
 464 ai <- 0
 465 di <- 0
 466 for (i in 1:length(userWeights)) {
 467   # write(paste("USER ADD EFF : ", userAddEff[[i]]), stderr())
 468   #  write(paste("USER DOM EFF : ", userDomEff[[i]]), stderr())
 469   # write(paste("USER WEIGHT : ", userWeights[i]), stderr())
 470
 471
 472   ai <- ai + userAddEff[[i]] * userWeights[i]
 473
 474   write("DONE WITH ADDITIVE EFFECTS!\n", stderr())
 475   di <- di + userDomEff[[i]] * userWeights[i]
 476   write("DONE WITH DOM EFFECTS!\n", stderr())
 477 }
 478
 479
 480
 481
 482
 483
 484 ################################################################################
 485 # 9. Predict the crosses.
 486 ################################################################################
 487
 488 # If the genotype matrix provides information about individuals for which
 489 # cross prediction is not desired, then the genotype matrix must be subset
 490 # for use in calcCrossMean(). calcCrossMean will return predicted cross
 491 # values for all individuals in the genotype file otherwise.
 492
 493 write("Predict crosses...", stderr())
 494
 495 GP <- G[rownames(G) %in% userPheno[, userID], ]
 496
 497 print("GP:")
 498 print(head(GP))
 499
 500 write("calcCrossMean...", stderr())
 501
 502 crossPlan <- calcCrossMean(
 503   GP,
 504   ai,
 505   di,
 506   userPloidy
 507 )
 508
 509
 510 write("Done with calcCrossMean!!!!!!", stderr())
 511
 512
 513
 514 ################################################################################
 515 # 10. Format the information needed for output.
 516 ################################################################################
 517
 518 # Add option to remove crosses with incompatible sexes.
 519
 520
 521 # hash <- new.env(hash = TRUE, parent = emptyenv(), size = 100L)
 522
 523 # assign_hash(userPheno$germplasmName, userPheno$userSexes, hash)
 524
 525 if (userSexes != "") { # "plant sex estimation 0-4"
 526   # !is.na(userSexes)  && !is.na(sd(userPheno[, userSexes]))
 527
 528   write(paste("userSexes", sd(userPheno[, userSexes])), stderr())
 529
 530   # Reformat the cross plan
 531   crossPlan <- as.data.frame(crossPlan)
 532
 533   write(paste("CROSSPLAN = ", head(crossPlan)), stderr())
 534   crossPlan <- crossPlan[order(crossPlan[, 3], decreasing = TRUE), ] # orders the plan by predicted merit
 535   crossPlan[, 1] <- rownames(GP)[crossPlan[, 1]] # replaces internal ID with genotye file ID
 536   crossPlan[, 2] <- rownames(GP)[crossPlan[, 2]] # replaces internal ID with genotye file ID
 537   colnames(crossPlan) <- c("Parent1", "Parent2", "CrossPredictedMerit")
 538
 539   write(paste("CROSSPLAN REPLACED = ", head(crossPlan)), stderr())
 540
 541   # Look up the parent sexes and subset
 542   crossPlan$P1Sex <- userPheno[match(crossPlan$Parent1, userPheno$germplasmName), userSexes] # get sexes ordered by Parent1
 543
 544   write(paste("PARENTS1 ", head(crossPlan)), stderr())
 545
 546   crossPlan$P2Sex <- userPheno[match(crossPlan$Parent2, userPheno$germplasmName), userSexes] # get sexes ordered by Parent2
 547
 548   write(paste("PARENTS2 ", head(crossPlan)), stderr())
 549   col_repl <- c("P1Sex", "P2Sex")
 550   crossPlan %>% filter(P1Sex == 0 | P2Sex == 0) # remove the 0s
 551   crossPlan %>% filter(P1Sex == 1 & P2Sex == 1) # remove same sex crosses with score of 1
 552   crossPlan %>% filter(P1Sex == 2 & P2Sex == 2) # remove same sex crosses with score of 2
 553
 554   write(paste("CROSSPLAN FILTERED = ", crossPlan), stderr())
 555   # crossPlan <- crossPlan[crossPlan$P1Sex != crossPlan$P2Sex, ] # remove crosses with same-sex parents
 556
 557   ## replace plant sex numbers to male, female etc
 558
 559   crossPlan[col_repl] <- sapply(crossPlan[col_repl], function(x) replace(x, x %in% "NA", "NA"))
 560   crossPlan[col_repl] <- sapply(crossPlan[col_repl], function(x) replace(x, x %in% 1, "Male"))
 561   crossPlan[col_repl] <- sapply(crossPlan[col_repl], function(x) replace(x, x %in% 2, "Female"))
 562   crossPlan[col_repl] <- sapply(crossPlan[col_repl], function(x) replace(x, x %in% 3, "Monoecious male (m>f)"))
 563   crossPlan[col_repl] <- sapply(crossPlan[col_repl], function(x) replace(x, x %in% 4, "Monoecious female(f>m)"))
 564
 565   # ** summary statistics for the cross prediction merit
 566   avg <- round(mean(crossPlan$CrossPredictedMerit), digits = 3)
 567   max <- round(max(crossPlan$CrossPredictedMerit), digits = 3)
 568   min <- round(min(crossPlan$CrossPredictedMerit), digits = 3)
 569   std <- round(sd(crossPlan$CrossPredictedMerit), digits = 3)
 570   leng <- length(crossPlan$CrossPredictedMerit)
 571
 572   ## histogram
 573   histogra <- paste(phenotypeFile, ".png", sep = "")
 574   png(file = histogra, width = 600, height = 350)
 575   hist(crossPlan$CrossPredictedMerit, xlab = "Cross Predicted Merit", main = "Distribution")
 576   mtext(paste("Mean =", avg), side = 3, adj = 1, line = 0)
 577   mtext(paste("Standard Deviation = ", std), side = 3, adj = 1, line = -1)
 578   mtext(paste("Range = (", min, " to ", max, ")"), side = 3, adj = 1, line = -2)
 579   mtext(paste("No. of predictions = ", leng), side = 3, adj = 1, line = -3)
 580   dev.off()
 581
 582
 583
 584   # subset the number of crosses the user wishes to output
 585   if (nrow(crossPlan)<100) {
 586     finalcrosses = crossPlan
 587   } else {
 588     crossPlan[1:userNCrosses, ]
 589     finalcrosses=crossPlan[1:userNCrosses, ]
 590   }
 591   outputFile <- paste(phenotypeFile, ".out", sep = "")
 592
 593   write.csv(finalcrosses, outputFile)
 594 } else {
 595   # only subset the number of crosses the user wishes to output
 596   crossPlan <- as.data.frame(crossPlan)
 597   crossPlan <- na.omit(crossPlan)
 598   crossPlan <- crossPlan[order(crossPlan[, 3], decreasing = TRUE), ] # orders the plan by predicted merit
 599   crossPlan[, 1] <- rownames(GP)[crossPlan[, 1]] # replaces internal ID with genotye file ID
 600   crossPlan[, 2] <- rownames(GP)[crossPlan[, 2]] # replaces internal ID with genotye file ID
 601   colnames(crossPlan) <- c("Parent1", "Parent2", "CrossPredictedMerit")
 602
 603   # get summary statistics for the cross prediction merit
 604   avg <- round(mean(crossPlan$CrossPredictedMerit), digits = 3)
 605   max <- round(max(crossPlan$CrossPredictedMerit), digits = 3)
 606   min <- round(min(crossPlan$CrossPredictedMerit), digits = 3)
 607   std <- round(sd(crossPlan$CrossPredictedMerit), digits = 3)
 608   leng <- length(crossPlan$CrossPredictedMerit)
 609
 610
 611   ## histogram
 612   histogra <- paste(phenotypeFile, ".png", sep = "")
 613   png(file = histogra, width = 600, height = 350)
 614   hist(crossPlan$CrossPredictedMerit, xlab = "Cross Predicted Merit", main = "Distribution")
 615   mtext(paste("Mean =", avg), side = 3, adj = 1, line = 0)
 616   mtext(paste("Standard Deviation = ", std), side = 3, adj = 1, line = -1)
 617   mtext(paste("Range = (", min, " to ", max, ")"), side = 3, adj = 1, line = -2)
 618   mtext(paste("No. of predictions = ", leng), side = 3, adj = 1, line = -3)
 619   dev.off()
 620
 621   ## save the best 100 predictions
 622   if (nrow(crossPlan)<100) {
 623     finalcrosses = crossPlan
 624   } else {
 625     crossPlan[1:userNCrosses, ]
 626     finalcrosses=crossPlan[1:userNCrosses, ]
 627   }
 628   outputFile <- paste(phenotypeFile, ".out", sep = "")
 629
 630   write.csv(finalcrosses, outputFile)
 631 }