R/GCPC.R

   1 ################################################################################
   2 # Genomic prediction of cross performance for YamBase
   3 ################################################################################
   4
   5 # There are ten main steps to this protocol:
   6 # 1. Load the software needed.
   7 # 2. Declare user-supplied variables.
   8 # 3. Read in the genotype data and convert to numeric allele counts.
   9 # 4. Get the genetic predictors needed.
  10 # 5. Process the phenotypic data.
  11 # 6. Fit the mixed models in sommer.
  12 # 7. Backsolve from individual estimates to marker effect estimates / GBLUP -> RR-BLUP
  13 # 8. Weight the marker effects and add them together to form an index of merit.
  14 # 9. Predict the crosses.
  15 # 10. Format the information needed for output.
  16
  17
  18 #Get Arguments
  19 args = commandArgs(trailingOnly=TRUE)
  20 if (length(args) < 3) {
  21   stop('Two or more arguments are required.')
  22 }
  23 phenotypeFile = args[1]
  24 genotypeFile = args[2]
  25 traits = args[3]
  26 weights = args[4]
  27 userSexes = args[5]
  28 userFixed = args[6]
  29 userRandom = args[7]
  30
  31
  32 write(paste('PLANT SEX CVTERM: ', userSexes), stderr())
  33
  34 ################################################################################
  35 # 1. Load software needed
  36 ################################################################################
  37
  38 library(sommer)
  39 library(AGHmatrix)
  40 library(VariantAnnotation) # Bioconductor package
  41 library(tools)
  42 Rcpp::sourceCpp("/home/production/cxgn/QuantGenResources/CalcCrossMeans.cpp") # this is called CalcCrossMean.cpp on Github
  43
  44
  45
  46
  47
  48
  49 ################################################################################
  50 # 2. Declare user-supplied variables
  51 ################################################################################
  52
  53 # a. Define path with internal YamBase instructions such that the object 'userGeno'
  54 #    is defined as a VCF file of genotypes.
  55
  56 #userGeno <- path
  57
  58
  59 # b. Define path2 with internal YamBase instructions such that the object 'userPheno'
  60 #    is defined as the phenotype file.
  61
  62 #userPheno <- path2
  63 #write(paste("READING PHENOTYPEFILE: ",phenotypeFile), stderr())
  64 userPheno <- read.delim(phenotypeFile, header = TRUE, sep="\t", fill=TRUE) #testing only
  65 #write(colnames(userPheno), stderr())
  66 #write(summary(userPheno), stderr())
  67 ##userPheno <- userPheno[userPheno$Trial == "SCG", ] #testing only-- needs to replaced with 2-stage
  68
  69 #write("DONE WITH PHENOTYPEFILE"), stderr())
  70
  71 # c. The user should be able to select their fixed variables from a menu
  72 #    of the column names of the userPheno object. The possible interaction terms
  73 #    also need to be shown somehow. Then, those strings should be passed
  74 #    to this vector, 'userFixed'. Please set userFixed to NA if no fixed effects
  75 #    besides f are requested.
  76 #    f is automatically included as a fixed effect- a note to the user would be good.
  77
  78 #userFixed <- c()
  79 #userFixed <- c("studyYear") # for testing only
  80 userFixed <-unlist(strsplit(userFixed, split=",", fixed=T))
  81
  82
  83 # d. The user should be able to select their random variables from a menu
  84 #    of the column names of the userPheno object. The possible interaction terms
  85 #    also need to be shown somehow. Then, those strings should be passed
  86 #    to this vector, 'userRandom'.
  87
  88 #userRandom <- c()
  89 #userRandom <- "blockNumber" # for testing only
  90 userRandom <-unlist(strsplit(userRandom, split=",", fixed=T))
  91
  92 # e. The user should be able to indicate which of the userPheno column names
  93 #    represents individual genotypes identically as they are represented in the VCF
  94 #    column names. No check to ensure matching at this stage. This single string
  95 #    should be passed to this vector, userID.
  96
  97 #userID <- c()
  98 userID <- "germplasmName" # for testing only
  99
 100
 101 # f. The user must indicate the ploidy level of their organism, and the integer
 102 #    provided should be passed to the vector 'userPloidy'. CalcCrossMeans.cpp
 103 #    currently supports ploidy = {2, 4, 6}. Ideally, the user could select
 104 #    their ploidy from a drop-down to avoid errors here, and there would be a note
 105 #    that other ploidies are not currently supported. If not, a possible error is
 106 #    provided.
 107
 108 userPloidy <- c()
 109 userPloidy <- 2 # for testing only
 110
 111 # if(userPloidy %in% c(2, 4, 6) != TRUE){
 112 #   stop("Only ploidies of 2, 4, and 6 are supported currently. \n
 113 #        Please confirm your ploidy level is supported.")
 114 # }
 115
 116
 117 # g. The user should be able to select their response variables from a drop-down menu
 118 #    of the column names of the userPheno object. Then, those strings should be passed
 119 #    to this vector, 'userResponse'.
 120
 121 write(paste("TRAIT STRING:", traits), stderr())
 122 #userResponse <- c()
 123 #userResponse <- c("YIELD", "DMC", "OXBI") # for testing only
 124 userResponse <- unlist(strsplit(traits, split=",", fixed=T))
 125
 126 write(paste("USER RESPONSE", userResponse),stderr())
 127 write(paste("first element: ", userResponse[1]), stderr())
 128 # h. The user must indicate weights for each response. The order of the vector
 129 #    of response weights must match the order of the responses in userResponse.
 130
 131 #userWeights <- c()
 132 #userWeights <- c(1, 0.8, 0.2) # for YIELD, DMC, and OXBI respectively; for testing only
 133 userWeights  <- as.numeric(unlist(strsplit(weights, split=",", fixed=T)))
 134
 135 write(paste("WEIGHTS", userWeights), stderr())
 136
 137 # i. The user can indicate the number of crosses they wish to output.
 138 #    The maximum possible is a full diallel.
 139
 140 #userNCrosses <- c()
 141 userNCrosses <- 100 # for testing only
 142
 143
 144 # j. The user can (optionally) input the individuals' sexes and indicate the column
 145 #    name of the userPheno object which corresponds to sex. The column name
 146 #   string should be passed to the 'userSexes' object. If the user does not wish
 147 #   to remove crosses with incompatible sexes (e.g. because the information is not available),
 148 #   then userSexes should be set to NA.
 149
 150
 151 #userSexes <- c()
 152 #userSexes <- "Sex" # for testing only
 153 #userPheno$Sex <- sample(c("M", "F"), size = nrow(userPheno), replace = TRUE, prob = c(0.7, 0.3)) # for testing only
 154 # Please note that for the test above, sex is sampled randomly for each entry, so the same accession can have
 155 # different sexes. This does not matter for the code or testing.
 156
 157 ################################################################################
 158 # 3. Read in the genotype data and convert to numeric allele counts.
 159 ################################################################################
 160
 161 # a. The VCF file object 'userGeno' needs to be converted to a numeric matrix
 162 #    of allele counts in whic:
 163 #    Rownames represent the individual genotype IDs
 164 #    Colnames represent the site IDs
 165 #    A cell within a given row and column represents the row individual's
 166 #    genotype at the site in the column.
 167
 168 #   The individual's genotype should be an integer from 0... ploidy to represent
 169 #   counts of the alternate allele at the site. Diploid example:
 170 #    0 = homozygous reference
 171 #    1 = heterozygous
 172 #    2 = homozygous alternate
 173
 174 #    The genotypes must not contain monomorphic or non-biallelic sites.
 175 #    Users need to pre-process their VCF to remove these (e.g. in TASSEL or R)
 176 #    I can put an error message into this script if a user tries to input
 177 #    monomorphic or biallelic sites which could be communicated through the GUI.
 178 #    It's also possible to filter them here.
 179
 180 if (file_ext(genotypeFile) == 'vcf') {
 181    write(paste("READING VARIANT FILE ", genotypeFile), stderr())
 182    #  Import VCF with VariantAnnotation package and extract matrix of dosages
 183    myVCF <- readVcf(genotypeFile)
 184    #G <- t(geno(myVCF)$DS) # Individual in row, genotype in column
 185    mat <- genotypeToSnpMatrix(myVCF)
 186    #G <- t(geno(myVCF)$DS) # Individual in row, genotype in column
 187    G <- as(mat$genotypes, "numeric")
 188    G <- G[,colSums(is.na(G))<nrow(G)]
 189
 190    #   TEST temporarily import the genotypes via HapMap:
 191    #source("R/hapMap2numeric.R") # replace and delete
 192    #G <- hapMap2numeric(genotypeFile) # replace and delete
 193
 194 } else {
 195
 196   # accession_names     abc      abc2    abc3
 197   # marker1                   0      0        2
 198   # marker2                   1      0        0
 199   # marker3                   0      0        0
 200
 201    write(paste("READING DOSAGE FILE ", genotypeFile), stderr())
 202      GF <- read.delim(genotypeFile)
 203      GD <- GF[,-1]
 204      GM <- as.matrix(GD)
 205      G <- t(GM)
 206
 207
 208 }
 209
 210 write("G Matrix start --------", stderr())
 211 write(G[1:5, 1:5], stderr())
 212 write("G Matrix end =========", stderr())
 213
 214
 215
 216
 217 ################################################################################
 218 # 4. Get the genetic predictors needed.
 219 ################################################################################
 220
 221 write("GENETIC PREDICTIONS...", stderr())
 222 # 4a. Get the inbreeding coefficent, f, as described by Xiang et al., 2016
 223 # The following constructs f as the average heterozygosity of the individual
 224 # The coefficient of f estimated later then needs to be divided by the number of markers
 225 # in the matrix D before adding it to the estimated dominance marker effects
 226 # One unit of change in f represents changing all loci from homozygous to heterozygous
 227
 228 ###GC <- G - (userPloidy/2) #this centers G
 229 GC <- G * (userPloidy - G) * (2 / userPloidy)^2 # center at G
 230 f <- rowSums(GC, na.rm = TRUE) / apply(GC, 1, function(x) sum(!is.na(x)))
 231
 232 # Another alternate way to construct f is the total number of heterozygous loci in the individual
 233 # The coefficient of this construction of f does not need to be divided by the number of markers
 234 # It is simply added to each marker dominance effect
 235 # The coefficient of this construction of f represents the average dominance effect of a marker
 236 # One unit of change in f represents changing one locus from homozygous to heterozygous
 237 # f <- rowSums(D, na.rm = TRUE)
 238
 239
 240 write("DISTANCE MATRIX...", stderr())
 241 # 4b. Get the additive and dominance relationship matrices following Batista et al., 2021
 242 # https://doi.org/10.1007/s00122-021-03994-w
 243
 244 # Additive: this gives a different result than AGHmatrix VanRaden's Gmatrix
 245 # AGHmatrix: Weights are implemented for "VanRaden" method as described in Liu (2020)?
 246 allele_freq = colSums(G) / (userPloidy * nrow(G))
 247 W = t(G) - userPloidy * allele_freq
 248 WWt = crossprod(W)
 249 denom = sum(userPloidy * allele_freq * (1 - allele_freq))
 250 A = WWt / denom
 251
 252 # Check with paper equation:
 253 #w <- G - (userPloidy/2)
 254 #num <- w %*% t(w)
 255 #denom = sum(userPloidy * allele_freq * (1 - allele_freq))
 256 #A2 <- num/denom
 257 #table(A == A2)
 258 #cor(as.vector(A), as.vector(A2)) # 0.9996...
 259
 260
 261 # Dominance or digenic dominance
 262 if(userPloidy == 2){
 263   D <- Gmatrix(G, method = "Su", ploidy = userPloidy, missingValue = NA)
 264 }
 265
 266 if(userPloidy > 2){
 267 # Digenic dominance
 268 C_matrix = matrix(length(combn(userPloidy, 2)) / 2,
 269                   nrow = nrow(t(G)),
 270                   ncol = ncol(t(G)))
 271
 272 Ploidy_matrix = matrix(userPloidy,
 273                        nrow = nrow(t(G)),
 274                        ncol = ncol(t(G)))
 275
 276 Q = (allele_freq^2 * C_matrix) -
 277   (Ploidy_matrix - 1) * allele_freq * t(G) +
 278   0.5 * t(G) * (t(G) - 1)
 279
 280 Dnum = crossprod(Q)
 281 denomDom = sum(C_matrix[ ,1] * allele_freq^2 * (1 - allele_freq)^2)
 282 D = Dnum/denomDom
 283 }
 284
 285
 286
 287
 288
 289
 290 ################################################################################
 291 # 5. Process the phenotypic data.
 292 ################################################################################
 293
 294 #write(summary(userPheno), stderr())
 295
 296 # a. Paste f into the phenotype dataframe
 297 write("processing phenotypic data...", stderr())
 298 userPheno$f <- f[as.character(userPheno[ , userID])]
 299
 300 #write(summary(userPheno), stderr())
 301
 302 # b. Scale the response variables.
 303 write("processing phenotypic data... scaling...", stderr())
 304 write(paste("USER RESPONSE LENGTH = ", length(userResponse)), stderr())
 305 for(i in 1:length(userResponse)){
 306   write(paste("working on user response ", userResponse[i]), stderr())
 307   userPheno[ , userResponse[i]] <- (userPheno[ , userResponse[i]] - mean(userPheno[ , userResponse[i]], na.rm = TRUE)) / sd(userPheno[ , userResponse[i]], na.rm = TRUE)
 308 }
 309
 310 write(paste("accession count: ", length(userPheno[ , userID])), stderr())
 311 write("processing phenotypic data... adding dominance effects", stderr())
 312 # c. Paste in a second ID column for the dominance effects.
 313
 314 #write(summary(userPheno), stderr())
 315
 316 dominanceEffectCol = paste(userID, "2", sep="")
 317 write(paste("NEW COL NAME: ", dominanceEffectCol), stderr())
 318
 319 write(paste("USER_ID COLUMN: ", userPheno[ , userID]), stderr());
 320
 321
 322 userPheno[ , dominanceEffectCol] <- userPheno[ , userID]
 323
 324 write(paste("USER PHENO userID2 COL", userPheno[ , dominanceEffectCol]), stderr())
 325
 326
 327 # Additional steps could be added here to remove outliers etc.
 328
 329
 330
 331
 332
 333 ################################################################################
 334 # 6. Fit the mixed models in sommer.
 335 ################################################################################
 336
 337 write("Fit mixed model in sommer", stderr());
 338
 339 # 6a. Make a list to save the models.
 340
 341 userModels <- list()
 342
 343 for(i in 1:length(userResponse)){
 344
 345   write(paste("User response: ", userResponse[i]), stderr())
 346   # check if fixed effects besides f are requested, then paste together
 347   # response variable and fixed effects
 348   if(!is.na(userFixed[1])){
 349   fixedEff <- paste(userFixed, collapse = " + ")
 350   fixedEff <- paste(fixedEff, "f", sep = " + ")
 351   fixedArg <- paste(userResponse[i], " ~ ", fixedEff, sep = "")
 352   }
 353   if(is.na(userFixed[1])){
 354     fixedArg <- paste(userResponse[i], " ~ ", "f")
 355   }
 356
 357
 358   # check if random effects besides genotypic additive and dominance effects
 359   # are requested, then paste together the formula
 360
 361   write("Generate formula...", stderr())
 362
 363   if(!is.na(userRandom[1])){
 364     randEff <- paste(userRandom, collapse = " + ")
 365     ID2 <- paste(userID, 2, sep = "")
 366     randEff2 <- paste("~vsr(", userID, ", Gu = A) + vsr(", ID2, ", Gu = D)", sep = "")
 367     randArg <- paste(randEff2, randEff, sep = " + ")
 368   }
 369   if(is.na(userRandom[1])){
 370     ID2 <- paste(userID, 2, sep ="")
 371     randArg <- paste("~vsr(", userID, ", Gu = A) + vsr(", ID2, ", Gu = D)", sep = "")
 372   }
 373
 374   write(paste("Fit mixed GBLUP model...", randArg), stderr())
 375
 376   #  write(paste("USER PHENO:", userPheno), stderr())
 377   #  write(paste("COLNAMES: ", colnames(userPheno)), stderr())
 378   # fit the mixed GBLUP model
 379   myMod <- mmer(fixed = as.formula(fixedArg),
 380                 random = as.formula(randArg),
 381                 rcov = ~units,
 382                 getPEV = FALSE,
 383                 data = userPheno)
 384
 385
 386   # save the fit model
 387
 388   write(paste("I = ", i), stderr());
 389
 390   userModels[[i]] <- myMod
 391 }
 392
 393
 394
 395
 396
 397
 398 ######################################################################################
 399 # 7. Backsolve from individual estimates to marker effect estimates / GBLUP -> RR-BLUP
 400 ######################################################################################
 401
 402 # a. Get the matrices and inverses needed
 403 #    This is not correct for polyploids yet.
 404 A.G <- G - (userPloidy / 2) # this is the additive genotype matrix (coded -1 0 1 for diploids)
 405 D.G <- 1 - abs(A.G)     # this is the dominance genotype matrix (coded 0 1 0 for diploids)
 406
 407
 408 A.T <- A.G %*% t(A.G) ## additive genotype matrix
 409 A.Tinv <- solve(A.T) # inverse; may cause an error sometimes, if so, add a small amount to the diag
 410 A.TTinv <- t(A.G) %*% A.Tinv # M'%*% (M'M)-
 411
 412 D.T <- D.G %*% t(D.G) ## dominance genotype matrix
 413 D.Tinv <- solve(D.T) ## inverse
 414 D.TTinv <- t(D.G) %*% D.Tinv # M'%*% (M'M)-
 415
 416
 417 # b. Loop through and backsolve to marker effects.
 418
 419 write("backsolve marker effects...", stderr())
 420
 421 userAddEff <- list() # save them in order
 422 userDomEff <- list() # save them in order
 423
 424 for(i in 1:length(userModels)){
 425
 426   myMod <- userModels[[i]]
 427
 428   # get the additive and dominance effects out of the sommer list
 429   subMod <- myMod$U
 430   subModA <- subMod[[1]]
 431   subModA <- subModA[[1]]
 432   subModD <- subMod[[2]]
 433   subModD <- subModD[[1]]
 434
 435   # backsolve
 436   addEff <- A.TTinv %*% matrix(subModA[colnames(A.TTinv)], ncol = 1) # these must be reordered to match A.TTinv
 437   domEff <- D.TTinv %*% matrix(subModD[colnames(D.TTinv)], ncol=1)   # these must be reordered to match D.TTinv
 438
 439   # add f coefficient back into the dominance effects
 440   subModf <- myMod$Beta
 441   fCoef <- subModf[subModf$Effect == "f", "Estimate"] # raw f coefficient
 442   fCoefScal <- fCoef / ncol(G) # divides f coefficient by number of markers
 443   dirDomEff <- domEff + fCoefScal
 444
 445   # save
 446   userAddEff[[i]] <- addEff
 447   userDomEff[[i]] <- dirDomEff
 448 }
 449
 450
 451
 452
 453
 454
 455 ################################################################################
 456 # 8. Weight the marker effects and add them together to form an index of merit.
 457 ################################################################################
 458
 459 write("weight marker effects...", stderr())
 460
 461 ai <- 0
 462 di <- 0
 463 for(i in 1:length(userWeights)){
 464   write(paste("USER ADD EFF : ", userAddEff[[i]]), stderr())
 465     write(paste("USER DOM EFF : ", userDomEff[[i]]), stderr())
 466   write(paste("USER WEIGHT : ", userWeights[i]), stderr())
 467
 468
 469        ai <- ai + userAddEff[[i]] *  userWeights[i]
 470
 471  write("DONE WITH ADDITIVE EFFECTS!\n", stderr())
 472   di <- di + userDomEff[[i]] * userWeights[i]
 473    write("DONE WITH DOM EFFECTS!\n", stderr())
 474 }
 475
 476
 477
 478
 479
 480
 481 ################################################################################
 482 # 9. Predict the crosses.
 483 ################################################################################
 484
 485 # If the genotype matrix provides information about individuals for which
 486 # cross prediction is not desired, then the genotype matrix must be subset
 487 # for use in calcCrossMean(). calcCrossMean will return predicted cross
 488 # values for all individuals in the genotype file otherwise.
 489
 490 write("Predict crosses...", stderr())
 491
 492 GP <- G[rownames(G) %in% userPheno[ , userID], ]
 493
 494 print("GP:")
 495 print(head(GP))
 496
 497 write("calcCrossMean...", stderr())
 498
 499 crossPlan <- calcCrossMean(GP,
 500                            ai,
 501                            di,
 502                            userPloidy)
 503
 504
 505 write("Done with calcCrossMean!!!!!!", stderr())
 506
 507
 508
 509 ################################################################################
 510 # 10. Format the information needed for output.
 511 ################################################################################
 512
 513 # Add option to remove crosses with incompatible sexes.
 514
 515
 516 #hash <- new.env(hash = TRUE, parent = emptyenv(), size = 100L)
 517
 518 #assign_hash(userPheno$germplasmName, userPheno$userSexes, hash)
 519
 520 if(!is.na(userSexes)){  # "plant sex estimation 0-4"
 521
 522
 523   write(paste("userSexes", head(userSexes)), stderr())
 524
 525   # Reformat the cross plan
 526   crossPlan <- as.data.frame(crossPlan)
 527
 528   write(paste("CROSSPLAN = ", head(crossPlan)), stderr())
 529   crossPlan <- crossPlan[order(crossPlan[,3], decreasing = TRUE), ] # orders the plan by predicted merit
 530   crossPlan[ ,1] <- rownames(GP)[crossPlan[ ,1]] # replaces internal ID with genotye file ID
 531   crossPlan[ ,2] <- rownames(GP)[crossPlan[ ,2]] # replaces internal ID with genotye file ID
 532   colnames(crossPlan) <- c("Parent1", "Parent2", "CrossPredictedMerit")
 533
 534   write(paste("CROSSPLAN REPLACED = ", head(crossPlan)), stderr());
 535
 536   # Look up the parent sexes and subset
 537   crossPlan$P1Sex <- userPheno[match(crossPlan$Parent1, userPheno$germplasmName), userSexes] # get sexes ordered by Parent1
 538
 539   write(paste("PARENTS1 ", head(crossPlan)), stderr())
 540
 541   crossPlan$P2Sex <- userPheno[match(crossPlan$Parent2, userPheno$germplasmName), userSexes] # get sexes ordered by Parent2
 542
 543   write(paste("PARENTS2 ", head(crossPlan)), stderr())
 544
 545   crossPlan <- crossPlan[!(crossPlan$P1Sex==0 | crossPlan$P2Sex==0),] #remove the 0s
 546   crossPlan <- crossPlan[!(crossPlan$P1Sex==1 & crossPlan$P2Sex==1),] #remove same sex crosses with score of 1
 547   crossPlan <- crossPlan[!(crossPlan$P1Sex==2 & crossPlan$P2Sex==2),] #remove same sex crosses with score of 2
 548
 549   write(paste("CROSSPLAN FILTERED = ", head(crossPlan)), stderr())
 550   #crossPlan <- crossPlan[crossPlan$P1Sex != crossPlan$P2Sex, ] # remove crosses with same-sex parents
 551
 552   # subset the number of crosses the user wishes to output
 553   crossPlan[1:userNCrosses, ]
 554   finalcrosses=crossPlan[1:userNCrosses, ]
 555   outputFile= paste(phenotypeFile, ".out", sep="")
 556
 557   write.csv(finalcrosses, outputFile)
 558
 559 }
 560
 561
 562 if(is.na(userSexes)){
 563
 564   # only subset the number of crosses the user wishes to output
 565   crossPlan <- as.data.frame(crossPlan)
 566
 567   crossPlan <- crossPlan[order(crossPlan[,3], decreasing = TRUE), ] # orders the plan by predicted merit
 568   crossPlan[ ,1] <- rownames(GP)[crossPlan[ ,1]] # replaces internal ID with genotye file ID
 569   crossPlan[ ,2] <- rownames(GP)[crossPlan[ ,2]] # replaces internal ID with genotye file ID
 570   colnames(crossPlan) <- c("Parent1", "Parent2", "CrossPredictedMerit")
 571
 572   crossPlan[1:userNCrosses, ]
 573   finalcrosses=crossPlan[1:userNCrosses, ]
 574   outputFile= paste(phenotypeFile, ".out", sep="")
 575
 576   write.csv(finalcrosses, outputFile)
 577
 578
 579 }