R/GCPC_YamBase.R

   1 ################################################################################\r
   2 # Genomic prediction of cross performance for YamBase\r
   3 ################################################################################\r
   4 \r
   5 # There are ten main steps to this protocol:\r
   6 # 1. Load the software needed.\r
   7 # 2. Declare user-supplied variables.\r
   8 # 3. Read in the genotype data and convert to numeric allele counts.\r
   9 # 4. Get the genetic predictors needed.\r
  10 # 5. Process the phenotypic data.\r
  11 # 6. Fit the mixed models in sommer.\r
  12 # 7. Backsolve from individual estimates to marker effect estimates / GBLUP -> RR-BLUP\r
  13 # 8. Weight the marker effects and add them together to form an index of merit.\r
  14 # 9. Predict the crosses.\r
  15 # 10. Format the information needed for output.\r
  16 \r
  17 \r
  18 #Get Arguments\r
  19 args = commandArgs(trailingOnly=TRUE)\r
  20 if (length(args)!=2) {\r
  21   stop('Two Arguments are required.')\r
  22 }\r
  23 phenotypeFile = args[1]\r
  24 genotypeFile = args[2]\r
  25 traits = args[3]\r
  26 weights = args[4]\r
  27 \r
  28 ################################################################################\r
  29 # 1. Load software needed\r
  30 ################################################################################\r
  31 \r
  32 library(sommer)\r
  33 library(AGHmatrix)\r
  34 library(VariantAnnotation) # Bioconductor package\r
  35 Rcpp::sourceCpp("../QuantGenResources/CalcCrossMeans.cpp") # this is called CalcCrossMean.cpp on Github\r
  36 \r
  37 \r
  38 \r
  39 \r
  40 \r
  41 \r
  42 ################################################################################\r
  43 # 2. Declare user-supplied variables\r
  44 ################################################################################\r
  45 \r
  46 # a. Define path with internal YamBase instructions such that the object 'userGeno'\r
  47 #    is defined as a VCF file of genotypes.\r
  48 \r
  49 #userGeno <- path\r
  50 \r
  51 \r
  52 # b. Define path2 with internal YamBase instructions such that the object 'userPheno'\r
  53 #    is defined as the phenotype file.\r
  54 \r
  55 #userPheno <- path2\r
  56 userPheno <- read.csv(phenotypeFile, header = TRUE) #testing only\r
  57 userPheno <- userPheno[userPheno$Trial == "SCG", ] #testing only-- needs to replaced with 2-stage\r
  58 \r
  59 \r
  60 # c. The user should be able to select their fixed variables from a menu\r
  61 #    of the column names of the userPheno object. The possible interaction terms\r
  62 #    also need to be shown somehow. Then, those strings should be passed\r
  63 #    to this vector, 'userFixed'. Please set userFixed to NA if no fixed effects\r
  64 #    besides f are requested.\r
  65 #    f is automatically included as a fixed effect- a note to the user would be good.\r
  66 \r
  67 userFixed <- c()\r
  68 userFixed <- c("Year") # for testing only\r
  69 \r
  70 \r
  71 # d. The user should be able to select their random variables from a menu\r
  72 #    of the column names of the userPheno object. The possible interaction terms\r
  73 #    also need to be shown somehow. Then, those strings should be passed\r
  74 #    to this vector, 'userRandom'.\r
  75 \r
  76 userRandom <- c()\r
  77 userRandom <- "Block" # for testing only\r
  78 \r
  79 \r
  80 # e. The user should be able to indicate which of the userPheno column names\r
  81 #    represents individual genotypes identically as they are represented in the VCF\r
  82 #    column names. No check to ensure matching at this stage. This single string\r
  83 #    should be passed to this vector, userID.\r
  84 \r
  85 userID <- c()\r
  86 userID <- "Accession" # for testing only\r
  87 \r
  88 \r
  89 # f. The user must indicate the ploidy level of their organism, and the integer\r
  90 #    provided should be passed to the vector 'userPloidy'. CalcCrossMeans.cpp\r
  91 #    currently supports ploidy = {2, 4, 6}. Ideally, the user could select\r
  92 #    their ploidy from a drop-down to avoid errors here, and there would be a note\r
  93 #    that other ploidies are not currently supported. If not, a possible error is\r
  94 #    provided.\r
  95 \r
  96 userPloidy <- c()\r
  97 userPloidy <- 2 # for testing only\r
  98 \r
  99 # if(userPloidy %in% c(2, 4, 6) != TRUE){\r
 100 #   stop("Only ploidies of 2, 4, and 6 are supported currently. \n\r
 101 #        Please confirm your ploidy level is supported.")\r
 102 # }\r
 103 \r
 104 \r
 105 # g. The user should be able to select their response variables from a drop-down menu\r
 106 #    of the column names of the userPheno object. Then, those strings should be passed\r
 107 #    to this vector, 'userResponse'.\r
 108 \r
 109 userResponse <- c()\r
 110 #userResponse <- c("YIELD", "DMC", "OXBI") # for testing only\r
 111 userResponse <- strsplit(traits, ",")\r
 112 \r
 113 \r
 114 # h. The user must indicate weights for each response. The order of the vector\r
 115 #    of response weights must match the order of the responses in userResponse.\r
 116 \r
 117 userWeights <- c()\r
 118 #userWeights <- c(1, 0.8, 0.2) # for YIELD, DMC, and OXBI respectively; for testing only\r
 119 userWeights  <- strsplit(weights, ",")\r
 120 \r
 121 # i. The user can indicate the number of crosses they wish to output.\r
 122 #    The maximum possible is a full diallel.\r
 123 \r
 124 userNCrosses <- c()\r
 125 userNCrosses <- 40 # for testing only\r
 126 \r
 127 \r
 128 # j. The user can (optionally) input the individuals' sexes and indicate the column\r
 129 #    name of the userPheno object which corresponds to sex. The column name\r
 130 #   string should be passed to the 'userSexes' object. If the user does not wish\r
 131 #   to remove crosses with incompatible sexes (e.g. because the information is not available),\r
 132 #   then userSexes should be set to NA.\r
 133 \r
 134 \r
 135 userSexes <- c()\r
 136 userSexes <- "Sex" # for testing only\r
 137 userPheno$Sex <- sample(c("M", "F"), size = nrow(userPheno), replace = TRUE, prob = c(0.7, 0.3)) # for testing only\r
 138 # Please note that for the test above, sex is sampled randomly for each entry, so the same accession can have\r
 139 # different sexes. This does not matter for the code or testing.\r
 140 \r
 141 ################################################################################\r
 142 # 3. Read in the genotype data and convert to numeric allele counts.\r
 143 ################################################################################\r
 144 \r
 145 # a. The VCF file object 'userGeno' needs to be converted to a numeric matrix\r
 146 #    of allele counts in whic:\r
 147 #    Rownames represent the individual genotype IDs\r
 148 #    Colnames represent the site IDs\r
 149 #    A cell within a given row and column represents the row individual's\r
 150 #    genotype at the site in the column.\r
 151 \r
 152 #   The individual's genotype should be an integer from 0... ploidy to represent\r
 153 #   counts of the alternate allele at the site. Diploid example:\r
 154 #    0 = homozygous reference\r
 155 #    1 = heterozygous\r
 156 #    2 = homozygous alternate\r
 157 \r
 158 #    The genotypes must not contain monomorphic or non-biallelic sites.\r
 159 #    Users need to pre-process their VCF to remove these (e.g. in TASSEL or R)\r
 160 #    I can put an error message into this script if a user tries to input\r
 161 #    monomorphic or biallelic sites which could be communicated through the GUI.\r
 162 #    It's also possible to filter them here.\r
 163 \r
 164 #  Import VCF with VariantAnnotation package and extract matrix of dosages\r
 165 myVCF <- readVcf(genotypeFile)\r
 166 G <- t(geno(myVCF)$DS) # Individual in row, genotype in column\r
 167 \r
 168 \r
 169 #   TEST temporarily import the genotypes via HapMap:\r
 170 #source("R/hapMap2numeric.R") # replace and delete\r
 171 #G <- hapMap2numeric(genotypeFile) # replace and delete\r
 172 \r
 173 \r
 174 \r
 175 \r
 176 \r
 177 \r
 178 ################################################################################\r
 179 # 4. Get the genetic predictors needed.\r
 180 ################################################################################\r
 181 \r
 182 # 4a. Get the inbreeding coefficent, f, as described by Xiang et al., 2016\r
 183 # The following constructs f as the average heterozygosity of the individual\r
 184 # The coefficient of f estimated later then needs to be divided by the number of markers\r
 185 # in the matrix D before adding it to the estimated dominance marker effects\r
 186 # One unit of change in f represents changing all loci from homozygous to heterozygous\r
 187 \r
 188 GC <- G - (userPloidy/2) #this centers G\r
 189 f <- rowSums(GC, na.rm = TRUE) / apply(GC, 1, function(x) sum(!is.na(x)))\r
 190 \r
 191 # Another alternate way to construct f is the total number of heterozygous loci in the individual\r
 192 # The coefficient of this construction of f does not need to be divided by the number of markers\r
 193 # It is simply added to each marker dominance effect\r
 194 # The coefficient of this construction of f represents the average dominance effect of a marker\r
 195 # One unit of change in f represents changing one locus from homozygous to heterozygous\r
 196 # f <- rowSums(D, na.rm = TRUE)\r
 197 \r
 198 \r
 199 \r
 200 # 4b. Get the additive and dominance relationship matrices. This uses AGHmatrix.\r
 201 \r
 202 A <- Gmatrix(G, method = "VanRaden", ploidy = userPloidy, missingValue = NA)\r
 203 \r
 204 if(userPloidy == 2){\r
 205   D <- Gmatrix(G, method = "Su", ploidy = userPloidy, missingValue = NA)\r
 206 }\r
 207 \r
 208 # I haven't tested this yet\r
 209 if(userPloidy > 2){\r
 210   D <- Gmatrix(G, method = "Slater", ploidy = userPloidy, missingValue = NA)\r
 211 }\r
 212 \r
 213 \r
 214 \r
 215 \r
 216 \r
 217 \r
 218 ################################################################################\r
 219 # 5. Process the phenotypic data.\r
 220 ################################################################################\r
 221 \r
 222 # a. Paste f into the phenotype dataframe\r
 223 userPheno$f <- f[as.character(userPheno[ , userID])]\r
 224 \r
 225 \r
 226 # b. Scale the response variables.\r
 227 for(i in 1:length(userResponse)){\r
 228   userPheno[ , userResponse[i]] <- (userPheno[ , userResponse[i]] -\r
 229                                       mean(userPheno[ , userResponse[i]], na.rm = TRUE))/\r
 230     sd(userPheno[ , userResponse[i]], na.rm = TRUE)\r
 231 }\r
 232 \r
 233 \r
 234 # c. Paste in a second ID column for the dominance effects.\r
 235 userPheno[ , paste(userID, 2, sep = "")] <- userPheno[ , userID]\r
 236 \r
 237 \r
 238 # Additional steps could be added here to remove outliers etc.\r
 239 \r
 240 \r
 241 \r
 242 \r
 243 \r
 244 ################################################################################\r
 245 # 6. Fit the mixed models in sommer.\r
 246 ################################################################################\r
 247 \r
 248 \r
 249 # 6a. Make a list to save the models.\r
 250 userModels <- list()\r
 251 \r
 252 for(i in 1:length(userResponse)){\r
 253 \r
 254 \r
 255   # check if fixed effects besides f are requested, then paste together\r
 256   # response variable and fixed effects\r
 257   if(!is.na(userFixed[1])){\r
 258   fixedEff <- paste(userFixed, collapse = " + ")\r
 259   fixedEff <- paste(fixedEff, "f", sep = " + ")\r
 260   fixedArg <- paste(userResponse[i], " ~ ", fixedEff, sep = "")\r
 261   }\r
 262   if(is.na(userFixed[1])){\r
 263     fixedArg <- paste(userResponse[i], " ~ ", "f")\r
 264   }\r
 265 \r
 266 \r
 267   # check if random effects besides genotypic additive and dominance effects\r
 268   # are requested, then paste together the formula\r
 269   if(!is.na(userRandom[1])){\r
 270     randEff <- paste(userRandom, collapse = " + ")\r
 271     ID2 <- paste(userID, 2, sep = "")\r
 272     randEff2 <- paste("~vs(", userID, ", Gu = A) + vs(", ID2, ", Gu = D)", sep = "")\r
 273     randArg <- paste(randEff2, randEff, sep = " + ")\r
 274   }\r
 275   if(is.na(userRandom[1])){\r
 276     randArg <- paste("~vs(", userID, ", Gu = A) + vs(", ID2, ", Gu = D)", sep = "")\r
 277   }\r
 278 \r
 279 \r
 280   # fit the mixed GBLUP model\r
 281   myMod <- mmer(fixed = as.formula(fixedArg),\r
 282                 random = as.formula(randArg),\r
 283                 rcov = ~units,\r
 284                 getPEV = FALSE,\r
 285                 data = userPheno)\r
 286 \r
 287 \r
 288   # save the fit model\r
 289   userModels[[i]] <- myMod\r
 290 }\r
 291 \r
 292 \r
 293 \r
 294 \r
 295 \r
 296 \r
 297 ######################################################################################\r
 298 # 7. Backsolve from individual estimates to marker effect estimates / GBLUP -> RR-BLUP\r
 299 ######################################################################################\r
 300 \r
 301 # a. Get the matrices and inverses needed\r
 302 #    This is not correct for polyploids yet.\r
 303 A.G <- G - (userPloidy / 2) # this is the additive genotype matrix (coded -1 0 1 for diploids)\r
 304 D.G <- GC     # this is the dominance genotype matrix (coded 0 1 0 for diploids)\r
 305 \r
 306 \r
 307 A.T <- A.G %*% t(A.G) ## additive genotype matrix\r
 308 A.Tinv <- solve(A.T) # inverse; may cause an error sometimes, if so, add a small amount to the diag\r
 309 A.TTinv <- t(A.G) %*% A.Tinv # M'%*% (M'M)-\r
 310 \r
 311 D.T <- D.G %*% t(D.G) ## dominance genotype matrix\r
 312 D.Tinv <- solve(D.T) ## inverse\r
 313 D.TTinv <- t(D.G) %*% D.Tinv # M'%*% (M'M)-\r
 314 \r
 315 \r
 316 # b. Loop through and backsolve to marker effects.\r
 317 \r
 318 userAddEff <- list() # save them in order\r
 319 userDomEff <- list() # save them in order\r
 320 \r
 321 for(i in 1:length(userModels)){\r
 322 \r
 323   myMod <- userModels[[i]]\r
 324 \r
 325   # get the additive and dominance effects out of the sommer list\r
 326   subMod <- myMod$U\r
 327   subModA <- subMod[[1]]\r
 328   subModA <- subModA[[1]]\r
 329   subModD <- subMod[[2]]\r
 330   subModD <- subModD[[1]]\r
 331 \r
 332   # backsolve\r
 333   addEff <- A.TTinv %*% matrix(subModA[colnames(A.TTinv)], ncol = 1) # these must be reordered to match A.TTinv\r
 334   domEff <- D.TTinv %*% matrix(subModD[colnames(D.TTinv)], ncol=1)   # these must be reordered to match D.TTinv\r
 335 \r
 336   # add f coefficient back into the dominance effects\r
 337   subModf <- myMod$Beta\r
 338   fCoef <- subModf[subModf$Effect == "f", "Estimate"] # raw f coefficient\r
 339   fCoefScal <- fCoef / ncol(G) # divides f coefficient by number of markers\r
 340   dirDomEff <- domEff + fCoefScal\r
 341 \r
 342   # save\r
 343   userAddEff[[i]] <- addEff\r
 344   userDomEff[[i]] <- dirDomEff\r
 345 }\r
 346 \r
 347 \r
 348 \r
 349 \r
 350 \r
 351 \r
 352 ################################################################################\r
 353 # 8. Weight the marker effects and add them together to form an index of merit.\r
 354 ################################################################################\r
 355 \r
 356 ai <- 0\r
 357 di <- 0\r
 358 for(i in 1:length(userWeights)){\r
 359   ai <- ai + userAddEff[[i]] * userWeights[i]\r
 360   di <- di + userDomEff[[i]] * userWeights[i]\r
 361 }\r
 362 \r
 363 \r
 364 \r
 365 \r
 366 \r
 367 \r
 368 ################################################################################\r
 369 # 9. Predict the crosses.\r
 370 ################################################################################\r
 371 \r
 372 # If the genotype matrix provides information about individuals for which\r
 373 # cross prediction is not desired, then the genotype matrix must be subset\r
 374 # for use in calcCrossMean(). calcCrossMean will return predicted cross\r
 375 # values for all individuals in the genotype file otherwise.\r
 376 \r
 377 GP <- G[rownames(G) %in% userPheno[ , userID], ]\r
 378 \r
 379 \r
 380 crossPlan <- calcCrossMean(GP,\r
 381                            ai,\r
 382                            di,\r
 383                            userPloidy)\r
 384 \r
 385 \r
 386 \r
 387 \r
 388 \r
 389 \r
 390 ################################################################################\r
 391 # 10. Format the information needed for output.\r
 392 ################################################################################\r
 393 \r
 394 # Add option to remove crosses with incompatible sexes.\r
 395 \r
 396 if(!is.na(userSexes)){\r
 397   \r
 398   # Reformat the cross plan\r
 399   crossPlan <- as.data.frame(crossPlan)\r
 400   crossPlan <- crossPlan[order(crossPlan[,3], decreasing = TRUE), ] # orders the plan by predicted merit\r
 401   crossPlan[ ,1] <- rownames(GP)[crossPlan[ ,1]] # replaces internal ID with genotye file ID\r
 402   crossPlan[ ,2] <- rownames(GP)[crossPlan[ ,2]] # replaces internal ID with genotye file ID\r
 403   colnames(crossPlan) <- c("Parent1", "Parent2", "CrossPredictedMerit")\r
 404   \r
 405   # Look up the parent sexes and subset\r
 406   crossPlan$P1Sex <- userPheno[match(crossPlan$Parent1, userPheno$Accession), userSexes] # get sexes ordered by Parent1\r
 407   crossPlan$P2Sex <- userPheno[match(crossPlan$Parent2, userPheno$Accession), userSexes] # get sexes ordered by Parent2\r
 408   crossPlan <- crossPlan[crossPlan$P1Sex != crossPlan$P2Sex, ] # remove crosses with same-sex parents\r
 409   \r
 410   \r
 411   # subset the number of crosses the user wishes to output\r
 412   crossPlan[1:userNCrosses, ]\r
 413   outputFile= paste(phenotypeFile, ".out", sep="")\r
 414   \r
 415 }\r
 416 \r
 417 \r
 418 if(is.na(userSexes)){\r
 419   \r
 420   # only subset the number of crosses the user wishes to output\r
 421   crossPlan[1:userNCrosses, ]\r
 422   outputFile= paste(phenotypeFile, ".out", sep="")\r
 423   \r
 424 }\r