1 ################################################################################
\r
2 # Genomic prediction of cross performance for YamBase
\r
3 ################################################################################
\r
5 # There are ten main steps to this protocol:
\r
6 # 1. Load the software needed.
\r
7 # 2. Declare user-supplied variables.
\r
8 # 3. Read in the genotype data and convert to numeric allele counts.
\r
9 # 4. Get the genetic predictors needed.
\r
10 # 5. Process the phenotypic data.
\r
11 # 6. Fit the mixed models in sommer.
\r
12 # 7. Backsolve from individual estimates to marker effect estimates / GBLUP -> RR-BLUP
\r
13 # 8. Weight the marker effects and add them together to form an index of merit.
\r
14 # 9. Predict the crosses.
\r
15 # 10. Format the information needed for output.
\r
19 args = commandArgs(trailingOnly=TRUE)
\r
20 if (length(args)!=2) {
\r
21 stop('Two Arguments are required.')
\r
23 phenotypeFile = args[1]
\r
24 genotypeFile = args[2]
\r
28 ################################################################################
\r
29 # 1. Load software needed
\r
30 ################################################################################
\r
34 library(VariantAnnotation) # Bioconductor package
\r
35 Rcpp::sourceCpp("../QuantGenResources/CalcCrossMeans.cpp") # this is called CalcCrossMean.cpp on Github
\r
42 ################################################################################
\r
43 # 2. Declare user-supplied variables
\r
44 ################################################################################
\r
46 # a. Define path with internal YamBase instructions such that the object 'userGeno'
\r
47 # is defined as a VCF file of genotypes.
\r
52 # b. Define path2 with internal YamBase instructions such that the object 'userPheno'
\r
53 # is defined as the phenotype file.
\r
56 userPheno <- read.csv(phenotypeFile, header = TRUE) #testing only
\r
57 userPheno <- userPheno[userPheno$Trial == "SCG", ] #testing only-- needs to replaced with 2-stage
\r
60 # c. The user should be able to select their fixed variables from a menu
\r
61 # of the column names of the userPheno object. The possible interaction terms
\r
62 # also need to be shown somehow. Then, those strings should be passed
\r
63 # to this vector, 'userFixed'. Please set userFixed to NA if no fixed effects
\r
64 # besides f are requested.
\r
65 # f is automatically included as a fixed effect- a note to the user would be good.
\r
68 userFixed <- c("Year") # for testing only
\r
71 # d. The user should be able to select their random variables from a menu
\r
72 # of the column names of the userPheno object. The possible interaction terms
\r
73 # also need to be shown somehow. Then, those strings should be passed
\r
74 # to this vector, 'userRandom'.
\r
77 userRandom <- "Block" # for testing only
\r
80 # e. The user should be able to indicate which of the userPheno column names
\r
81 # represents individual genotypes identically as they are represented in the VCF
\r
82 # column names. No check to ensure matching at this stage. This single string
\r
83 # should be passed to this vector, userID.
\r
86 userID <- "Accession" # for testing only
\r
89 # f. The user must indicate the ploidy level of their organism, and the integer
\r
90 # provided should be passed to the vector 'userPloidy'. CalcCrossMeans.cpp
\r
91 # currently supports ploidy = {2, 4, 6}. Ideally, the user could select
\r
92 # their ploidy from a drop-down to avoid errors here, and there would be a note
\r
93 # that other ploidies are not currently supported. If not, a possible error is
\r
97 userPloidy <- 2 # for testing only
\r
99 # if(userPloidy %in% c(2, 4, 6) != TRUE){
\r
100 # stop("Only ploidies of 2, 4, and 6 are supported currently. \n
\r
101 # Please confirm your ploidy level is supported.")
\r
105 # g. The user should be able to select their response variables from a drop-down menu
\r
106 # of the column names of the userPheno object. Then, those strings should be passed
\r
107 # to this vector, 'userResponse'.
\r
109 userResponse <- c()
\r
110 #userResponse <- c("YIELD", "DMC", "OXBI") # for testing only
\r
111 userResponse <- strsplit(traits, ",")
\r
114 # h. The user must indicate weights for each response. The order of the vector
\r
115 # of response weights must match the order of the responses in userResponse.
\r
118 #userWeights <- c(1, 0.8, 0.2) # for YIELD, DMC, and OXBI respectively; for testing only
\r
119 userWeights <- strsplit(weights, ",")
\r
121 # i. The user can indicate the number of crosses they wish to output.
\r
122 # The maximum possible is a full diallel.
\r
124 userNCrosses <- c()
\r
125 userNCrosses <- 40 # for testing only
\r
128 # j. The user can (optionally) input the individuals' sexes and indicate the column
\r
129 # name of the userPheno object which corresponds to sex. The column name
\r
130 # string should be passed to the 'userSexes' object. If the user does not wish
\r
131 # to remove crosses with incompatible sexes (e.g. because the information is not available),
\r
132 # then userSexes should be set to NA.
\r
136 userSexes <- "Sex" # for testing only
\r
137 userPheno$Sex <- sample(c("M", "F"), size = nrow(userPheno), replace = TRUE, prob = c(0.7, 0.3)) # for testing only
\r
138 # Please note that for the test above, sex is sampled randomly for each entry, so the same accession can have
\r
139 # different sexes. This does not matter for the code or testing.
\r
141 ################################################################################
\r
142 # 3. Read in the genotype data and convert to numeric allele counts.
\r
143 ################################################################################
\r
145 # a. The VCF file object 'userGeno' needs to be converted to a numeric matrix
\r
146 # of allele counts in whic:
\r
147 # Rownames represent the individual genotype IDs
\r
148 # Colnames represent the site IDs
\r
149 # A cell within a given row and column represents the row individual's
\r
150 # genotype at the site in the column.
\r
152 # The individual's genotype should be an integer from 0... ploidy to represent
\r
153 # counts of the alternate allele at the site. Diploid example:
\r
154 # 0 = homozygous reference
\r
156 # 2 = homozygous alternate
\r
158 # The genotypes must not contain monomorphic or non-biallelic sites.
\r
159 # Users need to pre-process their VCF to remove these (e.g. in TASSEL or R)
\r
160 # I can put an error message into this script if a user tries to input
\r
161 # monomorphic or biallelic sites which could be communicated through the GUI.
\r
162 # It's also possible to filter them here.
\r
164 # Import VCF with VariantAnnotation package and extract matrix of dosages
\r
165 myVCF <- readVcf(genotypeFile)
\r
166 G <- t(geno(myVCF)$DS) # Individual in row, genotype in column
\r
169 # TEST temporarily import the genotypes via HapMap:
\r
170 #source("R/hapMap2numeric.R") # replace and delete
\r
171 #G <- hapMap2numeric(genotypeFile) # replace and delete
\r
178 ################################################################################
\r
179 # 4. Get the genetic predictors needed.
\r
180 ################################################################################
\r
182 # 4a. Get the inbreeding coefficent, f, as described by Xiang et al., 2016
\r
183 # The following constructs f as the average heterozygosity of the individual
\r
184 # The coefficient of f estimated later then needs to be divided by the number of markers
\r
185 # in the matrix D before adding it to the estimated dominance marker effects
\r
186 # One unit of change in f represents changing all loci from homozygous to heterozygous
\r
188 GC <- G - (userPloidy/2) #this centers G
\r
189 f <- rowSums(GC, na.rm = TRUE) / apply(GC, 1, function(x) sum(!is.na(x)))
\r
191 # Another alternate way to construct f is the total number of heterozygous loci in the individual
\r
192 # The coefficient of this construction of f does not need to be divided by the number of markers
\r
193 # It is simply added to each marker dominance effect
\r
194 # The coefficient of this construction of f represents the average dominance effect of a marker
\r
195 # One unit of change in f represents changing one locus from homozygous to heterozygous
\r
196 # f <- rowSums(D, na.rm = TRUE)
\r
200 # 4b. Get the additive and dominance relationship matrices. This uses AGHmatrix.
\r
202 A <- Gmatrix(G, method = "VanRaden", ploidy = userPloidy, missingValue = NA)
\r
204 if(userPloidy == 2){
\r
205 D <- Gmatrix(G, method = "Su", ploidy = userPloidy, missingValue = NA)
\r
208 # I haven't tested this yet
\r
209 if(userPloidy > 2){
\r
210 D <- Gmatrix(G, method = "Slater", ploidy = userPloidy, missingValue = NA)
\r
218 ################################################################################
\r
219 # 5. Process the phenotypic data.
\r
220 ################################################################################
\r
222 # a. Paste f into the phenotype dataframe
\r
223 userPheno$f <- f[as.character(userPheno[ , userID])]
\r
226 # b. Scale the response variables.
\r
227 for(i in 1:length(userResponse)){
\r
228 userPheno[ , userResponse[i]] <- (userPheno[ , userResponse[i]] -
\r
229 mean(userPheno[ , userResponse[i]], na.rm = TRUE))/
\r
230 sd(userPheno[ , userResponse[i]], na.rm = TRUE)
\r
234 # c. Paste in a second ID column for the dominance effects.
\r
235 userPheno[ , paste(userID, 2, sep = "")] <- userPheno[ , userID]
\r
238 # Additional steps could be added here to remove outliers etc.
\r
244 ################################################################################
\r
245 # 6. Fit the mixed models in sommer.
\r
246 ################################################################################
\r
249 # 6a. Make a list to save the models.
\r
250 userModels <- list()
\r
252 for(i in 1:length(userResponse)){
\r
255 # check if fixed effects besides f are requested, then paste together
\r
256 # response variable and fixed effects
\r
257 if(!is.na(userFixed[1])){
\r
258 fixedEff <- paste(userFixed, collapse = " + ")
\r
259 fixedEff <- paste(fixedEff, "f", sep = " + ")
\r
260 fixedArg <- paste(userResponse[i], " ~ ", fixedEff, sep = "")
\r
262 if(is.na(userFixed[1])){
\r
263 fixedArg <- paste(userResponse[i], " ~ ", "f")
\r
267 # check if random effects besides genotypic additive and dominance effects
\r
268 # are requested, then paste together the formula
\r
269 if(!is.na(userRandom[1])){
\r
270 randEff <- paste(userRandom, collapse = " + ")
\r
271 ID2 <- paste(userID, 2, sep = "")
\r
272 randEff2 <- paste("~vs(", userID, ", Gu = A) + vs(", ID2, ", Gu = D)", sep = "")
\r
273 randArg <- paste(randEff2, randEff, sep = " + ")
\r
275 if(is.na(userRandom[1])){
\r
276 randArg <- paste("~vs(", userID, ", Gu = A) + vs(", ID2, ", Gu = D)", sep = "")
\r
280 # fit the mixed GBLUP model
\r
281 myMod <- mmer(fixed = as.formula(fixedArg),
\r
282 random = as.formula(randArg),
\r
288 # save the fit model
\r
289 userModels[[i]] <- myMod
\r
297 ######################################################################################
\r
298 # 7. Backsolve from individual estimates to marker effect estimates / GBLUP -> RR-BLUP
\r
299 ######################################################################################
\r
301 # a. Get the matrices and inverses needed
\r
302 # This is not correct for polyploids yet.
\r
303 A.G <- G - (userPloidy / 2) # this is the additive genotype matrix (coded -1 0 1 for diploids)
\r
304 D.G <- GC # this is the dominance genotype matrix (coded 0 1 0 for diploids)
\r
307 A.T <- A.G %*% t(A.G) ## additive genotype matrix
\r
308 A.Tinv <- solve(A.T) # inverse; may cause an error sometimes, if so, add a small amount to the diag
\r
309 A.TTinv <- t(A.G) %*% A.Tinv # M'%*% (M'M)-
\r
311 D.T <- D.G %*% t(D.G) ## dominance genotype matrix
\r
312 D.Tinv <- solve(D.T) ## inverse
\r
313 D.TTinv <- t(D.G) %*% D.Tinv # M'%*% (M'M)-
\r
316 # b. Loop through and backsolve to marker effects.
\r
318 userAddEff <- list() # save them in order
\r
319 userDomEff <- list() # save them in order
\r
321 for(i in 1:length(userModels)){
\r
323 myMod <- userModels[[i]]
\r
325 # get the additive and dominance effects out of the sommer list
\r
327 subModA <- subMod[[1]]
\r
328 subModA <- subModA[[1]]
\r
329 subModD <- subMod[[2]]
\r
330 subModD <- subModD[[1]]
\r
333 addEff <- A.TTinv %*% matrix(subModA[colnames(A.TTinv)], ncol = 1) # these must be reordered to match A.TTinv
\r
334 domEff <- D.TTinv %*% matrix(subModD[colnames(D.TTinv)], ncol=1) # these must be reordered to match D.TTinv
\r
336 # add f coefficient back into the dominance effects
\r
337 subModf <- myMod$Beta
\r
338 fCoef <- subModf[subModf$Effect == "f", "Estimate"] # raw f coefficient
\r
339 fCoefScal <- fCoef / ncol(G) # divides f coefficient by number of markers
\r
340 dirDomEff <- domEff + fCoefScal
\r
343 userAddEff[[i]] <- addEff
\r
344 userDomEff[[i]] <- dirDomEff
\r
352 ################################################################################
\r
353 # 8. Weight the marker effects and add them together to form an index of merit.
\r
354 ################################################################################
\r
358 for(i in 1:length(userWeights)){
\r
359 ai <- ai + userAddEff[[i]] * userWeights[i]
\r
360 di <- di + userDomEff[[i]] * userWeights[i]
\r
368 ################################################################################
\r
369 # 9. Predict the crosses.
\r
370 ################################################################################
\r
372 # If the genotype matrix provides information about individuals for which
\r
373 # cross prediction is not desired, then the genotype matrix must be subset
\r
374 # for use in calcCrossMean(). calcCrossMean will return predicted cross
\r
375 # values for all individuals in the genotype file otherwise.
\r
377 GP <- G[rownames(G) %in% userPheno[ , userID], ]
\r
380 crossPlan <- calcCrossMean(GP,
\r
390 ################################################################################
\r
391 # 10. Format the information needed for output.
\r
392 ################################################################################
\r
394 # Add option to remove crosses with incompatible sexes.
\r
396 if(!is.na(userSexes)){
\r
398 # Reformat the cross plan
\r
399 crossPlan <- as.data.frame(crossPlan)
\r
400 crossPlan <- crossPlan[order(crossPlan[,3], decreasing = TRUE), ] # orders the plan by predicted merit
\r
401 crossPlan[ ,1] <- rownames(GP)[crossPlan[ ,1]] # replaces internal ID with genotye file ID
\r
402 crossPlan[ ,2] <- rownames(GP)[crossPlan[ ,2]] # replaces internal ID with genotye file ID
\r
403 colnames(crossPlan) <- c("Parent1", "Parent2", "CrossPredictedMerit")
\r
405 # Look up the parent sexes and subset
\r
406 crossPlan$P1Sex <- userPheno[match(crossPlan$Parent1, userPheno$Accession), userSexes] # get sexes ordered by Parent1
\r
407 crossPlan$P2Sex <- userPheno[match(crossPlan$Parent2, userPheno$Accession), userSexes] # get sexes ordered by Parent2
\r
408 crossPlan <- crossPlan[crossPlan$P1Sex != crossPlan$P2Sex, ] # remove crosses with same-sex parents
\r
411 # subset the number of crosses the user wishes to output
\r
412 crossPlan[1:userNCrosses, ]
\r
413 outputFile= paste(phenotypeFile, ".out", sep="")
\r
418 if(is.na(userSexes)){
\r
420 # only subset the number of crosses the user wishes to output
\r
421 crossPlan[1:userNCrosses, ]
\r
422 outputFile= paste(phenotypeFile, ".out", sep="")
\r