1 ################################################################################
2 # Genomic prediction of cross performance for YamBase
3 ################################################################################
5 # There are ten main steps to this protocol:
6 # 1. Load the software needed.
7 # 2. Declare user-supplied variables.
8 # 3. Read in the genotype data and convert to numeric allele counts.
9 # 4. Get the genetic predictors needed.
10 # 5. Process the phenotypic data.
11 # 6. Fit the mixed models in sommer.
12 # 7. Backsolve from individual estimates to marker effect estimates / GBLUP -> RR-BLUP
13 # 8. Weight the marker effects and add them together to form an index of merit.
14 # 9. Predict the crosses.
15 # 10. Format the information needed for output.
19 args = commandArgs(trailingOnly=TRUE)
20 if (length(args) < 3) {
21 stop('Two or more arguments are required.')
23 phenotypeFile = args[1]
24 genotypeFile = args[2]
32 write(paste('PLANT SEX CVTERM: ', userSexes), stderr())
34 ################################################################################
35 # 1. Load software needed
36 ################################################################################
40 library(VariantAnnotation) # Bioconductor package
42 Rcpp::sourceCpp("/home/production/cxgn/QuantGenResources/CalcCrossMeans.cpp") # this is called CalcCrossMean.cpp on Github
49 ################################################################################
50 # 2. Declare user-supplied variables
51 ################################################################################
53 # a. Define path with internal YamBase instructions such that the object 'userGeno'
54 # is defined as a VCF file of genotypes.
59 # b. Define path2 with internal YamBase instructions such that the object 'userPheno'
60 # is defined as the phenotype file.
63 #write(paste("READING PHENOTYPEFILE: ",phenotypeFile), stderr())
64 userPheno <- read.delim(phenotypeFile, header = TRUE, sep="\t", fill=TRUE) #testing only
65 #write(colnames(userPheno), stderr())
66 #write(summary(userPheno), stderr())
67 ##userPheno <- userPheno[userPheno$Trial == "SCG", ] #testing only-- needs to replaced with 2-stage
69 #write("DONE WITH PHENOTYPEFILE"), stderr())
71 # c. The user should be able to select their fixed variables from a menu
72 # of the column names of the userPheno object. The possible interaction terms
73 # also need to be shown somehow. Then, those strings should be passed
74 # to this vector, 'userFixed'. Please set userFixed to NA if no fixed effects
75 # besides f are requested.
76 # f is automatically included as a fixed effect- a note to the user would be good.
79 #userFixed <- c("studyYear") # for testing only
80 userFixed <-unlist(strsplit(userFixed, split=",", fixed=T))
83 # d. The user should be able to select their random variables from a menu
84 # of the column names of the userPheno object. The possible interaction terms
85 # also need to be shown somehow. Then, those strings should be passed
86 # to this vector, 'userRandom'.
89 #userRandom <- "blockNumber" # for testing only
90 userRandom <-unlist(strsplit(userRandom, split=",", fixed=T))
92 # e. The user should be able to indicate which of the userPheno column names
93 # represents individual genotypes identically as they are represented in the VCF
94 # column names. No check to ensure matching at this stage. This single string
95 # should be passed to this vector, userID.
98 userID <- "germplasmName" # for testing only
101 # f. The user must indicate the ploidy level of their organism, and the integer
102 # provided should be passed to the vector 'userPloidy'. CalcCrossMeans.cpp
103 # currently supports ploidy = {2, 4, 6}. Ideally, the user could select
104 # their ploidy from a drop-down to avoid errors here, and there would be a note
105 # that other ploidies are not currently supported. If not, a possible error is
109 userPloidy <- 2 # for testing only
111 # if(userPloidy %in% c(2, 4, 6) != TRUE){
112 # stop("Only ploidies of 2, 4, and 6 are supported currently. \n
113 # Please confirm your ploidy level is supported.")
117 # g. The user should be able to select their response variables from a drop-down menu
118 # of the column names of the userPheno object. Then, those strings should be passed
119 # to this vector, 'userResponse'.
121 write(paste("TRAIT STRING:", traits), stderr())
123 #userResponse <- c("YIELD", "DMC", "OXBI") # for testing only
124 userResponse <- unlist(strsplit(traits, split=",", fixed=T))
126 write(paste("USER RESPONSE", userResponse),stderr())
127 write(paste("first element: ", userResponse[1]), stderr())
128 # h. The user must indicate weights for each response. The order of the vector
129 # of response weights must match the order of the responses in userResponse.
132 #userWeights <- c(1, 0.8, 0.2) # for YIELD, DMC, and OXBI respectively; for testing only
133 userWeights <- as.numeric(unlist(strsplit(weights, split=",", fixed=T)))
135 write(paste("WEIGHTS", userWeights), stderr())
137 # i. The user can indicate the number of crosses they wish to output.
138 # The maximum possible is a full diallel.
141 userNCrosses <- 100 # for testing only
144 # j. The user can (optionally) input the individuals' sexes and indicate the column
145 # name of the userPheno object which corresponds to sex. The column name
146 # string should be passed to the 'userSexes' object. If the user does not wish
147 # to remove crosses with incompatible sexes (e.g. because the information is not available),
148 # then userSexes should be set to NA.
152 #userSexes <- "Sex" # for testing only
153 #userPheno$Sex <- sample(c("M", "F"), size = nrow(userPheno), replace = TRUE, prob = c(0.7, 0.3)) # for testing only
154 # Please note that for the test above, sex is sampled randomly for each entry, so the same accession can have
155 # different sexes. This does not matter for the code or testing.
157 ################################################################################
158 # 3. Read in the genotype data and convert to numeric allele counts.
159 ################################################################################
161 # a. The VCF file object 'userGeno' needs to be converted to a numeric matrix
162 # of allele counts in whic:
163 # Rownames represent the individual genotype IDs
164 # Colnames represent the site IDs
165 # A cell within a given row and column represents the row individual's
166 # genotype at the site in the column.
168 # The individual's genotype should be an integer from 0... ploidy to represent
169 # counts of the alternate allele at the site. Diploid example:
170 # 0 = homozygous reference
172 # 2 = homozygous alternate
174 # The genotypes must not contain monomorphic or non-biallelic sites.
175 # Users need to pre-process their VCF to remove these (e.g. in TASSEL or R)
176 # I can put an error message into this script if a user tries to input
177 # monomorphic or biallelic sites which could be communicated through the GUI.
178 # It's also possible to filter them here.
180 if (file_ext(genotypeFile) == 'vcf') {
181 write(paste("READING VARIANT FILE ", genotypeFile), stderr())
182 # Import VCF with VariantAnnotation package and extract matrix of dosages
183 myVCF <- readVcf(genotypeFile)
184 #G <- t(geno(myVCF)$DS) # Individual in row, genotype in column
185 mat <- genotypeToSnpMatrix(myVCF)
186 #G <- t(geno(myVCF)$DS) # Individual in row, genotype in column
187 G <- as(mat$genotypes, "numeric")
188 G <- G[,colSums(is.na(G))<nrow(G)]
190 # TEST temporarily import the genotypes via HapMap:
191 #source("R/hapMap2numeric.R") # replace and delete
192 #G <- hapMap2numeric(genotypeFile) # replace and delete
196 # accession_names abc abc2 abc3
201 write(paste("READING DOSAGE FILE ", genotypeFile), stderr())
202 GF <- read.delim(genotypeFile)
210 write("G Matrix start --------", stderr())
211 write(G[1:5, 1:5], stderr())
212 write("G Matrix end =========", stderr())
217 ################################################################################
218 # 4. Get the genetic predictors needed.
219 ################################################################################
221 write("GENETIC PREDICTIONS...", stderr())
222 # 4a. Get the inbreeding coefficent, f, as described by Xiang et al., 2016
223 # The following constructs f as the average heterozygosity of the individual
224 # The coefficient of f estimated later then needs to be divided by the number of markers
225 # in the matrix D before adding it to the estimated dominance marker effects
226 # One unit of change in f represents changing all loci from homozygous to heterozygous
228 ###GC <- G - (userPloidy/2) #this centers G
229 GC <- G * (userPloidy - G) * (2 / userPloidy)^2 # center at G
230 f <- rowSums(GC, na.rm = TRUE) / apply(GC, 1, function(x) sum(!is.na(x)))
232 # Another alternate way to construct f is the total number of heterozygous loci in the individual
233 # The coefficient of this construction of f does not need to be divided by the number of markers
234 # It is simply added to each marker dominance effect
235 # The coefficient of this construction of f represents the average dominance effect of a marker
236 # One unit of change in f represents changing one locus from homozygous to heterozygous
237 # f <- rowSums(D, na.rm = TRUE)
240 write("DISTANCE MATRIX...", stderr())
241 # 4b. Get the additive and dominance relationship matrices following Batista et al., 2021
242 # https://doi.org/10.1007/s00122-021-03994-w
244 # Additive: this gives a different result than AGHmatrix VanRaden's Gmatrix
245 # AGHmatrix: Weights are implemented for "VanRaden" method as described in Liu (2020)?
246 allele_freq = colSums(G) / (userPloidy * nrow(G))
247 W = t(G) - userPloidy * allele_freq
249 denom = sum(userPloidy * allele_freq * (1 - allele_freq))
252 # Check with paper equation:
253 #w <- G - (userPloidy/2)
255 #denom = sum(userPloidy * allele_freq * (1 - allele_freq))
258 #cor(as.vector(A), as.vector(A2)) # 0.9996...
261 # Dominance or digenic dominance
263 D <- Gmatrix(G, method = "Su", ploidy = userPloidy, missingValue = NA)
268 C_matrix = matrix(length(combn(userPloidy, 2)) / 2,
272 Ploidy_matrix = matrix(userPloidy,
276 Q = (allele_freq^2 * C_matrix) -
277 (Ploidy_matrix - 1) * allele_freq * t(G) +
278 0.5 * t(G) * (t(G) - 1)
281 denomDom = sum(C_matrix[ ,1] * allele_freq^2 * (1 - allele_freq)^2)
290 ################################################################################
291 # 5. Process the phenotypic data.
292 ################################################################################
294 #write(summary(userPheno), stderr())
296 # a. Paste f into the phenotype dataframe
297 write("processing phenotypic data...", stderr())
298 userPheno$f <- f[as.character(userPheno[ , userID])]
300 #write(summary(userPheno), stderr())
302 # b. Scale the response variables.
303 write("processing phenotypic data... scaling...", stderr())
304 write(paste("USER RESPONSE LENGTH = ", length(userResponse)), stderr())
305 for(i in 1:length(userResponse)){
306 write(paste("working on user response ", userResponse[i]), stderr())
307 userPheno[ , userResponse[i]] <- (userPheno[ , userResponse[i]] - mean(userPheno[ , userResponse[i]], na.rm = TRUE)) / sd(userPheno[ , userResponse[i]], na.rm = TRUE)
310 write(paste("accession count: ", length(userPheno[ , userID])), stderr())
311 write("processing phenotypic data... adding dominance effects", stderr())
312 # c. Paste in a second ID column for the dominance effects.
314 #write(summary(userPheno), stderr())
316 dominanceEffectCol = paste(userID, "2", sep="")
317 write(paste("NEW COL NAME: ", dominanceEffectCol), stderr())
319 write(paste("USER_ID COLUMN: ", userPheno[ , userID]), stderr());
322 userPheno[ , dominanceEffectCol] <- userPheno[ , userID]
324 write(paste("USER PHENO userID2 COL", userPheno[ , dominanceEffectCol]), stderr())
327 # Additional steps could be added here to remove outliers etc.
333 ################################################################################
334 # 6. Fit the mixed models in sommer.
335 ################################################################################
337 write("Fit mixed model in sommer", stderr());
339 # 6a. Make a list to save the models.
343 for(i in 1:length(userResponse)){
345 write(paste("User response: ", userResponse[i]), stderr())
346 # check if fixed effects besides f are requested, then paste together
347 # response variable and fixed effects
348 if(!is.na(userFixed[1])){
349 fixedEff <- paste(userFixed, collapse = " + ")
350 fixedEff <- paste(fixedEff, "f", sep = " + ")
351 fixedArg <- paste(userResponse[i], " ~ ", fixedEff, sep = "")
353 if(is.na(userFixed[1])){
354 fixedArg <- paste(userResponse[i], " ~ ", "f")
358 # check if random effects besides genotypic additive and dominance effects
359 # are requested, then paste together the formula
361 write("Generate formula...", stderr())
363 if(!is.na(userRandom[1])){
364 randEff <- paste(userRandom, collapse = " + ")
365 ID2 <- paste(userID, 2, sep = "")
366 randEff2 <- paste("~vsr(", userID, ", Gu = A) + vsr(", ID2, ", Gu = D)", sep = "")
367 randArg <- paste(randEff2, randEff, sep = " + ")
369 if(is.na(userRandom[1])){
370 ID2 <- paste(userID, 2, sep ="")
371 randArg <- paste("~vsr(", userID, ", Gu = A) + vsr(", ID2, ", Gu = D)", sep = "")
374 write(paste("Fit mixed GBLUP model...", randArg), stderr())
376 # write(paste("USER PHENO:", userPheno), stderr())
377 # write(paste("COLNAMES: ", colnames(userPheno)), stderr())
378 # fit the mixed GBLUP model
379 myMod <- mmer(fixed = as.formula(fixedArg),
380 random = as.formula(randArg),
388 write(paste("I = ", i), stderr());
390 userModels[[i]] <- myMod
398 ######################################################################################
399 # 7. Backsolve from individual estimates to marker effect estimates / GBLUP -> RR-BLUP
400 ######################################################################################
402 # a. Get the matrices and inverses needed
403 # This is not correct for polyploids yet.
404 A.G <- G - (userPloidy / 2) # this is the additive genotype matrix (coded -1 0 1 for diploids)
405 D.G <- 1 - abs(A.G) # this is the dominance genotype matrix (coded 0 1 0 for diploids)
408 A.T <- A.G %*% t(A.G) ## additive genotype matrix
409 A.Tinv <- solve(A.T) # inverse; may cause an error sometimes, if so, add a small amount to the diag
410 A.TTinv <- t(A.G) %*% A.Tinv # M'%*% (M'M)-
412 D.T <- D.G %*% t(D.G) ## dominance genotype matrix
413 D.Tinv <- solve(D.T) ## inverse
414 D.TTinv <- t(D.G) %*% D.Tinv # M'%*% (M'M)-
417 # b. Loop through and backsolve to marker effects.
419 write("backsolve marker effects...", stderr())
421 userAddEff <- list() # save them in order
422 userDomEff <- list() # save them in order
424 for(i in 1:length(userModels)){
426 myMod <- userModels[[i]]
428 # get the additive and dominance effects out of the sommer list
430 subModA <- subMod[[1]]
431 subModA <- subModA[[1]]
432 subModD <- subMod[[2]]
433 subModD <- subModD[[1]]
436 addEff <- A.TTinv %*% matrix(subModA[colnames(A.TTinv)], ncol = 1) # these must be reordered to match A.TTinv
437 domEff <- D.TTinv %*% matrix(subModD[colnames(D.TTinv)], ncol=1) # these must be reordered to match D.TTinv
439 # add f coefficient back into the dominance effects
440 subModf <- myMod$Beta
441 fCoef <- subModf[subModf$Effect == "f", "Estimate"] # raw f coefficient
442 fCoefScal <- fCoef / ncol(G) # divides f coefficient by number of markers
443 dirDomEff <- domEff + fCoefScal
446 userAddEff[[i]] <- addEff
447 userDomEff[[i]] <- dirDomEff
455 ################################################################################
456 # 8. Weight the marker effects and add them together to form an index of merit.
457 ################################################################################
459 write("weight marker effects...", stderr())
463 for(i in 1:length(userWeights)){
464 write(paste("USER ADD EFF : ", userAddEff[[i]]), stderr())
465 write(paste("USER DOM EFF : ", userDomEff[[i]]), stderr())
466 write(paste("USER WEIGHT : ", userWeights[i]), stderr())
469 ai <- ai + userAddEff[[i]] * userWeights[i]
471 write("DONE WITH ADDITIVE EFFECTS!\n", stderr())
472 di <- di + userDomEff[[i]] * userWeights[i]
473 write("DONE WITH DOM EFFECTS!\n", stderr())
481 ################################################################################
482 # 9. Predict the crosses.
483 ################################################################################
485 # If the genotype matrix provides information about individuals for which
486 # cross prediction is not desired, then the genotype matrix must be subset
487 # for use in calcCrossMean(). calcCrossMean will return predicted cross
488 # values for all individuals in the genotype file otherwise.
490 write("Predict crosses...", stderr())
492 GP <- G[rownames(G) %in% userPheno[ , userID], ]
497 write("calcCrossMean...", stderr())
499 crossPlan <- calcCrossMean(GP,
505 write("Done with calcCrossMean!!!!!!", stderr())
509 ################################################################################
510 # 10. Format the information needed for output.
511 ################################################################################
513 # Add option to remove crosses with incompatible sexes.
516 #hash <- new.env(hash = TRUE, parent = emptyenv(), size = 100L)
518 #assign_hash(userPheno$germplasmName, userPheno$userSexes, hash)
520 if(!is.na(userSexes)){ # "plant sex estimation 0-4"
523 write(paste("userSexes", head(userSexes)), stderr())
525 # Reformat the cross plan
526 crossPlan <- as.data.frame(crossPlan)
528 write(paste("CROSSPLAN = ", head(crossPlan)), stderr())
529 crossPlan <- crossPlan[order(crossPlan[,3], decreasing = TRUE), ] # orders the plan by predicted merit
530 crossPlan[ ,1] <- rownames(GP)[crossPlan[ ,1]] # replaces internal ID with genotye file ID
531 crossPlan[ ,2] <- rownames(GP)[crossPlan[ ,2]] # replaces internal ID with genotye file ID
532 colnames(crossPlan) <- c("Parent1", "Parent2", "CrossPredictedMerit")
534 write(paste("CROSSPLAN REPLACED = ", head(crossPlan)), stderr());
536 # Look up the parent sexes and subset
537 crossPlan$P1Sex <- userPheno[match(crossPlan$Parent1, userPheno$germplasmName), userSexes] # get sexes ordered by Parent1
539 write(paste("PARENTS1 ", head(crossPlan)), stderr())
541 crossPlan$P2Sex <- userPheno[match(crossPlan$Parent2, userPheno$germplasmName), userSexes] # get sexes ordered by Parent2
543 write(paste("PARENTS2 ", head(crossPlan)), stderr())
545 crossPlan <- crossPlan[!(crossPlan$P1Sex==0 | crossPlan$P2Sex==0),] #remove the 0s
546 crossPlan <- crossPlan[!(crossPlan$P1Sex==1 & crossPlan$P2Sex==1),] #remove same sex crosses with score of 1
547 crossPlan <- crossPlan[!(crossPlan$P1Sex==2 & crossPlan$P2Sex==2),] #remove same sex crosses with score of 2
549 write(paste("CROSSPLAN FILTERED = ", head(crossPlan)), stderr())
550 #crossPlan <- crossPlan[crossPlan$P1Sex != crossPlan$P2Sex, ] # remove crosses with same-sex parents
552 # subset the number of crosses the user wishes to output
553 crossPlan[1:userNCrosses, ]
554 finalcrosses=crossPlan[1:userNCrosses, ]
555 outputFile= paste(phenotypeFile, ".out", sep="")
557 write.csv(finalcrosses, outputFile)
562 if(is.na(userSexes)){
564 # only subset the number of crosses the user wishes to output
565 crossPlan <- as.data.frame(crossPlan)
567 crossPlan <- crossPlan[order(crossPlan[,3], decreasing = TRUE), ] # orders the plan by predicted merit
568 crossPlan[ ,1] <- rownames(GP)[crossPlan[ ,1]] # replaces internal ID with genotye file ID
569 crossPlan[ ,2] <- rownames(GP)[crossPlan[ ,2]] # replaces internal ID with genotye file ID
570 colnames(crossPlan) <- c("Parent1", "Parent2", "CrossPredictedMerit")
572 crossPlan[1:userNCrosses, ]
573 finalcrosses=crossPlan[1:userNCrosses, ]
574 outputFile= paste(phenotypeFile, ".out", sep="")
576 write.csv(finalcrosses, outputFile)