1 ################################################################################
2 # Genomic prediction of cross performance for YamBase
3 ################################################################################
5 # There are ten main steps to this protocol:
6 # 1. Load the software needed.
7 # 2. Declare user-supplied variables.
8 # 3. Read in the genotype data and convert to numeric allele counts.
9 # 4. Get the genetic predictors needed.
10 # 5. Process the phenotypic data.
11 # 6. Fit the mixed models in sommer.
12 # 7. Backsolve from individual estimates to marker effect estimates / GBLUP -> RR-BLUP
13 # 8. Weight the marker effects and add them together to form an index of merit.
14 # 9. Predict the crosses.
15 # 10. Format the information needed for output.
19 args <- commandArgs(trailingOnly = TRUE)
20 if (length(args) < 3) {
21 stop("Two or more arguments are required.")
23 phenotypeFile <- args[1]
24 genotypeFile <- args[2]
31 # userSexes = as.vector(userSexes)
33 # L = length(userSexes)
34 # if (L==1 && userSexes[1]!="") {write('PLANT SEX is empty: ', stderr())}
35 write(paste("PLANT SEX CVTERM: |", userSexes, "|"), stderr())
38 ################################################################################
39 # 1. Load software needed
40 ################################################################################
45 library(VariantAnnotation) # Bioconductor package
48 Rcpp::sourceCpp("/home/production/cxgn/QuantGenResources/CalcCrossMeans.cpp") # this is called CalcCrossMean.cpp on Github
55 ################################################################################
56 # 2. Declare user-supplied variables
57 ################################################################################
59 # a. Define path with internal YamBase instructions such that the object 'userGeno'
60 # is defined as a VCF file of genotypes.
65 # b. Define path2 with internal YamBase instructions such that the object 'userPheno'
66 # is defined as the phenotype file.
69 # write(paste("READING PHENOTYPEFILE: ",phenotypeFile), stderr())
70 userPheno <- read.delim(phenotypeFile, header = TRUE, sep = "\t", fill = TRUE) # testing only
71 # write(colnames(userPheno), stderr())
72 # write(summary(userPheno), stderr())
73 ## userPheno <- userPheno[userPheno$Trial == "SCG", ] #testing only-- needs to replaced with 2-stage
75 # write("DONE WITH PHENOTYPEFILE"), stderr())
77 # c. The user should be able to select their fixed variables from a menu
78 # of the column names of the userPheno object. The possible interaction terms
79 # also need to be shown somehow. Then, those strings should be passed
80 # to this vector, 'userFixed'. Please set userFixed to NA if no fixed effects
81 # besides f are requested.
82 # f is automatically included as a fixed effect- a note to the user would be good.
85 # userFixed <- c("studyYear") # for testing only
86 userFixed <- unlist(strsplit(userFixed, split = ",", fixed = T))
89 # d. The user should be able to select their random variables from a menu
90 # of the column names of the userPheno object. The possible interaction terms
91 # also need to be shown somehow. Then, those strings should be passed
92 # to this vector, 'userRandom'.
95 # userRandom <- "blockNumber" # for testing only
96 userRandom <- unlist(strsplit(userRandom, split = ",", fixed = T))
98 # e. The user should be able to indicate which of the userPheno column names
99 # represents individual genotypes identically as they are represented in the VCF
100 # column names. No check to ensure matching at this stage. This single string
101 # should be passed to this vector, userID.
104 userID <- "germplasmName" # for testing only
107 # f. The user must indicate the ploidy level of their organism, and the integer
108 # provided should be passed to the vector 'userPloidy'. CalcCrossMeans.cpp
109 # currently supports ploidy = {2, 4, 6}. Ideally, the user could select
110 # their ploidy from a drop-down to avoid errors here, and there would be a note
111 # that other ploidies are not currently supported. If not, a possible error is
115 userPloidy <- 2 # for testing only
117 # if(userPloidy %in% c(2, 4, 6) != TRUE){
118 # stop("Only ploidies of 2, 4, and 6 are supported currently. \n
119 # Please confirm your ploidy level is supported.")
123 # g. The user should be able to select their response variables from a drop-down menu
124 # of the column names of the userPheno object. Then, those strings should be passed
125 # to this vector, 'userResponse'.
127 write(paste("TRAIT STRING:", traits), stderr())
128 # userResponse <- c()
129 # userResponse <- c("YIELD", "DMC", "OXBI") # for testing only
130 userResponse <- unlist(strsplit(traits, split = ",", fixed = T))
132 write(paste("USER RESPONSE", userResponse), stderr())
133 write(paste("first element: ", userResponse[1]), stderr())
134 # h. The user must indicate weights for each response. The order of the vector
135 # of response weights must match the order of the responses in userResponse.
138 # userWeights <- c(1, 0.8, 0.2) # for YIELD, DMC, and OXBI respectively; for testing only
139 userWeights <- as.numeric(unlist(strsplit(weights, split = ",", fixed = T)))
141 write(paste("WEIGHTS", userWeights), stderr())
143 # i. The user can indicate the number of crosses they wish to output.
144 # The maximum possible is a full diallel.
146 # userNCrosses <- c()
147 userNCrosses <- 100 # for testing only
150 # j. The user can (optionally) input the individuals' sexes and indicate the column
151 # name of the userPheno object which corresponds to sex. The column name
152 # string should be passed to the 'userSexes' object. If the user does not wish
153 # to remove crosses with incompatible sexes (e.g. because the information is not available),
154 # then userSexes should be set to NA.
158 # userSexes <- "Sex" # for testing only
159 # userPheno$Sex <- sample(c("M", "F"), size = nrow(userPheno), replace = TRUE, prob = c(0.7, 0.3)) # for testing only
160 # Please note that for the test above, sex is sampled randomly for each entry, so the same accession can have
161 # different sexes. This does not matter for the code or testing.
163 ################################################################################
164 # 3. Read in the genotype data and convert to numeric allele counts.
165 ################################################################################
167 # a. The VCF file object 'userGeno' needs to be converted to a numeric matrix
168 # of allele counts in whic:
169 # Rownames represent the individual genotype IDs
170 # Colnames represent the site IDs
171 # A cell within a given row and column represents the row individual's
172 # genotype at the site in the column.
174 # The individual's genotype should be an integer from 0... ploidy to represent
175 # counts of the alternate allele at the site. Diploid example:
176 # 0 = homozygous reference
178 # 2 = homozygous alternate
180 # The genotypes must not contain monomorphic or non-biallelic sites.
181 # Users need to pre-process their VCF to remove these (e.g. in TASSEL or R)
182 # I can put an error message into this script if a user tries to input
183 # monomorphic or biallelic sites which could be communicated through the GUI.
184 # It's also possible to filter them here.
186 if (file_ext(genotypeFile) == "vcf") {
187 write(paste("READING VARIANT FILE ", genotypeFile), stderr())
188 # Import VCF with VariantAnnotation package and extract matrix of dosages
189 myVCF <- readVcf(genotypeFile)
190 # G <- t(geno(myVCF)$DS) # Individual in row, genotype in column
191 mat <- genotypeToSnpMatrix(myVCF)
192 # G <- t(geno(myVCF)$DS) # Individual in row, genotype in column
193 G <- as(mat$genotypes, "numeric")
194 G <- G[, colSums(is.na(G)) < nrow(G)]
196 # TEST temporarily import the genotypes via HapMap:
197 # source("R/hapMap2numeric.R") # replace and delete
198 # G <- hapMap2numeric(genotypeFile) # replace and delete
200 # accession_names abc abc2 abc3
205 write(paste("READING DOSAGE FILE ", genotypeFile), stderr())
206 GF <- read.delim(genotypeFile)
212 write("G Matrix start --------", stderr())
213 write(G[1:5, 1:5], stderr())
214 write("G Matrix end =========", stderr())
219 ################################################################################
220 # 4. Get the genetic predictors needed.
221 ################################################################################
223 write("GENETIC PREDICTIONS...", stderr())
224 # 4a. Get the inbreeding coefficent, f, as described by Xiang et al., 2016
225 # The following constructs f as the average heterozygosity of the individual
226 # The coefficient of f estimated later then needs to be divided by the number of markers
227 # in the matrix D before adding it to the estimated dominance marker effects
228 # One unit of change in f represents changing all loci from homozygous to heterozygous
230 ### GC <- G - (userPloidy/2) #this centers G
231 GC <- G * (userPloidy - G) * (2 / userPloidy)^2 # center at G
232 f <- rowSums(GC, na.rm = TRUE) / apply(GC, 1, function(x) sum(!is.na(x)))
234 # Another alternate way to construct f is the total number of heterozygous loci in the individual
235 # The coefficient of this construction of f does not need to be divided by the number of markers
236 # It is simply added to each marker dominance effect
237 # The coefficient of this construction of f represents the average dominance effect of a marker
238 # One unit of change in f represents changing one locus from homozygous to heterozygous
239 # f <- rowSums(D, na.rm = TRUE)
242 write("DISTANCE MATRIX...", stderr())
243 # 4b. Get the additive and dominance relationship matrices following Batista et al., 2021
244 # https://doi.org/10.1007/s00122-021-03994-w
246 # Additive: this gives a different result than AGHmatrix VanRaden's Gmatrix
247 # AGHmatrix: Weights are implemented for "VanRaden" method as described in Liu (2020)?
248 allele_freq <- colSums(G) / (userPloidy * nrow(G))
249 W <- t(G) - userPloidy * allele_freq
251 denom <- sum(userPloidy * allele_freq * (1 - allele_freq))
254 # Check with paper equation:
255 # w <- G - (userPloidy/2)
257 # denom = sum(userPloidy * allele_freq * (1 - allele_freq))
260 # cor(as.vector(A), as.vector(A2)) # 0.9996...
263 # Dominance or digenic dominance
264 if (userPloidy == 2) {
265 D <- Gmatrix(G, method = "Su", ploidy = userPloidy, missingValue = NA)
268 if (userPloidy > 2) {
270 C_matrix <- matrix(length(combn(userPloidy, 2)) / 2,
275 Ploidy_matrix <- matrix(userPloidy,
280 Q <- (allele_freq^2 * C_matrix) -
281 (Ploidy_matrix - 1) * allele_freq * t(G) +
282 0.5 * t(G) * (t(G) - 1)
285 denomDom <- sum(C_matrix[, 1] * allele_freq^2 * (1 - allele_freq)^2)
294 ################################################################################
295 # 5. Process the phenotypic data.
296 ################################################################################
298 # write(summary(userPheno), stderr())
300 # a. Paste f into the phenotype dataframe
301 write("processing phenotypic data...", stderr())
302 userPheno$f <- f[as.character(userPheno[, userID])]
304 # write(summary(userPheno), stderr())
306 # b. Scale the response variables.
307 write("processing phenotypic data... scaling...", stderr())
308 write(paste("USER RESPONSE LENGTH = ", length(userResponse)), stderr())
309 for (i in 1:length(userResponse)) {
310 write(paste("working on user response ", userResponse[i]), stderr())
311 userPheno[, userResponse[i]] <- (userPheno[, userResponse[i]] - mean(userPheno[, userResponse[i]], na.rm = TRUE)) / sd(userPheno[, userResponse[i]], na.rm = TRUE)
314 write(paste("accession count: ", length(userPheno[, userID])), stderr())
315 write("processing phenotypic data... adding dominance effects", stderr())
316 # c. Paste in a second ID column for the dominance effects.
318 # write(summary(userPheno), stderr())
320 dominanceEffectCol <- paste(userID, "2", sep = "")
321 write(paste("NEW COL NAME: ", dominanceEffectCol), stderr())
323 write(paste("USER_ID COLUMN: ", userPheno[, userID]), stderr())
324 userPheno[, dominanceEffectCol] <- userPheno[, userID]
326 write(paste("USER PHENO userID2 COL", userPheno[, dominanceEffectCol]), stderr())
327 uniq <- length(sapply(lapply(userPheno, unique), length))
328 write(paste("UNIQUE", uniq), stderr())
331 # Additional steps could be added here to remove outliers etc.
337 ################################################################################
338 # 6. Fit the mixed models in sommer.
339 ################################################################################
341 write("Fit mixed model in sommer", stderr())
342 # 6a. Make a list to save the models.
346 for (i in 1:length(userResponse)) {
347 write(paste("User response: ", userResponse[i]), stderr())
348 # check if fixed effects besides f are requested, then paste together
349 # response variable and fixed effects
350 if (!is.na(userFixed[1])) {
351 fixedEff <- paste(userFixed, collapse = " + ")
352 fixedEff <- paste(fixedEff, "f", sep = " + ")
353 fixedArg <- paste(userResponse[i], " ~ ", fixedEff, sep = "")
355 if (is.na(userFixed[1])) {
356 fixedArg <- paste(userResponse[i], " ~ ", "f")
360 # check if random effects besides genotypic additive and dominance effects
361 # are requested, then paste together the formula
363 write("Generate formula...", stderr())
365 if (!is.na(userRandom[1])) {
366 randEff <- paste(userRandom, collapse = " + ")
367 ID2 <- paste(userID, 2, sep = "")
368 randEff2 <- paste("~vsr(", userID, ", Gu = A) + vsr(", ID2, ", Gu = D)", sep = "")
369 randArg <- paste(randEff2, randEff, sep = " + ")
371 if (is.na(userRandom[1])) {
372 ID2 <- paste(userID, 2, sep = "")
373 randArg <- paste("~vsr(", userID, ", Gu = A) + vsr(", ID2, ", Gu = D)", sep = "")
376 write(paste("Fit mixed GBLUP model...", randArg), stderr())
378 # write(paste("USER PHENO:", userPheno), stderr())
379 # write(paste("COLNAMES: ", colnames(userPheno)), stderr())
380 # fit the mixed GBLUP model
382 fixed = as.formula(fixedArg),
383 random = as.formula(randArg),
392 write(paste("I = ", i), stderr())
394 userModels[[i]] <- myMod
402 ######################################################################################
403 # 7. Backsolve from individual estimates to marker effect estimates / GBLUP -> RR-BLUP
404 ######################################################################################
406 # a. Get the matrices and inverses needed
407 # This is not correct for polyploids yet.
408 A.G <- G - (userPloidy / 2) # this is the additive genotype matrix (coded -1 0 1 for diploids)
409 D.G <- 1 - abs(A.G) # this is the dominance genotype matrix (coded 0 1 0 for diploids)
412 A.T <- A.G %*% t(A.G) ## additive genotype matrix
413 A.Tinv <- solve(A.T) # inverse; may cause an error sometimes, if so, add a small amount to the diag
414 A.TTinv <- t(A.G) %*% A.Tinv # M'%*% (M'M)-
416 D.T <- D.G %*% t(D.G) ## dominance genotype matrix
417 D.Tinv <- solve(D.T) ## inverse
418 D.TTinv <- t(D.G) %*% D.Tinv # M'%*% (M'M)-
421 # b. Loop through and backsolve to marker effects.
423 write("backsolve marker effects...", stderr())
425 userAddEff <- list() # save them in order
426 userDomEff <- list() # save them in order
428 for (i in 1:length(userModels)) {
429 myMod <- userModels[[i]]
431 # get the additive and dominance effects out of the sommer list
433 subModA <- subMod[[1]]
434 subModA <- subModA[[1]]
435 subModD <- subMod[[2]]
436 subModD <- subModD[[1]]
439 addEff <- A.TTinv %*% matrix(subModA[colnames(A.TTinv)], ncol = 1) # these must be reordered to match A.TTinv
440 domEff <- D.TTinv %*% matrix(subModD[colnames(D.TTinv)], ncol = 1) # these must be reordered to match D.TTinv
442 # add f coefficient back into the dominance effects
443 subModf <- myMod$Beta
444 fCoef <- subModf[subModf$Effect == "f", "Estimate"] # raw f coefficient
445 fCoefScal <- fCoef / ncol(G) # divides f coefficient by number of markers
446 dirDomEff <- domEff + fCoefScal
449 userAddEff[[i]] <- addEff
450 userDomEff[[i]] <- dirDomEff
458 ################################################################################
459 # 8. Weight the marker effects and add them together to form an index of merit.
460 ################################################################################
462 write("weight marker effects...", stderr())
466 for (i in 1:length(userWeights)) {
467 # write(paste("USER ADD EFF : ", userAddEff[[i]]), stderr())
468 # write(paste("USER DOM EFF : ", userDomEff[[i]]), stderr())
469 # write(paste("USER WEIGHT : ", userWeights[i]), stderr())
472 ai <- ai + userAddEff[[i]] * userWeights[i]
474 write("DONE WITH ADDITIVE EFFECTS!\n", stderr())
475 di <- di + userDomEff[[i]] * userWeights[i]
476 write("DONE WITH DOM EFFECTS!\n", stderr())
484 ################################################################################
485 # 9. Predict the crosses.
486 ################################################################################
488 # If the genotype matrix provides information about individuals for which
489 # cross prediction is not desired, then the genotype matrix must be subset
490 # for use in calcCrossMean(). calcCrossMean will return predicted cross
491 # values for all individuals in the genotype file otherwise.
493 write("Predict crosses...", stderr())
495 GP <- G[rownames(G) %in% userPheno[, userID], ]
500 write("calcCrossMean...", stderr())
502 crossPlan <- calcCrossMean(
510 write("Done with calcCrossMean!!!!!!", stderr())
514 ################################################################################
515 # 10. Format the information needed for output.
516 ################################################################################
518 # Add option to remove crosses with incompatible sexes.
521 # hash <- new.env(hash = TRUE, parent = emptyenv(), size = 100L)
523 # assign_hash(userPheno$germplasmName, userPheno$userSexes, hash)
525 if (userSexes != "") { # "plant sex estimation 0-4"
526 # !is.na(userSexes) && !is.na(sd(userPheno[, userSexes]))
528 write(paste("userSexes", sd(userPheno[, userSexes])), stderr())
530 # Reformat the cross plan
531 crossPlan <- as.data.frame(crossPlan)
533 write(paste("CROSSPLAN = ", head(crossPlan)), stderr())
534 crossPlan <- crossPlan[order(crossPlan[, 3], decreasing = TRUE), ] # orders the plan by predicted merit
535 crossPlan[, 1] <- rownames(GP)[crossPlan[, 1]] # replaces internal ID with genotye file ID
536 crossPlan[, 2] <- rownames(GP)[crossPlan[, 2]] # replaces internal ID with genotye file ID
537 colnames(crossPlan) <- c("Parent1", "Parent2", "CrossPredictedMerit")
539 write(paste("CROSSPLAN REPLACED = ", head(crossPlan)), stderr())
541 # Look up the parent sexes and subset
542 crossPlan$P1Sex <- userPheno[match(crossPlan$Parent1, userPheno$germplasmName), userSexes] # get sexes ordered by Parent1
544 write(paste("PARENTS1 ", head(crossPlan)), stderr())
546 crossPlan$P2Sex <- userPheno[match(crossPlan$Parent2, userPheno$germplasmName), userSexes] # get sexes ordered by Parent2
548 write(paste("PARENTS2 ", head(crossPlan)), stderr())
549 col_repl <- c("P1Sex", "P2Sex")
550 crossPlan %>% filter(P1Sex == 0 | P2Sex == 0) # remove the 0s
551 crossPlan %>% filter(P1Sex == 1 & P2Sex == 1) # remove same sex crosses with score of 1
552 crossPlan %>% filter(P1Sex == 2 & P2Sex == 2) # remove same sex crosses with score of 2
554 write(paste("CROSSPLAN FILTERED = ", crossPlan), stderr())
555 # crossPlan <- crossPlan[crossPlan$P1Sex != crossPlan$P2Sex, ] # remove crosses with same-sex parents
557 ## replace plant sex numbers to male, female etc
559 crossPlan[col_repl] <- sapply(crossPlan[col_repl], function(x) replace(x, x %in% "NA", "NA"))
560 crossPlan[col_repl] <- sapply(crossPlan[col_repl], function(x) replace(x, x %in% 1, "Male"))
561 crossPlan[col_repl] <- sapply(crossPlan[col_repl], function(x) replace(x, x %in% 2, "Female"))
562 crossPlan[col_repl] <- sapply(crossPlan[col_repl], function(x) replace(x, x %in% 3, "Monoecious male (m>f)"))
563 crossPlan[col_repl] <- sapply(crossPlan[col_repl], function(x) replace(x, x %in% 4, "Monoecious female(f>m)"))
565 # ** summary statistics for the cross prediction merit
566 avg <- round(mean(crossPlan$CrossPredictedMerit), digits = 3)
567 max <- round(max(crossPlan$CrossPredictedMerit), digits = 3)
568 min <- round(min(crossPlan$CrossPredictedMerit), digits = 3)
569 std <- round(sd(crossPlan$CrossPredictedMerit), digits = 3)
570 leng <- length(crossPlan$CrossPredictedMerit)
573 histogra <- paste(phenotypeFile, ".png", sep = "")
574 png(file = histogra, width = 600, height = 350)
575 hist(crossPlan$CrossPredictedMerit, xlab = "Cross Predicted Merit", main = "Distribution")
576 mtext(paste("Mean =", avg), side = 3, adj = 1, line = 0)
577 mtext(paste("Standard Deviation = ", std), side = 3, adj = 1, line = -1)
578 mtext(paste("Range = (", min, " to ", max, ")"), side = 3, adj = 1, line = -2)
579 mtext(paste("No. of predictions = ", leng), side = 3, adj = 1, line = -3)
584 # subset the number of crosses the user wishes to output
585 if (nrow(crossPlan)<100) {
586 finalcrosses = crossPlan
588 crossPlan[1:userNCrosses, ]
589 finalcrosses=crossPlan[1:userNCrosses, ]
591 outputFile <- paste(phenotypeFile, ".out", sep = "")
593 write.csv(finalcrosses, outputFile)
595 # only subset the number of crosses the user wishes to output
596 crossPlan <- as.data.frame(crossPlan)
597 crossPlan <- na.omit(crossPlan)
598 crossPlan <- crossPlan[order(crossPlan[, 3], decreasing = TRUE), ] # orders the plan by predicted merit
599 crossPlan[, 1] <- rownames(GP)[crossPlan[, 1]] # replaces internal ID with genotye file ID
600 crossPlan[, 2] <- rownames(GP)[crossPlan[, 2]] # replaces internal ID with genotye file ID
601 colnames(crossPlan) <- c("Parent1", "Parent2", "CrossPredictedMerit")
603 # get summary statistics for the cross prediction merit
604 avg <- round(mean(crossPlan$CrossPredictedMerit), digits = 3)
605 max <- round(max(crossPlan$CrossPredictedMerit), digits = 3)
606 min <- round(min(crossPlan$CrossPredictedMerit), digits = 3)
607 std <- round(sd(crossPlan$CrossPredictedMerit), digits = 3)
608 leng <- length(crossPlan$CrossPredictedMerit)
612 histogra <- paste(phenotypeFile, ".png", sep = "")
613 png(file = histogra, width = 600, height = 350)
614 hist(crossPlan$CrossPredictedMerit, xlab = "Cross Predicted Merit", main = "Distribution")
615 mtext(paste("Mean =", avg), side = 3, adj = 1, line = 0)
616 mtext(paste("Standard Deviation = ", std), side = 3, adj = 1, line = -1)
617 mtext(paste("Range = (", min, " to ", max, ")"), side = 3, adj = 1, line = -2)
618 mtext(paste("No. of predictions = ", leng), side = 3, adj = 1, line = -3)
621 ## save the best 100 predictions
622 if (nrow(crossPlan)<100) {
623 finalcrosses = crossPlan
625 crossPlan[1:userNCrosses, ]
626 finalcrosses=crossPlan[1:userNCrosses, ]
628 outputFile <- paste(phenotypeFile, ".out", sep = "")
630 write.csv(finalcrosses, outputFile)