R/stability/ammi_script.R

   1
   2 library("methods")
   3 library("agricolae")
   4 library("dplyr")
   5 library("tidyr")
   6
   7
   8 set.seed(54326)
   9
  10 ##### Getting data #####
  11 args = commandArgs(trailingOnly = TRUE)
  12
  13 pheno <- read.table(args[1], sep = "\t", header = TRUE)
  14 study_trait <- args[2]
  15 imputPheno <- args[3]
  16 stability_method <- args[4]
  17 jsonFile <- args[5]
  18 graphFile <- args[6]
  19 messageFile <- args[7]
  20 jsonSummary <- args[8]
  21
  22
  23 study_trait <- gsub("\\."," ",study_trait)
  24
  25 #Making names standard
  26 names <- colnames(pheno)
  27 new_names <- gsub(".CO.*","", names)
  28 new_names <- gsub("\\."," ",new_names)
  29 colnames(pheno) <- new_names
  30
  31 ## Function for pheno imputation
  32 pheno_imputation <- function(inData){
  33   library(mice)
  34
  35   dat_imput <- spread(inData, Env, Yield)
  36   colnames(dat_imput) <- gsub(" ", "", colnames(dat_imput))
  37   #pmm is is the predictive mean matching
  38   pre_imputed <- mice(dat_imput,  method = "mean", m = 3, maxit = 10)
  39   final_imputed <- complete(pre_imputed, 1)
  40
  41   final_imputed <- gather(final_imputed, "Env", "Yield", -c("Gen","Rep"))
  42   cat("Imput ok!")
  43   return(final_imputed)
  44 }
  45
  46 imputation_accuracy <- function(checkDF, fMissing){
  47   checkDF$Gen <- as.character(checkDF$Gen)
  48   checkDF$Env <- as.character(checkDF$Env)
  49
  50   realData <- pheno_imputation(checkDF)
  51
  52   # Subseting to check imputation accuracy
  53   realData <- realData[order(realData$Gen, realData$Env), ]
  54   data_missing <- sample(rownames(realData), nrow(realData)*fMissing)
  55   testindDF <- realData
  56   testindDF <- testindDF[order(testindDF$Gen, testindDF$Env), ]
  57   testindDF[rownames(testindDF)%in% data_missing, "Yield"] <- NA
  58
  59   testindDF$Gen <- as.factor(testindDF$Gen)
  60   testindDF$Env <- as.factor(testindDF$Env)
  61   testindDF$Rep <- as.factor(testindDF$Rep)
  62
  63   imputed_test <- pheno_imputation(testindDF)
  64   imputed_test <- imputed_test[order(imputed_test$Gen, imputed_test$Env), ]
  65   cat("Imput Acc ok!\n")
  66   corrImput <- cor(realData$Yield,imputed_test$Yield, use = "complete")
  67   return(corrImput)
  68
  69 }
  70
  71
  72
  73 # Setting dataframe to required format
  74 pheno <- pheno[,colnames(pheno) %in% c("locationName", "germplasmName", "replicate", "entryType", study_trait[1])]
  75
  76
  77
  78 ########################################
  79 #           Quality Control            #
  80 # Checking conditions to run analysis  #
  81 #                                      #
  82 ########################################
  83
  84 # Getting locations, accessions and reps
  85 accessions_sel <- unique(pheno$germplasmName)
  86 # Some breeding programs are using accession named Filler
  87 accessions_sel <- accessions_sel[!accessions_sel == "Filler" | !accessions_sel == "filler"]
  88
  89 # Taking test accessions
  90 dat <- pheno %>% dplyr::select(germplasmName, locationName, replicate, entryType, trait=study_trait[1])
  91 dat <- dat[dat$entryType == "test",]
  92 nLoc <- length(unique(dat$locationName))
  93 nReps <- max(dat$replicate)
  94
  95 # Number of reps mat be different for checks
  96 # Fixing here
  97 pheno <- pheno[pheno$replicate <= nReps & pheno$germplasmName %in% accessions_sel,]
  98
  99 # Preparing dataset for for analysis
 100 dat1 <- pheno %>% dplyr::select(germplasmName, locationName, replicate, trait=study_trait[1])
 101
 102 fMissing <- nrow(dat1[is.na(dat1$trait),])/nrow(dat1)
 103 message2 <- paste0("The frequency of missing data is too high. Please, check the dataset.")
 104
 105 summaryData <- data.frame(mean = mean(dat1[,4], na.rm=TRUE),
 106                                                                                                         min = min(dat1[,4], na.rm=TRUE),
 107                                                                                                         maxV = max(dat1[,4], na.rm=TRUE),
 108                                                                                                         sdV = sd(dat1[,4], na.rm=TRUE),
 109                                                                                                         missing = fMissing
 110                                                                                                         )
 111
 112 myJson <- jsonlite::toJSON(summaryData)
 113 jsonlite::write_json(myJson, jsonSummary)
 114
 115
 116 find_duplications <- function(inData){
 117   # Step 1: Identify duplicate rows based on the combination of four columns
 118   duplicated_rows <- duplicated(inData[, c("Gen", "Env", "Rep")])
 119
 120   # Step 2: Calculate the average of 'Yield' for each unique combination of the four columns
 121   averages <- aggregate(Yield ~ Gen + Env + Rep, data = dat1, FUN = mean)
 122
 123   # Step 3: Replace the duplicated rows with the calculated average
 124   pheno_unique <- inData[!duplicated_rows, ]  # Keep only unique rows
 125   pheno_unique <- merge(pheno_unique, averages, by = c("Gen", "Env", "Rep"), all.x = TRUE)
 126
 127   # If there are any missing values (for rows that were not duplicated), fill them with the original 'Yield'
 128   pheno_unique$Yield <- ifelse(is.na(pheno_unique$Yield.y), pheno_unique$Yield.x, pheno_unique$Yield.y)
 129
 130   # Remove unnecessary columns (if needed)
 131   pheno_unique <- pheno_unique[, !(names(pheno_unique) %in% c("Yield.x", "Yield.y"))]
 132
 133   return(pheno_unique)
 134
 135   # View the resulting data frame
 136 }
 137
 138
 139 dat1$germplasmName <- as.factor(dat1$germplasmName)
 140 dat1$locationName <- as.factor(dat1$locationName)
 141 dat1$replicate <- as.factor(dat1$replicate)
 142 dat1$trait <- as.double(dat1$trait)
 143
 144 # Running model from stability package
 145 # Assuming dat1 is your data frame
 146 dat1 <- dat1[order(dat1$germplasmName, dat1$locationName), ]
 147 colnames(dat1) <- c("Gen", "Env", "Rep", "Yield")
 148
 149
 150 dat2 <- find_duplications(dat1)
 151
 152 if(imputPheno == "imput_yes"){
 153   testCor <- imputation_accuracy(dat2, fMissing)
 154   dat2 <- pheno_imputation(dat2)
 155   cat("The imputation accuracy is: ", sprintf("%.3f",testCor), "\n")
 156 }
 157
 158 # testing if all accessions have the same number of reps in all locations
 159 dat <- na.omit(dat2)
 160 replicate_df <- data.frame(replicate = tapply(dat$Yield, list(dat$Gen, dat$Env), length))
 161 replicate_df$germplasmName <- rownames(replicate_df)
 162 gathered_df <- gather(replicate_df, key = "replicate_locations", value = "value", -germplasmName)
 163 repAcc <- unique(gathered_df$value)
 164 message1 <- paste0("Please, check dataset for accessions per locations and reps.")
 165
 166 # Saving error message
 167 errorMessages <- c()
 168 if(fMissing >= 0.4){
 169         errorMessages <- append(errorMessages, message2)
 170 }else if(length(repAcc)>1){
 171   errorMessages <- append(errorMessages, message1)
 172 }
 173
 174 # Setting second rep to compare augmented design with 1 rep per location
 175 if(nReps == 1 && stability_method == "ammi"){
 176   dat2.1 <- dat2
 177   dat2.1$Rep <- as.numeric(dat2.1$Rep)
 178   dat2.1$Rep <- 2
 179   dat2$Rep <- as.numeric(dat2$Rep)
 180   dat2 <- rbind(dat2,dat2.1)
 181   dat2$Rep <- as.factor(dat2$Rep)
 182 }else if (nReps == 1 && stability_method == "gge"){
 183         errorMessages <- append(errorMessages, "The number of replication must be greater than 1 for gge.")
 184 }
 185
 186
 187 if(stability_method=="ammi" && length(errorMessages) == 0 ){
 188         library("agricolae")
 189
 190         model <- NULL
 191   # Running AMMI model
 192         tryCatch({
 193           # Attempt to execute the code
 194           model<- with(dat2,AMMI(Env, Gen, Rep, Yield, console=FALSE))
 195         }, error = function(e) {
 196           # Handle the error by printing a message
 197           cat("An error occurred:", conditionMessage(e), "\n")
 198           append(errorMessages, paste0("An error occurred:", conditionMessage(e)))
 199         })
 200
 201         index<-index.AMMI(model)
 202         index <- tibble::rownames_to_column(index, "Accession")
 203         indexDF <- data.frame(Accession = index$Accession, ASV = index$ASV, Rank = index$rASV)
 204
 205         # Adding a column to scale ASV
 206         # This column will be used to plot stability lines
 207         slopes <- data.frame(Accession = index$Accession, slope = index$ASV, scaled= "NA")
 208         slopes$scaled <-as.vector(scale(slopes$slope, center = T, scale = F))
 209         indexDF <- left_join(indexDF, slopes, by="Accession")
 210
 211         indexDF <- as.data.frame(indexDF)
 212         indexDF <- indexDF[order(indexDF[,3]),]
 213
 214         ## Preparing table with GxE effects
 215         list_data <- list(model$genXenv)
 216         dataGraphic <- data.frame(plyr::ldply(list_data))
 217         rownames(dataGraphic) <- model$means$GEN[1:(nrow(dataGraphic))]
 218         dataGraphic <- tibble::rownames_to_column(dataGraphic, "Accession")
 219
 220         myMeans <- data.frame(Accession = model$means$GEN, location = model$means$ENV, means = model$means$Yield)
 221         myMeans$average <- myMeans$means
 222         myMeans$means <- as.vector(scale(myMeans$means, center = T, scale = FALSE))
 223         meanDF <- data.frame(tapply(myMeans$means, myMeans$Accession, mean))
 224         meanDF$Accession <- rownames(meanDF)
 225         colnames(meanDF) <- c("means", "Accession")
 226         meanDF <- left_join(meanDF, slopes, by = "Accession") %>% select(Accession, means, slope)
 227         meanDF$start <- 0
 228         meanDF$end <- length(unique(pheno$locationName))
 229         minDF <- data.frame(tapply(myMeans$means, myMeans$Accession, min))
 230         minDF$Accession <- rownames(minDF)
 231         colnames(minDF) <- c("minimum", "Accession")
 232
 233         meanDFF <- left_join(meanDF, minDF, by = "Accession")
 234         meanDFF$value <- meanDFF$minimum*meanDF$slope
 235         meanDFF <- gather(meanDFF, key = "T", value = "X",start,end) %>% select(Accession, minimum, value, X) %>% arrange(Accession)
 236         meanDFF <- left_join(meanDFF, slopes, by = "Accession")
 237
 238         for ( i in 1:nrow(meanDFF)){
 239           if(meanDFF$X[i] == 0){
 240             meanDFF$value[i] <- 0
 241           }else{
 242             meanDFF$value[i] <- (-1)*meanDFF$slope[i]
 243           }
 244         }
 245
 246         scaled_values <- (meanDFF$value - min(meanDFF$value)) / (max(meanDFF$value) - min(meanDFF$value))
 247         meanDFF$value <- scaled_values
 248
 249         myData <- data.frame(pivot_longer(dataGraphic, 2:ncol(dataGraphic), names_to = "location", values_to = "Effect"))
 250         myData$location <- gsub("\\."," ", myData$location)
 251
 252         preFinal <- left_join(myData, myMeans, by = c("Accession", "location"))
 253         selLocations <- data.frame(locName = unique(preFinal$location), locNumber = c(1:length(unique(preFinal$location))))
 254         preFinal <- left_join(preFinal, selLocations, by = c("location"="locName"))
 255
 256         finalDF <- left_join(preFinal, indexDF, by="Accession") %>% dplyr::select(Accession, location, Effect, Rank, means, slope, scaled)
 257         finalDF <- finalDF[order(finalDF$Rank, -finalDF$Effect),]
 258         finalDF$Effect <- sprintf("%.3f", finalDF$Effect)
 259         finalDF$means <- sprintf("%.3f", finalDF$means)
 260         finalDF$slope <- sprintf("%.3f", finalDF$slope)
 261         finalDF$scaled <- sprintf("%.3f", finalDF$scaled)
 262
 263         graphicJson <- jsonlite::toJSON(meanDFF)
 264         jsonlite::write_json(graphicJson, graphFile)
 265
 266         # Parsing files to combine Accession, locations, effects, rank, averages, slopes
 267         preFinal <- left_join(myData, myMeans, by = c("Accession", "location"))
 268         finalDF <- left_join(preFinal, indexDF, by="Accession") %>% dplyr::select(Accession, location, Effect, Rank, average, slope, scaled)
 269         colnames(finalDF)[5] <- "means"
 270         if(imputPheno == "imput_yes"){finalDF$imputAcc <- testCor}
 271         finalDF <- finalDF[order(finalDF$Rank, -finalDF$Effect),]
 272
 273         myJson <- jsonlite::toJSON(finalDF)
 274         jsonlite::write_json(myJson, jsonFile)
 275
 276         }else if( stability_method=="gge" && length(errorMessages) == 0){
 277                 library("stability")
 278                 cat("running gge", "\n")
 279                 # dat2 <- pheno_imputation(dat2)
 280
 281     dat2$Env <- as.character(dat2$Env)
 282                 effectsDF <- NULL
 283                 # Use tryCatch to handle potential errors
 284                 tryCatch({
 285                     # Attempt to execute the code
 286                     effectsDF <- stability::ge_means(.data = dat2, .y = Yield, .gen = Gen, .env = Env)
 287                 }, error = function(e) {
 288                     # Handle the error by printing a message
 289                     cat("An error occurred:", conditionMessage(e), "\n")
 290                     append(errorMessages, paste0("An error occurred:", conditionMessage(e)))
 291                 })
 292
 293                 effectsDF <-  stability::ge_means(.data=dat2, .y= Yield, .gen=Gen, .env=Env)
 294
 295                 test <- data.frame(effectsDF$ge_ranks)
 296                 test$location <- rownames(test)
 297                 gathered_df <- gather(test, key = "rank", value = "genotypes", -location)
 298                 gathered_df$rank <- gsub("X", "", gathered_df$rank)
 299                 gathered_df$rank <- as.numeric(gathered_df$rank)
 300
 301                 test1 <- data.frame(effectsDF$ge_means)
 302                 gathered_mean <- gather(test1, key = "locations", value = "means", -Gen)
 303
 304
 305                 gathered_df$track <- paste0(gathered_df$genotypes,"_",gathered_df$location)
 306                 gathered_mean$track <- paste0(gathered_mean$Gen,"_",gathered_mean$locations)
 307
 308                 gathered_df$track <- gsub(" ","",gathered_df$track)
 309                 gathered_df$track <- gsub("\\.","",gathered_df$track)
 310
 311                 gathered_mean$track <- gsub(" ","",gathered_mean$track)
 312                 gathered_mean$track <- gsub("\\.","",gathered_mean$track)
 313
 314
 315                 finalDF <- left_join(gathered_df, gathered_mean, by = "track") %>% dplyr::select(
 316                   genotypes, locations, means, rank
 317                 )
 318                 finalDF$locations <- gsub("\\."," ",finalDF$locations)
 319
 320                 rankDF <- data.frame(sumRank=tapply(finalDF$rank, finalDF$genotypes, sum))
 321                 rankDF$genotypes <- rownames(rankDF)
 322                 rankDF$genotypesRank <- rank(rankDF$sumRank, ties.method = "min")
 323                 finalDF <- left_join(finalDF, rankDF, by = "genotypes") %>% dplyr::select(genotypes, locations, means, rank, genotypesRank)
 324                 colnames(finalDF) <- c("genotypes","locations", "means", "locationRank","genotypeRank")
 325                 finalDF$means <- sprintf("%.2f", finalDF$means)
 326
 327                 colnames(finalDF) <- c("Accession", "location", "means", "locationRank", "genotypeRank")
 328                 if(imputPheno == "imput_yes"){finalDF$imputAcc <- testCor}
 329                 # Saving Json
 330                 myJson <- jsonlite::toJSON(finalDF)
 331                 jsonlite::write_json(myJson, jsonFile)
 332
 333
 334                 ## Preparing Graphic dataset
 335                 prepGraph <- as.data.frame(tapply(dat2$Yield, list(dat2$Gen, dat2$Env), mean))
 336                 prepGraph <- tibble::rownames_to_column(prepGraph, var="Accession")
 337                 meanGraph <- gather(prepGraph, key = "location", value = "mean", -Accession)
 338                 prepGraph2 <- as.data.frame(tapply(dat2$Yield, list(dat2$Gen, dat2$Env), sd))
 339                 prepGraph2 <- tibble::rownames_to_column(prepGraph2, var="Accession")
 340                 sdGraph <- gather(prepGraph2, key = "location", value = "sd", -Accession)
 341                 finalGraph <- left_join(meanGraph, sdGraph, by=c("Accession","location"))
 342
 343                 graphicJson <- jsonlite::toJSON(finalGraph)
 344                 jsonlite::write_json(graphicJson, graphFile)
 345
 346         }
 347
 348 if ( is.null(errorMessages) == F ) {
 349   print(sprintf("Writing Error Messages to file: %s", messageFile))
 350   print(errorMessages)
 351   write(errorMessages, messageFile)
 352 }
 353
 354