Added eval; site now shows clean dataset missing message instead of server error...
[sgn.git] / R / stability / ammi_script.R
blob529d6103bfdb226507e4a7e93941ab8c822ca044
2 library("methods")
3 library("agricolae")
4 library("dplyr")
5 library("tidyr")
8 set.seed(54326)
10 ##### Getting data #####
11 args = commandArgs(trailingOnly = TRUE)
13 pheno <- read.table(args[1], sep = "\t", header = TRUE)
14 study_trait <- args[2]
15 imputPheno <- args[3]
16 stability_method <- args[4]
17 jsonFile <- args[5]
18 graphFile <- args[6]
19 messageFile <- args[7]
20 jsonSummary <- args[8]
23 study_trait <- gsub("\\."," ",study_trait)
25 #Making names standard
26 names <- colnames(pheno)
27 new_names <- gsub(".CO.*","", names)
28 new_names <- gsub("\\."," ",new_names)
29 colnames(pheno) <- new_names
31 ## Function for pheno imputation
32 pheno_imputation <- function(inData){
33   library(mice)
34   
35   dat_imput <- spread(inData, Env, Yield)
36   colnames(dat_imput) <- gsub(" ", "", colnames(dat_imput))
37   #pmm is is the predictive mean matching
38   pre_imputed <- mice(dat_imput,  method = "mean", m = 3, maxit = 10)
39   final_imputed <- complete(pre_imputed, 1)
40   
41   final_imputed <- gather(final_imputed, "Env", "Yield", -c("Gen","Rep"))
42   cat("Imput ok!")
43   return(final_imputed)
46 imputation_accuracy <- function(checkDF, fMissing){
47   checkDF$Gen <- as.character(checkDF$Gen)
48   checkDF$Env <- as.character(checkDF$Env)
49   
50   realData <- pheno_imputation(checkDF)
51   
52   # Subseting to check imputation accuracy
53   realData <- realData[order(realData$Gen, realData$Env), ]
54   data_missing <- sample(rownames(realData), nrow(realData)*fMissing)
55   testindDF <- realData
56   testindDF <- testindDF[order(testindDF$Gen, testindDF$Env), ]
57   testindDF[rownames(testindDF)%in% data_missing, "Yield"] <- NA
58   
59   testindDF$Gen <- as.factor(testindDF$Gen)
60   testindDF$Env <- as.factor(testindDF$Env)
61   testindDF$Rep <- as.factor(testindDF$Rep)
62   
63   imputed_test <- pheno_imputation(testindDF)
64   imputed_test <- imputed_test[order(imputed_test$Gen, imputed_test$Env), ]
65   cat("Imput Acc ok!\n")
66   corrImput <- cor(realData$Yield,imputed_test$Yield, use = "complete")
67   return(corrImput)
68   
73 # Setting dataframe to required format
74 pheno <- pheno[,colnames(pheno) %in% c("locationName", "germplasmName", "replicate", "entryType", study_trait[1])]
78 ########################################
79 #           Quality Control            #
80 # Checking conditions to run analysis  #
81 #                                      #
82 ########################################
84 # Getting locations, accessions and reps
85 accessions_sel <- unique(pheno$germplasmName)
86 # Some breeding programs are using accession named Filler
87 accessions_sel <- accessions_sel[!accessions_sel == "Filler" | !accessions_sel == "filler"]
89 # Taking test accessions 
90 dat <- pheno %>% dplyr::select(germplasmName, locationName, replicate, entryType, trait=study_trait[1])
91 dat <- dat[dat$entryType == "test",]
92 nLoc <- length(unique(dat$locationName))
93 nReps <- max(dat$replicate)
95 # Number of reps mat be different for checks
96 # Fixing here
97 pheno <- pheno[pheno$replicate <= nReps & pheno$germplasmName %in% accessions_sel,]
99 # Preparing dataset for for analysis
100 dat1 <- pheno %>% dplyr::select(germplasmName, locationName, replicate, trait=study_trait[1])
102 fMissing <- nrow(dat1[is.na(dat1$trait),])/nrow(dat1)
103 message2 <- paste0("The frequency of missing data is too high. Please, check the dataset.")
105 summaryData <- data.frame(mean = mean(dat1[,4], na.rm=TRUE),
106                                                                                                         min = min(dat1[,4], na.rm=TRUE),
107                                                                                                         maxV = max(dat1[,4], na.rm=TRUE),
108                                                                                                         sdV = sd(dat1[,4], na.rm=TRUE),
109                                                                                                         missing = fMissing
110                                                                                                         )
112 myJson <- jsonlite::toJSON(summaryData)
113 jsonlite::write_json(myJson, jsonSummary)
116 find_duplications <- function(inData){
117   # Step 1: Identify duplicate rows based on the combination of four columns
118   duplicated_rows <- duplicated(inData[, c("Gen", "Env", "Rep")])
119   
120   # Step 2: Calculate the average of 'Yield' for each unique combination of the four columns
121   averages <- aggregate(Yield ~ Gen + Env + Rep, data = dat1, FUN = mean)
122   
123   # Step 3: Replace the duplicated rows with the calculated average
124   pheno_unique <- inData[!duplicated_rows, ]  # Keep only unique rows
125   pheno_unique <- merge(pheno_unique, averages, by = c("Gen", "Env", "Rep"), all.x = TRUE)
126   
127   # If there are any missing values (for rows that were not duplicated), fill them with the original 'Yield'
128   pheno_unique$Yield <- ifelse(is.na(pheno_unique$Yield.y), pheno_unique$Yield.x, pheno_unique$Yield.y)
129   
130   # Remove unnecessary columns (if needed)
131   pheno_unique <- pheno_unique[, !(names(pheno_unique) %in% c("Yield.x", "Yield.y"))]
133   return(pheno_unique)
134   
135   # View the resulting data frame
139 dat1$germplasmName <- as.factor(dat1$germplasmName)
140 dat1$locationName <- as.factor(dat1$locationName)
141 dat1$replicate <- as.factor(dat1$replicate)
142 dat1$trait <- as.double(dat1$trait)
144 # Running model from stability package
145 # Assuming dat1 is your data frame
146 dat1 <- dat1[order(dat1$germplasmName, dat1$locationName), ]
147 colnames(dat1) <- c("Gen", "Env", "Rep", "Yield")
150 dat2 <- find_duplications(dat1)
152 if(imputPheno == "imput_yes"){
153   testCor <- imputation_accuracy(dat2, fMissing)
154   dat2 <- pheno_imputation(dat2)
155   cat("The imputation accuracy is: ", sprintf("%.3f",testCor), "\n")
158 # testing if all accessions have the same number of reps in all locations
159 dat <- na.omit(dat2)
160 replicate_df <- data.frame(replicate = tapply(dat$Yield, list(dat$Gen, dat$Env), length))
161 replicate_df$germplasmName <- rownames(replicate_df)
162 gathered_df <- gather(replicate_df, key = "replicate_locations", value = "value", -germplasmName)
163 repAcc <- unique(gathered_df$value)
164 message1 <- paste0("Please, check dataset for accessions per locations and reps.")
166 # Saving error message
167 errorMessages <- c()
168 if(fMissing >= 0.4){
169         errorMessages <- append(errorMessages, message2)
170 }else if(length(repAcc)>1){
171   errorMessages <- append(errorMessages, message1)
174 # Setting second rep to compare augmented design with 1 rep per location
175 if(nReps == 1 && stability_method == "ammi"){
176   dat2.1 <- dat2
177   dat2.1$Rep <- as.numeric(dat2.1$Rep)
178   dat2.1$Rep <- 2
179   dat2$Rep <- as.numeric(dat2$Rep)
180   dat2 <- rbind(dat2,dat2.1)
181   dat2$Rep <- as.factor(dat2$Rep)
182 }else if (nReps == 1 && stability_method == "gge"){
183         errorMessages <- append(errorMessages, "The number of replication must be greater than 1 for gge.")
187 if(stability_method=="ammi" && length(errorMessages) == 0 ){
188         library("agricolae")
190         model <- NULL
191   # Running AMMI model
192         tryCatch({
193           # Attempt to execute the code
194           model<- with(dat2,AMMI(Env, Gen, Rep, Yield, console=FALSE))
195         }, error = function(e) {
196           # Handle the error by printing a message
197           cat("An error occurred:", conditionMessage(e), "\n")
198           append(errorMessages, paste0("An error occurred:", conditionMessage(e)))
199         })
201         index<-index.AMMI(model)
202         index <- tibble::rownames_to_column(index, "Accession")
203         indexDF <- data.frame(Accession = index$Accession, ASV = index$ASV, Rank = index$rASV)
205         # Adding a column to scale ASV
206         # This column will be used to plot stability lines
207         slopes <- data.frame(Accession = index$Accession, slope = index$ASV, scaled= "NA")
208         slopes$scaled <-as.vector(scale(slopes$slope, center = T, scale = F))
209         indexDF <- left_join(indexDF, slopes, by="Accession")
211         indexDF <- as.data.frame(indexDF)
212         indexDF <- indexDF[order(indexDF[,3]),]
214         ## Preparing table with GxE effects
215         list_data <- list(model$genXenv)
216         dataGraphic <- data.frame(plyr::ldply(list_data))
217         rownames(dataGraphic) <- model$means$GEN[1:(nrow(dataGraphic))]
218         dataGraphic <- tibble::rownames_to_column(dataGraphic, "Accession")
220         myMeans <- data.frame(Accession = model$means$GEN, location = model$means$ENV, means = model$means$Yield)
221         myMeans$average <- myMeans$means
222         myMeans$means <- as.vector(scale(myMeans$means, center = T, scale = FALSE))
223         meanDF <- data.frame(tapply(myMeans$means, myMeans$Accession, mean))
224         meanDF$Accession <- rownames(meanDF)
225         colnames(meanDF) <- c("means", "Accession")
226         meanDF <- left_join(meanDF, slopes, by = "Accession") %>% select(Accession, means, slope)
227         meanDF$start <- 0
228         meanDF$end <- length(unique(pheno$locationName)) 
229         minDF <- data.frame(tapply(myMeans$means, myMeans$Accession, min))
230         minDF$Accession <- rownames(minDF)
231         colnames(minDF) <- c("minimum", "Accession")
233         meanDFF <- left_join(meanDF, minDF, by = "Accession")
234         meanDFF$value <- meanDFF$minimum*meanDF$slope
235         meanDFF <- gather(meanDFF, key = "T", value = "X",start,end) %>% select(Accession, minimum, value, X) %>% arrange(Accession)
236         meanDFF <- left_join(meanDFF, slopes, by = "Accession")
238         for ( i in 1:nrow(meanDFF)){
239           if(meanDFF$X[i] == 0){
240             meanDFF$value[i] <- 0 
241           }else{
242             meanDFF$value[i] <- (-1)*meanDFF$slope[i]
243           }
244         }
246         scaled_values <- (meanDFF$value - min(meanDFF$value)) / (max(meanDFF$value) - min(meanDFF$value))
247         meanDFF$value <- scaled_values
249         myData <- data.frame(pivot_longer(dataGraphic, 2:ncol(dataGraphic), names_to = "location", values_to = "Effect"))
250         myData$location <- gsub("\\."," ", myData$location)
252         preFinal <- left_join(myData, myMeans, by = c("Accession", "location"))
253         selLocations <- data.frame(locName = unique(preFinal$location), locNumber = c(1:length(unique(preFinal$location))))
254         preFinal <- left_join(preFinal, selLocations, by = c("location"="locName")) 
256         finalDF <- left_join(preFinal, indexDF, by="Accession") %>% dplyr::select(Accession, location, Effect, Rank, means, slope, scaled)
257         finalDF <- finalDF[order(finalDF$Rank, -finalDF$Effect),]
258         finalDF$Effect <- sprintf("%.3f", finalDF$Effect)
259         finalDF$means <- sprintf("%.3f", finalDF$means)
260         finalDF$slope <- sprintf("%.3f", finalDF$slope)
261         finalDF$scaled <- sprintf("%.3f", finalDF$scaled)
263         graphicJson <- jsonlite::toJSON(meanDFF)
264         jsonlite::write_json(graphicJson, graphFile)
266         # Parsing files to combine Accession, locations, effects, rank, averages, slopes
267         preFinal <- left_join(myData, myMeans, by = c("Accession", "location"))
268         finalDF <- left_join(preFinal, indexDF, by="Accession") %>% dplyr::select(Accession, location, Effect, Rank, average, slope, scaled)
269         colnames(finalDF)[5] <- "means"
270         if(imputPheno == "imput_yes"){finalDF$imputAcc <- testCor}
271         finalDF <- finalDF[order(finalDF$Rank, -finalDF$Effect),]
273         myJson <- jsonlite::toJSON(finalDF)
274         jsonlite::write_json(myJson, jsonFile)
276         }else if( stability_method=="gge" && length(errorMessages) == 0){
277                 library("stability")
278                 cat("running gge", "\n")
279                 # dat2 <- pheno_imputation(dat2)
280     
281     dat2$Env <- as.character(dat2$Env)
282                 effectsDF <- NULL
283                 # Use tryCatch to handle potential errors
284                 tryCatch({
285                     # Attempt to execute the code
286                     effectsDF <- stability::ge_means(.data = dat2, .y = Yield, .gen = Gen, .env = Env)
287                 }, error = function(e) {
288                     # Handle the error by printing a message
289                     cat("An error occurred:", conditionMessage(e), "\n")
290                     append(errorMessages, paste0("An error occurred:", conditionMessage(e)))
291                 })
292                 
293                 effectsDF <-  stability::ge_means(.data=dat2, .y= Yield, .gen=Gen, .env=Env)
295                 test <- data.frame(effectsDF$ge_ranks)
296                 test$location <- rownames(test)
297                 gathered_df <- gather(test, key = "rank", value = "genotypes", -location)
298                 gathered_df$rank <- gsub("X", "", gathered_df$rank)
299                 gathered_df$rank <- as.numeric(gathered_df$rank)
301                 test1 <- data.frame(effectsDF$ge_means)
302                 gathered_mean <- gather(test1, key = "locations", value = "means", -Gen)
305                 gathered_df$track <- paste0(gathered_df$genotypes,"_",gathered_df$location)
306                 gathered_mean$track <- paste0(gathered_mean$Gen,"_",gathered_mean$locations)
308                 gathered_df$track <- gsub(" ","",gathered_df$track)
309                 gathered_df$track <- gsub("\\.","",gathered_df$track)
311                 gathered_mean$track <- gsub(" ","",gathered_mean$track)
312                 gathered_mean$track <- gsub("\\.","",gathered_mean$track)
315                 finalDF <- left_join(gathered_df, gathered_mean, by = "track") %>% dplyr::select(
316                   genotypes, locations, means, rank
317                 )
318                 finalDF$locations <- gsub("\\."," ",finalDF$locations)
320                 rankDF <- data.frame(sumRank=tapply(finalDF$rank, finalDF$genotypes, sum))
321                 rankDF$genotypes <- rownames(rankDF)
322                 rankDF$genotypesRank <- rank(rankDF$sumRank, ties.method = "min")
323                 finalDF <- left_join(finalDF, rankDF, by = "genotypes") %>% dplyr::select(genotypes, locations, means, rank, genotypesRank) 
324                 colnames(finalDF) <- c("genotypes","locations", "means", "locationRank","genotypeRank")
325                 finalDF$means <- sprintf("%.2f", finalDF$means)
327                 colnames(finalDF) <- c("Accession", "location", "means", "locationRank", "genotypeRank")
328                 if(imputPheno == "imput_yes"){finalDF$imputAcc <- testCor}
329                 # Saving Json
330                 myJson <- jsonlite::toJSON(finalDF)
331                 jsonlite::write_json(myJson, jsonFile)
334                 ## Preparing Graphic dataset
335                 prepGraph <- as.data.frame(tapply(dat2$Yield, list(dat2$Gen, dat2$Env), mean))
336                 prepGraph <- tibble::rownames_to_column(prepGraph, var="Accession")
337                 meanGraph <- gather(prepGraph, key = "location", value = "mean", -Accession)
338                 prepGraph2 <- as.data.frame(tapply(dat2$Yield, list(dat2$Gen, dat2$Env), sd))
339                 prepGraph2 <- tibble::rownames_to_column(prepGraph2, var="Accession")
340                 sdGraph <- gather(prepGraph2, key = "location", value = "sd", -Accession)
341                 finalGraph <- left_join(meanGraph, sdGraph, by=c("Accession","location"))
343                 graphicJson <- jsonlite::toJSON(finalGraph)
344                 jsonlite::write_json(graphicJson, graphFile)
346         }
348 if ( is.null(errorMessages) == F ) {
349   print(sprintf("Writing Error Messages to file: %s", messageFile))
350   print(errorMessages)
351   write(errorMessages, messageFile)