R/solGS/combine_populations.r

   1 #formats and combines phenotype (of a single trait)
   2 #and genotype datasets of multiple
   3 #populations
   4
   5 options(echo = FALSE)
   6
   7 library(stringr)
   8 library(randomForest)
   9 library(lme4)
  10 library(data.table)
  11 library(phenoAnalysis)
  12 library(dplyr)
  13 library(tibble)
  14 library(genoDataFilter)
  15
  16 allArgs <- commandArgs()
  17
  18 inFile <- grep("input_files",
  19                allArgs,
  20                ignore.case = TRUE,
  21                perl = TRUE,
  22                value = TRUE
  23                )
  24
  25 inFiles <- scan(inFile,
  26                 what = "character"
  27                 )
  28
  29 outFile <- grep("output_files",
  30                 allArgs,
  31                 ignore.case = TRUE,
  32                 perl = TRUE,
  33                 value = TRUE
  34                 )
  35
  36 outFiles <- scan(outFile,
  37                  what = "character"
  38                  )
  39
  40 combinedGenoFile <- grep("genotype_data",
  41                          outFiles,
  42                          ignore.case = TRUE,
  43                          fixed = FALSE,
  44                          value = TRUE
  45                          )
  46
  47 combinedPhenoFile <- grep("model_phenodata",
  48                           outFiles,
  49                           ignore.case = TRUE,
  50                           fixed = FALSE,
  51                           value = TRUE
  52                           )
  53
  54 analysisReportFile <- grep("_report_",
  55                           outFiles,
  56                           ignore.case = TRUE,
  57                           fixed = FALSE,
  58                           value = TRUE
  59                           )
  60
  61 ## traitFile <- grep("model_phenodata",
  62 ##                   inFiles,
  63 ##                   ignore.case = TRUE,
  64 ##                   fixed = FALSE,
  65 ##                   value = TRUE
  66 ##                   )
  67
  68 ## trait <- scan(traitFile,
  69 ##               what = "character",
  70 ##               )
  71
  72 ## traitInfo <- strsplit(trait, "\t");
  73 ## traitId   <- traitInfo[[1]]
  74 ## traitName <- traitInfo[[2]]
  75
  76 modelInfoFile  <- grep("model_info", inFiles, value = TRUE)
  77 message('model_info_file ', modelInfoFile)
  78
  79 traitRawPhenoFile <- grep('trait_raw_phenodata', outFiles, value = TRUE)
  80
  81 modelInfo  <- read.table(modelInfoFile,
  82                          header=TRUE, sep ="\t",
  83                          as.is = c('Value'))
  84
  85 modelInfo  <- column_to_rownames(modelInfo, var="Name")
  86 traitId    <- modelInfo["trait_id", 1]
  87 traitAbbr  <- modelInfo["trait_abbr", 1]
  88 modelId    <- modelInfo["model_id", 1]
  89 protocolId <- modelInfo["protocol_id", 1]
  90
  91 message('class ', class(traitAbbr))
  92 message('trait_id ', traitId)
  93 message('trait_abbr ', traitAbbr)
  94 message('protocol_id ', protocolId)
  95 message('model_id ', modelId)
  96
  97 #extract trait phenotype data from all populations
  98 #and combine them into one dataset
  99
 100 allPhenoFiles <- grep("phenotype_data",
 101                   inFiles,
 102                   ignore.case = TRUE,
 103                   fixed = FALSE,
 104                   value = TRUE
 105                   )
 106 message("phenotype files: ", allPhenoFiles)
 107
 108 allGenoFiles <- grep("genotype_data",
 109                   inFiles,
 110                   ignore.case = TRUE,
 111                   fixed = FALSE,
 112                   value = TRUE
 113                   )
 114
 115 popIds            <- c()
 116 combinedPhenoPops <- c()
 117 cnt               <- 0
 118 traitRawPhenoData <- c()
 119
 120 logHeading <- paste0("Genomic Prediction Analysis Log for ", traitAbbr,  ".\n")
 121 logHeading <- append(logHeading,  paste0("Date: ", format(Sys.time(), "%d %b %Y %H:%M"), "\n\n\n"))
 122 logHeading <- format(logHeading, width=80, justify="c")
 123
 124 anovaLog <- paste0("#Preprocessing training population phenotype data.\n\n")
 125 anovaLog <- append(anovaLog, paste0("This training population is composed of ", length(allPhenoFiles), " trials. Adjusted means or arithmetic trait values for each trial will be calculated as described below. After that, single trait values for the entries will be calculated by averaging trait adjusted means/arithmetic averages across the trials.\n\n"))
 126
 127 for (popPhenoFile in allPhenoFiles) {
 128
 129      cnt <- cnt + 1
 130
 131     phenoData <- fread(popPhenoFile,
 132                        header = TRUE,
 133                        sep="\t",
 134                        na.strings = c("NA", "", "--", "-", "."))
 135
 136      phenoData <- data.frame(phenoData)
 137
 138      meansResult <- getAdjMeans(phenoData,
 139                                traitName = traitAbbr,
 140                                calcAverages = TRUE,
 141                                logReturn = TRUE)
 142
 143    anovaLog <- append(anovaLog, meansResult$log)
 144    phenoTrait <- meansResult$adjMeans
 145
 146     keepMetaCols <- c('observationUnitName', 'germplasmName', 'studyDbId', 'locationName',
 147                   'studyYear', 'replicate', 'blockNumber')
 148
 149     trialTraitRawPhenoData <- phenoData %>%
 150                                         select(c(keepMetaCols, traitAbbr))
 151
 152      popIdFile <- basename(popPhenoFile)
 153      popId     <- str_extract(popIdFile, "\\d+")
 154      popIds    <- c(popIds, popId)
 155
 156      newTraitName <- paste(traitAbbr, popId, sep = "_")
 157      colnames(phenoTrait)[2] <- newTraitName
 158
 159      if (cnt == 1 ) {
 160          combinedPhenoPops <- phenoTrait
 161          traitRawPhenoData <- trialTraitRawPhenoData
 162      } else {
 163            combinedPhenoPops <- full_join(combinedPhenoPops, phenoTrait, by='germplasmName')
 164            traitRawPhenoData <- bind_rows(traitRawPhenoData, trialTraitRawPhenoData)
 165      }
 166  }
 167
 168 combinedPhenoPops <- column_to_rownames(combinedPhenoPops, var='germplasmName')
 169
 170 # #fill in missing data in combined phenotype dataset
 171 # #using row means
 172 naIndices <- which(is.na(combinedPhenoPops), arr.ind=TRUE)
 173 combinedPhenoPops <- as.matrix(combinedPhenoPops)
 174 combinedPhenoPops[naIndices] <- rowMeans(combinedPhenoPops, na.rm=TRUE)[naIndices[,1]]
 175 combinedPhenoPops <- as.data.frame(combinedPhenoPops)
 176
 177 message("combined total number of stocks in phenotype dataset (before averaging): ", length(rownames(combinedPhenoPops)))
 178
 179 combinedPhenoPops$Average<-round(apply(combinedPhenoPops, 1, function(x) { mean(x) }), digits = 2)
 180
 181 combinedGenoPops <- c()
 182
 183 if (file.size(combinedGenoFile) < 100 ) {
 184     combinedGenoPops       <- combineGenoData(allGenoFiles)
 185     combinedGenoPops$trial <- NULL
 186     combinedGenoPops       <- combinedGenoPops[order(rownames(combinedGenoPops)), ]
 187 }
 188
 189 message("writing data to files...")
 190 #if(length(combinedPhenoFile) != 0 )
 191 #  {
 192       fwrite(combinedPhenoPops,
 193                   file = combinedPhenoFile,
 194                   sep = "\t",
 195                   quote = FALSE,
 196                   row.names = TRUE,
 197                   )
 198 #  }
 199 if (!is.null(traitRawPhenoData) & length(traitRawPhenoFile) != 0) {
 200
 201     fwrite(traitRawPhenoData,
 202            file  = traitRawPhenoFile,
 203            row.names = FALSE,
 204            sep   = "\t",
 205            na = 'NA',
 206            quote = FALSE,
 207            )
 208 }
 209 if(!is.null(combinedGenoPops)) {
 210     fwrite(combinedGenoPops,
 211            file = combinedGenoFile,
 212            sep = "\t",
 213            quote = FALSE,
 214            row.names = TRUE,
 215            )
 216  }
 217
 218 cat(anovaLog, fill = TRUE,  file = analysisReportFile, append=FALSE)
 219
 220 q(save = "no", runLast = FALSE)