R/combine_populations.r

   1 #formats and combines phenotype (of a single trait)
   2 #and genotype datasets of multiple
   3 #training populations
   4
   5 options(echo = FALSE)
   6
   7 library(stats)
   8 library(stringr)
   9 library(imputation)
  10 library(plyr)
  11
  12 allArgs <- commandArgs()
  13
  14 inFile <- grep("input_files",
  15                allArgs,
  16                ignore.case = TRUE,
  17                perl = TRUE,
  18                value = TRUE
  19                )
  20
  21 outFile <- grep("output_files",
  22                 allArgs,
  23                 ignore.case = TRUE,
  24                 perl = TRUE,
  25                 value = TRUE
  26                 )
  27
  28 outFiles <- scan(outFile,
  29                  what = "character"
  30                  )
  31 print(outFiles)
  32 combinedGenoFile <- grep("genotype_data",
  33                          outFiles,
  34                          ignore.case = TRUE,
  35                          fixed = FALSE,
  36                          value = TRUE
  37                          )
  38
  39 combinedPhenoFile <- grep("phenotype_data",
  40                           outFiles,
  41                           ignore.case = TRUE,
  42                           fixed = FALSE,
  43                           value = TRUE
  44                           )
  45
  46 inFiles <- scan(inFile,
  47                 what = "character"
  48                 )
  49 print(inFiles)
  50
  51 traitFile <- grep("trait_",
  52                   inFiles,
  53                   ignore.case = TRUE,
  54                   fixed = FALSE,
  55                   value = TRUE
  56                   )
  57
  58 trait <- scan(traitFile,
  59               what = "character",
  60               )
  61 print(trait)
  62
  63 traitInfo<-strsplit(trait, "\t");
  64 traitId<-traitInfo[[1]]
  65 traitName<-traitInfo[[2]]
  66
  67 #extract trait phenotype data from all populations
  68 #and combine them into one dataset
  69
  70 allPhenoFiles <- grep("phenotype_data",
  71                   inFiles,
  72                   ignore.case = TRUE,
  73                   fixed = FALSE,
  74                   value = TRUE
  75                   )
  76 print(allPhenoFiles)
  77
  78 allGenoFiles <- grep("genotype_data",
  79                   inFiles,
  80                   ignore.case = TRUE,
  81                   fixed = FALSE,
  82                   value = TRUE
  83                   )
  84
  85
  86 print(allGenoFiles)
  87
  88 popsPhenoSize     <- length(allPhenoFiles)
  89 popsGenoSize      <- length(allGenoFiles)
  90 popIds            <- c()
  91 combinedPhenoPops <- c()
  92
  93 for (i in 1:popsPhenoSize)
  94   {
  95     popId <- str_extract(allPhenoFiles[[i]], "\\d+")
  96     popIds <- append(popIds, popId)
  97
  98     print(popId)
  99     phenoData <- read.table(allPhenoFiles[[i]],
 100                             header = TRUE,
 101                             row.names = 1,
 102                             sep = "\t",
 103                             na.strings = c("NA", " ", "--", "-"),
 104                             dec = "."
 105                            )
 106
 107
 108     phenoTrait <- subset(phenoData,
 109                          select = c("object_name", "stock_id", traitName)
 110                          )
 111
 112     if (sum(is.na(phenoTrait)) > 0)
 113       {
 114         print("sum of pheno missing values")
 115         print(sum(is.na(phenoTrait)))
 116
 117         #fill in for missing data with mean value
 118         phenoTrait[, traitName]  <- replace (phenoTrait[, traitName],
 119                                              is.na(phenoTrait[, traitName]),
 120                                              mean(phenoTrait[, traitName], na.rm =TRUE)
 121                                             )
 122
 123        #calculate mean of reps/plots of the same accession and
 124        #create new df with the accession means
 125         phenoTrait$stock_id <- NULL
 126         phenoTrait   <- phenoTrait[order(row.names(phenoTrait)), ]
 127
 128         print('phenotyped lines before averaging')
 129         print(length(row.names(phenoTrait)))
 130
 131         phenoTrait<-ddply(phenoTrait, "object_name", colwise(mean))
 132
 133         print('phenotyped lines after averaging')
 134         print(length(row.names(phenoTrait)))
 135
 136         row.names(phenoTrait) <- phenoTrait[, 1]
 137         phenoTrait[, 1] <- NULL
 138
 139         phenoTrait <- round(phenoTrait, digits = 2)
 140
 141       } else {
 142       print ('No missing data')
 143       phenoTrait$stock_id <- NULL
 144       phenoTrait   <- phenoTrait[order(row.names(phenoTrait)), ]
 145
 146       print('phenotyped lines before averaging')
 147       print(length(row.names(phenoTrait)))
 148
 149       phenoTrait<-ddply(phenoTrait, "object_name", colwise(mean))
 150
 151       print('phenotyped lines after averaging')
 152       print(length(row.names(phenoTrait)))
 153
 154       row.names(phenoTrait) <- phenoTrait[, 1]
 155       phenoTrait[, 1] <- NULL
 156
 157       phenoTrait <- round(phenoTrait, digits = 2)
 158
 159     }
 160
 161     newTraitName = paste(traitName, popId, sep = "_")
 162     colnames(phenoTrait)[1] <- newTraitName
 163
 164     if (i == 1 )
 165       {
 166         print('no need to combine, yet')
 167         combinedPhenoPops <- phenoTrait
 168
 169       } else {
 170       print('combining...')
 171       combinedPhenoPops <- merge(combinedPhenoPops, phenoTrait,
 172                             by = 0,
 173                             all=TRUE,
 174                             )
 175
 176       rownames(combinedPhenoPops) <- combinedPhenoPops[, 1]
 177       combinedPhenoPops$Row.names <- NULL
 178
 179     }
 180 }
 181
 182 #fill in missing data in combined phenotype dataset
 183 #using row means
 184 naIndices <- which(is.na(combinedPhenoPops), arr.ind=TRUE)
 185 combinedPhenoPops <- as.matrix(combinedPhenoPops)
 186 combinedPhenoPops[naIndices] <- rowMeans(combinedPhenoPops, na.rm=TRUE)[naIndices[,1]]
 187 combinedPhenoPops <- as.data.frame(combinedPhenoPops)
 188 message("combined total number of stocks in phenotype dataset (before averaging): ", length(rownames(combinedPhenoPops)))
 189 #combinedPhenoPops<-ddply(combinedPhenoPops, by=0, colwise(mean))
 190 #message("combined total number of stocks in phenotype dataset (after averaging): ", length(rownames(combinedPhenoPops)))
 191
 192 combinedPhenoPops$Average<-round(apply(combinedPhenoPops,
 193                                        1,
 194                                        function(x)
 195                                        { mean(x) }
 196                                        ),
 197                                  digits = 2
 198                                  )
 199
 200 print(combinedPhenoPops[1:30, ])
 201
 202 markersList <- c()
 203 combinedGenoPops <- c()
 204 for (i in 1:popsGenoSize)
 205   {
 206     popId <- str_extract(allGenoFiles[[i]], "\\d+")
 207     popIds <- append(popIds, popId)
 208
 209     print(popId)
 210     genoData <- read.table(allGenoFiles[[i]],
 211                             header = TRUE,
 212                             row.names = 1,
 213                             sep = "\t",
 214                             na.strings = c("NA", " ", "--", "-"),
 215                             dec = "."
 216                            )
 217
 218     popMarkers <- colnames(genoData)
 219     message("No of markers from population ", popId, ": ", length(popMarkers))
 220     #print(popMarkers)
 221
 222     if (sum(is.na(genoData)) > 0)
 223       {
 224         print("sum of geno missing values")
 225         print(sum(is.na(genoData)))
 226
 227         #impute missing genotypes
 228         genoData <-kNNImpute(genoData, 10)
 229         genoData <-as.data.frame(genoData)
 230
 231         #extract columns with imputed values
 232         genoData <- subset(genoData,
 233                                 select = grep("^x", names(genoData))
 234                                 )
 235
 236         #remove prefix 'x.' from imputed columns
 237         print(genoData[1:50, 1:4])
 238         names(genoData) <- sub("x.", "", names(genoData))
 239
 240         genoData <- round(genoData, digits = 0)
 241         message("total number of stocks for pop ", popId,": ", length(rownames(genoData)))
 242       }
 243
 244     if (i == 1 )
 245       {
 246         print('no need to combine, yet')
 247         combinedGenoPops <- genoData
 248
 249       } else {
 250         print('combining genotype datasets...')
 251         combinedGenoPops <-rbind(combinedGenoPops, genoData)
 252       }
 253
 254
 255   }
 256 message("combined total number of stocks in genotype dataset: ", length(rownames(combinedGenoPops)))
 257 #discard duplicate clones
 258 combinedGenoPops <- unique(combinedGenoPops)
 259 message("combined unique number of stocks in genotype dataset: ", length(rownames(combinedGenoPops)))
 260
 261 message("writing data into files...")
 262 if(length(combinedPhenoFile) != 0 )
 263   {
 264       write.table(combinedPhenoPops,
 265                   file = combinedPhenoFile,
 266                   sep = "\t",
 267                   quote = FALSE,
 268                   col.names = NA,
 269                   )
 270   }
 271
 272 if(length(combinedGenoFile) != 0 )
 273   {
 274       write.table(combinedGenoPops,
 275                   file = combinedGenoFile,
 276                   sep = "\t",
 277                   quote = FALSE,
 278                   col.names = NA,
 279                   )
 280   }
 281
 282 q(save = "no", runLast = FALSE)