R/phenotypic_correlation.r

   1  #SNOPSIS
   2
   3  #Commands for running phenotypic correlation analysis.
   4  #Correlation coeffiecients are stored in tabular and json formats
   5
   6  #AUTHOR
   7  # Isaak Y Tecle (iyt2@cornell.edu)
   8
   9
  10 options(echo = FALSE)
  11
  12 library(gplots)
  13 library(ltm)
  14 library(plyr)
  15 library(rjson)
  16 library(nlme)
  17
  18 allargs<-commandArgs()
  19
  20 refererQtl <- grep("qtl",
  21                    allargs,
  22                    ignore.case=TRUE,
  23                    perl=TRUE,
  24                    value=TRUE
  25                    )
  26
  27 phenoDataFile <- grep("phenotype_data",
  28                       allargs,
  29                       ignore.case=TRUE,
  30                       perl=TRUE,
  31                       value=TRUE
  32                       )
  33
  34 correCoefficientsFile <- grep("corre_coefficients_table",
  35                               allargs,
  36                               ignore.case=TRUE,
  37                               perl=TRUE,
  38                               value=TRUE
  39                               )
  40
  41 correCoefficientsJsonFile <- grep("corre_coefficients_json",
  42                                   allargs,
  43                                   ignore.case=TRUE,
  44                                   perl=TRUE,
  45                                   value=TRUE
  46                                   )
  47
  48 phenoData <- c()
  49
  50 if ( length(refererQtl) != 0 ) {
  51
  52   phenoData <- read.csv(phenoDataFile,
  53                         header=TRUE,
  54                         row.names = NULL,
  55                         dec=".",
  56                         sep=",",
  57                         na.strings=c("NA", "-", " ", ".")
  58                         )
  59
  60 } else {
  61   phenoData <- read.table(phenoDataFile,
  62                           header = TRUE,
  63                           row.names = NULL,
  64                           sep = "\t",
  65                           na.strings = c("NA", " ", "--", "-", "."),
  66                           dec = "."
  67                           )
  68
  69 }
  70
  71 formattedPhenoData <- c()
  72 allTraitNames      <- c()
  73
  74 if (length(refererQtl) != 0) {
  75
  76   allNames      <- names(phenoData)
  77   nonTraitNames <- c("ID")
  78
  79   allTraitNames <- allNames[! allNames %in% nonTraitNames]
  80
  81 } else {
  82   dropColumns <- c("uniquename", "stock_name")
  83   phenoData   <- phenoData[,!(names(phenoData) %in% dropColumns)]
  84
  85   allNames      <- names(phenoData)
  86   nonTraitNames <- c("object_name", "object_id", "stock_id", "design", "block", "replicate")
  87
  88   allTraitNames <- allNames[! allNames %in% nonTraitNames]
  89
  90 }
  91
  92 for (i in allTraitNames) {
  93   if (all(is.nan(phenoData$i))) {
  94     phenoData[, i] <- sapply(phenoData[, i], function(x) ifelse(is.numeric(x), x, NA))
  95   }
  96 }
  97
  98 phenoData <- phenoData[, colSums(is.na(phenoData)) < nrow(phenoData)]
  99
 100 trait <- c()
 101 cnt   <- 0
 102
 103 if (length(refererQtl) == 0) {
 104   for (i in allTraitNames) {
 105     cnt   <- cnt + 1
 106     trait <- i
 107
 108     phenoTrait         <- c()
 109     experimentalDesign <- c()
 110
 111     if ('design' %in% colnames(phenoData)) {
 112
 113     phenoTrait  <- subset(phenoData,
 114                           select = c("object_name", "object_id", "design", "block", "replicate", trait)
 115                           )
 116
 117     experimentalDesign <- phenoTrait[2, 'design']
 118
 119     if (is.na(experimentalDesign) == TRUE) {
 120       experimentalDesign <- c('No Design')
 121     }
 122
 123   } else {
 124     experimentalDesign <- c('No Design')
 125   }
 126
 127   if (experimentalDesign == 'augmented' || experimentalDesign == 'RCBD') {
 128
 129     message("experimental design: ", experimentalDesign)
 130
 131     augData <- subset(phenoTrait,
 132                         select = c("object_name", "object_id",  "block",  trait)
 133                         )
 134
 135     colnames(augData)[1] <- "genotypes"
 136     colnames(augData)[4] <- "trait"
 137
 138     ff <- trait ~ 0 + genotypes
 139
 140     model <- try(lme(ff,
 141                      data=augData,
 142                      random = ~1|block,
 143                      method="REML",
 144                      na.action = na.omit
 145                      ))
 146
 147     if (class(model) != "try-error") {
 148       adjMeans <- data.matrix(fixed.effects(model))
 149
 150       colnames(adjMeans) <- trait
 151
 152       nn <- gsub('genotypes', '', rownames(adjMeans))
 153       rownames(adjMeans) <- nn
 154       adjMeans <- round(adjMeans, digits = 2)
 155
 156       phenoTrait <- data.frame(adjMeans)
 157
 158       colnames(phenoTrait) <- trait
 159
 160       if(cnt == 1 ) {
 161         formattedPhenoData <- data.frame(adjMeans)
 162       } else {
 163         formattedPhenoData <-  merge(formattedPhenoData, phenoTrait, by=0, all=TRUE)
 164         row.names(formattedPhenoData) <- formattedPhenoData[, 1]
 165         formattedPhenoData[, 1] <- NULL
 166       }
 167     }
 168
 169   } else if (experimentalDesign == 'alpha') {
 170
 171     trait <- i
 172     alphaData <- subset(phenoData,
 173                           select = c("object_name", "object_id","block", "replicate", trait)
 174                           )
 175
 176     colnames(alphaData)[2] <- "genotypes"
 177     colnames(alphaData)[5] <- "trait"
 178
 179     ff <- trait ~ 0 + genotypes
 180
 181     model <- try(lme(ff,
 182                      data = alphaData,
 183                      random = ~1|replicate/block,
 184                      method = "REML",
 185                      na.action = na.omit
 186                      ))
 187
 188     if (class(model) != "try-error") {
 189       adjMeans <- data.matrix(fixed.effects(model))
 190       colnames(adjMeans) <- trait
 191
 192       nn <- gsub('genotypes', '', rownames(adjMeans))
 193       rownames(adjMeans) <- nn
 194       adjMeans <- round(adjMeans, digits = 2)
 195
 196       phenoTrait <- data.frame(adjMeans)
 197       colnames(phenoTrait) <- trait
 198
 199       if(cnt == 1 ) {
 200         formattedPhenoData <- data.frame(adjMeans)
 201       } else {
 202         formattedPhenoData <-  merge(formattedPhenoData, phenoTrait, by=0, all=TRUE)
 203         row.names(formattedPhenoData) <- formattedPhenoData[, 1]
 204         formattedPhenoData[, 1] <- NULL
 205       }
 206     }
 207
 208   } else {
 209     message("experimental design: ", experimentalDesign)
 210     message("GS stuff")
 211
 212     dropColumns <- c("object_id", "stock_id", "design",  "block", "replicate")
 213
 214     formattedPhenoData <- phenoData[, !(names(phenoData) %in% dropColumns)]
 215
 216     formattedPhenoData <- ddply(formattedPhenoData,
 217                                 "object_name",
 218                                 colwise(mean, na.rm=TRUE)
 219                                 )
 220
 221     row.names(formattedPhenoData) <- formattedPhenoData[, 1]
 222     formattedPhenoData[, 1] <- NULL
 223
 224   }
 225   }
 226
 227 } else {
 228   message("qtl stuff")
 229   formattedPhenoData <- ddply(phenoData,
 230                               "ID",
 231                               colwise(mean, na.rm=TRUE)
 232                               )
 233
 234   row.names(formattedPhenoData) <- formattedPhenoData[, 1]
 235   formattedPhenoData[, 1] <- NULL
 236
 237 }
 238
 239 formattedPhenoData <- round(formattedPhenoData,
 240                              digits = 2
 241                              )
 242
 243 coefpvalues <- rcor.test(formattedPhenoData,
 244                          method="pearson",
 245                          use="pairwise"
 246                          )
 247
 248
 249 coefficients <- coefpvalues$cor.mat
 250 allcordata   <- coefpvalues$cor.mat
 251 allcordata[lower.tri(allcordata)] <- coefpvalues$p.values[, 3]
 252 diag(allcordata) <- 1.00
 253
 254 pvalues <- as.matrix(allcordata)
 255
 256 pvalues <- round(pvalues,
 257                  digits=2
 258                  )
 259
 260 coefficients <- round(coefficients,
 261                       digits=3
 262                       )
 263
 264 allcordata <- round(allcordata,
 265                     digits=3
 266                     )
 267
 268 #remove rows and columns that are all "NA"
 269 if ( apply(coefficients,
 270            1,
 271            function(x)any(is.na(x))
 272            )
 273     ||
 274     apply(coefficients,
 275           2,
 276           function(x)any(is.na(x))
 277           )
 278     )
 279   {
 280
 281     coefficients<-coefficients[-which(apply(coefficients,
 282                                             1,
 283                                             function(x)all(is.na(x)))
 284                                       ),
 285                                -which(apply(coefficients,
 286                                             2,
 287                                             function(x)all(is.na(x)))
 288                                       )
 289                                ]
 290   }
 291
 292
 293 pvalues[upper.tri(pvalues)]<-NA
 294 coefficients[upper.tri(coefficients)]<-NA
 295
 296 coefficients2json <- function(mat){
 297     mat <- as.list(as.data.frame(t(mat)))
 298     names(mat) <- NULL
 299     toJSON(mat)
 300 }
 301
 302 traits <- colnames(coefficients)
 303
 304 correlationList <- list(
 305                    "traits"=toJSON(traits),
 306                    "coefficients"=coefficients2json(coefficients)
 307                    )
 308
 309 correlationJson <- paste("{",paste("\"", names(correlationList), "\":", correlationList, collapse=","), "}")
 310
 311 write.table(coefficients,
 312       file=correCoefficientsFile,
 313       col.names=TRUE,
 314       row.names=TRUE,
 315       quote=FALSE,
 316       dec="."
 317       )
 318
 319 write.table(correlationJson,
 320       file=correCoefficientsJsonFile,
 321       col.names=FALSE,
 322       row.names=FALSE,
 323       )
 324
 325 q(save = "no", runLast = FALSE)