R/solGS/correlation.r

   1  #SNOPSIS
   2
   3  #runs correlation analysis.
   4  #Correlation coeffiecients are stored in tabular and json formats
   5
   6  #AUTHOR
   7  # Isaak Y Tecle (iyt2@cornell.edu)
   8
   9
  10 options(echo = FALSE)
  11
  12 library(ltm)
  13 #library(rjson)
  14 library(jsonlite)
  15 library(data.table)
  16 library(phenoAnalysis)
  17 library(dplyr)
  18 library(tibble)
  19 #library(rbenchmark)
  20 library(methods)
  21
  22 allArgs <- commandArgs()
  23
  24 outputFiles <- scan(grep("output_files", allArgs, value = TRUE),
  25                     what = "character")
  26 inputFiles  <- scan(grep("input_files", allArgs, value = TRUE),
  27                     what = "character")
  28
  29 message('inputFiles: ', inputFiles)
  30
  31 correInputData <- c()
  32 correCoefsTableFile <- c()
  33 correCoefsJsonFile <- c()
  34
  35 if (any(grepl("phenotype_data", inputFiles))) {
  36   refererQtl <- grep("qtl", inputFiles, value=TRUE)
  37
  38   phenoDataFile      <- grep("\\/phenotype_data", inputFiles, value=TRUE)
  39   formattedPhenoFile <- grep("formatted_phenotype_data", inputFiles, fixed = FALSE, value = TRUE)
  40   metaFile       <-  grep("metadata", inputFiles, value=TRUE)
  41   correCoefsTableFile     <- grep("pheno_corr_table", outputFiles, value=TRUE)
  42   correCoefsJsonFile <- grep("pheno_corr_json", outputFiles, value=TRUE)
  43
  44   formattedPhenoData <- c()
  45   phenoData          <- c()
  46
  47   if ( length(refererQtl) != 0 ) {
  48     phenoDataFile      <- grep("\\/phenodata", inputFiles, value=TRUE)
  49     phenoData <- data.frame(fread(phenoDataFile,
  50           header=TRUE,
  51           sep=",",
  52           na.strings=c("NA", "-", " ", ".", "..")))
  53   }
  54
  55   metaData <- scan(metaFile, what="character")
  56
  57   allTraitNames <- c()
  58   nonTraitNames <- c()
  59   naTraitNames  <- c()
  60
  61   if (length(refererQtl) != 0) {
  62     allNames      <- names(phenoData)
  63     nonTraitNames <- c("ID")
  64     allTraitNames <- allNames[! allNames %in% nonTraitNames]
  65   }
  66
  67   correPhenoData <- c()
  68
  69   if (length(refererQtl) == 0  ) {
  70       averagedPhenoData <- cleanAveragePhenotypes(inputFiles, metaDataFile = metaFile)
  71       allNames <- names(averagedPhenoData)
  72       nonTraitNames <- metaData
  73       allTraitNames <- allNames[! allNames %in% nonTraitNames]
  74
  75     rownames(averagedPhenoData) <- NULL
  76     correPhenoData <- averagedPhenoData
  77   } else {
  78     message("qtl stuff")
  79     correPhenoData <- phenoData %>%
  80                           group_by(ID) %>%
  81                           summarise_if(is.numeric, mean, na.rm=TRUE) %>%
  82                           select(-ID) %>%
  83                           round(., 2) %>%
  84                           data.frame
  85
  86   }
  87
  88   correInputData <- correPhenoData
  89 print(head(correPhenoData))
  90
  91 } else if (any(grepl("combined_gebvs", inputFiles)) ||
  92               any(grepl("selection_index", inputFiles))) {
  93
  94     correCoefsTableFile <- grep("genetic_corr_table", outputFiles, value=TRUE)
  95     correCoefsJsonFile  <- grep("genetic_corr_json", outputFiles, value=TRUE)
  96     geneticDataFile <- grep("combined_gebvs", inputFiles, value=TRUE)
  97     selectionIndexFile <- grep("selection_index", inputFiles, value=TRUE)
  98
  99     message('selectionIndexFile', selectionIndexFile)
 100     message('geneticDataFile', geneticDataFile)
 101
 102     geneticData <- read.table(geneticDataFile,
 103                               header = TRUE,
 104                               row.names = 1,
 105                               sep = "\t",
 106                               na.strings = c("NA"),
 107                               dec = "."
 108                               )
 109
 110     indexData <- c()
 111     if (length(selectionIndexFile) != 0
 112         && file.info(selectionIndexFile)$size != 0) {
 113         indexData <- read.table(selectionIndexFile,
 114                                 header = TRUE,
 115                                 row.names = 1,
 116                                 sep = "\t",
 117                                 na.strings = c("NA"),
 118                                 dec = "."
 119                                 )
 120     }
 121
 122     corrData <- c()
 123
 124     if (!is.null(indexData)) {
 125         geneticData <- rownames_to_column(geneticData, var="genotypes")
 126         indexData   <- rownames_to_column(indexData, var="genotypes")
 127
 128         geneticData <- geneticData %>% arrange(genotypes)
 129         indexData   <- indexData %>% arrange(genotypes)
 130
 131         corrData <- full_join(geneticData, indexData)
 132         corrData <- column_to_rownames(corrData, var="genotypes")
 133
 134     } else {
 135         corrData <- geneticData
 136     }
 137
 138     correInputData <- corrData
 139
 140 }
 141
 142 if (is.null(correInputData)) {
 143   stop("Can't run correlation analysis. There is no input data.")
 144 }
 145
 146 correInputData <- correInputData %>%
 147                   select(where(~n_distinct(.) > 2))
 148
 149 coefpvalues <- rcor.test(correInputData,
 150                          method="pearson",
 151                          use="pairwise"
 152                          )
 153
 154 coefs <- coefpvalues$cor.mat
 155 #remove rows and columns that are all "NA"
 156 coefs <- data.frame(coefs)
 157  if (any(is.na(coefs))) {
 158   coefs <- coefs[ , colSums(is.na(coefs)) < nrow(coefs)]
 159 }
 160
 161 coefs[upper.tri(coefs)] <- NA
 162 pvalues <- coefpvalues$cor.mat
 163 pvalues[lower.tri(pvalues)] <- coefpvalues$p.values[, 3]
 164 pvalues <- round(pvalues, 3)
 165 pvalues[upper.tri(pvalues)] <- NA
 166 pvalues <- data.frame(pvalues)
 167
 168 allcordata   <- coefpvalues$cor.mat
 169 allcordata[upper.tri(allcordata)] <- coefpvalues$p.values[, 3]
 170 diag(allcordata) <- 1
 171 allcordata   <- round(allcordata, 3)
 172
 173 traits <- colnames(coefs)
 174
 175 correlationList <- list(
 176                      labels = traits,
 177                     values  = coefs,
 178                     pvalues = pvalues
 179                    )
 180
 181 correlationJson <- jsonlite::toJSON(correlationList)
 182
 183 write.table(allcordata,
 184        file = correCoefsTableFile,
 185        sep  = "\t",
 186        row.names = TRUE,
 187        quote = FALSE
 188        )
 189
 190 write(correlationJson,
 191        file = correCoefsJsonFile)
 192
 193 message("Done running correlation.")
 194 q(save = "no", runLast = FALSE)