Merge pull request #4719 from solgenomics/topic/fixing_histogram
[sgn.git] / R / solGS / correlation.r
blobb39686096684f205ee4cab09815e02a633894433
1 #SNOPSIS
3 #runs correlation analysis.
4 #Correlation coeffiecients are stored in tabular and json formats
6 #AUTHOR
7 # Isaak Y Tecle (iyt2@cornell.edu)
10 options(echo = FALSE)
12 library(ltm)
13 #library(rjson)
14 library(jsonlite)
15 library(data.table)
16 library(phenoAnalysis)
17 library(dplyr)
18 library(tibble)
19 #library(rbenchmark)
20 library(methods)
22 allArgs <- commandArgs()
24 outputFiles <- scan(grep("output_files", allArgs, value = TRUE),
25 what = "character")
26 inputFiles <- scan(grep("input_files", allArgs, value = TRUE),
27 what = "character")
29 message('inputFiles: ', inputFiles)
31 correInputData <- c()
32 correCoefsTableFile <- c()
33 correCoefsJsonFile <- c()
35 if (any(grepl("phenotype_data", inputFiles))) {
36 refererQtl <- grep("qtl", inputFiles, value=TRUE)
38 phenoDataFile <- grep("\\/phenotype_data", inputFiles, value=TRUE)
39 formattedPhenoFile <- grep("formatted_phenotype_data", inputFiles, fixed = FALSE, value = TRUE)
40 metaFile <- grep("metadata", inputFiles, value=TRUE)
41 correCoefsTableFile <- grep("pheno_corr_table", outputFiles, value=TRUE)
42 correCoefsJsonFile <- grep("pheno_corr_json", outputFiles, value=TRUE)
44 formattedPhenoData <- c()
45 phenoData <- c()
47 if ( length(refererQtl) != 0 ) {
48 phenoDataFile <- grep("\\/phenodata", inputFiles, value=TRUE)
49 phenoData <- data.frame(fread(phenoDataFile,
50 header=TRUE,
51 sep=",",
52 na.strings=c("NA", "-", " ", ".", "..")))
55 metaData <- scan(metaFile, what="character")
57 allTraitNames <- c()
58 nonTraitNames <- c()
59 naTraitNames <- c()
61 if (length(refererQtl) != 0) {
62 allNames <- names(phenoData)
63 nonTraitNames <- c("ID")
64 allTraitNames <- allNames[! allNames %in% nonTraitNames]
67 correPhenoData <- c()
69 if (length(refererQtl) == 0 ) {
70 averagedPhenoData <- cleanAveragePhenotypes(inputFiles, metaDataFile = metaFile)
71 allNames <- names(averagedPhenoData)
72 nonTraitNames <- metaData
73 allTraitNames <- allNames[! allNames %in% nonTraitNames]
75 rownames(averagedPhenoData) <- NULL
76 correPhenoData <- averagedPhenoData
77 } else {
78 message("qtl stuff")
79 correPhenoData <- phenoData %>%
80 group_by(ID) %>%
81 summarise_if(is.numeric, mean, na.rm=TRUE) %>%
82 select(-ID) %>%
83 round(., 2) %>%
84 data.frame
88 correInputData <- correPhenoData
89 print(head(correPhenoData))
91 } else if (any(grepl("combined_gebvs", inputFiles)) ||
92 any(grepl("selection_index", inputFiles))) {
94 correCoefsTableFile <- grep("genetic_corr_table", outputFiles, value=TRUE)
95 correCoefsJsonFile <- grep("genetic_corr_json", outputFiles, value=TRUE)
96 geneticDataFile <- grep("combined_gebvs", inputFiles, value=TRUE)
97 selectionIndexFile <- grep("selection_index", inputFiles, value=TRUE)
99 message('selectionIndexFile', selectionIndexFile)
100 message('geneticDataFile', geneticDataFile)
102 geneticData <- read.table(geneticDataFile,
103 header = TRUE,
104 row.names = 1,
105 sep = "\t",
106 na.strings = c("NA"),
107 dec = "."
110 indexData <- c()
111 if (length(selectionIndexFile) != 0
112 && file.info(selectionIndexFile)$size != 0) {
113 indexData <- read.table(selectionIndexFile,
114 header = TRUE,
115 row.names = 1,
116 sep = "\t",
117 na.strings = c("NA"),
118 dec = "."
122 corrData <- c()
124 if (!is.null(indexData)) {
125 geneticData <- rownames_to_column(geneticData, var="genotypes")
126 indexData <- rownames_to_column(indexData, var="genotypes")
128 geneticData <- geneticData %>% arrange(genotypes)
129 indexData <- indexData %>% arrange(genotypes)
131 corrData <- full_join(geneticData, indexData)
132 corrData <- column_to_rownames(corrData, var="genotypes")
134 } else {
135 corrData <- geneticData
138 correInputData <- corrData
142 if (is.null(correInputData)) {
143 stop("Can't run correlation analysis. There is no input data.")
146 coefpvalues <- rcor.test(correInputData,
147 method="pearson",
148 use="pairwise"
151 coefs <- coefpvalues$cor.mat
152 #remove rows and columns that are all "NA"
153 coefs <- data.frame(coefs)
154 if (any(is.na(coefs))) {
155 coefs <- coefs[ , colSums(is.na(coefs)) < nrow(coefs)]
158 coefs[upper.tri(coefs)] <- NA
159 pvalues <- coefpvalues$cor.mat
160 pvalues[lower.tri(pvalues)] <- coefpvalues$p.values[, 3]
161 pvalues <- round(pvalues, 3)
162 pvalues[upper.tri(pvalues)] <- NA
163 pvalues <- data.frame(pvalues)
165 allcordata <- coefpvalues$cor.mat
166 allcordata[upper.tri(allcordata)] <- coefpvalues$p.values[, 3]
167 diag(allcordata) <- 1
168 allcordata <- round(allcordata, 3)
170 traits <- colnames(coefs)
172 correlationList <- list(
173 labels = traits,
174 values = coefs,
175 pvalues = pvalues
178 correlationJson <- jsonlite::toJSON(correlationList)
180 write.table(allcordata,
181 file = correCoefsTableFile,
182 sep = "\t",
183 row.names = TRUE,
184 quote = FALSE
187 write(correlationJson,
188 file = correCoefsJsonFile)
190 message("Done running correlation.")
191 q(save = "no", runLast = FALSE)