Merge pull request #5243 from solgenomics/topic/observations_upload_catch_error
[sgn.git] / R / solGS / correlation.r
blobb015a7546a85ea948d4f7564ab1e6f44c4e32f4e
1 #SNOPSIS
3 #runs correlation analysis.
4 #Correlation coeffiecients are stored in tabular and json formats
6 #AUTHOR
7 # Isaak Y Tecle (iyt2@cornell.edu)
10 options(echo = FALSE)
12 library(ltm)
13 #library(rjson)
14 library(jsonlite)
15 library(data.table)
16 library(phenoAnalysis)
17 library(dplyr)
18 library(tibble)
19 #library(rbenchmark)
20 library(methods)
22 allArgs <- commandArgs()
24 outputFiles <- scan(grep("output_files", allArgs, value = TRUE),
25 what = "character")
26 inputFiles <- scan(grep("input_files", allArgs, value = TRUE),
27 what = "character")
29 message('inputFiles: ', inputFiles)
31 correInputData <- c()
32 correCoefsTableFile <- c()
33 correCoefsJsonFile <- c()
35 if (any(grepl("phenotype_data", inputFiles))) {
36 refererQtl <- grep("qtl", inputFiles, value=TRUE)
38 phenoDataFile <- grep("\\/phenotype_data", inputFiles, value=TRUE)
39 formattedPhenoFile <- grep("formatted_phenotype_data", inputFiles, fixed = FALSE, value = TRUE)
40 metaFile <- grep("metadata", inputFiles, value=TRUE)
41 correCoefsTableFile <- grep("pheno_corr_table", outputFiles, value=TRUE)
42 correCoefsJsonFile <- grep("pheno_corr_json", outputFiles, value=TRUE)
44 formattedPhenoData <- c()
45 phenoData <- c()
47 if ( length(refererQtl) != 0 ) {
48 phenoDataFile <- grep("\\/phenodata", inputFiles, value=TRUE)
49 phenoData <- data.frame(fread(phenoDataFile,
50 header=TRUE,
51 sep=",",
52 na.strings=c("NA", "-", " ", ".", "..")))
55 metaData <- scan(metaFile, what="character")
57 allTraitNames <- c()
58 nonTraitNames <- c()
59 naTraitNames <- c()
61 if (length(refererQtl) != 0) {
62 allNames <- names(phenoData)
63 nonTraitNames <- c("ID")
64 allTraitNames <- allNames[! allNames %in% nonTraitNames]
67 correPhenoData <- c()
69 if (length(refererQtl) == 0 ) {
70 averagedPhenoData <- cleanAveragePhenotypes(inputFiles, metaDataFile = metaFile)
71 allNames <- names(averagedPhenoData)
72 nonTraitNames <- metaData
73 allTraitNames <- allNames[! allNames %in% nonTraitNames]
75 rownames(averagedPhenoData) <- NULL
76 correPhenoData <- averagedPhenoData
77 } else {
78 message("qtl stuff")
79 correPhenoData <- phenoData %>%
80 group_by(ID) %>%
81 summarise_if(is.numeric, mean, na.rm=TRUE) %>%
82 select(-ID) %>%
83 round(., 2) %>%
84 data.frame
88 correInputData <- correPhenoData
89 print(head(correPhenoData))
91 } else if (any(grepl("combined_gebvs", inputFiles)) ||
92 any(grepl("selection_index", inputFiles))) {
94 correCoefsTableFile <- grep("genetic_corr_table", outputFiles, value=TRUE)
95 correCoefsJsonFile <- grep("genetic_corr_json", outputFiles, value=TRUE)
96 geneticDataFile <- grep("combined_gebvs", inputFiles, value=TRUE)
97 selectionIndexFile <- grep("selection_index", inputFiles, value=TRUE)
99 message('selectionIndexFile', selectionIndexFile)
100 message('geneticDataFile', geneticDataFile)
102 geneticData <- read.table(geneticDataFile,
103 header = TRUE,
104 row.names = 1,
105 sep = "\t",
106 na.strings = c("NA"),
107 dec = "."
110 indexData <- c()
111 if (length(selectionIndexFile) != 0
112 && file.info(selectionIndexFile)$size != 0) {
113 indexData <- read.table(selectionIndexFile,
114 header = TRUE,
115 row.names = 1,
116 sep = "\t",
117 na.strings = c("NA"),
118 dec = "."
122 corrData <- c()
124 if (!is.null(indexData)) {
125 geneticData <- rownames_to_column(geneticData, var="genotypes")
126 indexData <- rownames_to_column(indexData, var="genotypes")
128 geneticData <- geneticData %>% arrange(genotypes)
129 indexData <- indexData %>% arrange(genotypes)
131 corrData <- full_join(geneticData, indexData)
132 corrData <- column_to_rownames(corrData, var="genotypes")
134 } else {
135 corrData <- geneticData
138 correInputData <- corrData
142 if (is.null(correInputData)) {
143 stop("Can't run correlation analysis. There is no input data.")
146 correInputData <- correInputData %>%
147 select(where(~n_distinct(.) > 2))
149 coefpvalues <- rcor.test(correInputData,
150 method="pearson",
151 use="pairwise"
154 coefs <- coefpvalues$cor.mat
155 #remove rows and columns that are all "NA"
156 coefs <- data.frame(coefs)
157 if (any(is.na(coefs))) {
158 coefs <- coefs[ , colSums(is.na(coefs)) < nrow(coefs)]
161 coefs[upper.tri(coefs)] <- NA
162 pvalues <- coefpvalues$cor.mat
163 pvalues[lower.tri(pvalues)] <- coefpvalues$p.values[, 3]
164 pvalues <- round(pvalues, 3)
165 pvalues[upper.tri(pvalues)] <- NA
166 pvalues <- data.frame(pvalues)
168 allcordata <- coefpvalues$cor.mat
169 allcordata[upper.tri(allcordata)] <- coefpvalues$p.values[, 3]
170 diag(allcordata) <- 1
171 allcordata <- round(allcordata, 3)
173 traits <- colnames(coefs)
175 correlationList <- list(
176 labels = traits,
177 values = coefs,
178 pvalues = pvalues
181 correlationJson <- jsonlite::toJSON(correlationList)
183 write.table(allcordata,
184 file = correCoefsTableFile,
185 sep = "\t",
186 row.names = TRUE,
187 quote = FALSE
190 write(correlationJson,
191 file = correCoefsJsonFile)
193 message("Done running correlation.")
194 q(save = "no", runLast = FALSE)