clean
[sgn.git] / R / phenotypic_correlation.r
blobbfd5600f90cb2770daa7b7ad6fbbffa6df8753a2
1 #SNOPSIS
3 #runs phenotypic correlation analysis.
4 #Correlation coeffiecients are stored in tabular and json formats
6 #AUTHOR
7 # Isaak Y Tecle (iyt2@cornell.edu)
10 options(echo = FALSE)
12 library(gplots)
13 library(ltm)
14 library(plyr)
15 library(rjson)
16 library(lme4)
17 library(data.table)
18 #library(rbenchmark)
21 allargs<-commandArgs()
23 refererQtl <- grep("qtl",
24 allargs,
25 ignore.case=TRUE,
26 perl=TRUE,
27 value=TRUE
30 phenoDataFile <- grep("\\/phenotype_data",
31 allargs,
32 ignore.case=TRUE,
33 perl=TRUE,
34 value=TRUE
37 correCoefficientsFile <- grep("corre_coefficients_table",
38 allargs,
39 ignore.case=TRUE,
40 perl=TRUE,
41 value=TRUE
44 correCoefficientsJsonFile <- grep("corre_coefficients_json",
45 allargs,
46 ignore.case=TRUE,
47 perl=TRUE,
48 value=TRUE
52 formattedPhenoFile <- grep("formatted_phenotype_data",
53 allargs,
54 ignore.case = TRUE,
55 fixed = FALSE,
56 value = TRUE
60 formattedPhenoData <- c()
61 phenoData <- c()
63 if (file.info(formattedPhenoFile)$size > 0 ) {
65 formattedPhenoData <- as.data.frame(fread(formattedPhenoFile,
66 na.strings = c("NA", " ", "--", "-", "."),
69 row.names(formattedPhenoData) <- formattedPhenoData[, 1]
70 formattedPhenoData[, 1] <- NULL
72 } else {
74 if ( length(refererQtl) != 0 ) {
76 phenoData <- as.data.frame(fread(phenoDataFile,
77 sep=",",
78 na.strings=c("NA", "-", " ", ".", "..")
80 } else {
82 phenoData <- as.data.frame(fread(phenoDataFile,
83 na.strings = c("NA", " ", "--", "-", ".", "..")
84 ))
88 allTraitNames <- c()
89 nonTraitNames <- c()
91 if (length(refererQtl) != 0) {
93 allNames <- names(phenoData)
94 nonTraitNames <- c("ID")
95 allTraitNames <- allNames[! allNames %in% nonTraitNames]
97 } else if (file.info(formattedPhenoFile)$size == 0 && length(refererQtl) == 0) {
99 dropColumns <- c("uniquename", "stock_name")
100 phenoData <- phenoData[,!(names(phenoData) %in% dropColumns)]
102 allNames <- names(phenoData)
103 nonTraitNames <- c("object_name", "object_id", "stock_id", "design", "block", "replicate")
104 allTraitNames <- allNames[! allNames %in% nonTraitNames]
108 if (!is.null(phenoData) && length(refererQtl) == 0) {
110 for (i in allTraitNames) {
112 if (class(phenoData[, i]) != 'numeric') {
113 phenoData[, i] <- as.numeric(as.character(phenoData[, i]))
116 if (all(is.nan(phenoData$i))) {
117 phenoData[, i] <- sapply(phenoData[, i], function(x) ifelse(is.numeric(x), x, NA))
122 phenoData <- phenoData[, colSums(is.na(phenoData)) < nrow(phenoData)]
123 allTraitNames <- names(phenoData)[! names(phenoData) %in% nonTraitNames]
125 ###############################
126 if (length(refererQtl) == 0 ) {
127 if (file.info(formattedPhenoFile)$size == 0) {
129 cnt <- 0
131 for (i in allTraitNames) {
133 cnt <- cnt + 1
134 trait <- i
136 experimentalDesign <- c()
138 if ('design' %in% colnames(phenoData)) {
140 experimentalDesign <- phenoData[2, 'design']
142 if (is.na(experimentalDesign)) {
143 experimentalDesign <- c('No Design')
146 } else {
147 experimentalDesign <- c('No Design')
150 if ((experimentalDesign == 'Augmented' || experimentalDesign == 'RCBD') && unique(phenoData$block) > 1) {
152 message("GS experimental design: ", experimentalDesign)
154 augData <- subset(phenoData,
155 select = c("object_name", "object_id", "block", trait)
158 colnames(augData)[1] <- "genotypes"
159 colnames(augData)[4] <- "trait"
161 model <- try(lmer(trait ~ 0 + genotypes + (1|block),
162 augData,
163 na.action = na.omit
165 genoEffects <- c()
167 if (class(model) != "try-error") {
168 genoEffects <- data.frame(fixef(model))
170 colnames(genoEffects) <- trait
172 nn <- gsub('genotypes', '', rownames(genoEffects))
173 rownames(genoEffects) <- nn
175 genoEffects <- round(genoEffects, digits = 2)
178 if (cnt == 1 ) {
179 formattedPhenoData <- data.frame(genoEffects)
180 } else {
181 formattedPhenoData <- merge(formattedPhenoData, genoEffects, by=0, all=TRUE)
182 row.names(formattedPhenoData) <- formattedPhenoData[, 1]
183 formattedPhenoData[, 1] <- NULL
186 } else if (experimentalDesign == 'Alpha') {
187 trait <- i
189 message("Experimental desgin: ", experimentalDesign)
191 alphaData <- subset(phenoData,
192 select = c("object_name", "object_id","block", "replicate", trait)
195 colnames(alphaData)[1] <- "genotypes"
196 colnames(alphaData)[5] <- "trait"
198 model <- try(lmer(trait ~ 0 + genotypes + (1|replicate/block),
199 alphaData,
200 na.action = na.omit
203 if (class(model) != "try-error") {
204 genoEffects <- data.frame(fixef(model))
206 colnames(genoEffects) <- trait
208 nn <- gsub('genotypes', '', rownames(genoEffects))
209 rownames(genoEffects) <- nn
211 genoEffects <- round(genoEffects, digits = 2)
215 if(cnt == 1 ) {
216 formattedPhenoData <- genoEffects
217 } else {
218 formattedPhenoData <- merge(formattedPhenoData, genoEffects, by=0, all=TRUE)
219 row.names(formattedPhenoData) <- formattedPhenoData[, 1]
220 formattedPhenoData[, 1] <- NULL
223 } else {
224 message("GS experimental design: ", experimentalDesign)
226 dropColumns <- c("object_id", "stock_id", "design", "block", "replicate")
228 formattedPhenoData <- phenoData[, !(names(phenoData) %in% dropColumns)]
230 formattedPhenoData <- ddply(formattedPhenoData,
231 "object_name",
232 colwise(mean, na.rm=TRUE)
235 row.names(formattedPhenoData) <- formattedPhenoData[, 1]
236 formattedPhenoData[, 1] <- NULL
241 } else {
242 message("qtl stuff")
243 formattedPhenoData <- ddply(phenoData,
244 "ID",
245 colwise(mean, na.rm=TRUE)
248 row.names(formattedPhenoData) <- formattedPhenoData[, 1]
249 formattedPhenoData[, 1] <- NULL
251 formattedPhenoData <- round(formattedPhenoData, digits = 2)
254 coefpvalues <- rcor.test(formattedPhenoData,
255 method="pearson",
256 use="pairwise"
259 coefficients <- coefpvalues$cor.mat
260 allcordata <- coefpvalues$cor.mat
262 allcordata[lower.tri(allcordata)] <- coefpvalues$p.values[, 3]
263 diag(allcordata) <- 1.00
265 pvalues <- as.matrix(allcordata)
267 pvalues <- round(pvalues,
268 digits=2
271 coefficients <- round(coefficients,
272 digits=3
275 allcordata <- round(allcordata,
276 digits=3
279 #remove rows and columns that are all "NA"
280 if ( apply(coefficients,
282 function(x)any(is.na(x))
285 apply(coefficients,
287 function(x)any(is.na(x))
292 coefficients<-coefficients[-which(apply(coefficients,
294 function(x)all(is.na(x)))
296 -which(apply(coefficients,
298 function(x)all(is.na(x)))
304 pvalues[upper.tri(pvalues)] <- NA
305 coefficients[upper.tri(coefficients)] <- NA
307 coefficients2json <- function(mat) {
308 mat <- as.list(as.data.frame(t(mat)))
309 names(mat) <- NULL
310 toJSON(mat)
313 traits <- colnames(coefficients)
315 correlationList <- list(
316 "traits" = toJSON(traits),
317 "coefficients " =coefficients2json(coefficients)
320 correlationJson <- paste("{",paste("\"", names(correlationList), "\":", correlationList, collapse=","), "}")
322 write.table(coefficients,
323 file=correCoefficientsFile,
324 col.names=TRUE,
325 row.names=TRUE,
326 quote=FALSE,
327 dec="."
330 write.table(correlationJson,
331 file=correCoefficientsJsonFile,
332 col.names=FALSE,
333 row.names=FALSE,
337 if (file.info(formattedPhenoFile)$size == 0 && !is.null(formattedPhenoData) ) {
338 write.table(formattedPhenoData,
339 file = formattedPhenoFile,
340 sep = "\t",
341 col.names = NA,
342 quote = FALSE,
347 q(save = "no", runLast = FALSE)