Merge pull request #2678 from solgenomics/BBase-15
[sgn.git] / R / analyze_phenotype.r
blobe47e578241e0e8bedddea4839e88861d500dde5e
2 # R CMD BATCH --no-save --no-restore '--args phenotype_file="blabla.txt" output_file="blalba.png" ' analyze_phenotype.r output.txt
3 library(plyr)
4 library(tidyr)
5 library(ggplot2)
7 args=(commandArgs(TRUE))
9 if(length(args)==0){
10 print("No arguments supplied.")
11 ##supply default values
12 phenotype_file = 'phenotypes.txt'
13 output_file = paste0(phenotype_file, ".png", sep="")
14 } else {
15 for(i in 1:length(args)){
16 eval(parse(text=args[[i]]))
20 write(paste("phenotype file: ", phenotype_file), stderr())
21 write(paste("output_file: ", output_file), stderr())
23 errorfile = paste(phenotype_file, ".err", sep="");
25 phenodata = read.csv(phenotype_file,fill=TRUE, sep=",", header = TRUE, stringsAsFactors = T, na.strings="NA");
27 blocks = unique(phenodata$blockNumber)
28 print(paste("blocks: ", blocks));
29 studyNames = unique(phenodata$studyName)
30 accessions = unique(phenodata$germplasmName)
31 datamatrix <- c()
32 datasetnames <- c()
33 trial_accessions <- c()
34 all_accessions = unique(phenodata$germplasmName)
36 datamatrix = matrix(nrow = length(all_accessions), ncol=length(studyNames)) # * length(blocks))
37 wfAlltrialsdata <- c()
38 for (i in 1:(length(studyNames))) {
40 trialdata <- phenodata[phenodata[,"studyName"]==studyNames[i], ] # & phenodata[,"blockNumber"]==n, ]
42 metadata <- c('studyYear', 'studyDbId', 'studyName', 'studyDesign', 'locationDbId', 'locationName', 'germplasmDbId', 'germplasmSynonyms', 'observationLevel', 'observationUnitDbId', 'observationUnitName', 'plotNumber')
44 trialdata <- trialdata[, !(names(trialdata) %in% metadata)]
46 trialdata <- trialdata[, !(names(trialdata) %in% c('replicate', 'blockNumber'))]
48 trialdata <- ddply(trialdata,
49 "germplasmName",
50 colwise(mean, na.rm=TRUE)
53 trialdata <- data.frame(trialdata)
55 trialdata <- trialdata[complete.cases(trialdata), ]
57 colnames(trialdata)[2]<- make.names(studyNames[i])
59 if( i == 1) {
60 wfAllTrialsData <- trialdata
61 } else {
62 wfAllTrialsData <- merge(wfAllTrialsData, trialdata, by="germplasmName")
66 #create a fake 4 trials dataset
67 #wfAllTrialsData <- merge(wfAllTrialsData, wfAllTrialsData, by="germplasmName")
69 names(wfAllTrialsData) <- make.names(names(wfAllTrialsData))
71 lfTrialsData <- function (wfTrialsData) {
72 lfTrDa <- gather(wfTrialsData, Trials, Trait,
73 2:length(wfTrialsData),
74 factor_key=TRUE)
76 return (lfTrDa)
80 datamatrix <- data.matrix(wfAllTrialsData)
83 if (nrow(datamatrix)==0) {
84 write("No data was retrieved from the database for this combination of trials: ", file = errorfile);
86 if (ncol(datamatrix) < 2) {
87 write("No data. Try again", file = errorfile);
90 # correlation
92 panel.cor <- function(x, y, digits=2, cex.cor)
94 usr <- par("usr"); on.exit(par(usr))
95 par(usr = c(0, 1, 0, 1))
96 r <- abs(cor(x, y, use ="na.or.complete"))
97 txt <- format(c(r, 0.123456789), digits=digits)[1]
98 test <- cor.test(x,y,use ="na.or.complete")
99 Signif <- ifelse(round(test$p.value,3)<0.001,"p<0.001",paste("p=",round(test$p.value,3)))
100 text(0.5, 0.25, paste("r=",txt))
101 text(.5, .75, Signif)
104 #pairs(data_test,lower.panel=panel.smooth,upper.panel=panel.cor)
106 #smooth
108 panel.smooth<-function (x, y, col = "black", bg = NA, pch = 18, cex = 0.8, col.smooth = "red", span = 2/3, iter = 3, ...)
110 points(x, y, pch = pch, col = col, bg = bg, cex = cex)
111 ok <- is.finite(x) & is.finite(y)
112 if (any(ok))
113 lines(stats::lowess(x[ok], y[ok], f = span, iter = iter),
114 col = col.smooth, ...)
118 #pairs(data,lower.panel=panel.smooth,upper.panel=panel.smooth)
120 #histo
121 panel.hist <- function(x, ...)
123 usr <- par("usr"); on.exit(par(usr))
124 par(usr = c(usr[1:2], 0, 1.5) )
125 h <- hist(x, plot = TRUE)
126 breaks <- h$breaks; nB <- length(breaks)
127 y <- h$counts; y <- y/max(y)
128 rect(breaks[-nB], 0, breaks[-1], y, col="red", ...)
131 scatterPlot <- function (wfTrialsData) {
133 lfTrDa <- lfTrialsData(wfTrialsData)
135 scatter <- ggplot(wfTrialsData, aes_string(x=names(wfTrialsData)[2], y=names(wfTrialsData)[3])) +
136 theme(plot.title = element_text(size=18, face="bold", color="olivedrab4", margin = margin(40, 40, 40, 40)),
137 plot.margin = unit(c(0.75, 1, 0.75, 1), "cm"),
138 axis.title.x = element_text(size=14, face="bold", color="olivedrab4"),
139 axis.title.y = element_text(size=14, face="bold", color="olivedrab4"),
140 axis.text.x = element_text(angle=90, vjust=0.5, size=10, color="olivedrab4"),
141 axis.text.y = element_text(size=10, color="olivedrab4")) +
142 geom_point(shape=1, color='DodgerBlue') +
143 scale_x_continuous(breaks = round(seq(min(lfTrDa$Trait), max(lfTrDa$Trait), by = 2),1)) +
144 scale_y_continuous(breaks = round(seq(min(lfTrDa$Trait), max(lfTrDa$Trait), by = 2),1)) +
145 geom_smooth(method=lm, se=FALSE)
147 return(scatter)
152 freqPlot <- function (wfTrialsData) {
154 lfTrDa <- lfTrialsData(wfTrialsData)
156 averages <- ddply(lfTrDa, "Trials", summarise, traitAverage = mean(Trait))
158 freq <- ggplot(lfTrDa, aes(x=Trait, fill=Trials)) +
159 xlab("Trait values") +
160 ylab("Frequency") +
161 theme(plot.title = element_text(size=18, face="bold", color="olivedrab4", margin = margin(40, 40, 40, 40)),
162 plot.margin = unit(c(0.75, 1, 0.75, 1), "cm"),
163 axis.title.x = element_text(size=14, face="bold", color="olivedrab4"),
164 axis.title.y = element_text(size=14, face="bold", color="olivedrab4"),
165 axis.text.x = element_text(angle=90, size=10, color="olivedrab4"),
166 axis.text.y = element_text(size=10, color="olivedrab4"),
167 legend.title=element_blank(),
168 legend.text=element_text(size=12, color="olivedrab4"),
169 legend.position="bottom") +
170 geom_histogram(binwidth=2, alpha=.5, position="identity") +
171 scale_x_continuous(breaks = round(seq(min(lfTrDa$Trait), max(lfTrDa$Trait), by = 2),1)) +
172 scale_fill_manual(values=c("ForestGreen", "DodgerBlue")) +
173 geom_vline(data=averages,
174 aes(xintercept=traitAverage, colour=Trials),
175 linetype="dashed", size=2)
177 return(freq)
181 # Multiple plot function: for how to use this function go here:
182 # http://www.cookbook-r.com/Graphs/Multiple_graphs_on_one_page_(ggplot2)/
184 multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
185 library(grid)
187 plots <- c(list(...), plotlist)
189 numPlots = length(plots)
191 if (is.null(layout)) {
192 layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
193 ncol = cols, nrow = ceiling(numPlots/cols))
196 if (numPlots==1) {
197 print(plots[[1]])
199 } else {
200 grid.newpage()
201 pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
203 for (i in 1:numPlots) {
204 matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
206 print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
207 layout.pos.col = matchidx$col))
213 getTrialsPairs <- function (wfAllTrialsData) {
214 combiTrMx <- combn(names(wfAllTrialsData[, 2:length(names(wfAllTrialsData))]), 2)
215 nPairs <- dim(combiTrMx)[2]
217 allPairs <- c()
218 for (i in 1:nPairs) {
219 pairs <- combiTrMx[, i]
220 message("pair " , i, " ", pairs )
221 allPairs[i] <- list(i=pairs)
224 return (list("trialsPairs"= allPairs, "pairsCount"= nPairs))
228 prTr <- getTrialsPairs(wfAllTrialsData)
229 trialsPairs <- prTr[["trialsPairs"]]
230 pairsCount <- prTr[["pairsCount"]]
232 message("pairs count ", pairsCount)
234 createGraphNames <- function (pairsCount) {
235 graphNames <- c()
237 for (i in 1:pairsCount) {
238 pf <- paste("freq", i, sep="")
239 ps <- paste("scatter", i, sep="")
241 graphNames[i] <- list(i=c(ps, pf))
244 return(graphNames)
248 png(output_file, height= pairsCount * 400, width=800)
250 graphNames <- createGraphNames(pairsCount)
252 for (i in 1:pairsCount) {
253 pnames <- graphNames[[i]]
254 message(pnames, " ", pnames[1], " ", pnames[2])
256 scatter <- scatterPlot(wfAllTrialsData[, c("germplasmName", trialsPairs[[i]])])
257 freq <- freqPlot(wfAllTrialsData[, c("germplasmName", trialsPairs[[i]])])
259 assign(pnames[1], scatter)
260 assign(pnames[2], freq)
264 if (pairsCount == 1) {
265 multiplot(scatter1, freq1, cols=2)
266 } else if (pairsCount == 3) {
267 multiplot(scatter1, freq1, scatter2, freq2, scatter3, freq3, cols=2)
268 } else if (pairsCount == 6) {
269 multiplot(scatter1, freq1, scatter2, freq2, scatter3, freq3, scatter4, freq4, scatter5, freq5, scatter6, freq6, cols=2)
272 dev.off()