R/analyze_phenotype.r

   1
   2 # R CMD BATCH --no-save --no-restore '--args phenotype_file="blabla.txt" output_file="blalba.png" ' analyze_phenotype.r output.txt
   3 library(plyr)
   4 library(tidyr)
   5 library(ggplot2)
   6
   7 args=(commandArgs(TRUE))
   8
   9 if(length(args)==0){
  10    print("No arguments supplied.")
  11    ##supply default values
  12    phenotype_file = 'phenotypes.txt'
  13    output_file = paste0(phenotype_file, ".png", sep="")
  14 } else {
  15    for(i in 1:length(args)){
  16        eval(parse(text=args[[i]]))
  17    }
  18 }
  19
  20 write(paste("phenotype file: ", phenotype_file), stderr())
  21 write(paste("output_file: ", output_file), stderr())
  22
  23 errorfile = paste(phenotype_file, ".err", sep="");
  24
  25 phenodata = read.csv(phenotype_file,fill=TRUE, sep=",", header = TRUE, stringsAsFactors = T, na.strings="NA");
  26
  27 blocks = unique(phenodata$blockNumber)
  28 print(paste("blocks: ", blocks));
  29 studyNames = unique(phenodata$studyName)
  30 accessions = unique(phenodata$germplasmName)
  31 datamatrix <- c()
  32 datasetnames <- c()
  33 trial_accessions <- c()
  34 all_accessions = unique(phenodata$germplasmName)
  35
  36 datamatrix = matrix(nrow = length(all_accessions), ncol=length(studyNames)) # * length(blocks))
  37 wfAlltrialsdata <- c()
  38 for (i in 1:(length(studyNames))) {
  39
  40   trialdata <- phenodata[phenodata[,"studyName"]==studyNames[i], ] # & phenodata[,"blockNumber"]==n, ]
  41
  42   metadata <- c('studyYear', 'studyDbId', 'studyName', 'studyDesign', 'locationDbId', 'locationName', 'germplasmDbId', 'germplasmSynonyms', 'observationLevel', 'observationUnitDbId', 'observationUnitName', 'plotNumber')
  43
  44   trialdata <- trialdata[, !(names(trialdata) %in% metadata)]
  45
  46   trialdata <- trialdata[, !(names(trialdata) %in% c('replicate', 'blockNumber'))]
  47
  48   trialdata <- ddply(trialdata,
  49                      "germplasmName",
  50                      colwise(mean, na.rm=TRUE)
  51                      )
  52
  53   trialdata <- data.frame(trialdata)
  54
  55   trialdata <- trialdata[complete.cases(trialdata), ]
  56
  57   colnames(trialdata)[2]<- make.names(studyNames[i])
  58
  59   if( i == 1) {
  60     wfAllTrialsData <- trialdata
  61   } else {
  62     wfAllTrialsData <- merge(wfAllTrialsData, trialdata, by="germplasmName")
  63   }
  64 }
  65
  66 #create a fake 4 trials dataset
  67 #wfAllTrialsData <- merge(wfAllTrialsData, wfAllTrialsData, by="germplasmName")
  68
  69 names(wfAllTrialsData) <- make.names(names(wfAllTrialsData))
  70
  71 lfTrialsData <- function (wfTrialsData) {
  72  lfTrDa <- gather(wfTrialsData, Trials, Trait,
  73                         2:length(wfTrialsData),
  74                         factor_key=TRUE)
  75
  76   return (lfTrDa)
  77 }
  78
  79
  80 datamatrix <- data.matrix(wfAllTrialsData)
  81
  82
  83 if (nrow(datamatrix)==0) {
  84    write("No data was retrieved from the database for this combination of trials: ", file = errorfile);
  85 }
  86 if (ncol(datamatrix) < 2) {
  87    write("No data. Try again", file = errorfile);
  88 }
  89
  90 # correlation
  91 #
  92 panel.cor <- function(x, y, digits=2, cex.cor)
  93 {
  94    usr <- par("usr"); on.exit(par(usr))
  95    par(usr = c(0, 1, 0, 1))
  96    r <- abs(cor(x, y, use ="na.or.complete"))
  97    txt <- format(c(r, 0.123456789), digits=digits)[1]
  98    test <- cor.test(x,y,use ="na.or.complete")
  99    Signif <- ifelse(round(test$p.value,3)<0.001,"p<0.001",paste("p=",round(test$p.value,3)))
 100    text(0.5, 0.25, paste("r=",txt))
 101    text(.5, .75, Signif)
 102 }
 103
 104 #pairs(data_test,lower.panel=panel.smooth,upper.panel=panel.cor)
 105
 106 #smooth
 107
 108 panel.smooth<-function (x, y, col = "black", bg = NA, pch = 18, cex = 0.8, col.smooth = "red", span = 2/3, iter = 3, ...)
 109 {
 110     points(x, y, pch = pch, col = col, bg = bg, cex = cex)
 111     ok <- is.finite(x) & is.finite(y)
 112     if (any(ok))
 113         lines(stats::lowess(x[ok], y[ok], f = span, iter = iter),
 114         col = col.smooth, ...)
 115     }
 116
 117
 118 #pairs(data,lower.panel=panel.smooth,upper.panel=panel.smooth)
 119
 120 #histo
 121 panel.hist <- function(x, ...)
 122 {
 123     usr <- par("usr"); on.exit(par(usr))
 124     par(usr = c(usr[1:2], 0, 1.5) )
 125     h <- hist(x, plot = TRUE)
 126     breaks <- h$breaks; nB <- length(breaks)
 127     y <- h$counts; y <- y/max(y)
 128     rect(breaks[-nB], 0, breaks[-1], y, col="red", ...)
 129 }
 130
 131 scatterPlot <- function (wfTrialsData) {
 132
 133   lfTrDa <-  lfTrialsData(wfTrialsData)
 134
 135   scatter <- ggplot(wfTrialsData, aes_string(x=names(wfTrialsData)[2], y=names(wfTrialsData)[3])) +
 136                 theme(plot.title = element_text(size=18,  face="bold", color="olivedrab4", margin = margin(40, 40, 40, 40)),
 137                       plot.margin = unit(c(0.75, 1, 0.75, 1), "cm"),
 138                       axis.title.x = element_text(size=14, face="bold", color="olivedrab4"),
 139                       axis.title.y = element_text(size=14, face="bold", color="olivedrab4"),
 140                       axis.text.x  = element_text(angle=90, vjust=0.5, size=10, color="olivedrab4"),
 141                       axis.text.y  = element_text(size=10, color="olivedrab4")) +
 142                 geom_point(shape=1, color='DodgerBlue') +
 143                 scale_x_continuous(breaks = round(seq(min(lfTrDa$Trait), max(lfTrDa$Trait), by = 2),1)) +
 144                 scale_y_continuous(breaks = round(seq(min(lfTrDa$Trait), max(lfTrDa$Trait), by = 2),1)) +
 145                 geom_smooth(method=lm, se=FALSE)
 146
 147  return(scatter)
 148
 149 }
 150
 151
 152 freqPlot <- function (wfTrialsData) {
 153
 154   lfTrDa <- lfTrialsData(wfTrialsData)
 155
 156   averages <- ddply(lfTrDa,  "Trials", summarise, traitAverage = mean(Trait))
 157
 158   freq <- ggplot(lfTrDa, aes(x=Trait, fill=Trials)) +
 159   xlab("Trait values") +
 160   ylab("Frequency") +
 161   theme(plot.title = element_text(size=18, face="bold", color="olivedrab4",  margin = margin(40, 40, 40, 40)),
 162         plot.margin = unit(c(0.75, 1, 0.75, 1), "cm"),
 163         axis.title.x = element_text(size=14, face="bold", color="olivedrab4"),
 164         axis.title.y = element_text(size=14, face="bold", color="olivedrab4"),
 165         axis.text.x  = element_text(angle=90, size=10, color="olivedrab4"),
 166         axis.text.y  = element_text(size=10, color="olivedrab4"),
 167         legend.title=element_blank(),
 168         legend.text=element_text(size=12, color="olivedrab4"),
 169         legend.position="bottom") +
 170   geom_histogram(binwidth=2, alpha=.5, position="identity") +
 171   scale_x_continuous(breaks = round(seq(min(lfTrDa$Trait), max(lfTrDa$Trait), by = 2),1)) +
 172   scale_fill_manual(values=c("ForestGreen", "DodgerBlue")) +
 173   geom_vline(data=averages,
 174              aes(xintercept=traitAverage,  colour=Trials),
 175              linetype="dashed", size=2)
 176
 177  return(freq)
 178 }
 179
 180
 181 # Multiple plot function: for how to use this function go here:
 182 # http://www.cookbook-r.com/Graphs/Multiple_graphs_on_one_page_(ggplot2)/
 183
 184 multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
 185   library(grid)
 186
 187   plots <- c(list(...), plotlist)
 188
 189   numPlots = length(plots)
 190
 191   if (is.null(layout)) {
 192     layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
 193                     ncol = cols, nrow = ceiling(numPlots/cols))
 194   }
 195
 196  if (numPlots==1) {
 197     print(plots[[1]])
 198
 199   } else {
 200     grid.newpage()
 201     pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
 202
 203     for (i in 1:numPlots) {
 204       matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
 205
 206       print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
 207                                       layout.pos.col = matchidx$col))
 208     }
 209   }
 210 }
 211
 212
 213 getTrialsPairs <- function (wfAllTrialsData) {
 214   combiTrMx <- combn(names(wfAllTrialsData[, 2:length(names(wfAllTrialsData))]), 2)
 215   nPairs   <- dim(combiTrMx)[2]
 216
 217   allPairs <- c()
 218   for (i in 1:nPairs) {
 219     pairs  <- combiTrMx[, i]
 220    message("pair  " , i, " ",  pairs )
 221     allPairs[i] <- list(i=pairs)
 222   }
 223
 224  return (list("trialsPairs"= allPairs, "pairsCount"= nPairs))
 225 }
 226
 227
 228 prTr        <- getTrialsPairs(wfAllTrialsData)
 229 trialsPairs <- prTr[["trialsPairs"]]
 230 pairsCount  <- prTr[["pairsCount"]]
 231
 232 message("pairs count ", pairsCount)
 233
 234 createGraphNames <- function (pairsCount) {
 235   graphNames <- c()
 236
 237   for (i in 1:pairsCount) {
 238     pf <- paste("freq", i, sep="")
 239     ps <- paste("scatter", i, sep="")
 240
 241     graphNames[i] <- list(i=c(ps, pf))
 242   }
 243
 244   return(graphNames)
 245 }
 246
 247
 248 png(output_file, height= pairsCount * 400, width=800)
 249
 250 graphNames <- createGraphNames(pairsCount)
 251
 252 for (i in 1:pairsCount) {
 253   pnames <- graphNames[[i]]
 254   message(pnames, "  ", pnames[1], " ", pnames[2])
 255
 256   scatter <- scatterPlot(wfAllTrialsData[, c("germplasmName", trialsPairs[[i]])])
 257   freq <- freqPlot(wfAllTrialsData[, c("germplasmName", trialsPairs[[i]])])
 258
 259   assign(pnames[1], scatter)
 260   assign(pnames[2], freq)
 261
 262 }
 263
 264 if (pairsCount == 1) {
 265   multiplot(scatter1, freq1, cols=2)
 266 } else if (pairsCount == 3) {
 267   multiplot(scatter1, freq1, scatter2, freq2, scatter3, freq3, cols=2)
 268 } else if (pairsCount == 6) {
 269   multiplot(scatter1, freq1, scatter2, freq2, scatter3, freq3, scatter4, freq4, scatter5, freq5, scatter6, freq6, cols=2)
 270 }
 271
 272 dev.off()
 273
 274