Merge branch 'master' into topic/dt_feature_search
[sgn.git] / R / combine_populations.r
blob45ebf5369cd1761b0f05d5cfa95dd49f45dfb570
1 #formats and combines phenotype (of a single trait)
2 #and genotype datasets of multiple
3 #populations
5 options(echo = FALSE)
7 library(stats)
8 library(stringr)
9 library(randomForest)
10 library(plyr)
11 library(lme4)
14 allArgs <- commandArgs()
16 inFile <- grep("input_files",
17 allArgs,
18 ignore.case = TRUE,
19 perl = TRUE,
20 value = TRUE
23 outFile <- grep("output_files",
24 allArgs,
25 ignore.case = TRUE,
26 perl = TRUE,
27 value = TRUE
30 outFiles <- scan(outFile,
31 what = "character"
34 combinedGenoFile <- grep("genotype_data",
35 outFiles,
36 ignore.case = TRUE,
37 fixed = FALSE,
38 value = TRUE
41 combinedPhenoFile <- grep("phenotype_data",
42 outFiles,
43 ignore.case = TRUE,
44 fixed = FALSE,
45 value = TRUE
48 inFiles <- scan(inFile,
49 what = "character"
51 print(inFiles)
53 traitFile <- grep("trait_",
54 inFiles,
55 ignore.case = TRUE,
56 fixed = FALSE,
57 value = TRUE
60 trait <- scan(traitFile,
61 what = "character",
64 traitInfo<-strsplit(trait, "\t");
65 traitId<-traitInfo[[1]]
66 traitName<-traitInfo[[2]]
68 #extract trait phenotype data from all populations
69 #and combine them into one dataset
71 allPhenoFiles <- grep("phenotype_data",
72 inFiles,
73 ignore.case = TRUE,
74 fixed = FALSE,
75 value = TRUE
77 message("phenotype files: ", allPhenoFiles)
79 allGenoFiles <- grep("genotype_data",
80 inFiles,
81 ignore.case = TRUE,
82 fixed = FALSE,
83 value = TRUE
86 popsPhenoSize <- length(allPhenoFiles)
87 popsGenoSize <- length(allGenoFiles)
88 popIds <- c()
89 combinedPhenoPops <- c()
91 for (popPhenoNum in 1:popsPhenoSize)
93 popId <- str_extract(allPhenoFiles[[popPhenoNum]], "\\d+")
94 popIds <- append(popIds, popId)
96 phenoData <- read.table(allPhenoFiles[[popPhenoNum]],
97 header = TRUE,
98 row.names = 1,
99 sep = "\t",
100 na.strings = c("NA", " ", "--", "-", "."),
101 dec = "."
105 phenoTrait <- subset(phenoData,
106 select = c("object_name", "object_id", "design", "block", "replicate", traitName)
109 experimentalDesign <- phenoTrait[2, 'design']
111 if (is.na(experimentalDesign) == TRUE) {experimentalDesign <- c('No Design')}
113 if ((experimentalDesign == 'Augmented' || experimentalDesign == 'RCBD') && unique(phenoTrait$block) > 1) {
115 message("experimental design: ", experimentalDesign)
117 augData <- subset(phenoTrait,
118 select = c("object_name", "object_id", "block", traitName)
121 colnames(augData)[1] <- "genotypes"
122 colnames(augData)[4] <- "trait"
124 model <- try(lmer(trait ~ 0 + genotypes + (1|block),
125 augData,
126 na.action = na.omit
129 if (class(model) != "try-error") {
130 phenoTrait <- data.frame(fixef(model))
132 colnames(phenoTrait) <- traitName
134 nn <- gsub('genotypes', '', rownames(phenoTrait))
135 rownames(phenoTrait) <- nn
137 phenoTrait <- round(phenoTrait, digits = 2)
139 formattedPhenoData[, traitName] <- phenoTrait
142 } else if (experimentalDesign == 'Alpha') {
144 alphaData <- phenoTrait
146 colnames(alphaData)[1] <- "genotypes"
147 colnames(alphaData)[5] <- "trait"
149 model <- try(lmer(trait ~ 0 + genotypes + (1|replicate/block),
150 alphaData,
151 na.action = na.omit
154 if (class(model) != "try-error") {
155 phenoTrait <- data.frame(fixef(model))
157 colnames(phenoTrait) <- traitName
159 nn <- gsub('genotypes', '', rownames(phenotrait))
160 rownames(phenoTrait) <- nn
162 phenoTrait <- round(phenoTrait, digits = 2)
164 formattedPhenoData[, i] <- phenoTrait
167 } else {
169 phenoTrait <- subset(phenoData,
170 select = c("object_name", "stock_id", traitName)
173 if (sum(is.na(phenoTrait)) > 0) {
174 message("No. of pheno missing values: ", sum(is.na(phenoTrait)))
176 phenoTrait <- na.omit(phenoTrait)
178 #calculate mean of reps/plots of the same accession and
179 #create new df with the accession means
180 phenoTrait$stock_id <- NULL
181 phenoTrait <- phenoTrait[order(row.names(phenoTrait)), ]
183 print('phenotyped lines before averaging')
184 print(length(row.names(phenoTrait)))
186 phenoTrait<-ddply(phenoTrait, "object_name", colwise(mean))
188 print('phenotyped lines after averaging')
189 print(length(row.names(phenoTrait)))
191 row.names(phenoTrait) <- phenoTrait[, 1]
192 phenoTrait[, 1] <- NULL
194 phenoTrait <- round(phenoTrait, digits = 2)
196 } else {
197 print ('No missing data')
198 phenoTrait$stock_id <- NULL
199 phenoTrait <- phenoTrait[order(row.names(phenoTrait)), ]
201 print('phenotyped lines before averaging')
202 print(length(row.names(phenoTrait)))
204 phenoTrait<-ddply(phenoTrait, "object_name", colwise(mean))
206 print('phenotyped lines after averaging')
207 print(length(row.names(phenoTrait)))
209 row.names(phenoTrait) <- phenoTrait[, 1]
210 phenoTrait[, 1] <- NULL
212 phenoTrait <- round(phenoTrait, digits = 2)
216 newTraitName = paste(traitName, popId, sep = "_")
217 colnames(phenoTrait)[1] <- newTraitName
219 if (popPhenoNum == 1 )
221 print('no need to combine, yet')
222 combinedPhenoPops <- phenoTrait
224 } else {
225 print('combining...')
226 combinedPhenoPops <- merge(combinedPhenoPops, phenoTrait,
227 by = 0,
228 all=TRUE,
231 rownames(combinedPhenoPops) <- combinedPhenoPops[, 1]
232 combinedPhenoPops$Row.names <- NULL
237 #fill in missing data in combined phenotype dataset
238 #using row means
239 naIndices <- which(is.na(combinedPhenoPops), arr.ind=TRUE)
240 combinedPhenoPops <- as.matrix(combinedPhenoPops)
241 combinedPhenoPops[naIndices] <- rowMeans(combinedPhenoPops, na.rm=TRUE)[naIndices[,1]]
242 combinedPhenoPops <- as.data.frame(combinedPhenoPops)
244 message("combined total number of stocks in phenotype dataset (before averaging): ", length(rownames(combinedPhenoPops)))
246 combinedPhenoPops$Average<-round(apply(combinedPhenoPops,
248 function(x)
249 { mean(x) }
251 digits = 2
254 markersList <- c()
255 combinedGenoPops <- c()
257 for (popGenoNum in 1:popsGenoSize)
259 popId <- str_extract(allGenoFiles[[popGenoNum]], "\\d+")
260 popIds <- append(popIds, popId)
262 genoData <- read.table(allGenoFiles[[popGenoNum]],
263 header = TRUE,
264 row.names = 1,
265 sep = "\t",
266 na.strings = c("NA", " ", "--", "-"),
267 dec = "."
270 popMarkers <- colnames(genoData)
271 message("No of markers from population ", popId, ": ", length(popMarkers))
272 #print(popMarkers)
274 if (sum(is.na(genoData)) > 0)
276 message("sum of geno missing values: ", sum(is.na(genoData)))
277 genoData <- na.roughfix(genoData)
278 message("total number of stocks for pop ", popId,": ", length(rownames(genoData)))
281 if (popGenoNum == 1 )
283 print('no need to combine, yet')
284 combinedGenoPops <- genoData
286 } else {
287 print('combining genotype datasets...')
288 combinedGenoPops <-rbind(combinedGenoPops, genoData)
293 message("combined total number of stocks in genotype dataset: ", length(rownames(combinedGenoPops)))
294 #discard duplicate clones
295 combinedGenoPops <- unique(combinedGenoPops)
296 message("combined unique number of stocks in genotype dataset: ", length(rownames(combinedGenoPops)))
298 message("writing data into files...")
299 if(length(combinedPhenoFile) != 0 )
301 write.table(combinedPhenoPops,
302 file = combinedPhenoFile,
303 sep = "\t",
304 quote = FALSE,
305 col.names = NA,
309 if(length(combinedGenoFile) != 0 )
311 write.table(combinedGenoPops,
312 file = combinedGenoFile,
313 sep = "\t",
314 quote = FALSE,
315 col.names = NA,
319 q(save = "no", runLast = FALSE)