consolidated unit fixture folder tests
[sgn.git] / R / combine_populations.r
blobaffef783711683c253f69012796a92dd7aa595e0
1 #formats and combines phenotype (of a single trait)
2 #and genotype datasets of multiple
3 #populations
5 options(echo = FALSE)
7 library(stats)
8 library(stringr)
9 library(randomForest)
10 library(plyr)
11 library(lme4)
12 library(data.table)
15 allArgs <- commandArgs()
17 inFile <- grep("input_files",
18 allArgs,
19 ignore.case = TRUE,
20 perl = TRUE,
21 value = TRUE
24 outFile <- grep("output_files",
25 allArgs,
26 ignore.case = TRUE,
27 perl = TRUE,
28 value = TRUE
31 outFiles <- scan(outFile,
32 what = "character"
35 combinedGenoFile <- grep("genotype_data",
36 outFiles,
37 ignore.case = TRUE,
38 fixed = FALSE,
39 value = TRUE
42 combinedPhenoFile <- grep("phenotype_data",
43 outFiles,
44 ignore.case = TRUE,
45 fixed = FALSE,
46 value = TRUE
49 inFiles <- scan(inFile,
50 what = "character"
52 print(inFiles)
54 traitFile <- grep("trait_",
55 inFiles,
56 ignore.case = TRUE,
57 fixed = FALSE,
58 value = TRUE
61 trait <- scan(traitFile,
62 what = "character",
65 traitInfo<-strsplit(trait, "\t");
66 traitId<-traitInfo[[1]]
67 traitName<-traitInfo[[2]]
69 #extract trait phenotype data from all populations
70 #and combine them into one dataset
72 allPhenoFiles <- grep("phenotype_data",
73 inFiles,
74 ignore.case = TRUE,
75 fixed = FALSE,
76 value = TRUE
78 message("phenotype files: ", allPhenoFiles)
80 allGenoFiles <- grep("genotype_data",
81 inFiles,
82 ignore.case = TRUE,
83 fixed = FALSE,
84 value = TRUE
87 popsPhenoSize <- length(allPhenoFiles)
88 popsGenoSize <- length(allGenoFiles)
89 popIds <- c()
90 combinedPhenoPops <- c()
92 for (popPhenoNum in 1:popsPhenoSize)
94 popId <- str_extract(allPhenoFiles[[popPhenoNum]], "\\d+")
95 popIds <- append(popIds, popId)
97 phenoData <- fread(allPhenoFiles[[popPhenoNum]],
98 na.strings = c("NA", " ", "--", "-", "."),
102 phenoTrait <- subset(phenoData,
103 select = c("object_name", "object_id", "design", "block", "replicate", traitName)
106 experimentalDesign <- phenoTrait[2, 'design']
108 if (is.na(experimentalDesign) == TRUE) {experimentalDesign <- c('No Design')}
110 if ((experimentalDesign == 'Augmented' || experimentalDesign == 'RCBD') && unique(phenoTrait$block) > 1) {
112 message("experimental design: ", experimentalDesign)
114 augData <- subset(phenoTrait,
115 select = c("object_name", "object_id", "block", traitName)
118 colnames(augData)[1] <- "genotypes"
119 colnames(augData)[4] <- "trait"
121 model <- try(lmer(trait ~ 0 + genotypes + (1|block),
122 augData,
123 na.action = na.omit
126 if (class(model) != "try-error") {
127 phenoTrait <- data.frame(fixef(model))
129 colnames(phenoTrait) <- traitName
131 nn <- gsub('genotypes', '', rownames(phenoTrait))
132 rownames(phenoTrait) <- nn
134 phenoTrait <- round(phenoTrait, digits = 2)
136 formattedPhenoData[, traitName] <- phenoTrait
139 } else if (experimentalDesign == 'Alpha') {
141 alphaData <- phenoTrait
143 colnames(alphaData)[1] <- "genotypes"
144 colnames(alphaData)[5] <- "trait"
146 model <- try(lmer(trait ~ 0 + genotypes + (1|replicate/block),
147 alphaData,
148 na.action = na.omit
151 if (class(model) != "try-error") {
152 phenoTrait <- data.frame(fixef(model))
154 colnames(phenoTrait) <- traitName
156 nn <- gsub('genotypes', '', rownames(phenotrait))
157 rownames(phenoTrait) <- nn
159 phenoTrait <- round(phenoTrait, digits = 2)
161 formattedPhenoData[, i] <- phenoTrait
164 } else {
166 phenoTrait <- subset(phenoData,
167 select = c("object_name", "stock_id", traitName)
170 if (sum(is.na(phenoTrait)) > 0) {
171 message("No. of pheno missing values: ", sum(is.na(phenoTrait)))
173 phenoTrait <- na.omit(phenoTrait)
175 #calculate mean of reps/plots of the same accession and
176 #create new df with the accession means
177 phenoTrait$stock_id <- NULL
178 phenoTrait <- phenoTrait[order(row.names(phenoTrait)), ]
180 print('phenotyped lines before averaging')
181 print(length(row.names(phenoTrait)))
183 phenoTrait<-ddply(phenoTrait, "object_name", colwise(mean))
185 print('phenotyped lines after averaging')
186 print(length(row.names(phenoTrait)))
188 row.names(phenoTrait) <- phenoTrait[, 1]
189 phenoTrait[, 1] <- NULL
191 phenoTrait <- round(phenoTrait, digits = 2)
193 } else {
194 print ('No missing data')
195 phenoTrait$stock_id <- NULL
196 phenoTrait <- phenoTrait[order(row.names(phenoTrait)), ]
198 print('phenotyped lines before averaging')
199 print(length(row.names(phenoTrait)))
201 phenoTrait<-ddply(phenoTrait, "object_name", colwise(mean))
203 print('phenotyped lines after averaging')
204 print(length(row.names(phenoTrait)))
206 row.names(phenoTrait) <- phenoTrait[, 1]
207 phenoTrait[, 1] <- NULL
209 phenoTrait <- round(phenoTrait, digits = 2)
213 newTraitName = paste(traitName, popId, sep = "_")
214 colnames(phenoTrait)[1] <- newTraitName
216 if (popPhenoNum == 1 )
218 print('no need to combine, yet')
219 combinedPhenoPops <- phenoTrait
221 } else {
222 print('combining...')
223 combinedPhenoPops <- merge(combinedPhenoPops, phenoTrait,
224 by = 0,
225 all=TRUE,
228 rownames(combinedPhenoPops) <- combinedPhenoPops[, 1]
229 combinedPhenoPops$Row.names <- NULL
234 #fill in missing data in combined phenotype dataset
235 #using row means
236 naIndices <- which(is.na(combinedPhenoPops), arr.ind=TRUE)
237 combinedPhenoPops <- as.matrix(combinedPhenoPops)
238 combinedPhenoPops[naIndices] <- rowMeans(combinedPhenoPops, na.rm=TRUE)[naIndices[,1]]
239 combinedPhenoPops <- as.data.frame(combinedPhenoPops)
241 message("combined total number of stocks in phenotype dataset (before averaging): ", length(rownames(combinedPhenoPops)))
243 combinedPhenoPops$Average<-round(apply(combinedPhenoPops,
245 function(x)
246 { mean(x) }
248 digits = 2
251 markersList <- c()
252 combinedGenoPops <- c()
254 for (popGenoNum in 1:popsGenoSize)
256 popId <- str_extract(allGenoFiles[[popGenoNum]], "\\d+")
257 popIds <- append(popIds, popId)
259 genoData <- fread(allGenoFiles[[popGenoNum]],
260 na.strings = c("NA", " ", "--", "-"),
263 genoData <- as.data.frame(genoData)
264 rownames(genoData) <- genoData[, 1]
265 genoData[, 1] <- NULL
267 popMarkers <- colnames(genoData)
268 message("No of markers from population ", popId, ": ", length(popMarkers))
269 #print(popMarkers)
271 if (sum(is.na(genoData)) > 0)
273 message("sum of geno missing values: ", sum(is.na(genoData)))
274 genoData <- na.roughfix(genoData)
275 message("total number of stocks for pop ", popId,": ", length(rownames(genoData)))
278 if (popGenoNum == 1 )
280 print('no need to combine, yet')
281 combinedGenoPops <- genoData
283 } else {
284 print('combining genotype datasets...')
285 combinedGenoPops <-rbind(combinedGenoPops, genoData)
290 message("combined total number of stocks in genotype dataset: ", length(rownames(combinedGenoPops)))
291 #discard duplicate clones
292 combinedGenoPops <- unique(combinedGenoPops)
293 message("combined unique number of stocks in genotype dataset: ", length(rownames(combinedGenoPops)))
295 message("writing data into files...")
296 #if(length(combinedPhenoFile) != 0 )
298 write.table(combinedPhenoPops,
299 file = combinedPhenoFile,
300 sep = "\t",
301 quote = FALSE,
302 col.names = NA,
306 #if(length(combinedGenoFile) != 0 )
308 write.table(combinedGenoPops,
309 file = combinedGenoFile,
310 sep = "\t",
311 quote = FALSE,
312 col.names = NA,
316 q(save = "no", runLast = FALSE)