Merge branch 'master' into topic/refactor_bulk
[sgn.git] / R / combine_populations.r
blob965891d5967d12270301e31900c04653ba709197
1 #formats and combines phenotype (of a single trait)
2 #and genotype datasets of multiple
3 #training populations
5 options(echo = FALSE)
7 library(stats)
8 library(stringr)
9 library(imputation)
10 library(plyr)
12 allArgs <- commandArgs()
14 inFile <- grep("input_files",
15 allArgs,
16 ignore.case = TRUE,
17 perl = TRUE,
18 value = TRUE
21 outFile <- grep("output_files",
22 allArgs,
23 ignore.case = TRUE,
24 perl = TRUE,
25 value = TRUE
28 outFiles <- scan(outFile,
29 what = "character"
31 print(outFiles)
32 combinedGenoFile <- grep("genotype_data",
33 outFiles,
34 ignore.case = TRUE,
35 fixed = FALSE,
36 value = TRUE
39 combinedPhenoFile <- grep("phenotype_data",
40 outFiles,
41 ignore.case = TRUE,
42 fixed = FALSE,
43 value = TRUE
46 inFiles <- scan(inFile,
47 what = "character"
49 print(inFiles)
51 traitFile <- grep("trait_",
52 inFiles,
53 ignore.case = TRUE,
54 fixed = FALSE,
55 value = TRUE
58 trait <- scan(traitFile,
59 what = "character",
61 print(trait)
63 traitInfo<-strsplit(trait, "\t");
64 traitId<-traitInfo[[1]]
65 traitName<-traitInfo[[2]]
67 #extract trait phenotype data from all populations
68 #and combine them into one dataset
70 allPhenoFiles <- grep("phenotype_data",
71 inFiles,
72 ignore.case = TRUE,
73 fixed = FALSE,
74 value = TRUE
76 print(allPhenoFiles)
78 allGenoFiles <- grep("genotype_data",
79 inFiles,
80 ignore.case = TRUE,
81 fixed = FALSE,
82 value = TRUE
86 print(allGenoFiles)
88 popsPhenoSize <- length(allPhenoFiles)
89 popsGenoSize <- length(allGenoFiles)
90 popIds <- c()
91 combinedPhenoPops <- c()
93 for (i in 1:popsPhenoSize)
95 popId <- str_extract(allPhenoFiles[[i]], "\\d+")
96 popIds <- append(popIds, popId)
98 print(popId)
99 phenoData <- read.table(allPhenoFiles[[i]],
100 header = TRUE,
101 row.names = 1,
102 sep = "\t",
103 na.strings = c("NA", " ", "--", "-"),
104 dec = "."
108 phenoTrait <- subset(phenoData,
109 select = c("object_name", "stock_id", traitName)
112 if (sum(is.na(phenoTrait)) > 0)
114 print("sum of pheno missing values")
115 print(sum(is.na(phenoTrait)))
117 #fill in for missing data with mean value
118 phenoTrait[, traitName] <- replace (phenoTrait[, traitName],
119 is.na(phenoTrait[, traitName]),
120 mean(phenoTrait[, traitName], na.rm =TRUE)
123 #calculate mean of reps/plots of the same accession and
124 #create new df with the accession means
125 phenoTrait$stock_id <- NULL
126 phenoTrait <- phenoTrait[order(row.names(phenoTrait)), ]
128 print('phenotyped lines before averaging')
129 print(length(row.names(phenoTrait)))
131 phenoTrait<-ddply(phenoTrait, "object_name", colwise(mean))
133 print('phenotyped lines after averaging')
134 print(length(row.names(phenoTrait)))
136 row.names(phenoTrait) <- phenoTrait[, 1]
137 phenoTrait[, 1] <- NULL
139 phenoTrait <- round(phenoTrait, digits = 2)
141 } else {
142 print ('No missing data')
143 phenoTrait$stock_id <- NULL
144 phenoTrait <- phenoTrait[order(row.names(phenoTrait)), ]
146 print('phenotyped lines before averaging')
147 print(length(row.names(phenoTrait)))
149 phenoTrait<-ddply(phenoTrait, "object_name", colwise(mean))
151 print('phenotyped lines after averaging')
152 print(length(row.names(phenoTrait)))
154 row.names(phenoTrait) <- phenoTrait[, 1]
155 phenoTrait[, 1] <- NULL
157 phenoTrait <- round(phenoTrait, digits = 2)
161 newTraitName = paste(traitName, popId, sep = "_")
162 colnames(phenoTrait)[1] <- newTraitName
164 if (i == 1 )
166 print('no need to combine, yet')
167 combinedPhenoPops <- phenoTrait
169 } else {
170 print('combining...')
171 combinedPhenoPops <- merge(combinedPhenoPops, phenoTrait,
172 by = 0,
173 all=TRUE,
176 rownames(combinedPhenoPops) <- combinedPhenoPops[, 1]
177 combinedPhenoPops$Row.names <- NULL
182 #fill in missing data in combined phenotype dataset
183 #using row means
184 naIndices <- which(is.na(combinedPhenoPops), arr.ind=TRUE)
185 combinedPhenoPops <- as.matrix(combinedPhenoPops)
186 combinedPhenoPops[naIndices] <- rowMeans(combinedPhenoPops, na.rm=TRUE)[naIndices[,1]]
187 combinedPhenoPops <- as.data.frame(combinedPhenoPops)
188 message("combined total number of stocks in phenotype dataset (before averaging): ", length(rownames(combinedPhenoPops)))
189 #combinedPhenoPops<-ddply(combinedPhenoPops, by=0, colwise(mean))
190 #message("combined total number of stocks in phenotype dataset (after averaging): ", length(rownames(combinedPhenoPops)))
192 combinedPhenoPops$Average<-round(apply(combinedPhenoPops,
194 function(x)
195 { mean(x) }
197 digits = 2
200 print(combinedPhenoPops[1:30, ])
202 markersList <- c()
203 combinedGenoPops <- c()
204 for (i in 1:popsGenoSize)
206 popId <- str_extract(allGenoFiles[[i]], "\\d+")
207 popIds <- append(popIds, popId)
209 print(popId)
210 genoData <- read.table(allGenoFiles[[i]],
211 header = TRUE,
212 row.names = 1,
213 sep = "\t",
214 na.strings = c("NA", " ", "--", "-"),
215 dec = "."
218 popMarkers <- colnames(genoData)
219 message("No of markers from population ", popId, ": ", length(popMarkers))
220 #print(popMarkers)
222 if (sum(is.na(genoData)) > 0)
224 print("sum of geno missing values")
225 print(sum(is.na(genoData)))
227 #impute missing genotypes
228 genoData <-kNNImpute(genoData, 10)
229 genoData <-as.data.frame(genoData)
231 #extract columns with imputed values
232 genoData <- subset(genoData,
233 select = grep("^x", names(genoData))
236 #remove prefix 'x.' from imputed columns
237 print(genoData[1:50, 1:4])
238 names(genoData) <- sub("x.", "", names(genoData))
240 genoData <- round(genoData, digits = 0)
241 message("total number of stocks for pop ", popId,": ", length(rownames(genoData)))
244 if (i == 1 )
246 print('no need to combine, yet')
247 combinedGenoPops <- genoData
249 } else {
250 print('combining genotype datasets...')
251 combinedGenoPops <-rbind(combinedGenoPops, genoData)
256 message("combined total number of stocks in genotype dataset: ", length(rownames(combinedGenoPops)))
257 #discard duplicate clones
258 combinedGenoPops <- unique(combinedGenoPops)
259 message("combined unique number of stocks in genotype dataset: ", length(rownames(combinedGenoPops)))
261 message("writing data into files...")
262 if(length(combinedPhenoFile) != 0 )
264 write.table(combinedPhenoPops,
265 file = combinedPhenoFile,
266 sep = "\t",
267 quote = FALSE,
268 col.names = NA,
272 if(length(combinedGenoFile) != 0 )
274 write.table(combinedGenoPops,
275 file = combinedGenoFile,
276 sep = "\t",
277 quote = FALSE,
278 col.names = NA,
282 q(save = "no", runLast = FALSE)