1 #formats and combines phenotype (of a single trait)
2 #and genotype datasets of multiple
15 allArgs
<- commandArgs()
17 inFile
<- grep("input_files",
24 outFile
<- grep("output_files",
31 outFiles
<- scan(outFile
,
35 combinedGenoFile
<- grep("genotype_data",
42 combinedPhenoFile
<- grep("phenotype_data",
49 inFiles
<- scan(inFile
,
54 traitFile
<- grep("trait_",
61 trait
<- scan(traitFile
,
65 traitInfo
<-strsplit(trait
, "\t");
66 traitId
<-traitInfo
[[1]]
67 traitName
<-traitInfo
[[2]]
69 #extract trait phenotype data from all populations
70 #and combine them into one dataset
72 allPhenoFiles
<- grep("phenotype_data",
78 message("phenotype files: ", allPhenoFiles
)
80 allGenoFiles
<- grep("genotype_data",
87 popsPhenoSize
<- length(allPhenoFiles
)
88 popsGenoSize
<- length(allGenoFiles
)
90 combinedPhenoPops
<- c()
92 for (popPhenoNum
in 1:popsPhenoSize
)
94 popId
<- str_extract(allPhenoFiles
[[popPhenoNum
]], "\\d+")
95 popIds
<- append(popIds
, popId
)
97 phenoData
<- fread(allPhenoFiles
[[popPhenoNum
]],
98 na
.strings
= c("NA", " ", "--", "-", "."),
102 phenoTrait
<- subset(phenoData
,
103 select
= c("object_name", "object_id", "design", "block", "replicate", traitName
)
106 experimentalDesign
<- phenoTrait
[2, 'design']
108 if (is
.na(experimentalDesign
) == TRUE) {experimentalDesign
<- c('No Design')}
110 if ((experimentalDesign
== 'Augmented' || experimentalDesign
== 'RCBD') && unique(phenoTrait$block
) > 1) {
112 message("experimental design: ", experimentalDesign
)
114 augData
<- subset(phenoTrait
,
115 select
= c("object_name", "object_id", "block", traitName
)
118 colnames(augData
)[1] <- "genotypes"
119 colnames(augData
)[4] <- "trait"
121 model
<- try(lmer(trait
~ 0 + genotypes
+ (1|block
),
126 if (class(model
) != "try-error") {
127 phenoTrait
<- data
.frame(fixef(model
))
129 colnames(phenoTrait
) <- traitName
131 nn
<- gsub('genotypes', '', rownames(phenoTrait
))
132 rownames(phenoTrait
) <- nn
134 phenoTrait
<- round(phenoTrait
, digits
= 2)
136 formattedPhenoData
[, traitName
] <- phenoTrait
139 } else if (experimentalDesign
== 'Alpha') {
141 alphaData
<- phenoTrait
143 colnames(alphaData
)[1] <- "genotypes"
144 colnames(alphaData
)[5] <- "trait"
146 model
<- try(lmer(trait
~ 0 + genotypes
+ (1|replicate
/block
),
151 if (class(model
) != "try-error") {
152 phenoTrait
<- data
.frame(fixef(model
))
154 colnames(phenoTrait
) <- traitName
156 nn
<- gsub('genotypes', '', rownames(phenotrait
))
157 rownames(phenoTrait
) <- nn
159 phenoTrait
<- round(phenoTrait
, digits
= 2)
161 formattedPhenoData
[, i
] <- phenoTrait
166 phenoTrait
<- subset(phenoData
,
167 select
= c("object_name", "stock_id", traitName
)
170 if (sum(is
.na(phenoTrait
)) > 0) {
171 message("No. of pheno missing values: ", sum(is
.na(phenoTrait
)))
173 phenoTrait
<- na
.omit(phenoTrait
)
175 #calculate mean of reps/plots of the same accession and
176 #create new df with the accession means
177 phenoTrait$stock_id
<- NULL
178 phenoTrait
<- phenoTrait
[order(row
.names(phenoTrait
)), ]
180 print('phenotyped lines before averaging')
181 print(length(row
.names(phenoTrait
)))
183 phenoTrait
<-ddply(phenoTrait
, "object_name", colwise(mean
))
185 print('phenotyped lines after averaging')
186 print(length(row
.names(phenoTrait
)))
188 row
.names(phenoTrait
) <- phenoTrait
[, 1]
189 phenoTrait
[, 1] <- NULL
191 phenoTrait
<- round(phenoTrait
, digits
= 2)
194 print ('No missing data')
195 phenoTrait$stock_id
<- NULL
196 phenoTrait
<- phenoTrait
[order(row
.names(phenoTrait
)), ]
198 print('phenotyped lines before averaging')
199 print(length(row
.names(phenoTrait
)))
201 phenoTrait
<-ddply(phenoTrait
, "object_name", colwise(mean
))
203 print('phenotyped lines after averaging')
204 print(length(row
.names(phenoTrait
)))
206 row
.names(phenoTrait
) <- phenoTrait
[, 1]
207 phenoTrait
[, 1] <- NULL
209 phenoTrait
<- round(phenoTrait
, digits
= 2)
213 newTraitName
= paste(traitName
, popId
, sep
= "_")
214 colnames(phenoTrait
)[1] <- newTraitName
216 if (popPhenoNum
== 1 )
218 print('no need to combine, yet')
219 combinedPhenoPops
<- phenoTrait
222 print('combining...')
223 combinedPhenoPops
<- merge(combinedPhenoPops
, phenoTrait
,
228 rownames(combinedPhenoPops
) <- combinedPhenoPops
[, 1]
229 combinedPhenoPops$Row
.names
<- NULL
234 #fill in missing data in combined phenotype dataset
236 naIndices
<- which(is
.na(combinedPhenoPops
), arr
.ind
=TRUE)
237 combinedPhenoPops
<- as
.matrix(combinedPhenoPops
)
238 combinedPhenoPops
[naIndices
] <- rowMeans(combinedPhenoPops
, na
.rm
=TRUE)[naIndices
[,1]]
239 combinedPhenoPops
<- as
.data
.frame(combinedPhenoPops
)
241 message("combined total number of stocks in phenotype dataset (before averaging): ", length(rownames(combinedPhenoPops
)))
243 combinedPhenoPops$Average
<-round(apply(combinedPhenoPops
,
252 combinedGenoPops
<- c()
254 for (popGenoNum
in 1:popsGenoSize
)
256 popId
<- str_extract(allGenoFiles
[[popGenoNum
]], "\\d+")
257 popIds
<- append(popIds
, popId
)
259 genoData
<- fread(allGenoFiles
[[popGenoNum
]],
260 na
.strings
= c("NA", " ", "--", "-"),
263 genoData
<- as
.data
.frame(genoData
)
264 rownames(genoData
) <- genoData
[, 1]
265 genoData
[, 1] <- NULL
267 popMarkers
<- colnames(genoData
)
268 message("No of markers from population ", popId
, ": ", length(popMarkers
))
271 if (sum(is
.na(genoData
)) > 0)
273 message("sum of geno missing values: ", sum(is
.na(genoData
)))
274 genoData
<- na
.roughfix(genoData
)
275 message("total number of stocks for pop ", popId
,": ", length(rownames(genoData
)))
278 if (popGenoNum
== 1 )
280 print('no need to combine, yet')
281 combinedGenoPops
<- genoData
284 print('combining genotype datasets...')
285 combinedGenoPops
<-rbind(combinedGenoPops
, genoData
)
290 message("combined total number of stocks in genotype dataset: ", length(rownames(combinedGenoPops
)))
291 #discard duplicate clones
292 combinedGenoPops
<- unique(combinedGenoPops
)
293 message("combined unique number of stocks in genotype dataset: ", length(rownames(combinedGenoPops
)))
295 message("writing data into files...")
296 #if(length(combinedPhenoFile) != 0 )
298 write
.table(combinedPhenoPops
,
299 file
= combinedPhenoFile
,
306 #if(length(combinedGenoFile) != 0 )
308 write
.table(combinedGenoPops
,
309 file
= combinedGenoFile
,
316 q(save
= "no", runLast
= FALSE)