1 #formats and combines phenotype (of a single trait)
2 #and genotype datasets of multiple
14 allArgs
<- commandArgs()
16 inFile
<- grep("input_files",
23 outFile
<- grep("output_files",
30 outFiles
<- scan(outFile
,
34 combinedGenoFile
<- grep("genotype_data",
41 combinedPhenoFile
<- grep("phenotype_data",
48 inFiles
<- scan(inFile
,
53 traitFile
<- grep("trait_",
60 trait
<- scan(traitFile
,
64 traitInfo
<-strsplit(trait
, "\t");
65 traitId
<-traitInfo
[[1]]
66 traitName
<-traitInfo
[[2]]
68 #extract trait phenotype data from all populations
69 #and combine them into one dataset
71 allPhenoFiles
<- grep("phenotype_data",
77 message("phenotype files: ", allPhenoFiles
)
79 allGenoFiles
<- grep("genotype_data",
86 popsPhenoSize
<- length(allPhenoFiles
)
87 popsGenoSize
<- length(allGenoFiles
)
89 combinedPhenoPops
<- c()
91 for (popPhenoNum
in 1:popsPhenoSize
)
93 popId
<- str_extract(allPhenoFiles
[[popPhenoNum
]], "\\d+")
94 popIds
<- append(popIds
, popId
)
96 phenoData
<- read
.table(allPhenoFiles
[[popPhenoNum
]],
100 na
.strings
= c("NA", " ", "--", "-", "."),
105 phenoTrait
<- subset(phenoData
,
106 select
= c("object_name", "object_id", "design", "block", "replicate", traitName
)
109 experimentalDesign
<- phenoTrait
[2, 'design']
111 if (is
.na(experimentalDesign
) == TRUE) {experimentalDesign
<- c('No Design')}
113 if ((experimentalDesign
== 'Augmented' || experimentalDesign
== 'RCBD') && unique(phenoTrait$block
) > 1) {
115 message("experimental design: ", experimentalDesign
)
117 augData
<- subset(phenoTrait
,
118 select
= c("object_name", "object_id", "block", traitName
)
121 colnames(augData
)[1] <- "genotypes"
122 colnames(augData
)[4] <- "trait"
124 model
<- try(lmer(trait
~ 0 + genotypes
+ (1|block
),
129 if (class(model
) != "try-error") {
130 phenoTrait
<- data
.frame(fixef(model
))
132 colnames(phenoTrait
) <- traitName
134 nn
<- gsub('genotypes', '', rownames(phenoTrait
))
135 rownames(phenoTrait
) <- nn
137 phenoTrait
<- round(phenoTrait
, digits
= 2)
139 formattedPhenoData
[, traitName
] <- phenoTrait
142 } else if (experimentalDesign
== 'Alpha') {
144 alphaData
<- phenoTrait
146 colnames(alphaData
)[1] <- "genotypes"
147 colnames(alphaData
)[5] <- "trait"
149 model
<- try(lmer(trait
~ 0 + genotypes
+ (1|replicate
/block
),
154 if (class(model
) != "try-error") {
155 phenoTrait
<- data
.frame(fixef(model
))
157 colnames(phenoTrait
) <- traitName
159 nn
<- gsub('genotypes', '', rownames(phenotrait
))
160 rownames(phenoTrait
) <- nn
162 phenoTrait
<- round(phenoTrait
, digits
= 2)
164 formattedPhenoData
[, i
] <- phenoTrait
169 phenoTrait
<- subset(phenoData
,
170 select
= c("object_name", "stock_id", traitName
)
173 if (sum(is
.na(phenoTrait
)) > 0) {
174 message("No. of pheno missing values: ", sum(is
.na(phenoTrait
)))
176 phenoTrait
<- na
.omit(phenoTrait
)
178 #calculate mean of reps/plots of the same accession and
179 #create new df with the accession means
180 phenoTrait$stock_id
<- NULL
181 phenoTrait
<- phenoTrait
[order(row
.names(phenoTrait
)), ]
183 print('phenotyped lines before averaging')
184 print(length(row
.names(phenoTrait
)))
186 phenoTrait
<-ddply(phenoTrait
, "object_name", colwise(mean
))
188 print('phenotyped lines after averaging')
189 print(length(row
.names(phenoTrait
)))
191 row
.names(phenoTrait
) <- phenoTrait
[, 1]
192 phenoTrait
[, 1] <- NULL
194 phenoTrait
<- round(phenoTrait
, digits
= 2)
197 print ('No missing data')
198 phenoTrait$stock_id
<- NULL
199 phenoTrait
<- phenoTrait
[order(row
.names(phenoTrait
)), ]
201 print('phenotyped lines before averaging')
202 print(length(row
.names(phenoTrait
)))
204 phenoTrait
<-ddply(phenoTrait
, "object_name", colwise(mean
))
206 print('phenotyped lines after averaging')
207 print(length(row
.names(phenoTrait
)))
209 row
.names(phenoTrait
) <- phenoTrait
[, 1]
210 phenoTrait
[, 1] <- NULL
212 phenoTrait
<- round(phenoTrait
, digits
= 2)
216 newTraitName
= paste(traitName
, popId
, sep
= "_")
217 colnames(phenoTrait
)[1] <- newTraitName
219 if (popPhenoNum
== 1 )
221 print('no need to combine, yet')
222 combinedPhenoPops
<- phenoTrait
225 print('combining...')
226 combinedPhenoPops
<- merge(combinedPhenoPops
, phenoTrait
,
231 rownames(combinedPhenoPops
) <- combinedPhenoPops
[, 1]
232 combinedPhenoPops$Row
.names
<- NULL
237 #fill in missing data in combined phenotype dataset
239 naIndices
<- which(is
.na(combinedPhenoPops
), arr
.ind
=TRUE)
240 combinedPhenoPops
<- as
.matrix(combinedPhenoPops
)
241 combinedPhenoPops
[naIndices
] <- rowMeans(combinedPhenoPops
, na
.rm
=TRUE)[naIndices
[,1]]
242 combinedPhenoPops
<- as
.data
.frame(combinedPhenoPops
)
244 message("combined total number of stocks in phenotype dataset (before averaging): ", length(rownames(combinedPhenoPops
)))
246 combinedPhenoPops$Average
<-round(apply(combinedPhenoPops
,
255 combinedGenoPops
<- c()
257 for (popGenoNum
in 1:popsGenoSize
)
259 popId
<- str_extract(allGenoFiles
[[popGenoNum
]], "\\d+")
260 popIds
<- append(popIds
, popId
)
262 genoData
<- read
.table(allGenoFiles
[[popGenoNum
]],
266 na
.strings
= c("NA", " ", "--", "-"),
270 popMarkers
<- colnames(genoData
)
271 message("No of markers from population ", popId
, ": ", length(popMarkers
))
274 if (sum(is
.na(genoData
)) > 0)
276 message("sum of geno missing values: ", sum(is
.na(genoData
)))
277 genoData
<- na
.roughfix(genoData
)
278 message("total number of stocks for pop ", popId
,": ", length(rownames(genoData
)))
281 if (popGenoNum
== 1 )
283 print('no need to combine, yet')
284 combinedGenoPops
<- genoData
287 print('combining genotype datasets...')
288 combinedGenoPops
<-rbind(combinedGenoPops
, genoData
)
293 message("combined total number of stocks in genotype dataset: ", length(rownames(combinedGenoPops
)))
294 #discard duplicate clones
295 combinedGenoPops
<- unique(combinedGenoPops
)
296 message("combined unique number of stocks in genotype dataset: ", length(rownames(combinedGenoPops
)))
298 message("writing data into files...")
299 if(length(combinedPhenoFile
) != 0 )
301 write
.table(combinedPhenoPops
,
302 file
= combinedPhenoFile
,
309 if(length(combinedGenoFile
) != 0 )
311 write
.table(combinedGenoPops
,
312 file
= combinedGenoFile
,
319 q(save
= "no", runLast
= FALSE)