1 #formats and combines phenotype (of a single trait)
2 #and genotype datasets of multiple
12 allArgs
<- commandArgs()
14 inFile
<- grep("input_files",
21 outFile
<- grep("output_files",
28 outFiles
<- scan(outFile
,
32 combinedGenoFile
<- grep("genotype_data",
39 combinedPhenoFile
<- grep("phenotype_data",
46 inFiles
<- scan(inFile
,
51 traitFile
<- grep("trait_",
58 trait
<- scan(traitFile
,
63 traitInfo
<-strsplit(trait
, "\t");
64 traitId
<-traitInfo
[[1]]
65 traitName
<-traitInfo
[[2]]
67 #extract trait phenotype data from all populations
68 #and combine them into one dataset
70 allPhenoFiles
<- grep("phenotype_data",
78 allGenoFiles
<- grep("genotype_data",
88 popsPhenoSize
<- length(allPhenoFiles
)
89 popsGenoSize
<- length(allGenoFiles
)
91 combinedPhenoPops
<- c()
93 for (i
in 1:popsPhenoSize
)
95 popId
<- str_extract(allPhenoFiles
[[i
]], "\\d+")
96 popIds
<- append(popIds
, popId
)
99 phenoData
<- read
.table(allPhenoFiles
[[i
]],
103 na
.strings
= c("NA", " ", "--", "-"),
108 phenoTrait
<- subset(phenoData
,
109 select
= c("object_name", "stock_id", traitName
)
112 if (sum(is
.na(phenoTrait
)) > 0)
114 print("sum of pheno missing values")
115 print(sum(is
.na(phenoTrait
)))
117 #fill in for missing data with mean value
118 phenoTrait
[, traitName
] <- replace (phenoTrait
[, traitName
],
119 is
.na(phenoTrait
[, traitName
]),
120 mean(phenoTrait
[, traitName
], na
.rm
=TRUE)
123 #calculate mean of reps/plots of the same accession and
124 #create new df with the accession means
125 phenoTrait$stock_id
<- NULL
126 phenoTrait
<- phenoTrait
[order(row
.names(phenoTrait
)), ]
128 print('phenotyped lines before averaging')
129 print(length(row
.names(phenoTrait
)))
131 phenoTrait
<-ddply(phenoTrait
, "object_name", colwise(mean
))
133 print('phenotyped lines after averaging')
134 print(length(row
.names(phenoTrait
)))
136 row
.names(phenoTrait
) <- phenoTrait
[, 1]
137 phenoTrait
[, 1] <- NULL
139 phenoTrait
<- round(phenoTrait
, digits
= 2)
142 print ('No missing data')
143 phenoTrait$stock_id
<- NULL
144 phenoTrait
<- phenoTrait
[order(row
.names(phenoTrait
)), ]
146 print('phenotyped lines before averaging')
147 print(length(row
.names(phenoTrait
)))
149 phenoTrait
<-ddply(phenoTrait
, "object_name", colwise(mean
))
151 print('phenotyped lines after averaging')
152 print(length(row
.names(phenoTrait
)))
154 row
.names(phenoTrait
) <- phenoTrait
[, 1]
155 phenoTrait
[, 1] <- NULL
157 phenoTrait
<- round(phenoTrait
, digits
= 2)
161 newTraitName
= paste(traitName
, popId
, sep
= "_")
162 colnames(phenoTrait
)[1] <- newTraitName
166 print('no need to combine, yet')
167 combinedPhenoPops
<- phenoTrait
170 print('combining...')
171 combinedPhenoPops
<- merge(combinedPhenoPops
, phenoTrait
,
176 rownames(combinedPhenoPops
) <- combinedPhenoPops
[, 1]
177 combinedPhenoPops$Row
.names
<- NULL
182 #fill in missing data in combined phenotype dataset
184 naIndices
<- which(is
.na(combinedPhenoPops
), arr
.ind
=TRUE)
185 combinedPhenoPops
<- as
.matrix(combinedPhenoPops
)
186 combinedPhenoPops
[naIndices
] <- rowMeans(combinedPhenoPops
, na
.rm
=TRUE)[naIndices
[,1]]
187 combinedPhenoPops
<- as
.data
.frame(combinedPhenoPops
)
188 message("combined total number of stocks in phenotype dataset (before averaging): ", length(rownames(combinedPhenoPops
)))
189 #combinedPhenoPops<-ddply(combinedPhenoPops, by=0, colwise(mean))
190 #message("combined total number of stocks in phenotype dataset (after averaging): ", length(rownames(combinedPhenoPops)))
192 combinedPhenoPops$Average
<-round(apply(combinedPhenoPops
,
200 print(combinedPhenoPops
[1:30, ])
203 combinedGenoPops
<- c()
204 for (i
in 1:popsGenoSize
)
206 popId
<- str_extract(allGenoFiles
[[i
]], "\\d+")
207 popIds
<- append(popIds
, popId
)
210 genoData
<- read
.table(allGenoFiles
[[i
]],
214 na
.strings
= c("NA", " ", "--", "-"),
218 popMarkers
<- colnames(genoData
)
219 message("No of markers from population ", popId
, ": ", length(popMarkers
))
222 if (sum(is
.na(genoData
)) > 0)
224 print("sum of geno missing values")
225 print(sum(is
.na(genoData
)))
227 #impute missing genotypes
228 genoData
<-kNNImpute(genoData
, 10)
229 genoData
<-as
.data
.frame(genoData
)
231 #extract columns with imputed values
232 genoData
<- subset(genoData
,
233 select
= grep("^x", names(genoData
))
236 #remove prefix 'x.' from imputed columns
237 print(genoData
[1:50, 1:4])
238 names(genoData
) <- sub("x.", "", names(genoData
))
240 genoData
<- round(genoData
, digits
= 0)
241 message("total number of stocks for pop ", popId
,": ", length(rownames(genoData
)))
246 print('no need to combine, yet')
247 combinedGenoPops
<- genoData
250 print('combining genotype datasets...')
251 combinedGenoPops
<-rbind(combinedGenoPops
, genoData
)
256 message("combined total number of stocks in genotype dataset: ", length(rownames(combinedGenoPops
)))
257 #discard duplicate clones
258 combinedGenoPops
<- unique(combinedGenoPops
)
259 message("combined unique number of stocks in genotype dataset: ", length(rownames(combinedGenoPops
)))
261 message("writing data into files...")
262 if(length(combinedPhenoFile
) != 0 )
264 write
.table(combinedPhenoPops
,
265 file
= combinedPhenoFile
,
272 if(length(combinedGenoFile
) != 0 )
274 write
.table(combinedGenoPops
,
275 file
= combinedGenoFile
,
282 q(save
= "no", runLast
= FALSE)