1 #formats and combines phenotype (of a single trait)
2 #and genotype datasets of multiple
11 library(phenoAnalysis
)
15 allArgs
<- commandArgs()
17 inFile
<- grep("input_files",
24 outFile
<- grep("output_files",
31 outFiles
<- scan(outFile
,
35 combinedGenoFile
<- grep("genotype_data",
42 combinedPhenoFile
<- grep("phenotype_data",
49 inFiles
<- scan(inFile
,
54 traitFile
<- grep("trait_",
61 trait
<- scan(traitFile
,
65 traitInfo
<- strsplit(trait
, "\t");
66 traitId
<- traitInfo
[[1]]
67 traitName
<- traitInfo
[[2]]
69 #extract trait phenotype data from all populations
70 #and combine them into one dataset
72 allPhenoFiles
<- grep("phenotype_data",
78 message("phenotype files: ", allPhenoFiles
)
80 allGenoFiles
<- grep("genotype_data",
88 combinedPhenoPops
<- c()
91 for (popPhenoFile
in allPhenoFiles
) {
95 phenoData
<- fread(popPhenoFile
,
96 na
.strings
= c("NA", " ", "--", "-", "."))
98 phenoData
<- data
.frame(phenoData
)
100 phenoTrait
<- getAdjMeans(phenoData
,
104 popIdFile
<- basename(popPhenoFile
)
105 popId
<- str_extract(popIdFile
, "\\d+")
106 popIds
<- c(popIds
, popId
)
108 newTraitName
<- paste(traitName
, popId
, sep
= "_")
109 colnames(phenoTrait
)[2] <- newTraitName
112 print('no need to combine, yet')
113 combinedPhenoPops
<- phenoTrait
116 print('combining...phenotypes')
118 combinedPhenoPops
<- full_join(combinedPhenoPops
, phenoTrait
, by
='genotypes')
122 combinedPhenoPops
<- column_to_rownames(combinedPhenoPops
, var
='genotypes')
124 # #fill in missing data in combined phenotype dataset
126 naIndices
<- which(is
.na(combinedPhenoPops
), arr
.ind
=TRUE)
127 combinedPhenoPops
<- as
.matrix(combinedPhenoPops
)
128 combinedPhenoPops
[naIndices
] <- rowMeans(combinedPhenoPops
, na
.rm
=TRUE)[naIndices
[,1]]
129 combinedPhenoPops
<- as
.data
.frame(combinedPhenoPops
)
131 message("combined total number of stocks in phenotype dataset (before averaging): ", length(rownames(combinedPhenoPops
)))
133 combinedPhenoPops$Average
<-round(apply(combinedPhenoPops
,
142 combinedGenoPops
<- c()
145 for (popGenoFile
in allGenoFiles
) {
150 genoData
<- fread(popGenoFile
,
151 na
.strings
= c("NA", " ", "--", "-"),
154 genoData
<- data
.frame(genoData
)
155 message('cnt of genotypes in dataset: ', length(rownames(genoData
)))
156 genoData
<- genoData
[!duplicated(genoData
[,'V1']), ]
157 message('cnt of unique genotypes in dataset: ', length(rownames(genoData
)))
158 rownames(genoData
) <- genoData
[, 1]
159 genoData
[, 1] <- NULL
161 popGenoFile
<- basename(popGenoFile
)
162 popId
<- str_extract(popGenoFile
, "\\d+")
163 popIds
<- c(popIds
, popId
)
165 #popMarkers <- colnames(genoData)
166 #message("No of markers from population ", popId, ": ", length(popMarkers))
168 # message("sum of geno missing values: ", sum(is.na(genoData)))
169 # genoData <- genoData[, colSums(is.na(genoData)) < nrow(genoData) * 0.5]
170 # genoData <- data.frame(genoData)
171 # message("sum of geno missing values: ", sum(is.na(genoData)))
172 ## if (sum(is.na(genoData)) > 0) {
173 ## message("sum of geno missing values: ", sum(is.na(genoData)))
174 ## genoData <- na.roughfix(genoData)
175 ## message("total number of stocks for pop ", popId,": ", length(rownames(genoData)))
179 print('no need to combine, yet')
180 message('cnt of genotypes first dataset: ', length(rownames(genoData
)))
181 combinedGenoPops
<- genoData
184 print('combining genotype datasets...')
186 uniqGenoNames
<- unique(rownames(combinedGenoPops
))
188 message('cnt of genotypes in new dataset ', popId
, ': ', length(rownames(genoData
)) )
190 genoData
<- genoData
[!(rownames(genoData
) %in% uniqGenoNames
),]
192 message('cnt of unique genotypes from new dataset ', popId
, ': ', length(rownames(genoData
)))
194 if (!is
.null(genoData
)) {
195 combinedGenoPops
<- rbind(combinedGenoPops
, genoData
)
197 message('dataset ', popId
, ' has no unique genotypes.')
203 combinedGenoPops
<- combinedGenoPops
[order(rownames(combinedGenoPops
)), ]
204 message("combined number of genotypes in combined dataset: ", length(rownames(combinedGenoPops
)))
206 message("writing data to files...")
207 #if(length(combinedPhenoFile) != 0 )
209 fwrite(combinedPhenoPops
,
210 file
= combinedPhenoFile
,
217 #if(length(combinedGenoFile) != 0 )
219 fwrite(combinedGenoPops
,
220 file
= combinedGenoFile
,
227 q(save
= "no", runLast
= FALSE)