1 #formats and combines phenotype (of a single trait)
2 #and genotype datasets of multiple
13 library(phenoAnalysis
)
15 allArgs
<- commandArgs()
17 inFile
<- grep("input_files",
24 outFile
<- grep("output_files",
31 outFiles
<- scan(outFile
,
35 combinedGenoFile
<- grep("genotype_data",
42 combinedPhenoFile
<- grep("phenotype_data",
49 inFiles
<- scan(inFile
,
54 traitFile
<- grep("trait_",
61 trait
<- scan(traitFile
,
65 traitInfo
<-strsplit(trait
, "\t");
66 traitId
<-traitInfo
[[1]]
67 traitName
<-traitInfo
[[2]]
69 #extract trait phenotype data from all populations
70 #and combine them into one dataset
72 allPhenoFiles
<- grep("phenotype_data",
78 message("phenotype files: ", allPhenoFiles
)
80 allGenoFiles
<- grep("genotype_data",
87 popsPhenoSize
<- length(allPhenoFiles
)
88 popsGenoSize
<- length(allGenoFiles
)
90 combinedPhenoPops
<- c()
92 for (popPhenoNum
in 1:popsPhenoSize
) {
93 popId
<- str_extract_all(allPhenoFiles
[[popPhenoNum
]], "\\d+")
95 popId
<- popId
[[1]][2]
96 popIds
<- c(popIds
, popId
)
98 phenoData
<- fread(allPhenoFiles
[[popPhenoNum
]],
99 na
.strings
= c("NA", " ", "--", "-", "."),
101 phenoData
<- data
.frame(phenoData
)
103 phenoTrait
<- getAdjMeans(phenoData
, traitName
)
105 newTraitName
<- paste(traitName
, popId
, sep
= "_")
106 colnames(phenoTrait
)[2] <- newTraitName
108 if (popPhenoNum
== 1 )
110 print('no need to combine, yet')
111 combinedPhenoPops
<- phenoTrait
114 print('combining...')
116 combinedPhenoPops
<- merge(combinedPhenoPops
, phenoTrait
, all
=TRUE)
117 rownames(combinedPhenoPops
) <- combinedPhenoPops
[, 1]
118 combinedPhenoPops
[, 1] <- NULL
123 #fill in missing data in combined phenotype dataset
125 naIndices
<- which(is
.na(combinedPhenoPops
), arr
.ind
=TRUE)
126 combinedPhenoPops
<- as
.matrix(combinedPhenoPops
)
127 combinedPhenoPops
[naIndices
] <- rowMeans(combinedPhenoPops
, na
.rm
=TRUE)[naIndices
[,1]]
128 combinedPhenoPops
<- as
.data
.frame(combinedPhenoPops
)
130 message("combined total number of stocks in phenotype dataset (before averaging): ", length(rownames(combinedPhenoPops
)))
132 combinedPhenoPops$Average
<-round(apply(combinedPhenoPops
,
141 combinedGenoPops
<- c()
143 for (popGenoNum
in 1:popsGenoSize
)
145 popId
<- str_extract(allGenoFiles
[[popGenoNum
]], "\\d+")
146 popIds
<- append(popIds
, popId
)
148 genoData
<- fread(allGenoFiles
[[popGenoNum
]],
149 na
.strings
= c("NA", " ", "--", "-"),
152 genoData
<- as
.data
.frame(genoData
)
153 rownames(genoData
) <- genoData
[, 1]
154 genoData
[, 1] <- NULL
156 popMarkers
<- colnames(genoData
)
157 message("No of markers from population ", popId
, ": ", length(popMarkers
))
159 message("sum of geno missing values: ", sum(is
.na(genoData
)))
160 genoData
<- genoData
[, colSums(is
.na(genoData
)) < nrow(genoData
) * 0.5]
161 message("sum of geno missing values: ", sum(is
.na(genoData
)))
163 if (sum(is
.na(genoData
)) > 0)
165 message("sum of geno missing values: ", sum(is
.na(genoData
)))
166 genoData
<- na
.roughfix(genoData
)
167 message("total number of stocks for pop ", popId
,": ", length(rownames(genoData
)))
170 if (popGenoNum
== 1 )
172 print('no need to combine, yet')
173 combinedGenoPops
<- genoData
176 print('combining genotype datasets...')
177 combinedGenoPops
<-rbind(combinedGenoPops
, genoData
)
183 message("combined total number of stocks in genotype dataset: ", length(rownames(combinedGenoPops
)))
184 #discard duplicate clones
185 combinedGenoPops
<- unique(combinedGenoPops
)
186 message("combined unique number of stocks in genotype dataset: ", length(rownames(combinedGenoPops
)))
188 message("writing data into files...")
189 #if(length(combinedPhenoFile) != 0 )
191 fwrite(combinedPhenoPops
,
192 file
= combinedPhenoFile
,
199 #if(length(combinedGenoFile) != 0 )
201 fwrite(combinedGenoPops
,
202 file
= combinedGenoFile
,
209 q(save
= "no", runLast
= FALSE)